diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index 5236f4c8baa8699edd9a27360f86ab4b8582e5e7..04b844e8e4e64f9098c8b3d4d5000599f75ed713 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -110,7 +110,7 @@ void ActivationFunctor::operator()(const Tensor *input, input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 0}; std::string tuning_key = Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index ab4d97bca81d93f7d19c87fb72f10ea14b3e3c11..a67c6aaf120f8a04acf6ba065261fa9aa6b8f377 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -105,7 +105,7 @@ void AddNFunctor::operator()( input_shape_ = input_tensors[0]->shape(); } - const std::vector lws = {kwg_size_ / 16, 16, 1}; + const std::vector lws = {kwg_size_ / 16, 16, 0}; std::stringstream ss; ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1] << "_" << output_shape[2] << "_" << output_shape[3]; diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index d29d8b42070577950a1a26304b751e5cd065f366..84f926334c71494436744a7df5ff9151a67e4ed6 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -116,7 +116,7 @@ void BatchNormFunctor::operator()(const Tensor *input, input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 0}; std::string tuning_key = Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index a03652ff2f3a9f3100b94c67233ea3a34c3b4dbd..0a6a460798e47da02bf91a37e4ba8282d1ef0535 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -90,7 +90,7 @@ void ChannelShuffleFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 0}; std::stringstream ss; ss << "channel_shuffle_opencl_kernel_" << output->dim(0) << "_" diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 0b278bd3928cf6f1195a8b0fb6f18798958f8325..b22896ea84a71253409fdd54cc7c6a1d0aec919d 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -95,7 +95,7 @@ static void Concat2(cl::Kernel *kernel, *prev_input_shape = input0->shape(); } - const std::vector lws = {8, *kwg_size / 64, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 0}; std::stringstream ss; ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 51a860ad601f526eb4cc1ebb0fc2ad0e6e491d76..9f7694bc6517b66cc4dc146babfc7354c06081c5 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -130,7 +130,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = {8, *kwg_size / 64, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 0}; std::string tuning_key = Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index bea5601a9d86eb1e6a48519c1f6c95928f411848..fb86602c99fc5efaefa2b7f2cb9d5c272798a50c 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -128,7 +128,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = {4, *kwg_size / 32, 8, 1}; + const std::vector lws = {4, *kwg_size / 32, 8, 0}; std::string tuning_key = Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index 0e2f30cbe68b019e5995ca7d301501b3aec9a7de..9db8df0f64bf676a2d090532299df6a138541162 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -130,7 +130,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = {8, *kwg_size / 64, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 0}; std::string tuning_key = Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/cwise_opencl.cc b/mace/kernels/opencl/cwise_opencl.cc index 4256f5ba54b186a0646e92420a9621e73d2712bb..cf716c27937c9c14b8df53f4061e3722aa4e8797 100644 --- a/mace/kernels/opencl/cwise_opencl.cc +++ b/mace/kernels/opencl/cwise_opencl.cc @@ -76,7 +76,7 @@ void CWiseFunctor::operator()(const Tensor *input, input_shape_ = input->shape(); } - const std::vector lws = {kwg_size_ / 16, 16, 1}; + const std::vector lws = {kwg_size_ / 16, 16, 0}; std::stringstream ss; ss << "cwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc index 56e1e106f3a33c7ca2dd448dcf35f7ca49fb9745..1ecbc891bfe9682b3c0f64a23f393e5ada3c5be4 100644 --- a/mace/kernels/opencl/depth_to_space_opencl.cc +++ b/mace/kernels/opencl/depth_to_space_opencl.cc @@ -134,7 +134,7 @@ void DepthToSpaceOpFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 0}; TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc index 1b2b372574bf62dc5f3b55cc9ca9757f4bf83b54..534aa64e6152e2f6eaa930f8e98c8960711d5479 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl.cc @@ -149,7 +149,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = {8, *kwg_size / 64, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 0}; std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, batch, height, width, channels, multiplier); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc index 9a875a820cb675007e5f74d520c7589627474223..629ba89045b043f2b1f7965eefc875a32a78b8ae 100644 --- a/mace/kernels/opencl/eltwise_opencl.cc +++ b/mace/kernels/opencl/eltwise_opencl.cc @@ -85,7 +85,7 @@ void EltwiseFunctor::operator()(const Tensor *input0, input_shape_ = input0->shape(); } - const std::vector lws = {kwg_size_ / 16, 16, 1}; + const std::vector lws = {kwg_size_ / 16, 16, 0}; std::stringstream ss; ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc index bf0961b82ff99dad3c183ac2f0306fb93598dcc0..50f8ed5cd5e9c3ad40b8f9cc0d3a5928fc1fb93c 100644 --- a/mace/kernels/opencl/fully_connected_opencl.cc +++ b/mace/kernels/opencl/fully_connected_opencl.cc @@ -233,7 +233,7 @@ void FCWTXKernel(cl::Kernel *kernel, uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - *lws = {16, kwg_size/16, 1}; + *lws = {16, kwg_size/16, 0}; } if (!IsVecEqual(*prev_input_shape, input->shape())) { const index_t batch = output->dim(0); diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index d822a7c050f3b4a021e9e72255e1332050d42461..06df28895506f64463e9d00451b3162d30a7b2c6 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -223,23 +223,23 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, std::min(gws[2], kwg_size / (local_ws[0] * local_ws[1])); return { // TODO(heliangliang): tuning these magic numbers - {local_ws[0], local_ws[1], local_ws[2], 1}, - {kwg_size / 16, 4, 4, 1}, - {kwg_size / 32, 4, 8, 1}, - {kwg_size / 32, 8, 4, 1}, - {kwg_size / 64, 8, 8, 1}, - {kwg_size / 64, 16, 4, 1}, - {kwg_size / 128, 8, 16, 1}, - {kwg_size / 128, 16, 8, 1}, - {kwg_size / 128, 32, 4, 1}, - {1, kwg_size / 32, 32, 1}, - {1, kwg_size / 64, 64, 1}, - {1, kwg_size / 128, 128, 1}, - {4, kwg_size / 16, 4, 1}, - {4, kwg_size / 28, 7, 1}, - {4, kwg_size / 32, 8, 1}, - {4, kwg_size / 56, 14, 1}, - {1, kwg_size, 1, 1}, + {local_ws[0], local_ws[1], local_ws[2], 0}, + {kwg_size / 16, 4, 4, 0}, + {kwg_size / 32, 4, 8, 0}, + {kwg_size / 32, 8, 4, 0}, + {kwg_size / 64, 8, 8, 0}, + {kwg_size / 64, 16, 4, 0}, + {kwg_size / 128, 8, 16, 0}, + {kwg_size / 128, 16, 8, 0}, + {kwg_size / 128, 32, 4, 0}, + {1, kwg_size / 32, 32, 0}, + {1, kwg_size / 64, 64, 0}, + {1, kwg_size / 128, 128, 0}, + {4, kwg_size / 16, 4, 0}, + {4, kwg_size / 28, 7, 0}, + {4, kwg_size / 32, 8, 0}, + {4, kwg_size / 56, 14, 0}, + {1, kwg_size, 1, 0}, }; }; cl::Event event; @@ -248,46 +248,35 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, MACE_CHECK(params.size() == 4) << "Tuning parameters of 3D kernel must be 4D"; cl_int error = CL_SUCCESS; - std::vector roundup_gws(3); + std::vector internal_gws(gws, gws+3); if (!runtime->IsNonUniformWorkgroupsSupported()) { for (size_t i = 0; i < 3; ++i) { - roundup_gws[i] = RoundUp(gws[i], params[i]); + internal_gws[i] = RoundUp(gws[i], params[i]); } } if (timer == nullptr) { - uint32_t num_blocks = params[3]; - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; + uint32_t block_size = params[3] == 0 ? internal_gws[2] : params[3]; + const uint32_t num_blocks = RoundUpDiv(internal_gws[2], + block_size); for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = - (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - } else { - uint32_t roundup_gws2 = RoundUp(gws2, params[2]); - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, 0, i * block_size), - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + uint32_t gws2 = block_size; + if (runtime->IsNonUniformWorkgroupsSupported() + && (i == num_blocks - 1)) { + gws2 = (internal_gws[2] - (i * block_size)); } + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, 0, i * block_size), + cl::NDRange(internal_gws[0], internal_gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); MACE_CHECK_CL_SUCCESS(error); } } else { timer->ClearTiming(); - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - } else { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NullRange, - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - } + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, + cl::NDRange(internal_gws[0], internal_gws[1], internal_gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); MACE_CHECK_CL_SUCCESS(error); timer->AccumulateTiming(); tuning_result->assign(params.begin(), params.end()); @@ -297,24 +286,22 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, timer->ClearTiming(); uint32_t num_blocks = std::min( static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); - (*tuning_result)[3] = num_blocks; - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; + uint32_t block_size = gws[2] / num_blocks; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + block_size = RoundUp(block_size, params[2]); + } + (*tuning_result)[3] = block_size; + num_blocks = RoundUpDiv(internal_gws[2], block_size); for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = - (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - } else { - uint32_t roundup_gws2 = RoundUp(gws2, params[2]); - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, 0, i * block_size), - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + uint32_t gws2 = block_size; + if (runtime->IsNonUniformWorkgroupsSupported() + && (i == num_blocks - 1)) { + gws2 = (internal_gws[2] - (i * block_size)); } + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, 0, i * block_size), + cl::NDRange(internal_gws[0], internal_gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); MACE_CHECK_CL_SUCCESS(error); timer->AccumulateTiming(); } @@ -349,16 +336,16 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, uint32_t local_ws[2]; local_ws[0] = std::min(gws[0], kwg_size); local_ws[1] = std::min(gws[1], kwg_size / local_ws[0]); - return {{local_ws[0], local_ws[1], 1}, - {local_ws[1], local_ws[0], 1}, - {kwg_size / 4, 4, 1}, - {kwg_size / 16, 16, 1}, - {kwg_size / 32, 32, 1}, - {kwg_size / 64, 64, 1}, - {kwg_size / 128, 128, 1}, - {kwg_size / 256, 256, 1}, - {kwg_size, 1, 1}, - {1, kwg_size, 1}}; + return {{local_ws[0], local_ws[1], 0}, + {local_ws[1], local_ws[0], 0}, + {kwg_size / 4, 4, 0}, + {kwg_size / 16, 16, 0}, + {kwg_size / 32, 32, 0}, + {kwg_size / 64, 64, 0}, + {kwg_size / 128, 128, 0}, + {kwg_size / 256, 256, 0}, + {kwg_size, 1, 0}, + {1, kwg_size, 0}}; }; cl::Event event; auto func = [&](const std::vector ¶ms, Timer *timer, @@ -366,44 +353,34 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, MACE_CHECK(params.size() == 3) << "Tuning parameters of 2D kernel must be 3d"; cl_int error = CL_SUCCESS; - std::vector roundup_gws(2); + std::vector internal_gws(gws, gws+2); if (!runtime->IsNonUniformWorkgroupsSupported()) { for (size_t i = 0; i < 2; ++i) { - roundup_gws[i] = RoundUp(gws[i], params[i]); + internal_gws[i] = RoundUp(gws[i], params[i]); } } if (timer == nullptr) { - uint32_t num_blocks = params[2]; - const uint32_t block_size = gws[1] / num_blocks; - if (gws[1] % num_blocks > 0) num_blocks++; + uint32_t block_size = params[2] == 0 ? internal_gws[1] : params[2]; + const uint32_t num_blocks = RoundUpDiv(internal_gws[1], + block_size); for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws1 = - (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), - cl::NDRange(params[0], params[1]), nullptr, &event); - } else { - uint32_t roundup_gws1 = RoundUp(gws1, params[1]); - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, i * block_size), - cl::NDRange(roundup_gws[0], roundup_gws1), - cl::NDRange(params[0], params[1]), nullptr, &event); + uint32_t gws1 = block_size; + if (runtime->IsNonUniformWorkgroupsSupported() + && (i == num_blocks - 1)) { + gws1 = (internal_gws[1] - (i * block_size)); } + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, i * block_size), + cl::NDRange(internal_gws[0], gws1), + cl::NDRange(params[0], params[1]), nullptr, &event); MACE_CHECK_CL_SUCCESS(error); } } else { timer->ClearTiming(); - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), - cl::NDRange(params[0], params[1]), nullptr, &event); - } else { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), - cl::NDRange(params[0], params[1]), nullptr, &event); - } + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(internal_gws[0], internal_gws[1]), + cl::NDRange(params[0], params[1]), nullptr, &event); MACE_CHECK_CL_SUCCESS(error); timer->AccumulateTiming(); tuning_result->assign(params.begin(), params.end()); @@ -413,24 +390,22 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, timer->ClearTiming(); uint32_t num_blocks = std::min( static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[1]); - (*tuning_result)[2] = num_blocks; - const uint32_t block_size = gws[1] / num_blocks; - if (gws[1] % num_blocks > 0) num_blocks++; + uint32_t block_size = gws[1] / num_blocks; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + block_size = RoundUp(block_size, params[1]); + } + (*tuning_result)[2] = block_size; + num_blocks = RoundUpDiv(internal_gws[1], block_size); for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws1 = - (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, i * block_size), - cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]), - nullptr, &event); - } else { - uint32_t roundup_gws1 = RoundUp(gws1, params[1]); - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, i * block_size), - cl::NDRange(roundup_gws[0], roundup_gws1), - cl::NDRange(params[0], params[1]), nullptr, &event); + uint32_t gws1 = block_size; + if (runtime->IsNonUniformWorkgroupsSupported() + && (i == num_blocks - 1)) { + gws1 = (internal_gws[1] - (i * block_size)); } + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, i * block_size), + cl::NDRange(internal_gws[0], gws1), + cl::NDRange(params[0], params[1]), nullptr, &event); MACE_CHECK_CL_SUCCESS(error); timer->AccumulateTiming(); } diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index a462f948b21f06ce5322c7216a34aa86d8e0e991..d941040ed83ed1151569c5139eb30f73436358b5 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -84,7 +84,7 @@ void MatMulFunctor::operator()(const Tensor *A, kernel_.setArg(idx++, static_cast(height_blocks)); kernel_.setArg(idx++, static_cast(RoundUpDiv4(A->dim(2)))); - const std::vector lws = {kwg_size_ / 64, 64, 1}; + const std::vector lws = {kwg_size_ / 64, 64, 0}; std::stringstream ss; ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_" << C->dim(2) << "_" << C->dim(3); diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc index df408555e097075927fd8f24fe6a76841f2c63b8..45f27d6dae21ec2f8b2c3d5b9d4913c913f753b0 100644 --- a/mace/kernels/opencl/pad.cc +++ b/mace/kernels/opencl/pad.cc @@ -100,7 +100,7 @@ void PadFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 0}; std::string tuning_key = Concat("pad", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index ea1e3f2cdaac470cb41aa80ecd4f4237fea903a5..e3d9081e007446d54e5ccdb68014d48caa1855f2 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -134,7 +134,7 @@ void PoolingFunctor::operator()(const Tensor *input, }; } - std::vector lws = {8, kwg_size_ / 64, 8, 1}; + std::vector lws = {8, kwg_size_ / 64, 8, 0}; std::stringstream ss; ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index 15cbaf575fc4cbcf24c08f6b42be94495f4fd825..be4fe3cd103d7f5ce25fe41c0a26d06d7b69d8fd 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -99,7 +99,7 @@ void ResizeBilinearFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 0}; std::stringstream ss; ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc index 54cf842c912fd3987cbc270ec5f75569d3011b93..7e463997b1f70c1c7593ead2fb5e48dd70a728dd 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax_opencl.cc @@ -81,7 +81,7 @@ void SoftmaxFunctor::operator()(const Tensor *logits, input_shape_ = logits->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 0}; std::stringstream ss; ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc index e4702c9c42f19d1552ba348b56df1e51c463bd4d..c5b9df5a0b72bcf758c0728c63bb7e4a883afde9 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -105,7 +105,7 @@ void SpaceToBatchFunctor::operator()( space_shape_ = space_tensor->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 0}; std::stringstream ss; ss << kernel_name << "_" << batch_tensor->dim(0) << "_" << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_" diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 480825f6003fe8562977109bccb20ac82d4dc113..591a6208e654ac5edbf12f7687f844e5e52f6acc 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -54,7 +54,7 @@ void WinogradTransformFunctor::operator()( static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } std::vector output_shape(4); - std::vector filter_shape = {3, 3, input_tensor->dim(3), 1}; + std::vector filter_shape = {3, 3, 1, input_tensor->dim(3)}; std::vector paddings(2); if (paddings_.empty()) { kernels::CalcNHWCPaddingAndOutputSize( @@ -101,7 +101,7 @@ void WinogradTransformFunctor::operator()( input_shape_ = input_tensor->shape(); } - const std::vector lws = {kwg_size_ / 8, 8, 1}; + const std::vector lws = {kwg_size_ / 8, 8, 0}; std::stringstream ss; ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_" << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" @@ -215,7 +215,7 @@ void WinogradInverseTransformFunctor::operator()( input_shape_ = input_tensor->shape(); } - const std::vector lws = {kwg_size_ / 8, 8, 1}; + const std::vector lws = {kwg_size_ / 8, 8, 0}; std::stringstream ss; ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_" diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py index 14e39039df14edb4b62723574e97a6e93b2d8257..69950ccd990dd801207ee9d0d572f034d63af6f5 100644 --- a/mace/python/tools/caffe_converter_lib.py +++ b/mace/python/tools/caffe_converter_lib.py @@ -559,6 +559,7 @@ class CaffeConverter(object): paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) filter_shape = np.asarray(op.data[0].shape) + filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI input_format = 'NHWC' output_shape = Shapes.conv_pool_shape(