diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index d68d4f2baf502bd51ae8ab8cc059f55cd7f57cb5..745f174466fefee3683746213247d1e549306309 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -8,6 +8,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/core/types.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { namespace kernels { @@ -128,6 +129,7 @@ class ActivationFunctor { ActivationType activation_; T relux_max_limit_; T prelu_alpha_; + cl::Kernel kernel_; }; } // namespace kernels diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index 391ce05aace61c99573e9e1ac221c00b4efd7ed3..0b4828a4491e18be8a1c35d4d82d70d1790d0abf 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -7,6 +7,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { namespace kernels { @@ -42,6 +43,8 @@ template struct AddNFunctor { void operator()(const std::vector &input_tensors, Tensor *output_tensor, StatsFuture *future); + + cl::Kernel kernel_; }; } // namespace kernels diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index e7499274c0b6bd2612e1b3ae65400d031d896c52..bd8fc7e98b91d3a9668b76b430943570df20be12 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -9,6 +9,7 @@ #include "mace/core/public/mace.h" #include "mace/core/tensor.h" #include "mace/kernels/activation.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { namespace kernels { @@ -133,6 +134,7 @@ struct BatchNormFunctor : BatchNormFunctorBase { const float epsilon, Tensor *output, StatsFuture *future); + cl::Kernel kernel_; }; } // namepsace kernels diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h index 53b3a8d901dffa982e0854ddc8e1777be1037f9f..5b87026debf2f760e3d82c09251b99ca1426aa3b 100644 --- a/mace/kernels/bias_add.h +++ b/mace/kernels/bias_add.h @@ -8,6 +8,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/core/public/mace.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { namespace kernels { @@ -63,6 +64,7 @@ struct BiasAddFunctor { const Tensor *bias, Tensor *output, StatsFuture *future); + cl::Kernel kernel_; }; } // namepsace kernels diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h index 4844f6dffcca6084c3146ba37bd23d65e471c007..424a4e917da65793b2a9c3e2f302f7690e66e952 100644 --- a/mace/kernels/concat.h +++ b/mace/kernels/concat.h @@ -10,6 +10,7 @@ #include "mace/core/types.h" #include "mace/core/public/mace.h" #include "mace/core/tensor.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { namespace kernels { @@ -81,6 +82,7 @@ struct ConcatFunctor : ConcatFunctorBase{ void operator()(const std::vector &input_list, Tensor *output, StatsFuture *future); + cl::Kernel kernel_; }; diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 628acceb10a21cd67bd62036876b67a1767c29c8..01c55434c85f1c5303cd555c3703861a7bf84e65 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -9,6 +9,7 @@ #include "mace/core/tensor.h" #include "mace/kernels/activation.h" #include "mace/kernels/conv_pool_2d_util.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { namespace kernels { @@ -176,6 +177,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { const Tensor *bias, Tensor *output, StatsFuture *future); + + cl::Kernel kernel_; }; } // namespace kernels diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 92bc4e33dacc0fa339ff618053deb10ac401a534..caff18938e3ff4a4fefd65964cb36fce7fe457d7 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -9,6 +9,7 @@ #include "mace/core/future.h" #include "mace/core/public/mace.h" #include "mace/kernels/conv_pool_2d_util.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { namespace kernels { @@ -57,7 +58,6 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { StatsFuture *future) { MACE_CHECK_NOTNULL(input); MACE_CHECK_NOTNULL(filter); - MACE_CHECK_NOTNULL(bias); MACE_CHECK_NOTNULL(output); // Create a fake conv_2d filter to calculate the paddings and output size @@ -113,7 +113,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { Tensor::MappingGuard output_mapper(output); const T *input_ptr = input->data(); const T *filter_ptr = filter->data(); - const T *bias_ptr = bias->data(); + const T *bias_ptr = bias == nullptr ? nullptr : bias->data(); T *output_ptr = output->mutable_data(); #pragma omp parallel for collapse(2) @@ -153,6 +153,10 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { } } } + + output_ptr = output->mutable_data(); + DoActivation(output_ptr, output_ptr, output->NumElements(), activation_, + relux_max_limit_, prelu_alpha_); } }; @@ -178,13 +182,15 @@ struct DepthwiseConv2dFunctor dilations, activation, relux_max_limit, - prelu_alpha) {} + prelu_alpha){} void operator()(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output, StatsFuture *future); + + cl::Kernel kernel_; }; } // namespace kernels diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h index d88e3843888d786291e34ecbfaafdff5cbe5d788..4b486c7280d92f2684d56bd6aed5a6b9477aea61 100644 --- a/mace/kernels/matmul.h +++ b/mace/kernels/matmul.h @@ -7,6 +7,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { namespace kernels { @@ -58,6 +59,8 @@ struct MatMulFunctor { const Tensor *B, Tensor *C, StatsFuture *future); + + cl::Kernel kernel_; }; } // namespace kernels diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index 17688e9cc3e0083bfae3a5fba1b3e384f2543be1..935b3576020415a92562cefd87ea8456c440e234 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -22,52 +22,60 @@ void ActivationFunctor::operator()(const Tensor *input, const index_t channels = input->dim(3); const index_t channel_blocks = RoundUpDiv4(channels); + std::string tuning_key_prefix; - auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { + auto runtime = OpenCLRuntime::Global(); - std::set built_options; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); - built_options.emplace("-Dactivation=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - switch (activation_) { - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case PRELU: - built_options.emplace("-DUSE_PRELU"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - defeult: - LOG(FATAL) << "Unknown activation type: " << activation_; + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); + built_options.emplace("-Dactivation=" + kernel_name); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + switch (activation_) { + case RELU: + tuning_key_prefix = "relu_opencl_kernel_"; + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + tuning_key_prefix = "relux_opencl_kernel_"; + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + tuning_key_prefix = "prelu_opencl_kernel_"; + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + tuning_key_prefix = "tanh_opencl_kernel_"; + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + tuning_key_prefix = "sigmoid_opencl_kernel_"; + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation_; + } + kernel_ = + runtime->BuildKernel("activation", kernel_name, built_options); + int idx = 0; + kernel_.setArg( + idx++, *(static_cast(input->buffer()))); + kernel_.setArg(idx++, static_cast(relux_max_limit_)); + kernel_.setArg(idx++, static_cast(prelu_alpha_)); + kernel_.setArg(idx++, + *(static_cast(output->buffer()))); } - cl::Kernel activation_kernel = - runtime->BuildKernel("activation", kernel_name, built_options); - int idx = 0; - activation_kernel.setArg( - idx++, *(static_cast(input->buffer()))); - activation_kernel.setArg(idx++, relux_max_limit_); - activation_kernel.setArg(idx++, prelu_alpha_); - activation_kernel.setArg(idx++, - *(static_cast(output->buffer()))); const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), static_cast(height * batch)}; const std::vector lws = {8, 16, 8, 1}; std::string tuning_key = - Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1), + Concat(tuning_key_prefix, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(activation_kernel, tuning_key, gws, lws, future); + TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); } template struct ActivationFunctor; diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index c16741d62421252cc888fbda6ff1a9d497a10ec9..df096e7fbf374fde681e281782860e31ba44edde 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -11,53 +11,6 @@ namespace mace { namespace kernels { -template -static void AddN(const std::vector &input_tensors, - Tensor *output, StatsFuture *future) { - if (input_tensors.size() > 4) { - MACE_NOT_IMPLEMENTED; - } - - const index_t batch = output->dim(0); - const index_t height = output->dim(1); - const index_t width = output->dim(2); - const index_t channels = output->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - const index_t width_pixels = channel_blocks * width; - const index_t batch_height_pixels = batch * height; - - auto runtime = OpenCLRuntime::Global(); - std::set built_options; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); - built_options.emplace("-Daddn=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size())); - auto addn_kernel = runtime->BuildKernel("addn", kernel_name, built_options); - - uint32_t idx = 0; - for (auto input : input_tensors) { - addn_kernel.setArg(idx++, - *(static_cast(input->buffer()))); - } - addn_kernel.setArg(idx++, *(static_cast(output->buffer()))); - - const uint32_t gws[2] = { - static_cast(width_pixels), - static_cast(batch_height_pixels) - }; - const std::vector lws = {64, 16, 1}; - std::stringstream ss; - ss << "addn_opencl_kernel_" - << output->dim(0) << "_" - << output->dim(1) << "_" - << output->dim(2) << "_" - << output->dim(3); - TuningOrRun2DKernel(addn_kernel, ss.str(), gws, lws, future); -} - template void AddNFunctor::operator()( const std::vector &input_tensors, @@ -84,7 +37,44 @@ void AddNFunctor::operator()( CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); output_tensor->ResizeImage(output_shape, output_image_shape); - AddN(input_tensors, output_tensor, future); + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t width_pixels = channel_blocks * width; + const index_t batch_height_pixels = batch * height; + + if (kernel_.get() == nullptr) { + if (input_tensors.size() > 4) { + MACE_NOT_IMPLEMENTED; + } + auto runtime = OpenCLRuntime::Global(); + std::set built_options; + auto dt = DataTypeToEnum::value; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); + built_options.emplace("-Daddn=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size())); + kernel_ = runtime->BuildKernel("addn", kernel_name, built_options); + + uint32_t idx = 0; + for (auto input : input_tensors) { + kernel_.setArg(idx++, + *(static_cast(input->buffer()))); + } + kernel_.setArg(idx++, *(static_cast(output_tensor->buffer()))); + } + + const uint32_t gws[2] = { + static_cast(width_pixels), + static_cast(batch_height_pixels) + }; + const std::vector lws = {64, 16, 1}; + std::stringstream ss; + ss << "addn_opencl_kernel_" + << output_shape[0] << "_" + << output_shape[1] << "_" + << output_shape[2] << "_" + << output_shape[3]; + TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); }; template diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 5942dcfbfa5f7ed104286ed47107c5fbb0733f9a..3b63c7ac579b902c97361ef4c9ba8b38ab650a38 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -30,55 +30,57 @@ void BatchNormFunctor::operator()(const Tensor *input, const index_t channel_blocks = RoundUpDiv4(channels); - auto runtime = OpenCLRuntime::Global(); - std::set built_options; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); - built_options.emplace("-Dbatch_norm=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (folded_constant_) { - built_options.emplace("-DFOLDED_CONSTANT"); - } - switch (activation_) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case PRELU: - built_options.emplace("-DUSE_PRELU"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - defeult: - LOG(FATAL) << "Unknown activation type: " << activation_; - } + if (kernel_.get() == nullptr) { + auto runtime = OpenCLRuntime::Global(); + std::set built_options; + auto dt = DataTypeToEnum::value; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); + built_options.emplace("-Dbatch_norm=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (folded_constant_) { + built_options.emplace("-DFOLDED_CONSTANT"); + } + switch (activation_) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation_; + } - auto bm_kernel = - runtime->BuildKernel("batch_norm", kernel_name, built_options); + kernel_ = + runtime->BuildKernel("batch_norm", kernel_name, built_options); - uint32_t idx = 0; - bm_kernel.setArg(idx++, *(static_cast(input->buffer()))); - bm_kernel.setArg(idx++, *(static_cast(scale->buffer()))); - bm_kernel.setArg(idx++, - *(static_cast(offset->buffer()))); - if (!folded_constant_) { - bm_kernel.setArg(idx++, - *(static_cast(mean->buffer()))); - bm_kernel.setArg(idx++, *(static_cast(var->buffer()))); - bm_kernel.setArg(idx++, epsilon); + uint32_t idx = 0; + kernel_.setArg(idx++, *(static_cast(input->buffer()))); + kernel_.setArg(idx++, *(static_cast(scale->buffer()))); + kernel_.setArg(idx++, + *(static_cast(offset->buffer()))); + if (!folded_constant_) { + kernel_.setArg(idx++, + *(static_cast(mean->buffer()))); + kernel_.setArg(idx++, *(static_cast(var->buffer()))); + kernel_.setArg(idx++, epsilon); + } + kernel_.setArg(idx++, *(static_cast(output->buffer()))); + kernel_.setArg(idx++, relux_max_limit_); + kernel_.setArg(idx++, prelu_alpha_); } - bm_kernel.setArg(idx++, *(static_cast(output->buffer()))); - bm_kernel.setArg(idx++, relux_max_limit_); - bm_kernel.setArg(idx++, prelu_alpha_); const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), @@ -87,7 +89,7 @@ void BatchNormFunctor::operator()(const Tensor *input, std::string tuning_key = Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); - TuningOrRun3DKernel(bm_kernel, tuning_key, gws, lws, future); + TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); } template struct BatchNormFunctor; diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc index b85bb860534ef362b4f61332308fb48d3f915bfc..84eff1bfabcaad80d913fdd1aa0a73279883e4ad 100644 --- a/mace/kernels/opencl/bias_add_opencl.cc +++ b/mace/kernels/opencl/bias_add_opencl.cc @@ -24,30 +24,30 @@ void BiasAddFunctor::operator()( const index_t channel_blocks = RoundUpDiv4(channels); + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { + std::set built_options; + auto dt = DataTypeToEnum::value; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); + built_options.emplace("-Dbias_add=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options); + + uint32_t idx = 0; + kernel_.setArg(idx++, *(static_cast(input->buffer()))); + kernel_.setArg(idx++, *(static_cast(bias->buffer()))); + kernel_.setArg(idx++, *(static_cast(output->buffer()))); + } + const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), static_cast(height * batch)}; - - auto runtime = OpenCLRuntime::Global(); - std::set built_options; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); - built_options.emplace("-Dbias_add=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - auto bias_kernel = runtime->BuildKernel("bias_add", kernel_name, built_options); - - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bias_kernel); const std::vector lws = {8, 16, 8}; - uint32_t idx = 0; - bias_kernel.setArg(idx++, *(static_cast(input->buffer()))); - bias_kernel.setArg(idx++, *(static_cast(bias->buffer()))); - bias_kernel.setArg(idx++, *(static_cast(output->buffer()))); - cl::Event event; cl_int error = runtime->command_queue().enqueueNDRangeKernel( - bias_kernel, cl::NullRange, + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl index 02ebff0aaeeb6a416f79b14203896ea40b512fe7..fe8619e20b7f567bc36dfa4bc5d6c53bd5f792fb 100644 --- a/mace/kernels/opencl/cl/activation.cl +++ b/mace/kernels/opencl/cl/activation.cl @@ -1,8 +1,8 @@ #include __kernel void activation(__read_only image2d_t input, - __private const DATA_TYPE relux_max_limit, - __private const DATA_TYPE prelu_alpha, + __private const float relux_max_limit, + __private const float prelu_alpha, __write_only image2d_t output) { const int ch_blk = get_global_id(0); const int w = get_global_id(1); diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl index 280ba54c8fda24181df63d832d438c0c02efb502..d9c94007d8dbb0f212c2e8aac8f346a0d4c776a7 100644 --- a/mace/kernels/opencl/cl/depthwise_conv2d.cl +++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl @@ -2,24 +2,24 @@ // Only multiplier = 1 is supported __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ - __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ + __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ #ifdef BIAS - __read_only image2d_t bias, /* cout%4 * cout/4 */ + __read_only image2d_t bias, /* cout%4 * cout/4 */ #endif - __write_only image2d_t output, - __private const DATA_TYPE relux_max_limit, - __private const DATA_TYPE prelu_alpha, - __private const short in_height, - __private const short in_width, - __private const short in_ch_blks, - __private const short out_height, - __private const short out_width, - __private const short filter_height, - __private const short filter_width, - __private const short padding_top, - __private const short padding_left, - __private const short dilation_h, - __private const short dilation_w) { + __write_only image2d_t output, + __private const DATA_TYPE relux_max_limit, + __private const DATA_TYPE prelu_alpha, + __private const short in_height, + __private const short in_width, + __private const short in_ch_blks, + __private const short out_height, + __private const short out_width, + __private const short filter_height, + __private const short filter_width, + __private const short padding_top, + __private const short padding_left, + __private const short dilation_h, + __private const short dilation_w) { const short out_ch_blk = get_global_id(0); const short out_w_blk = get_global_id(1); const short out_w_blks = get_global_size(1); @@ -52,7 +52,6 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left; int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left; int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left; - int in_width4 = ((out_w_blk + (out_w_blks << 2)) << 1) - padding_left; const int height_idx = (out_h << 1) - padding_top; #else const short in_width_stride = mul24(out_w_blks, STRIDE); @@ -90,7 +89,7 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h READ_INPUT(3); #undef READ_INPUT - + DATA_TYPE4 weights = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx, in_ch_blk)); @@ -127,3 +126,120 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h if (w >= out_width) return; WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); } + +__kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ + __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ +#ifdef BIAS + __read_only image2d_t bias, /* cout%4 * cout/4 */ +#endif + __write_only image2d_t output, + __private const DATA_TYPE relux_max_limit, + __private const DATA_TYPE prelu_alpha, + __private const short in_height, + __private const short in_width, + __private const short in_ch_blks, + __private const short out_height, + __private const short out_width, + __private const short filter_height, + __private const short filter_width, + __private const short padding_top, + __private const short padding_left) { + const short out_ch_blk = get_global_id(0); + const short out_w_blk = get_global_id(1) << 2; + const short out_hb = get_global_id(2); + const short rounded_in_ch = in_ch_blks << 2; + const short in_ch_blk = out_ch_blk; // multiplier = 1 + +#ifdef BIAS + DATA_TYPE4 out0 = + READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; +#else + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; +#endif + + const short out_h = out_hb % out_height; + const short in_width0 = out_w_blk - padding_left; + const short in_width1 = in_width0 + 1; + const short in_width2 = in_width1 + 1; + const short in_width3 = in_width2 + 1; + const short height_idx = out_h - padding_top; + + const short batch_idx = mul24((out_hb / out_height), in_height); + const short rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width); + + const short in_idx = mul24(in_ch_blk, in_width); + short filter_idx = 0; + short in_hb_idx = height_idx; + + const short in_w_idx0 = select(in_idx + in_width0, + -1, + (in_width0 < 0 || in_width0 >= in_width)); + const short in_w_idx1 = select(in_idx + in_width1, + -1, + (in_width1 < 0 || in_width1 >= in_width)); + const short in_w_idx2 = select(in_idx + in_width2, + -1, + (in_width2 < 0 || in_width2 >= in_width)); + + short in_w; + DATA_TYPE4 in0, in1, in2, in3; + for (short filter_h_idx = 0; filter_h_idx < filter_height; ++filter_h_idx) { + short in_hb = select(in_hb_idx + batch_idx, + -1, + (in_hb_idx < 0 || in_hb_idx >= in_height)); + in1 = READ_IMAGET(input, SAMPLER, (int2)(in_w_idx0, in_hb)); + in2 = READ_IMAGET(input, SAMPLER, (int2)(in_w_idx1, in_hb)); + in3 = READ_IMAGET(input, SAMPLER, (int2)(in_w_idx2, in_hb)); + + for (short filter_w_idx = 0; filter_w_idx < filter_width; ++filter_w_idx) { + in0 = in1; + in1 = in2; + in2 = in3; + + in_w = in_width3 + filter_w_idx; + in_w = select(in_idx + in_w, + -1, + (in_w < 0 || in_w >= in_width)); + in3 = READ_IMAGET(input, SAMPLER, (int2)(in_w, in_hb)); + + DATA_TYPE4 weights = READ_IMAGET(filter, SAMPLER, + (int2)(filter_idx, in_ch_blk)); + + out0 = mad(in0, weights, out0); + out1 = mad(in1, weights, out1); + out2 = mad(in2, weights, out2); + out3 = mad(in3, weights, out3); + ++filter_idx; + } + in_hb_idx += 1; + } + +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID) + out0 = do_activation(out0, relux_max_limit, prelu_alpha); + out1 = do_activation(out1, relux_max_limit, prelu_alpha); + out2 = do_activation(out2, relux_max_limit, prelu_alpha); + out3 = do_activation(out3, relux_max_limit, prelu_alpha); +#endif + + const short out_x_base = mul24(out_ch_blk, out_width); + short w = out_w_blk; + WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0); + + w += 1; + if (w >= out_width) return; + WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1); + + w += 1; + if (w >= out_width) return; + WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2); + + w += 1; + if (w >= out_width) return; + WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); +} diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 48f14bea8f313e19d483fd3c1b6e57f483382413..b1712d149817244fa80df61f4ae96df8971c707f 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -11,7 +11,8 @@ namespace mace { namespace kernels { -static void Concat2(const Tensor *input0, +static void Concat2(cl::Kernel *kernel, + const Tensor *input0, const Tensor *input1, const DataType dt, Tensor *output, @@ -23,27 +24,29 @@ static void Concat2(const Tensor *input0, const int channel_blk = RoundUpDiv4(channel); - auto runtime = OpenCLRuntime::Global(); - std::set built_options; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); - built_options.emplace("-Dconcat_channel=" + kernel_name); - if (input0->dtype() == output->dtype()) { - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - } else { - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - } - if (input0->dim(3) % 4 == 0) { - built_options.emplace("-DDIVISIBLE_FOUR"); - } - auto concat_kernel = runtime->BuildKernel("concat", kernel_name, built_options); + if (kernel->get() == nullptr) { + auto runtime = OpenCLRuntime::Global(); + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); + built_options.emplace("-Dconcat_channel=" + kernel_name); + if (input0->dtype() == output->dtype()) { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + } + if (input0->dim(3) % 4 == 0) { + built_options.emplace("-DDIVISIBLE_FOUR"); + } + *kernel = runtime->BuildKernel("concat", kernel_name, built_options); - uint32_t idx = 0; - concat_kernel.setArg(idx++, *(static_cast(input0->buffer()))); - concat_kernel.setArg(idx++, *(static_cast(input1->buffer()))); - concat_kernel.setArg(idx++, static_cast(input0->dim(3))); - concat_kernel.setArg(idx++, *(static_cast(output->buffer()))); + uint32_t idx = 0; + kernel->setArg(idx++, *(static_cast(input0->buffer()))); + kernel->setArg(idx++, *(static_cast(input1->buffer()))); + kernel->setArg(idx++, static_cast(input0->dim(3))); + kernel->setArg(idx++, *(static_cast(output->buffer()))); + } const uint32_t gws[3] = { static_cast(channel_blk), @@ -57,7 +60,7 @@ static void Concat2(const Tensor *input0, << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - TuningOrRun3DKernel(concat_kernel, ss.str(), gws, lws, future); + TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); } template @@ -90,7 +93,7 @@ void ConcatFunctor::operator()(const std::vector::value, + Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum::value, output, future); break; default:MACE_NOT_IMPLEMENTED; diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 03883d3a6fac483298fcfbc15d39564ad5ae8c06..5a6d93cbf29e3417c93b02d40d00e3925142c249 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -3,64 +3,44 @@ // #include "mace/kernels/conv_2d.h" -#include "mace/kernels/activation.h" #include "mace/kernels/opencl/helper.h" namespace mace { namespace kernels { -extern void Conv2dOpenclK1x1S1(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float prelu_alpha, - const DataType dt, - Tensor *output, - StatsFuture *future); +extern void Conv2dOpenclK1x1(cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, + StatsFuture *future); -extern void Conv2dOpenclK1x1S2(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float prelu_alpha, - const DataType dt, - Tensor *output, - StatsFuture *future); +extern void Conv2dOpenclK3x3(cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, + StatsFuture *future); -extern void Conv2dOpenclK3x3S1(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float prelu_alpha, - const DataType dt, - Tensor *output, - StatsFuture *future); - -extern void Conv2dOpenclK3x3S2(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float prelu_alpha, - const DataType dt, - Tensor *output, - StatsFuture *future); - -extern void Conv2dOpencl(const Tensor *input, +extern void Conv2dOpencl(cl::Kernel *kernel, + const Tensor *input, const Tensor *filter, const Tensor *bias, - const uint32_t stride, + const int stride, const int *padding, const int *dilations, const ActivationType activation, @@ -70,24 +50,21 @@ extern void Conv2dOpencl(const Tensor *input, Tensor *output, StatsFuture *future); -template +template void Conv2dFunctor::operator()(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output, StatsFuture *future) { typedef void (*Conv2dOpenclFunction)( - const Tensor *input, const Tensor *filter, const Tensor *bias, + cl::Kernel *kernel, + const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride, const int *padding, const int *dilations, const ActivationType activation, const float relux_max_limit, const float prelu_alpha, const DataType dt, Tensor *output, StatsFuture *future); // Selection matrix: kernel_size x stride_size - static const Conv2dOpenclFunction selector[5][2] = { - {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, - {nullptr, nullptr}, - {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2}, - {nullptr, nullptr}, - {nullptr, nullptr}}; + static const Conv2dOpenclFunction selector[5] = + {Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; index_t kernel_h = filter->dim(0); index_t kernel_w = filter->dim(1); @@ -113,20 +90,23 @@ void Conv2dFunctor::operator()(const Tensor *input, output->ResizeImage(output_shape, output_image_shape); if (kernel_h == kernel_w && kernel_h <= 5 && - selector[kernel_h - 1][strides_[0] - 1] != nullptr) { - auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_func(input, filter, bias, paddings.data(), dilations_, activation_, + selector[kernel_h - 1] != nullptr && + 0 < strides_[0] && strides_[0] < 3 ) { + auto conv2d_func = selector[kernel_h - 1]; + conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, prelu_alpha_, DataTypeToEnum::value, output, future); } else { - Conv2dOpencl(input, filter, bias, strides_[0], paddings.data(), dilations_, + Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, prelu_alpha_, DataTypeToEnum::value, output, future); } } -template struct Conv2dFunctor; -template struct Conv2dFunctor; +template +struct Conv2dFunctor; +template +struct Conv2dFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index d96f80ceea72699b4e762ae6cacb48dc5c3e4eb8..013df8509081c708849f0f2853141b2d0e8e2256 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -3,26 +3,26 @@ // #include "mace/kernels/conv_2d.h" -#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/activation.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/tuner.h" -#include "mace/utils/utils.h" namespace mace { namespace kernels { -void Conv1x1(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const ActivationType activation, - const float relux_max_limit, - const float prelu_alpha, - const DataType dt, - Tensor *output, - StatsFuture *future) { +extern void Conv2dOpenclK1x1(cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, + StatsFuture *future) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -36,62 +36,64 @@ void Conv1x1(const Tensor *input, const index_t width_blocks = RoundUpDiv4(width); const index_t input_channel_blocks = RoundUpDiv4(input_channels); - MACE_CHECK(input_batch == batch); + if (kernel->get() == nullptr) { + MACE_CHECK(input_batch == batch); - std::set built_options; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); - built_options.emplace("-Dconv_2d_1x1=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - built_options.emplace("-DSTRIDE=" + ToString(stride)); - if (bias != nullptr) { - built_options.emplace("-DBIAS"); - } - switch (activation) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case PRELU: - built_options.emplace("-DUSE_PRELU"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - defeult: - LOG(FATAL) << "Unknown activation type: " << activation; - } + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); + built_options.emplace("-Dconv_2d_1x1=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DSTRIDE=" + ToString(stride)); + if (bias != nullptr) { + built_options.emplace("-DBIAS"); + } + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation; + } - auto runtime = OpenCLRuntime::Global(); - auto conv_2d_kernel = - runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options); + auto runtime = OpenCLRuntime::Global(); + *kernel = + runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options); - uint32_t idx = 0; - conv_2d_kernel.setArg(idx++, - *(static_cast(input->buffer()))); - conv_2d_kernel.setArg(idx++, - *(static_cast(filter->buffer()))); - if (bias != nullptr) { - conv_2d_kernel.setArg(idx++, - *(static_cast(bias->buffer()))); + uint32_t idx = 0; + kernel->setArg(idx++, + *(static_cast(input->buffer()))); + kernel->setArg(idx++, + *(static_cast(filter->buffer()))); + if (bias != nullptr) { + kernel->setArg(idx++, + *(static_cast(bias->buffer()))); + } + kernel->setArg(idx++, + *(static_cast(output->buffer()))); + // FIXME handle flexable data type: half not supported + kernel->setArg(idx++, relux_max_limit); + kernel->setArg(idx++, prelu_alpha); + kernel->setArg(idx++, static_cast(input_height)); + kernel->setArg(idx++, static_cast(input_width)); + kernel->setArg(idx++, static_cast(input_channel_blocks)); + kernel->setArg(idx++, static_cast(height)); + kernel->setArg(idx++, static_cast(width)); } - conv_2d_kernel.setArg(idx++, - *(static_cast(output->buffer()))); - // FIXME handle flexable data type: half not supported - conv_2d_kernel.setArg(idx++, relux_max_limit); - conv_2d_kernel.setArg(idx++, prelu_alpha); - conv_2d_kernel.setArg(idx++, static_cast(input_height)); - conv_2d_kernel.setArg(idx++, static_cast(input_width)); - conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); - conv_2d_kernel.setArg(idx++, static_cast(height)); - conv_2d_kernel.setArg(idx++, static_cast(width)); const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), @@ -100,38 +102,9 @@ void Conv1x1(const Tensor *input, std::string tuning_key = Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future); + TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); } -extern void Conv2dOpenclK1x1S1(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float prelu_alpha, - const DataType dt, - Tensor *output, - StatsFuture *future) { - Conv1x1(input, filter, bias, 1, activation, relux_max_limit, prelu_alpha, dt, - output, future); -}; - -extern void Conv2dOpenclK1x1S2(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float prelu_alpha, - const DataType dt, - Tensor *output, - StatsFuture *future) { - Conv1x1(input, filter, bias, 2, activation, relux_max_limit, prelu_alpha, dt, - output, future); -}; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 853dfe0a008080f60dcbdfca6c1bb1bcc0534f27..13e367e37705d1f1a241be066bf1649d835b49f9 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -13,18 +13,19 @@ namespace mace { namespace kernels { -static void Conv2d3x3S12(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const uint32_t stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float prelu_alpha, - const DataType dt, - Tensor *output, - StatsFuture *future) { +extern void Conv2dOpenclK3x3(cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, + StatsFuture *future) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -35,61 +36,63 @@ static void Conv2d3x3S12(const Tensor *input, const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t width_blocks = RoundUpDiv(width); - std::set built_options; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); - built_options.emplace("-Dconv_2d_3x3=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); - built_options.emplace("-DSTRIDE=" + ToString(stride)); - switch (activation) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case PRELU: - built_options.emplace("-DUSE_PRELU"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - defeult: - LOG(FATAL) << "Unknown activation type: " << activation; - } + if (kernel->get() == nullptr) { + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); + built_options.emplace("-Dconv_2d_3x3=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + built_options.emplace("-DSTRIDE=" + ToString(stride)); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation; + } - auto runtime = OpenCLRuntime::Global(); - auto conv_2d_kernel = - runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options); + auto runtime = OpenCLRuntime::Global(); + *kernel = + runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options); - uint32_t idx = 0; - conv_2d_kernel.setArg(idx++, - *(static_cast(input->buffer()))); - conv_2d_kernel.setArg(idx++, - *(static_cast(filter->buffer()))); - if (bias != nullptr) { - conv_2d_kernel.setArg(idx++, - *(static_cast(bias->buffer()))); + uint32_t idx = 0; + kernel->setArg(idx++, + *(static_cast(input->buffer()))); + kernel->setArg(idx++, + *(static_cast(filter->buffer()))); + if (bias != nullptr) { + kernel->setArg(idx++, + *(static_cast(bias->buffer()))); + } + kernel->setArg(idx++, + *(static_cast(output->buffer()))); + kernel->setArg(idx++, relux_max_limit); + kernel->setArg(idx++, prelu_alpha); + kernel->setArg(idx++, static_cast(input->dim(1))); + kernel->setArg(idx++, static_cast(input->dim(2))); + kernel->setArg(idx++, static_cast(input_channel_blocks)); + kernel->setArg(idx++, static_cast(height)); + kernel->setArg(idx++, static_cast(width)); + kernel->setArg(idx++, padding[0] / 2); + kernel->setArg(idx++, padding[1] / 2); + kernel->setArg(idx++, dilations[0]); + kernel->setArg(idx++, dilations[1]); } - conv_2d_kernel.setArg(idx++, - *(static_cast(output->buffer()))); - conv_2d_kernel.setArg(idx++, relux_max_limit); - conv_2d_kernel.setArg(idx++, prelu_alpha); - conv_2d_kernel.setArg(idx++, static_cast(input->dim(1))); - conv_2d_kernel.setArg(idx++, static_cast(input->dim(2))); - conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); - conv_2d_kernel.setArg(idx++, static_cast(height)); - conv_2d_kernel.setArg(idx++, static_cast(width)); - conv_2d_kernel.setArg(idx++, padding[0] / 2); - conv_2d_kernel.setArg(idx++, padding[1] / 2); - conv_2d_kernel.setArg(idx++, dilations[0]); - conv_2d_kernel.setArg(idx++, dilations[1]); const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), @@ -98,37 +101,8 @@ static void Conv2d3x3S12(const Tensor *input, std::string tuning_key = Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future); + TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); } -void Conv2dOpenclK3x3S1(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float prelu_alpha, - const DataType dt, - Tensor *output, - StatsFuture *future) { - Conv2d3x3S12(input, filter, bias, 1, padding, dilations, activation, - relux_max_limit, prelu_alpha, dt, output, future); -}; - -void Conv2dOpenclK3x3S2(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float prelu_alpha, - const DataType dt, - Tensor *output, - StatsFuture *future) { - Conv2d3x3S12(input, filter, bias, 2, padding, dilations, activation, - relux_max_limit, prelu_alpha, dt, output, future); -}; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index 1bff90f65ba8e08a46cf1f283e969f26a4e72a19..ef6401d35e421a0c1f2dc72d4d3a02d4e0934db6 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -13,18 +13,19 @@ namespace mace { namespace kernels { -void Conv2dOpencl(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const uint32_t stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const float prelu_alpha, - const DataType dt, - Tensor *output, - StatsFuture *future) { +extern void Conv2dOpencl(cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, + StatsFuture *future) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -35,64 +36,66 @@ void Conv2dOpencl(const Tensor *input, const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t width_blocks = RoundUpDiv4(width); - std::set built_options; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); - built_options.emplace("-Dconv_2d=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); - built_options.emplace("-DSTRIDE=" + ToString(stride)); - switch (activation) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case PRELU: - built_options.emplace("-DUSE_PRELU"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - defeult: - LOG(FATAL) << "Unknown activation type: " << activation; - } + if (kernel->get() == nullptr) { + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); + built_options.emplace("-Dconv_2d=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + built_options.emplace("-DSTRIDE=" + ToString(stride)); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation; + } - auto runtime = OpenCLRuntime::Global(); - auto conv_2d_kernel = - runtime->BuildKernel("conv_2d", kernel_name, built_options); + auto runtime = OpenCLRuntime::Global(); + *kernel = + runtime->BuildKernel("conv_2d", kernel_name, built_options); - uint32_t idx = 0; - conv_2d_kernel.setArg(idx++, - *(static_cast(input->buffer()))); - conv_2d_kernel.setArg(idx++, - *(static_cast(filter->buffer()))); - if (bias != nullptr) { - conv_2d_kernel.setArg(idx++, - *(static_cast(bias->buffer()))); + uint32_t idx = 0; + kernel->setArg(idx++, + *(static_cast(input->buffer()))); + kernel->setArg(idx++, + *(static_cast(filter->buffer()))); + if (bias != nullptr) { + kernel->setArg(idx++, + *(static_cast(bias->buffer()))); + } + kernel->setArg(idx++, + *(static_cast(output->buffer()))); + kernel->setArg(idx++, relux_max_limit); + kernel->setArg(idx++, prelu_alpha); + kernel->setArg(idx++, static_cast(input->dim(1))); + kernel->setArg(idx++, static_cast(input->dim(2))); + kernel->setArg(idx++, static_cast(input_channel_blocks)); + kernel->setArg(idx++, static_cast(height)); + kernel->setArg(idx++, static_cast(width)); + kernel->setArg(idx++, static_cast(filter->dim(0))); + kernel->setArg(idx++, static_cast(filter->dim(1))); + kernel->setArg(idx++, static_cast(stride)); + kernel->setArg(idx++, padding[0] / 2); + kernel->setArg(idx++, padding[1] / 2); + kernel->setArg(idx++, dilations[0]); + kernel->setArg(idx++, dilations[1]); } - conv_2d_kernel.setArg(idx++, - *(static_cast(output->buffer()))); - conv_2d_kernel.setArg(idx++, relux_max_limit); - conv_2d_kernel.setArg(idx++, prelu_alpha); - conv_2d_kernel.setArg(idx++, static_cast(input->dim(1))); - conv_2d_kernel.setArg(idx++, static_cast(input->dim(2))); - conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); - conv_2d_kernel.setArg(idx++, static_cast(height)); - conv_2d_kernel.setArg(idx++, static_cast(width)); - conv_2d_kernel.setArg(idx++, static_cast(filter->dim(0))); - conv_2d_kernel.setArg(idx++, static_cast(filter->dim(1))); - conv_2d_kernel.setArg(idx++, static_cast(stride)); - conv_2d_kernel.setArg(idx++, padding[0] / 2); - conv_2d_kernel.setArg(idx++, padding[1] / 2); - conv_2d_kernel.setArg(idx++, dilations[0]); - conv_2d_kernel.setArg(idx++, dilations[1]); const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), @@ -101,7 +104,7 @@ void Conv2dOpencl(const Tensor *input, std::string tuning_key = Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future); + TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); } } // namespace kernels diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc index ef4f3b4f40740cf1e25bd55b1b57ae4508a532c4..79c9196d94021ebf32ef0d96e2e46c4e0bfdd475 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl.cc @@ -11,7 +11,8 @@ namespace mace { namespace kernels { -void DepthwiseConv2d(const Tensor *input, // NHWC +void DepthwiseConv2d(cl::Kernel *kernel, + const Tensor *input, // NHWC const Tensor *filter, // HWIM const Tensor *bias, const int stride, @@ -28,80 +29,88 @@ void DepthwiseConv2d(const Tensor *input, // NHWC const index_t width = output->dim(2); const index_t channels = output->dim(3); - const index_t input_batch = input->dim(0); - const index_t input_height = input->dim(1); - const index_t input_width = input->dim(2); const index_t input_channels = input->dim(3); - - const index_t filter_height = filter->dim(0); - const index_t filter_width = filter->dim(1); const index_t multiplier = filter->dim(3); - MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported"); - MACE_CHECK(multiplier * input_channels == channels); - MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=", - input_channels); const index_t channel_blocks = RoundUpDiv4(channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t width_blocks = RoundUpDiv4(width); - - auto runtime = OpenCLRuntime::Global(); - std::set built_options; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); - built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); - built_options.emplace("-DSTRIDE=" + ToString(stride)); - switch (activation) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case PRELU: - built_options.emplace("-DUSE_PRELU"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - defeult: - LOG(FATAL) << "Unknown activation type: " << activation; - } - - auto dw_conv2d_kernel = - runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options); - - uint32_t idx = 0; - dw_conv2d_kernel.setArg(idx++, - *(static_cast(input->buffer()))); - dw_conv2d_kernel.setArg( - idx++, *(static_cast(filter->buffer()))); - if (bias != nullptr) { - dw_conv2d_kernel.setArg( - idx++, *(static_cast(bias->buffer()))); + if(kernel->get() == nullptr) { + const index_t input_batch = input->dim(0); + const index_t input_height = input->dim(1); + const index_t input_width = input->dim(2); + + const index_t filter_height = filter->dim(0); + const index_t filter_width = filter->dim(1); + MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported"); + MACE_CHECK(multiplier * input_channels == channels); + MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=", + input_channels); + + auto runtime = OpenCLRuntime::Global(); + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); + if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { + kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d_s1"); + built_options.emplace("-Ddepthwise_conv2d_s1=" + kernel_name); + } else { + built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); + } + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + built_options.emplace("-DSTRIDE=" + ToString(stride)); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation; + } + + *kernel = runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options); + + uint32_t idx = 0; + kernel->setArg(idx++, + *(static_cast(input->buffer()))); + kernel->setArg( + idx++, *(static_cast(filter->buffer()))); + if (bias != nullptr) { + kernel->setArg( + idx++, *(static_cast(bias->buffer()))); + } + kernel->setArg( + idx++, *(static_cast(output->buffer()))); + kernel->setArg(idx++, relux_max_limit); + kernel->setArg(idx++, prelu_alpha); + kernel->setArg(idx++, static_cast(input_height)); + kernel->setArg(idx++, static_cast(input_width)); + kernel->setArg(idx++, static_cast(input_channel_blocks)); + kernel->setArg(idx++, static_cast(height)); + kernel->setArg(idx++, static_cast(width)); + kernel->setArg(idx++, static_cast(filter_height)); + kernel->setArg(idx++, static_cast(filter_width)); + kernel->setArg(idx++, static_cast(paddings[0] / 2)); + kernel->setArg(idx++, static_cast(paddings[1] / 2)); + if (stride != 1 || dilations[0] != 1 || dilations[1] != 1) { + kernel->setArg(idx++, static_cast(dilations[0])); + kernel->setArg(idx++, static_cast(dilations[1])); + } } - dw_conv2d_kernel.setArg( - idx++, *(static_cast(output->buffer()))); - dw_conv2d_kernel.setArg(idx++, relux_max_limit); - dw_conv2d_kernel.setArg(idx++, prelu_alpha); - dw_conv2d_kernel.setArg(idx++, static_cast(input_height)); - dw_conv2d_kernel.setArg(idx++, static_cast(input_width)); - dw_conv2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); - dw_conv2d_kernel.setArg(idx++, static_cast(height)); - dw_conv2d_kernel.setArg(idx++, static_cast(width)); - dw_conv2d_kernel.setArg(idx++, static_cast(filter_height)); - dw_conv2d_kernel.setArg(idx++, static_cast(filter_width)); - dw_conv2d_kernel.setArg(idx++, static_cast(paddings[0] / 2)); - dw_conv2d_kernel.setArg(idx++, static_cast(paddings[1] / 2)); - dw_conv2d_kernel.setArg(idx++, static_cast(dilations[0])); - dw_conv2d_kernel.setArg(idx++, static_cast(dilations[1])); const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), @@ -109,7 +118,7 @@ void DepthwiseConv2d(const Tensor *input, // NHWC const std::vector lws = {8, 16, 8, 1}; std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, batch, height, width, channels, multiplier); - TuningOrRun3DKernel(dw_conv2d_kernel, tuning_key, gws, lws, future); + TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); } template @@ -153,7 +162,7 @@ void DepthwiseConv2dFunctor::operator()( CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); output->ResizeImage(output_shape, output_image_shape); - DepthwiseConv2d(input, filter, bias, strides_[0], paddings.data(), dilations_, + DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, prelu_alpha_, DataTypeToEnum::value, output, future); } diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 84e102e08b3f3d78e18a07a85809b59df5b49021..451a19d068121bd55c7af4366d76a8afedfdecb6 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -121,18 +121,24 @@ std::vector CalWinogradShape(const std::vector &shape, std::string DtToCLDt(const DataType dt) { switch (dt) { - case DT_FLOAT:return "float"; - case DT_HALF:return "half"; - default:LOG(FATAL) << "Unsupported data type"; + case DT_FLOAT: + return "float"; + case DT_HALF: + return "half"; + default: + LOG(FATAL) << "Unsupported data type"; return ""; } } std::string DtToCLCMDDt(const DataType dt) { switch (dt) { - case DT_FLOAT:return "f"; - case DT_HALF:return "h"; - default:LOG(FATAL) << "Not supported data type for opencl cmd data type"; + case DT_FLOAT: + return "f"; + case DT_HALF: + return "h"; + default: + LOG(FATAL) << "Not supported data type for opencl cmd data type"; return ""; } } @@ -140,8 +146,10 @@ std::string DtToCLCMDDt(const DataType dt) { std::string DtToUpstreamCLDt(const DataType dt) { switch (dt) { case DT_FLOAT: - case DT_HALF:return "float"; - default:LOG(FATAL) << "Unsupported data type"; + case DT_HALF: + return "float"; + default: + LOG(FATAL) << "Unsupported data type"; return ""; } } @@ -149,8 +157,10 @@ std::string DtToUpstreamCLDt(const DataType dt) { std::string DtToUpstreamCLCMDDt(const DataType dt) { switch (dt) { case DT_FLOAT: - case DT_HALF:return "f"; - default:LOG(FATAL) << "Not supported data type for opencl cmd data type"; + case DT_HALF: + return "f"; + default: + LOG(FATAL) << "Not supported data type for opencl cmd data type"; return ""; } } @@ -161,8 +171,8 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, const std::vector &lws, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel); auto params_generator = [&]() -> std::vector> { + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel); std::vector local_ws(3, 0); local_ws[0] = std::min(gws[0], kwg_size); local_ws[1] = std::min(gws[1], kwg_size / local_ws[0]); @@ -258,8 +268,8 @@ void TuningOrRun2DKernel(cl::Kernel &kernel, const std::vector &lws, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel); auto params_generator = [&]() -> std::vector> { + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel); uint32_t local_ws[2]; local_ws[0] = std::min(gws[0], kwg_size); local_ws[1] = std::min(gws[1], kwg_size / local_ws[0]); diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index 4406308eace8e424c2b5a2d6cf4f449d797f2ab2..44a92c611c4159248a1028eabf3a2b2903bbe76a 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -29,26 +29,28 @@ void MatMulFunctor::operator()( const index_t height_blocks = RoundUpDiv4(height); const index_t width_blocks = RoundUpDiv4(width); - auto runtime = OpenCLRuntime::Global(); - std::set built_options; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); - built_options.emplace("-Dmatmul=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - auto matmul_kernel = runtime->BuildKernel("matmul", kernel_name, built_options); + if (kernel_.get() == nullptr) { + auto runtime = OpenCLRuntime::Global(); + std::set built_options; + auto dt = DataTypeToEnum::value; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); + built_options.emplace("-Dmatmul=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options); - uint32_t idx = 0; - matmul_kernel.setArg(idx++, - *(static_cast(A->buffer()))); - matmul_kernel.setArg(idx++, - *(static_cast(B->buffer()))); - matmul_kernel.setArg(idx++, *(static_cast(C->buffer()))); - matmul_kernel.setArg(idx++, static_cast(height)); - matmul_kernel.setArg(idx++, static_cast(width)); - matmul_kernel.setArg(idx++, static_cast(A->dim(2))); - matmul_kernel.setArg(idx++, static_cast(height_blocks)); - matmul_kernel.setArg(idx++, static_cast(RoundUpDiv4(A->dim(2)))); + uint32_t idx = 0; + kernel_.setArg(idx++, + *(static_cast(A->buffer()))); + kernel_.setArg(idx++, + *(static_cast(B->buffer()))); + kernel_.setArg(idx++, *(static_cast(C->buffer()))); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(A->dim(2))); + kernel_.setArg(idx++, static_cast(height_blocks)); + kernel_.setArg(idx++, static_cast(RoundUpDiv4(A->dim(2)))); + } const uint32_t gws[2] = { static_cast(width_blocks), @@ -61,7 +63,7 @@ void MatMulFunctor::operator()( << C->dim(1) << "_" << C->dim(2) << "_" << C->dim(3); - TuningOrRun2DKernel(matmul_kernel, ss.str(), gws, lws, future); + TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); }; diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 248bf6a7a30953e6fce662f017859a97f6b44527..8a4274e8699371fa2062443ca143aaec1464aac5 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -11,68 +11,6 @@ namespace mace { namespace kernels { -static void Pooling(const Tensor *input, - const int *stride, - const int *paddings, - const int pooling_size, - const PoolingType type, - const DataType dt, - Tensor *output, - StatsFuture *future) { - index_t batch = output->dim(0); - index_t out_height = output->dim(1); - index_t out_width = output->dim(2); - index_t channels = output->dim(3); - - index_t channel_blocks = (channels + 3) / 4; - - auto runtime = OpenCLRuntime::Global(); - std::set built_options; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); - built_options.emplace("-Dpooling=" + kernel_name); - if (type == MAX && input->dtype() == output->dtype()) { - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - built_options.emplace(dt == DT_HALF ? "-DFP16" : ""); - } else { - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - } - if (type == AVG) { - built_options.emplace("-DPOOL_AVG"); - } - auto pooling_kernel = runtime->BuildKernel("pooling", kernel_name, built_options); - - uint32_t idx = 0; - pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); - pooling_kernel.setArg(idx++, static_cast(input->dim(1))); - pooling_kernel.setArg(idx++, static_cast(input->dim(2))); - pooling_kernel.setArg(idx++, static_cast(out_height)); - pooling_kernel.setArg(idx++, paddings[0] / 2); - pooling_kernel.setArg(idx++, paddings[1] / 2); - pooling_kernel.setArg(idx++, stride[0]); - pooling_kernel.setArg(idx++, pooling_size); - pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); - - const uint32_t gws[3] = { - static_cast(channel_blocks), - static_cast(out_width), - static_cast(batch * out_height), - }; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel); - std::vector lws(4, 1); - lws[0] = std::min(channel_blocks, kwg_size); - lws[1] = std::min(out_width, kwg_size / lws[0]); - lws[2] = std::min(out_height * batch, kwg_size / (lws[0] * lws[1])); - std::stringstream ss; - ss << "pooling_opencl_kernel_" - << output->dim(0) << "_" - << output->dim(1) << "_" - << output->dim(2) << "_" - << output->dim(3); - TuningOrRun3DKernel(pooling_kernel, ss.str(), gws, lws, future); -} - template void PoolingFunctor::operator()(const Tensor *input, Tensor *output, @@ -95,8 +33,57 @@ void PoolingFunctor::operator()(const Tensor *input, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); output->ResizeImage(output_shape, output_image_shape); - Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_, - DataTypeToEnum::value, output, future); + index_t batch = output->dim(0); + index_t out_height = output->dim(1); + index_t out_width = output->dim(2); + index_t channels = output->dim(3); + + index_t channel_blocks = (channels + 3) / 4; + + if (kernel_.get() == nullptr) { + const DataType dt = DataTypeToEnum::value; + auto runtime = OpenCLRuntime::Global(); + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); + built_options.emplace("-Dpooling=" + kernel_name); + if (pooling_type_ == MAX && input->dtype() == output->dtype()) { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + built_options.emplace(dt == DT_HALF ? "-DFP16" : ""); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + } + if (pooling_type_ == AVG) { + built_options.emplace("-DPOOL_AVG"); + } + kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options); + + uint32_t idx = 0; + kernel_.setArg(idx++, *(static_cast(input->buffer()))); + kernel_.setArg(idx++, static_cast(input->dim(1))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(out_height)); + kernel_.setArg(idx++, paddings[0] / 2); + kernel_.setArg(idx++, paddings[1] / 2); + kernel_.setArg(idx++, strides_[0]); + kernel_.setArg(idx++, kernels_[0]); + kernel_.setArg(idx++, *(static_cast(output->buffer()))); + } + + const uint32_t gws[3] = { + static_cast(channel_blocks), + static_cast(out_width), + static_cast(batch * out_height), + }; + std::vector lws = {8, 16, 8, 1}; + std::stringstream ss; + ss << "pooling_opencl_kernel_" + << output->dim(0) << "_" + << output->dim(1) << "_" + << output->dim(2) << "_" + << output->dim(3); + TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); } diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index 97550999b1a98b77abab858e7d8d21436802c4d8..f4e910459fa9e8534c14c2d68b132c325a754d16 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -21,40 +21,42 @@ void ResizeBilinearFunctor::operator()( const index_t channels = input->dim(3); const index_t channel_blocks = RoundUpDiv4(channels); + const index_t out_height = out_height_; + const index_t out_width = out_width_; - index_t out_height = out_height_; - index_t out_width = out_width_; - MACE_CHECK(out_height > 0 && out_width > 0); - std::vector output_shape {batch, out_height, out_width, channels}; - if (input->is_image()) { - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); - output->ResizeImage(output_shape, output_image_shape); - } else { - output->Resize(output_shape); - } + if (kernel_.get() == nullptr) { + MACE_CHECK(out_height > 0 && out_width > 0); + std::vector output_shape{batch, out_height, out_width, channels}; + if (input->is_image()) { + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); + output->ResizeImage(output_shape, output_image_shape); + } else { + output->Resize(output_shape); + } - float height_scale = - CalculateResizeScale(in_height, out_height, align_corners_); - float width_scale = CalculateResizeScale(in_width, out_width, align_corners_); + float height_scale = + CalculateResizeScale(in_height, out_height, align_corners_); + float width_scale = CalculateResizeScale(in_width, out_width, align_corners_); - auto runtime = OpenCLRuntime::Global(); - std::set built_options; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); - built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - auto rb_kernel = runtime->BuildKernel("resize_bilinear", kernel_name, built_options); + auto runtime = OpenCLRuntime::Global(); + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); + built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + kernel_ = runtime->BuildKernel("resize_bilinear", kernel_name, built_options); - uint32_t idx = 0; - rb_kernel.setArg(idx++, *(static_cast(input->buffer()))); - rb_kernel.setArg(idx++, *(static_cast(output->buffer()))); - rb_kernel.setArg(idx++, height_scale); - rb_kernel.setArg(idx++, width_scale); - rb_kernel.setArg(idx++, static_cast(in_height)); - rb_kernel.setArg(idx++, static_cast(in_width)); - rb_kernel.setArg(idx++, static_cast(out_height)); + uint32_t idx = 0; + kernel_.setArg(idx++, *(static_cast(input->buffer()))); + kernel_.setArg(idx++, *(static_cast(output->buffer()))); + kernel_.setArg(idx++, height_scale); + kernel_.setArg(idx++, width_scale); + kernel_.setArg(idx++, static_cast(in_height)); + kernel_.setArg(idx++, static_cast(in_width)); + kernel_.setArg(idx++, static_cast(out_height)); + } const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(out_width), @@ -66,7 +68,7 @@ void ResizeBilinearFunctor::operator()( << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - TuningOrRun3DKernel(rb_kernel, ss.str(), gws, lws, future); + TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); } template struct ResizeBilinearFunctor; diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc index 25e99661afe7d3afc6dbc407a5c9b7260c986ea9..55a487757ebcc399d61813db5935259454dfd935 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax_opencl.cc @@ -23,21 +23,23 @@ void SoftmaxFunctor::operator()(const Tensor *logits, const index_t channel_blocks = RoundUpDiv4(channels); const int remain_channels = channel_blocks * 4 - channels; - auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { + auto runtime = OpenCLRuntime::Global(); - std::set built_options; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); - built_options.emplace("-Dsoftmax=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - cl::Kernel softmax_kernel = runtime->BuildKernel("softmax", kernel_name, built_options); + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); + built_options.emplace("-Dsoftmax=" + kernel_name); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options); - uint32_t idx = 0; - softmax_kernel.setArg(idx++, *(static_cast(logits->buffer()))); - softmax_kernel.setArg(idx++, static_cast(channels)); - softmax_kernel.setArg(idx++, remain_channels); - softmax_kernel.setArg(idx++, *(static_cast(output->buffer()))); + uint32_t idx = 0; + kernel_.setArg(idx++, *(static_cast(logits->buffer()))); + kernel_.setArg(idx++, static_cast(channels)); + kernel_.setArg(idx++, remain_channels); + kernel_.setArg(idx++, *(static_cast(output->buffer()))); + } const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), static_cast(height * batch)}; @@ -48,7 +50,7 @@ void SoftmaxFunctor::operator()(const Tensor *logits, << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - TuningOrRun3DKernel(softmax_kernel, ss.str(), gws, lws, future); + TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); } template diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc index 744fe0273084751a403dbe28553d93042e784c93..eccd6cef666d06f879d150e03c89169972318cdd 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -20,9 +20,9 @@ void SpaceToBatchFunctor::operator()(Tensor *space_tensor const std::vector &output_shape, Tensor *batch_tensor, StatsFuture *future) { + const char *kernel_name = nullptr; std::vector output_image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); - const char *kernel_name = nullptr; if (b2s_) { space_tensor->ResizeImage(output_shape, output_image_shape); kernel_name = "batch_to_space"; @@ -30,32 +30,34 @@ void SpaceToBatchFunctor::operator()(Tensor *space_tensor batch_tensor->ResizeImage(output_shape, output_image_shape); kernel_name = "space_to_batch"; } - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - auto runtime = OpenCLRuntime::Global(); - std::set built_options; - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); - auto s2b_kernel = runtime->BuildKernel("space_to_batch", kernel_name, built_options); + if (kernel_.get() == nullptr) { + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + auto runtime = OpenCLRuntime::Global(); + std::set built_options; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); + kernel_ = runtime->BuildKernel("space_to_batch", kernel_name, built_options); - uint32_t idx = 0; - if (b2s_) { - s2b_kernel.setArg(idx++, *(static_cast(batch_tensor->buffer()))); - s2b_kernel.setArg(idx++, *(static_cast(space_tensor->buffer()))); - } else { - s2b_kernel.setArg(idx++, *(static_cast(space_tensor->buffer()))); - s2b_kernel.setArg(idx++, *(static_cast(batch_tensor->buffer()))); + uint32_t idx = 0; + if (b2s_) { + kernel_.setArg(idx++, *(static_cast(batch_tensor->buffer()))); + kernel_.setArg(idx++, *(static_cast(space_tensor->buffer()))); + } else { + kernel_.setArg(idx++, *(static_cast(space_tensor->buffer()))); + kernel_.setArg(idx++, *(static_cast(batch_tensor->buffer()))); + } + kernel_.setArg(idx++, block_shape_[0]); + kernel_.setArg(idx++, block_shape_[1]); + kernel_.setArg(idx++, paddings_[0]); + kernel_.setArg(idx++, paddings_[2]); + kernel_.setArg(idx++, static_cast(space_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(space_tensor->dim(2))); + kernel_.setArg(idx++, static_cast(batch_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(batch_tensor->dim(2))); } - s2b_kernel.setArg(idx++, block_shape_[0]); - s2b_kernel.setArg(idx++, block_shape_[1]); - s2b_kernel.setArg(idx++, paddings_[0]); - s2b_kernel.setArg(idx++, paddings_[2]); - s2b_kernel.setArg(idx++, static_cast(space_tensor->dim(1))); - s2b_kernel.setArg(idx++, static_cast(space_tensor->dim(2))); - s2b_kernel.setArg(idx++, static_cast(batch_tensor->dim(1))); - s2b_kernel.setArg(idx++, static_cast(batch_tensor->dim(2))); const uint32_t chan_blk = RoundUpDiv4(batch_tensor->dim(3)); const uint32_t gws[3] = {chan_blk, @@ -68,7 +70,7 @@ void SpaceToBatchFunctor::operator()(Tensor *space_tensor << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_" << batch_tensor->dim(3); - TuningOrRun3DKernel(s2b_kernel, ss.str(), gws, lws, future); + TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); } template struct SpaceToBatchFunctor; diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 273806061f3bd068725451cc88787c59bc79bce5..a842ba719174505e115fd26bc8bd8cd7f2301898 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -25,31 +25,34 @@ void WinogradTransformFunctor::operator()(const Tensor *i const index_t round_h = (output_shape[1] + 1) / 2; const index_t round_w = (output_shape[2] + 1) / 2; const index_t out_width = input_tensor->dim(0) * round_h * round_w; - output_shape = {16, input_tensor->dim(3), out_width, 1}; - std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape); - output_tensor->ResizeImage(output_shape, image_shape); - string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); - std::set built_options; - built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - auto runtime = OpenCLRuntime::Global(); - auto wino_kernel = runtime->BuildKernel("winograd_transform", - obfuscated_kernel_name, - built_options); - - uint32_t idx = 0; - wino_kernel.setArg(idx++, *(static_cast(input_tensor->buffer()))); - wino_kernel.setArg(idx++, *(static_cast(output_tensor->buffer()))); - wino_kernel.setArg(idx++, static_cast(input_tensor->dim(1))); - wino_kernel.setArg(idx++, static_cast(input_tensor->dim(2))); - wino_kernel.setArg(idx++, static_cast(input_tensor->dim(3))); - wino_kernel.setArg(idx++, static_cast(round_h * round_w)); - wino_kernel.setArg(idx++, static_cast(round_w)); - wino_kernel.setArg(idx++, static_cast(paddings[0] / 2)); - wino_kernel.setArg(idx++, static_cast(paddings[1] / 2)); + if (kernel_.get() == nullptr) { + output_shape = {16, input_tensor->dim(3), out_width, 1}; + std::vector image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape); + output_tensor->ResizeImage(output_shape, image_shape); + + string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); + std::set built_options; + built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); + auto runtime = OpenCLRuntime::Global(); + kernel_ = runtime->BuildKernel("winograd_transform", + obfuscated_kernel_name, + built_options); + + uint32_t idx = 0; + kernel_.setArg(idx++, *(static_cast(input_tensor->buffer()))); + kernel_.setArg(idx++, *(static_cast(output_tensor->buffer()))); + kernel_.setArg(idx++, static_cast(input_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(input_tensor->dim(2))); + kernel_.setArg(idx++, static_cast(input_tensor->dim(3))); + kernel_.setArg(idx++, static_cast(round_h * round_w)); + kernel_.setArg(idx++, static_cast(round_w)); + kernel_.setArg(idx++, static_cast(paddings[0] / 2)); + kernel_.setArg(idx++, static_cast(paddings[1] / 2)); + } const uint32_t gws[2] = {static_cast(out_width), static_cast(RoundUpDiv4(input_tensor->dim(3)))}; @@ -60,7 +63,7 @@ void WinogradTransformFunctor::operator()(const Tensor *i << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" << input_tensor->dim(3); - TuningOrRun2DKernel(wino_kernel, ss.str(), gws, lws, future); + TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); } template @@ -73,53 +76,55 @@ void WinogradInverseTransformFunctor::operator()(const Te CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape); output_tensor->ResizeImage(output_shape, image_shape); - string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); - std::set built_options; - built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); - switch (activation_) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case PRELU: - built_options.emplace("-DUSE_PRELU"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - defeult: - LOG(FATAL) << "Unknown activation type: " << activation_; - } - - auto runtime = OpenCLRuntime::Global(); - auto wino_kernel = runtime->BuildKernel("winograd_transform", - obfuscated_kernel_name, - built_options); - - const uint32_t round_h = (height_ + 1) / 2; - const uint32_t round_w = (width_ + 1) / 2; - uint32_t idx = 0; - wino_kernel.setArg(idx++, *(static_cast(input_tensor->buffer()))); - if (bias != nullptr) { - wino_kernel.setArg(idx++, *(static_cast(bias->buffer()))); + if (kernel_.get() == nullptr) { + string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); + std::set built_options; + built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + switch (activation_) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation_; + } + + auto runtime = OpenCLRuntime::Global(); + kernel_ = runtime->BuildKernel("winograd_transform", + obfuscated_kernel_name, + built_options); + + const uint32_t round_h = (height_ + 1) / 2; + const uint32_t round_w = (width_ + 1) / 2; + uint32_t idx = 0; + kernel_.setArg(idx++, *(static_cast(input_tensor->buffer()))); + if (bias != nullptr) { + kernel_.setArg(idx++, *(static_cast(bias->buffer()))); + } + kernel_.setArg(idx++, *(static_cast(output_tensor->buffer()))); + kernel_.setArg(idx++, static_cast(output_shape[1])); + kernel_.setArg(idx++, static_cast(output_shape[2])); + kernel_.setArg(idx++, static_cast(round_h * round_w)); + kernel_.setArg(idx++, static_cast(round_w)); + kernel_.setArg(idx++, relux_max_limit_); + kernel_.setArg(idx++, prelu_alpha_); } - wino_kernel.setArg(idx++, *(static_cast(output_tensor->buffer()))); - wino_kernel.setArg(idx++, static_cast(output_shape[1])); - wino_kernel.setArg(idx++, static_cast(output_shape[2])); - wino_kernel.setArg(idx++, static_cast(round_h * round_w)); - wino_kernel.setArg(idx++, static_cast(round_w)); - wino_kernel.setArg(idx++, relux_max_limit_); - wino_kernel.setArg(idx++, prelu_alpha_); const uint32_t gws[2] = {static_cast(input_tensor->dim(2)), static_cast(RoundUpDiv4(input_tensor->dim(1)))}; @@ -131,7 +136,7 @@ void WinogradInverseTransformFunctor::operator()(const Te << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" << input_tensor->dim(3); - TuningOrRun2DKernel(wino_kernel, ss.str(), gws, lws, future); + TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); } template diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 1c1d90b2fd035eec8c644404c7f721414d1952a1..99d2363f8b003a3802c67b7177d286fd19ca1643 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -9,6 +9,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/kernels/conv_pool_2d_util.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { @@ -171,6 +172,8 @@ struct PoolingFunctor : PoolingFunctorBase { void operator()(const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future); + + cl::Kernel kernel_; }; } // namespace kernels diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h index 8d3d7fe3a4baa73ada8aff5e59bbc7d7c3a46c71..43e6a2df6140e5a2700bbe8dea528e72411724e0 100644 --- a/mace/kernels/resize_bilinear.h +++ b/mace/kernels/resize_bilinear.h @@ -6,6 +6,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { namespace kernels { @@ -169,6 +170,8 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase : ResizeBilinearFunctorBase(size, align_corners) {} void operator()(const Tensor *input, Tensor *output, StatsFuture *future); + + cl::Kernel kernel_; }; } // namespace kernels diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h index c686c60cec60098200e7da0db31472321dbe41ae..b29514a29e9ec4e1e37df3e643878ca914529b74 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.h @@ -8,6 +8,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/core/public/mace.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { namespace kernels { @@ -55,6 +56,8 @@ struct SoftmaxFunctor { void operator()(const Tensor *logits, Tensor *output, StatsFuture *future); + + cl::Kernel kernel_; }; } // namepsace kernels diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h index 9e2fd8762c4a7e8fa5d0d3b405d8701a6a914ed7..31b31bf944feff5e6bf3f3685601b356de530ef1 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.h @@ -8,6 +8,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/core/public/mace.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { namespace kernels { @@ -51,6 +52,8 @@ struct SpaceToBatchFunctor: SpaceToBatchFunctorBase{ Tensor *batch_tensor, StatsFuture *future); + cl::Kernel kernel_; + }; } // namespace kernels diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h index a71bda24b120f3eab77171dd2836c606151b6486..d6ba1e6251972a97dec91ac3da4fface16bf1bdd 100644 --- a/mace/kernels/winograd_transform.h +++ b/mace/kernels/winograd_transform.h @@ -9,6 +9,7 @@ #include "mace/core/tensor.h" #include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/activation.h" +#include "mace/core/runtime/opencl/cl2_header.h" namespace mace { namespace kernels { @@ -43,6 +44,8 @@ struct WinogradTransformFunctor : WinogradTransformFuncto void operator()(const Tensor *input, Tensor *output, StatsFuture *future); + + cl::Kernel kernel_; }; struct WinogradInverseTransformFunctorBase { @@ -100,6 +103,8 @@ struct WinogradInverseTransformFunctor : WinogradInverseT const Tensor *bias, Tensor *output, StatsFuture *future); + + cl::Kernel kernel_; }; } // namespace kernels diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h index ed4ff152e444a5a9a5712e7112e01922e7fefd8b..04ad7f1b521ab0447a31e6bbcd15cc3bdab83f73 100644 --- a/mace/ops/depthwise_conv2d.h +++ b/mace/ops/depthwise_conv2d.h @@ -22,9 +22,11 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase { functor_(this->strides_.data(), this->padding_, this->dilations_.data(), - kernels::ActivationType::NOOP, - 0.0f, - 0.0f) {} + kernels::StringToActivationType( + OperatorBase::GetSingleArgument("activation", + "NOOP")), + OperatorBase::GetSingleArgument("max_limit", 0.0f), + OperatorBase::GetSingleArgument("alpha", 0.0f)) {} bool Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/depthwise_conv_2d_benchmark.cc b/mace/ops/depthwise_conv_2d_benchmark.cc index 3a963dd0408afc4069ce654b4a5002d3d8252504..1f7dfa3ca6bb3ecbf2e1a2528af165bd2ab7c8b6 100644 --- a/mace/ops/depthwise_conv_2d_benchmark.cc +++ b/mace/ops/depthwise_conv_2d_benchmark.cc @@ -94,16 +94,16 @@ static void DepthwiseConv2d(int iters, BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1, float); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1, float); +//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1, float); BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1, float); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1, float); -BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1, float); -BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1, float); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1, float); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1, float); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1, float); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1, float); -BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1, float); -BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1, float); +//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1, float); +//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1, float); +//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1, float); +//BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1, float); +//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1, float); +//BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1, float); +//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1, float); +//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1, float); +//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1, float); } // namespace mace