diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 0a856fc945b3ce2eebc7eaa8e9d0f6fc4985629a..83acf4fb70311c710bcde1d7b08c1e6c4630879f 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -135,6 +135,7 @@ class ActivationFunctor { T relux_max_limit_; T prelu_alpha_; cl::Kernel kernel_; + std::string tuning_key_prefix_; }; } // namespace kernels diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index a457edfb2354eae313c0ba14ea9a467602af6680..b9bf8a983bd4bbafca9e16a52e576bcbf378924c 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -228,11 +228,12 @@ struct Conv2dFunctor : Conv2dFunctorBase { MACE_CHECK_NOTNULL(output); std::vector output_shape(4); - if (paddings_.empty()) { - paddings_.resize(2); - kernels::CalcNHWCPaddingAndOutputSize( - input->shape().data(), filter->shape().data(), dilations_, strides_, - padding_type_, output_shape.data(), paddings_.data()); + std::vector paddings(2); + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter->shape().data(), dilations_, strides_, + padding_type_, output_shape.data(), paddings.data()); + if (!paddings_.empty()) { + paddings = paddings_; } output->Resize(output_shape); @@ -260,13 +261,13 @@ struct Conv2dFunctor : Conv2dFunctorBase { MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); - int padded_height = input_height + paddings_[0]; - int padded_width = input_width + paddings_[1]; + int padded_height = input_height + paddings[0]; + int padded_width = input_width + paddings[1]; Tensor padded_input; // Keep this alive during kernel execution - if (paddings_[0] > 0 || paddings_[1] > 0) { - ConstructNHWCInputWithPadding(input, paddings_.data(), &padded_input); + if (paddings[0] > 0 || paddings[1] > 0) { + ConstructNHWCInputWithPadding(input, paddings.data(), &padded_input); input = &padded_input; } diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 65e8b500b4b3f8afc24d5a11698d060fed510709..c72a4a6d59ff68e5a94a539c6c85c782f4aa9d1f 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -294,11 +294,12 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { fake_filter_shape[3] = 1; std::vector output_shape(4); - if (paddings_.empty()) { - paddings_.resize(2); - kernels::CalcNHWCPaddingAndOutputSize( - input->shape().data(), fake_filter_shape.data(), dilations_, strides_, - padding_type_, output_shape.data(), paddings_.data()); + std::vector paddings(2); + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), fake_filter_shape.data(), dilations_, strides_, + padding_type_, output_shape.data(), paddings.data()); + if (!paddings_.empty()) { + paddings = paddings_; } auto input_shape = fake_filter_shape; output->Resize(output_shape); @@ -329,10 +330,10 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); // The left-upper most offset of the padded input - int paddings_top = paddings_[0] / 2; - int paddings_bottom = paddings_[0] - paddings_top; - int paddings_left = paddings_[1] / 2; - int paddings_right = paddings_[1] - paddings_left; + int paddings_top = paddings[0] / 2; + int paddings_bottom = paddings[0] - paddings_top; + int paddings_left = paddings[1] / 2; + int paddings_right = paddings[1] - paddings_left; int padded_h_start = 0 - paddings_top; int padded_w_start = 0 - paddings_left; diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index 50ad306368f503860cf4d6688bd10b74fd678d9b..934ce8da666b8e9910815b9f56de250a3511fd68 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -22,7 +22,6 @@ void ActivationFunctor::operator()(const Tensor *input, const index_t channels = input->dim(3); const index_t channel_blocks = RoundUpDiv4(channels); - std::string tuning_key_prefix; if (kernel_.get() == nullptr) { auto runtime = OpenCLRuntime::Global(); @@ -35,23 +34,23 @@ void ActivationFunctor::operator()(const Tensor *input, built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); switch (activation_) { case RELU: - tuning_key_prefix = "relu_opencl_kernel_"; + tuning_key_prefix_ = "relu_opencl_kernel_"; built_options.emplace("-DUSE_RELU"); break; case RELUX: - tuning_key_prefix = "relux_opencl_kernel_"; + tuning_key_prefix_ = "relux_opencl_kernel_"; built_options.emplace("-DUSE_RELUX"); break; case PRELU: - tuning_key_prefix = "prelu_opencl_kernel_"; + tuning_key_prefix_ = "prelu_opencl_kernel_"; built_options.emplace("-DUSE_PRELU"); break; case TANH: - tuning_key_prefix = "tanh_opencl_kernel_"; + tuning_key_prefix_ = "tanh_opencl_kernel_"; built_options.emplace("-DUSE_TANH"); break; case SIGMOID: - tuning_key_prefix = "sigmoid_opencl_kernel_"; + tuning_key_prefix_ = "sigmoid_opencl_kernel_"; built_options.emplace("-DUSE_SIGMOID"); break; default: @@ -60,12 +59,10 @@ void ActivationFunctor::operator()(const Tensor *input, kernel_ = runtime->BuildKernel("activation", kernel_name, built_options); int idx = 0; - kernel_.setArg( - idx++, *(static_cast(input->buffer()))); + kernel_.setArg(idx++, *(static_cast(input->buffer()))); kernel_.setArg(idx++, static_cast(relux_max_limit_)); kernel_.setArg(idx++, static_cast(prelu_alpha_)); - kernel_.setArg(idx++, - *(static_cast(output->buffer()))); + kernel_.setArg(idx++, *(static_cast(output->buffer()))); } const uint32_t gws[3] = {static_cast(channel_blocks), @@ -73,7 +70,7 @@ void ActivationFunctor::operator()(const Tensor *input, static_cast(height * batch)}; const std::vector lws = {8, 16, 8, 1}; std::string tuning_key = - Concat(tuning_key_prefix, output->dim(0), output->dim(1), + Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); } diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 48e775e1ad3476d6f7b5adcaadb905db1d6d4e38..8c0733b341697b965f7b804c625d035b51dec6f4 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -68,7 +68,8 @@ void Conv2dFunctor::operator()(const Tensor *input, index_t kernel_h = filter->dim(0); index_t kernel_w = filter->dim(1); - if (!input->is_image() || strides_[0] != strides_[1] || strides_[0] > 2 || + if (!input->is_image() || strides_[0] != strides_[1] || + ((kernel_h == 1 || kernel_h == 3) && strides_[0] > 2) || (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) { LOG(WARNING) << "OpenCL conv2d kernel with " << "filter" << kernel_h << "x" << kernel_w << "," @@ -80,11 +81,12 @@ void Conv2dFunctor::operator()(const Tensor *input, } std::vector output_shape(4); - if (paddings_.empty()) { - paddings_.resize(2); - kernels::CalcNHWCPaddingAndOutputSize( - input->shape().data(), filter->shape().data(), dilations_, strides_, - padding_type_, output_shape.data(), paddings_.data()); + std::vector paddings(2); + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter->shape().data(), dilations_, strides_, + padding_type_, output_shape.data(), paddings.data()); + if (!paddings_.empty()) { + paddings = paddings_; } std::vector output_image_shape; @@ -95,11 +97,11 @@ void Conv2dFunctor::operator()(const Tensor *input, selector[kernel_h - 1] != nullptr && 0 < strides_[0] && strides_[0] < 3 ) { auto conv2d_func = selector[kernel_h - 1]; - conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings_.data(), dilations_, activation_, + conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, prelu_alpha_, DataTypeToEnum::value, output, future); } else { - Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings_.data(), dilations_, + Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, prelu_alpha_, DataTypeToEnum::value, output, future); } diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc index 440b18b43cbf2060bcefd176e9d647b5bb66b2a7..67304bd896bd5e5df14c273c2a839dccfea28390 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl.cc @@ -153,18 +153,19 @@ void DepthwiseConv2dFunctor::operator()( fake_filter_shape[3] = 1; std::vector output_shape(4); - if (paddings_.empty()) { - paddings_.resize(2); - kernels::CalcNHWCPaddingAndOutputSize( - input->shape().data(), fake_filter_shape.data(), dilations_, strides_, - padding_type_, output_shape.data(), paddings_.data()); + std::vector paddings(2); + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), fake_filter_shape.data(), dilations_, strides_, + padding_type_, output_shape.data(), paddings.data()); + if (!paddings_.empty()) { + paddings = paddings_; } std::vector output_image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); output->ResizeImage(output_shape, output_image_shape); - DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings_.data(), dilations_, + DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, prelu_alpha_, DataTypeToEnum::value, output, future); } diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 39bb994ef8ed0f6b27740f8d5d89ac75da57e2bd..9b612e48a558599751b7bde26df063689ea54c6a 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -23,12 +23,13 @@ void PoolingFunctor::operator()(const Tensor *input, input->dim(3), input->dim(3) }; - if (paddings_.empty()) { - paddings_.resize(2); - kernels::CalcNHWCPaddingAndOutputSize( - input->shape().data(), filter_shape.data(), - dilations_, strides_, this->padding_type_, - output_shape.data(), paddings_.data()); + std::vector paddings(2); + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter_shape.data(), + dilations_, strides_, this->padding_type_, + output_shape.data(), paddings.data()); + if (!paddings_.empty()) { + paddings = paddings_; } std::vector output_image_shape; @@ -66,8 +67,8 @@ void PoolingFunctor::operator()(const Tensor *input, kernel_.setArg(idx++, static_cast(input->dim(1))); kernel_.setArg(idx++, static_cast(input->dim(2))); kernel_.setArg(idx++, static_cast(out_height)); - kernel_.setArg(idx++, paddings_[0] / 2); - kernel_.setArg(idx++, paddings_[1] / 2); + kernel_.setArg(idx++, paddings[0] / 2); + kernel_.setArg(idx++, paddings[1] / 2); kernel_.setArg(idx++, strides_[0]); kernel_.setArg(idx++, kernels_[0]); kernel_.setArg(idx++, *(static_cast(output->buffer()))); diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 0396af97fe3c4571a8906c1dd5f97c7004436357..31ca09f482d75999eb79aa59fca27f5bd0e9929d 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -17,11 +17,12 @@ void WinogradTransformFunctor::operator()(const Tensor *i StatsFuture *future) { std::vector output_shape(4); std::vector filter_shape = {3, 3, input_tensor->dim(3), 1}; - if (paddings_.empty()) { - paddings_.resize(2); - kernels::CalcNHWCPaddingAndOutputSize( - input_tensor->shape().data(), filter_shape.data(), dilations_.data(), - strides_.data(), padding_type_, output_shape.data(), paddings_.data()); + std::vector paddings(2); + kernels::CalcNHWCPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), dilations_.data(), + strides_.data(), padding_type_, output_shape.data(), paddings.data()); + if (!paddings_.empty()) { + paddings = paddings_; } const index_t round_h = (output_shape[1] + 1) / 2; @@ -52,8 +53,8 @@ void WinogradTransformFunctor::operator()(const Tensor *i kernel_.setArg(idx++, static_cast(input_tensor->dim(3))); kernel_.setArg(idx++, static_cast(round_h * round_w)); kernel_.setArg(idx++, static_cast(round_w)); - kernel_.setArg(idx++, static_cast(paddings_[0] / 2)); - kernel_.setArg(idx++, static_cast(paddings_[1] / 2)); + kernel_.setArg(idx++, static_cast(paddings[0] / 2)); + kernel_.setArg(idx++, static_cast(paddings[1] / 2)); } const uint32_t gws[2] = {static_cast(out_width), diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 0a563d9ab2d2d57ba21e8ba0d1d78fd56d71c688..7b13b2172a6acb390e5a48cb9b98df566f23b8ae 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -64,13 +64,13 @@ struct PoolingFunctor : PoolingFunctorBase { input_tensor->dim(3), input_tensor->dim(3) }; - if (paddings_.empty()) { - paddings_.resize(2); - kernels::CalcNHWCPaddingAndOutputSize( - input_tensor->shape().data(), filter_shape.data(), - dilations_, strides_, this->padding_type_, - output_shape.data(), paddings_.data()); - + std::vector paddings(2); + kernels::CalcNHWCPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), + dilations_, strides_, this->padding_type_, + output_shape.data(), paddings.data()); + if (!paddings_.empty()) { + paddings = paddings_; } output_tensor->Resize(output_shape); @@ -99,8 +99,8 @@ struct PoolingFunctor : PoolingFunctorBase { int dilation_w = dilations_[1]; // The left-upper most offset of the padded input - int padded_h_start = 0 - paddings_[0] / 2; - int padded_w_start = 0 - paddings_[1] / 2; + int padded_h_start = 0 - paddings[0] / 2; + int padded_w_start = 0 - paddings[1] / 2; if (pooling_type_ == MAX) { #pragma omp parallel for collapse(4)