diff --git a/mace/core/operator.cc b/mace/core/operator.cc index eca09f3b102b00ea530a34230a254c24f6a103ce..7554fbba99a176071ba7815ef3db1a08a8efaf92 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -59,6 +59,7 @@ std::unique_ptr OperatorRegistry::CreateOperator( } } +extern void Register_Activation(OperatorRegistry *op_registry); extern void Register_AddN(OperatorRegistry *op_registry); extern void Register_BatchNorm(OperatorRegistry *op_registry); extern void Register_BatchToSpaceND(OperatorRegistry *op_registry); @@ -68,17 +69,17 @@ extern void Register_ChannelShuffle(OperatorRegistry *op_registry); extern void Register_Concat(OperatorRegistry *op_registry); extern void Register_Conv2D(OperatorRegistry *op_registry); extern void Register_DepthwiseConv2d(OperatorRegistry *op_registry); +extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry); extern void Register_FusedConv2D(OperatorRegistry *op_registry); extern void Register_GlobalAvgPooling(OperatorRegistry *op_registry); extern void Register_ImageToBuffer(OperatorRegistry *op_registry); extern void Register_Pooling(OperatorRegistry *op_registry); -extern void Register_Relu(OperatorRegistry *op_registry); extern void Register_ResizeBilinear(OperatorRegistry *op_registry); -extern void Register_SpaceToBatchND(OperatorRegistry *op_registry); extern void Register_Softmax(OperatorRegistry *op_registry); -extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry); +extern void Register_SpaceToBatchND(OperatorRegistry *op_registry); OperatorRegistry::OperatorRegistry() { + Register_Activation(this); Register_AddN(this); Register_BatchNorm(this); Register_BatchToSpaceND(this); @@ -88,15 +89,14 @@ OperatorRegistry::OperatorRegistry() { Register_Concat(this); Register_Conv2D(this); Register_DepthwiseConv2d(this); + Register_FoldedBatchNorm(this); Register_FusedConv2D(this); Register_GlobalAvgPooling(this); Register_ImageToBuffer(this); Register_Pooling(this); - Register_Relu(this); Register_ResizeBilinear(this); - Register_SpaceToBatchND(this); Register_Softmax(this); - Register_FoldedBatchNorm(this); + Register_SpaceToBatchND(this); } } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_development.cc b/mace/core/runtime/opencl/opencl_development.cc index 43127e798fd950dbbb769ace7834cc1551a0556a..b0ebcd245b97baff041afa8a75f0810c0a034dda 100644 --- a/mace/core/runtime/opencl/opencl_development.cc +++ b/mace/core/runtime/opencl/opencl_development.cc @@ -22,8 +22,9 @@ bool GetSourceOrBinaryProgram(const std::string &program_name, return false; } cl::Program::Sources sources; - std::string kernel_source(it_source->second.begin(), it_source->second.end()); - sources.push_back(ObfuscateString(kernel_source)); + std::string content(it_source->second.begin(), it_source->second.end()); + std::string kernel_source = ObfuscateString(content); + sources.push_back(kernel_source); *program = cl::Program(context, sources); return true; diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h new file mode 100644 index 0000000000000000000000000000000000000000..d68d4f2baf502bd51ae8ab8cc059f55cd7f57cb5 --- /dev/null +++ b/mace/kernels/activation.h @@ -0,0 +1,136 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_ACTIVATION_H_ +#define MACE_KERNELS_ACTIVATION_H_ + +#include "mace/core/future.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" + +namespace mace { +namespace kernels { + +enum ActivationType { + NOOP = 0, + RELU = 1, + RELUX = 2, + PRELU = 3, + TANH = 4, + SIGMOID = 5 +}; + +inline ActivationType StringToActivationType(const std::string type) { + if (type == "RELU") { + return ActivationType::RELU; + } else if (type == "RELUX") { + return ActivationType::RELUX; + } else if (type == "PRELU") { + return ActivationType::PRELU; + } else if (type == "TANH") { + return ActivationType::TANH; + } else if (type == "SIGMOID") { + return ActivationType::SIGMOID; + } else if (type == "NOOP") { + return ActivationType::NOOP; + } else { + LOG(FATAL) << "Unknown activation type: " << type; + } + return ActivationType::NOOP; +} + +template +void DoActivation(const T *input_ptr, + T *output_ptr, + const index_t size, + const ActivationType type, + const float relux_max_limit, + const float prelu_alpha) { + MACE_CHECK(DataTypeToEnum::value != DataType::DT_HALF); + + switch (type) { + case NOOP: + break; + case RELU: + for (index_t i = 0; i < size; ++i) { + output_ptr[i] = std::max(input_ptr[i], static_cast(0)); + } + break; + case RELUX: + for (index_t i = 0; i < size; ++i) { + output_ptr[i] = std::min(std::max(input_ptr[i], static_cast(0)), + static_cast(relux_max_limit)); + } + break; + case PRELU: + for (index_t i = 0; i < size; ++i) { + T in = input_ptr[i]; + if (in < 0) { + output_ptr[i] = in * prelu_alpha; + } else { + output_ptr[i] = in; + } + } + break; + case TANH: + for (index_t i = 0; i < size; ++i) { + T in_exp = std::exp(-2 * input_ptr[i]); + output_ptr[i] = (1 - in_exp) / (1 + in_exp); + } + break; + case SIGMOID: + for (index_t i = 0; i < size; ++i) { + output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i])); + } + break; + default: + LOG(FATAL) << "Unknown activation type: " << type; + } +} + +template +class ActivationFunctor { + public: + ActivationFunctor(ActivationType type, T relux_max_limit, T prelu_alpha) + : activation_(type), + relux_max_limit_(relux_max_limit), + prelu_alpha_(prelu_alpha) {} + + void operator()(const Tensor *input, Tensor *output, StatsFuture *future) { + const T *input_ptr = input->data(); + T *output_ptr = output->mutable_data(); + DoActivation(input_ptr, output_ptr, output->size(), activation_, relux_max_limit_, + prelu_alpha_); + } + + private: + ActivationType activation_; + T relux_max_limit_; + T prelu_alpha_; +}; + +template <> +void ActivationFunctor::operator()( + const Tensor *input, Tensor *output, StatsFuture *future); + +template +class ActivationFunctor { + public: + ActivationFunctor(ActivationType type, T relux_max_limit, T prelu_alpha) + : activation_(type), + relux_max_limit_(relux_max_limit), + prelu_alpha_(prelu_alpha) {} + + void operator()(const Tensor *input, Tensor *output, StatsFuture *future); + + private: + ActivationType activation_; + T relux_max_limit_; + T prelu_alpha_; +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_ACTIVATION_H_ diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index 5c62e33da27639854f4b5d70867eebe75cfd9652..e7499274c0b6bd2612e1b3ae65400d031d896c52 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -6,25 +6,37 @@ #define MACE_KERNELS_BATCH_NORM_H_ #include "mace/core/future.h" -#include "mace/core/tensor.h" #include "mace/core/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/kernels/activation.h" namespace mace { namespace kernels { struct BatchNormFunctorBase { - BatchNormFunctorBase(bool folded_constant, bool fused_relu) : - folded_constant_(folded_constant), - fused_relu_(fused_relu){} + BatchNormFunctorBase(bool folded_constant, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha) + : folded_constant_(folded_constant), + activation_(activation), + relux_max_limit_(relux_max_limit), + prelu_alpha_(prelu_alpha) {} const bool folded_constant_; - const bool fused_relu_; + const ActivationType activation_; + const float relux_max_limit_; + const float prelu_alpha_; }; template -struct BatchNormFunctor : BatchNormFunctorBase{ - BatchNormFunctor(const bool folded_constant, const bool fused_relu) : - BatchNormFunctorBase(folded_constant, fused_relu) {} +struct BatchNormFunctor : BatchNormFunctorBase { + BatchNormFunctor(const bool folded_constant, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha) + : BatchNormFunctorBase( + folded_constant, activation, relux_max_limit, prelu_alpha) {} void operator()(const Tensor *input, const Tensor *scale, @@ -85,32 +97,34 @@ struct BatchNormFunctor : BatchNormFunctorBase{ } else { output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c]; } - if (fused_relu_) { - output_ptr[pos] = std::max(output_ptr[pos], static_cast(0)); - } ++pos; } } } } + DoActivation(output_ptr, output_ptr, output->NumElements(), activation_, + relux_max_limit_, prelu_alpha_); } }; template <> -void BatchNormFunctor::operator()( - const Tensor *input, - const Tensor *scale, - const Tensor *offset, - const Tensor *mean, - const Tensor *var, - const float epsilon, - Tensor *output, - StatsFuture *future); +void BatchNormFunctor::operator()(const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const float epsilon, + Tensor *output, + StatsFuture *future); template struct BatchNormFunctor : BatchNormFunctorBase { - BatchNormFunctor(const bool folded_constant, const bool fused_relu) : - BatchNormFunctorBase(folded_constant, fused_relu) {} + BatchNormFunctor(const bool folded_constant, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha) + : BatchNormFunctorBase( + folded_constant, activation, relux_max_limit, prelu_alpha) {} void operator()(const Tensor *input, const Tensor *scale, const Tensor *offset, diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 37ca87f405df211ed3752a28048714fd5618317e..08b04477b6b43aeb8ae84f5c05e54cc042d349fc 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -7,6 +7,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/activation.h" #include "mace/kernels/conv_pool_2d_util.h" namespace mace { @@ -15,20 +16,39 @@ namespace kernels { struct Conv2dFunctorBase { Conv2dFunctorBase(const int *strides, const Padding &paddings, - const int *dilations) - : strides_(strides), dilations_(dilations), paddings_(paddings) {} - - const int *strides_; // [stride_h, stride_w] - const int *dilations_; // [dilation_h, dilation_w] - Padding paddings_; + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + activation_(activation), + relux_max_limit_(relux_max_limit), + prelu_alpha_(prelu_alpha) {} + + const int *strides_; // [stride_h, stride_w] + const int *dilations_; // [dilation_h, dilation_w] + const Padding paddings_; + const ActivationType activation_; + const float relux_max_limit_; + const float prelu_alpha_; }; -template +template struct Conv2dFunctor : Conv2dFunctorBase { Conv2dFunctor(const int *strides, const Padding &paddings, - const int *dilations) - : Conv2dFunctorBase(strides, paddings, dilations) {} + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha) + : Conv2dFunctorBase(strides, + paddings, + dilations, + activation, + relux_max_limit, + prelu_alpha) {} void operator()(const Tensor *input, const Tensor *filter, @@ -42,8 +62,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { std::vector output_shape(4); std::vector paddings(2); kernels::CalcNHWCPaddingAndOutputSize( - input->shape().data(), filter->shape().data(), dilations_, - strides_, paddings_, output_shape.data(), paddings.data()); + input->shape().data(), filter->shape().data(), dilations_, strides_, + paddings_, output_shape.data(), paddings.data()); output->Resize(output_shape); index_t batch = output->dim(0); @@ -101,7 +121,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { if (inh < 0 || inh >= input_height || inw < 0 || inw >= input_width) { MACE_CHECK(inh >= padded_h_start && inh < padded_h_stop && - inw >= padded_w_start && inw < padded_w_stop, + inw >= padded_w_start && inw < padded_w_stop, "Out of range read from input: ", inh, ", ", inw); // else padding with 0: @@ -109,8 +129,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { } else { index_t input_offset = n * input_height * input_width * input_channels + - inh * input_width * input_channels + inw * input_channels + - inc; + inh * input_width * input_channels + + inw * input_channels + inc; sum += input_data[input_offset] * *filter_ptr; } filter_ptr += channels; @@ -123,24 +143,33 @@ struct Conv2dFunctor : Conv2dFunctorBase { } } } - + output_data = output->mutable_data(); + DoActivation(output_data, output_data, output->NumElements(), activation_, + relux_max_limit_, prelu_alpha_); } - }; -template<> +template <> void Conv2dFunctor::operator()(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output, StatsFuture *future); -template +template struct Conv2dFunctor : Conv2dFunctorBase { Conv2dFunctor(const int *strides, const Padding &paddings, - const int *dilations) - : Conv2dFunctorBase(strides, paddings, dilations) {} + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha) + : Conv2dFunctorBase(strides, + paddings, + dilations, + activation, + relux_max_limit, + prelu_alpha) {} void operator()(const Tensor *input, const Tensor *filter, diff --git a/mace/kernels/fused_conv_2d.h b/mace/kernels/fused_conv_2d.h deleted file mode 100644 index 53a7dbb10a58fbfdf02220e6df217081bec3ee45..0000000000000000000000000000000000000000 --- a/mace/kernels/fused_conv_2d.h +++ /dev/null @@ -1,74 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#ifndef MACE_KERNELS_FUSED_CONV_2D_H_ -#define MACE_KERNELS_FUSED_CONV_2D_H_ - -#include "mace/core/tensor.h" -#include "mace/kernels/conv_pool_2d_util.h" -#include "mace/kernels/conv_2d.h" - -namespace mace { -namespace kernels { - -struct FusedConv2dFunctorBase { - FusedConv2dFunctorBase(const int *strides, - const Padding &paddings, - const int *dilations) - : strides_(strides), dilations_(dilations), paddings_(paddings) {} - - const int *strides_; // [stride_h, stride_w] - const int *dilations_; // [dilation_h, dilation_w] - Padding paddings_; -}; - -template -struct FusedConv2dFunctor : FusedConv2dFunctorBase { - FusedConv2dFunctor(const int *strides, - const Padding &paddings, - const int *dilations) - : FusedConv2dFunctorBase(strides, paddings, dilations) {} - - void operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - Conv2dFunctor(strides_, paddings_, dilations_)(input, filter, bias, - output, future); - T *output_data = output->mutable_data(); - - T zero_value; - if (DataTypeToEnum::value == DataType::DT_HALF) { - zero_value = half_float::half_cast(0.0f); - } else { - zero_value = 0; - } - auto output_size = output->size(); - for (int n = 0; n < output_size; ++n) { - *output_data = *output_data < 0 ? zero_value : *output_data; - output_data++; - } - } - -}; - -template -struct FusedConv2dFunctor : FusedConv2dFunctorBase { - FusedConv2dFunctor(const int *strides, - const Padding &paddings, - const int *dilations) - : FusedConv2dFunctorBase(strides, paddings, dilations) {} - - void operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future); -}; - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_FUSED_CONV_2D_H_ diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc index 356e14ece1cb7e3bdb816ba02200ca63626f8fc9..ad74b819224397b9b532a19729df32cb190e93a7 100644 --- a/mace/kernels/neon/relu_neon.cc +++ b/mace/kernels/neon/relu_neon.cc @@ -9,7 +9,7 @@ namespace mace { namespace kernels { template <> -void ReluFunctor::operator()(const Tensor *input_tensor, +void ActivationFunctor::operator()(const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { const float *input = input_tensor->data(); diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc new file mode 100644 index 0000000000000000000000000000000000000000..44eaa47e52a9558a27f8ba70128b7c06eb457a65 --- /dev/null +++ b/mace/kernels/opencl/activation_opencl.cc @@ -0,0 +1,117 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/kernels/activation.h" +#include "mace/core/runtime/opencl/cl2_header.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/tuner.h" +#include "mace/utils/utils.h" + +namespace mace { +namespace kernels { + +template +void ActivationFunctor::operator()(const Tensor *input, + Tensor *output, + StatsFuture *future) { + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + + auto runtime = OpenCLRuntime::Global(); + + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); + built_options.emplace("-Dactivation=" + kernel_name); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + switch (activation_) { + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation_; + } + cl::Kernel activation_kernel = + runtime->BuildKernel("activation", kernel_name, built_options); + int idx = 0; + activation_kernel.setArg( + idx++, *(static_cast(input->buffer()))); + activation_kernel.setArg(idx++, relux_max_limit_); + activation_kernel.setArg(idx++, prelu_alpha_); + activation_kernel.setArg(idx++, + *(static_cast(output->buffer()))); + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + const std::vector lws = {8, 16, 8}; + const uint32_t kwg_size = + runtime->GetKernelMaxWorkGroupSize(activation_kernel); + auto params_generator = [&]() -> std::vector> { + std::vector local_ws(3, 0); + local_ws[0] = std::min(channel_blocks, kwg_size); + local_ws[1] = std::min(width, kwg_size / local_ws[0]); + local_ws[2] = std::min(height * batch, + kwg_size / (local_ws[0] * local_ws[1])); + return { + {local_ws[0], local_ws[1], local_ws[2]}, + {kwg_size / 16, 4, 4}, + {kwg_size / 32, 4, 8}, + {kwg_size / 32, 8, 4}, + {kwg_size / 64, 8, 8}, + {kwg_size / 64, 16, 4}, + {kwg_size / 128, 8, 16}, + {kwg_size / 128, 16, 8}, + {kwg_size / 128, 32, 4}, + {1, kwg_size / 32, 32}, + {1, kwg_size / 64, 64}, + {1, kwg_size / 128, 128}, + {3, 15, 9}, + {7, 15, 9}, + {9, 7, 15}, + {15, 7, 9}, + {1, kwg_size, 1}, + {4, 15, 8}, // SNPE size + }; + }; + cl::Event event; + auto func = [&](const std::vector ¶ms) -> cl_int { + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + return error; + }; + std::string tuning_key = + Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + OpenCLProfilingTimer timer(&event); + Tuner::Get()->template TuneOrRun( + tuning_key, lws, params_generator, func, &timer); + SetFuture(future, event); +} + +template struct ActivationFunctor; +template struct ActivationFunctor; +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index ba0e0c2e97f82acad0a1b67c6803e5592615c03d..2d6c95a37963b6ffceb9d216d58017cedd01cb00 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -5,23 +5,22 @@ #include "mace/kernels/batch_norm.h" #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/opencl/helper.h" #include "mace/utils/tuner.h" #include "mace/utils/utils.h" -#include "mace/kernels/opencl/helper.h" namespace mace { namespace kernels { -template -void BatchNormFunctor::operator()( - const Tensor *input, - const Tensor *scale, - const Tensor *offset, - const Tensor *mean, - const Tensor *var, - const float epsilon, - Tensor *output, - StatsFuture *future) { +template +void BatchNormFunctor::operator()(const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const float epsilon, + Tensor *output, + StatsFuture *future) { MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr)); const index_t batch = input->dim(0); @@ -41,21 +40,45 @@ void BatchNormFunctor::operator()( if (folded_constant_) { built_options.emplace("-DFOLDED_CONSTANT"); } - if (fused_relu_) { - built_options.emplace("-DFUSED_RELU"); + switch (activation_) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation_; } - auto bm_kernel = runtime->BuildKernel("batch_norm", kernel_name, built_options); + + auto bm_kernel = + runtime->BuildKernel("batch_norm", kernel_name, built_options); uint32_t idx = 0; bm_kernel.setArg(idx++, *(static_cast(input->buffer()))); bm_kernel.setArg(idx++, *(static_cast(scale->buffer()))); - bm_kernel.setArg(idx++, *(static_cast(offset->buffer()))); + bm_kernel.setArg(idx++, + *(static_cast(offset->buffer()))); if (!folded_constant_) { - bm_kernel.setArg(idx++, *(static_cast(mean->buffer()))); + bm_kernel.setArg(idx++, + *(static_cast(mean->buffer()))); bm_kernel.setArg(idx++, *(static_cast(var->buffer()))); bm_kernel.setArg(idx++, epsilon); } bm_kernel.setArg(idx++, *(static_cast(output->buffer()))); + bm_kernel.setArg(idx++, relux_max_limit_); + bm_kernel.setArg(idx++, prelu_alpha_); const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), @@ -66,64 +89,48 @@ void BatchNormFunctor::operator()( std::vector local_ws(3, 0); local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(width, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {8, 128, 1}, //SNPE size + local_ws[2] = std::min(height * batch, + kwg_size / (local_ws[0] * local_ws[1])); + return { + {local_ws[0], local_ws[1], local_ws[2]}, + {kwg_size / 16, 4, 4}, + {kwg_size / 32, 4, 8}, + {kwg_size / 32, 8, 4}, + {kwg_size / 64, 8, 8}, + {kwg_size / 64, 16, 4}, + {kwg_size / 128, 8, 16}, + {kwg_size / 128, 16, 8}, + {kwg_size / 128, 32, 4}, + {1, kwg_size / 32, 32}, + {1, kwg_size / 64, 64}, + {1, kwg_size / 128, 128}, + {3, 15, 9}, + {7, 15, 9}, + {9, 7, 15}, + {15, 7, 9}, + {1, kwg_size, 1}, + {8, 128, 1}, // SNPE size }; }; cl::Event event; auto func = [&](const std::vector ¶ms) -> cl_int { cl_int error = runtime->command_queue().enqueueNDRangeKernel( - bm_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); + bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; return error; }; - std::stringstream ss; - ss << "batch_norm_opencl_kernel_" - << output->dim(0) << "_" - << output->dim(1) << "_" - << output->dim(2) << "_" - << output->dim(3) << "_" - << folded_constant_; + std::string tuning_key = + Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), + output->dim(1), output->dim(2), output->dim(3), folded_constant_); OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + Tuner::Get()->template TuneOrRun( + tuning_key, lws, params_generator, func, &timer); + SetFuture(future, event); } -template -struct BatchNormFunctor; -template -struct BatchNormFunctor; +template struct BatchNormFunctor; +template struct BatchNormFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl new file mode 100644 index 0000000000000000000000000000000000000000..02ebff0aaeeb6a416f79b14203896ea40b512fe7 --- /dev/null +++ b/mace/kernels/opencl/cl/activation.cl @@ -0,0 +1,17 @@ +#include + +__kernel void activation(__read_only image2d_t input, + __private const DATA_TYPE relux_max_limit, + __private const DATA_TYPE prelu_alpha, + __write_only image2d_t output) { + const int ch_blk = get_global_id(0); + const int w = get_global_id(1); + const int hb = get_global_id(2); + const int width = get_global_size(1); + + const int pos = mad24(ch_blk, width, w); + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); + DATA_TYPE4 out = do_activation(in, relux_max_limit, prelu_alpha); + WRITE_IMAGET(output, (int2)(pos, hb), out); +} + diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index f40609664de78c01a9dac3d66aec3f3c5b90d99e..995abc8c1bba8b8dfd3a9cb30fc52f1c05cfa0ba 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -8,7 +8,9 @@ __kernel void batch_norm(__read_only image2d_t input, __read_only image2d_t var, __private const float epsilon, #endif - __write_only image2d_t output) { + __write_only image2d_t output, + __private const DATA_TYPE relux_max_limit, + __private const DATA_TYPE prelu_alpha) { const int ch_blk = get_global_id(0); const int w = get_global_id(1); const int hb = get_global_id(2); @@ -33,8 +35,8 @@ __kernel void batch_norm(__read_only image2d_t input, DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 out = mad(in, bn_scale, bn_offset); -#ifdef FUSED_RELU - out = fmax(out, 0); +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID) + out = do_activation(out, relux_max_limit, prelu_alpha); #endif WRITE_IMAGET(output, (int2)(pos, hb), out); diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index 499c8164ddc3a0c5158c97e70c6f6ec55f0ccd87..792d2b49b7a82bb40064feb917aa2435805353a6 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -18,7 +18,29 @@ #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) - __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + +inline DATA_TYPE4 do_activation(DATA_TYPE4 in, + __private const DATA_TYPE relux_max_limit, + __private const DATA_TYPE prelu_alpha) { + DATA_TYPE4 out; +#ifdef USE_RELU + out = fmax(in, 0); +#endif +#ifdef USE_RELUX + out = clamp(in, 0, relux_max_limit); +#endif +#ifdef USE_PRELU + out = select(prelu_alpha * in, in, in >= 0); +#endif +#ifdef USE_TANH + out = tanh(in); +#endif +#ifdef USE_SIGMOID + out = native_recip(1.0 + native_exp(-in)); +#endif + return out; +} + #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index 8929b0488d2c18cf033fdb3a4ae2fc19e0c24326..ce33c093ab2adfaf4bec1876fc20fc289c6d0e10 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -6,6 +6,8 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t bias, /* cout%4 * cout/4 */ #endif __write_only image2d_t output, + __private const DATA_TYPE relux_max_limit, + __private const DATA_TYPE prelu_alpha, __private const int in_height, __private const int in_width, __private const int in_ch_blks, @@ -115,12 +117,11 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ } } -#ifdef FUSED_RELU - // TODO relux - out0 = fmax(out0, 0); - out1 = fmax(out1, 0); - out2 = fmax(out2, 0); - out3 = fmax(out3, 0); +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID) + out0 = do_activation(out0, relux_max_limit, prelu_alpha); + out1 = do_activation(out1, relux_max_limit, prelu_alpha); + out2 = do_activation(out2, relux_max_limit, prelu_alpha); + out3 = do_activation(out3, relux_max_limit, prelu_alpha); #endif const int out_x_base = mul24(out_ch_blk, out_width); diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index e741833a31fde27d0764110629b07dd861dfc3af..a34c69ce734e284ff21be5a63d45afe88008484a 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -6,6 +6,8 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] __read_only image2d_t bias, /* cout%4 * cout/4 */ #endif __write_only image2d_t output, + __private const DATA_TYPE relux_max_limit, + __private const DATA_TYPE prelu_alpha, __private const int in_height, __private const int in_width, __private const int in_ch_blks, @@ -90,12 +92,11 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] filter_x_base += 4; } -#ifdef FUSED_RELU - // TODO relux - out0 = fmax(out0, 0); - out1 = fmax(out1, 0); - out2 = fmax(out2, 0); - out3 = fmax(out3, 0); +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID) + out0 = do_activation(out0, relux_max_limit, prelu_alpha); + out1 = do_activation(out1, relux_max_limit, prelu_alpha); + out2 = do_activation(out2, relux_max_limit, prelu_alpha); + out3 = do_activation(out3, relux_max_limit, prelu_alpha); #endif const int out_x_base = mul24(out_ch_blk, width); diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 019db2378755314bee009dceb084233a7a72db5a..448c2c9e00ea17a653f66ebbbb102a874c2731b6 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -6,6 +6,8 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] __read_only image2d_t bias, /* cout%4 * cout/4 */ #endif __write_only image2d_t output, + __private const DATA_TYPE relux_max_limit, + __private const DATA_TYPE prelu_alpha, __private const int in_height, __private const int in_width, __private const int in_ch_blks, @@ -122,13 +124,12 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] } } -#ifdef FUSED_RELU - // TODO relux - out0 = fmax(out0, 0); - out1 = fmax(out1, 0); - out2 = fmax(out2, 0); - out3 = fmax(out3, 0); - out4 = fmax(out4, 0); +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID) + out0 = do_activation(out0, relux_max_limit, prelu_alpha); + out1 = do_activation(out1, relux_max_limit, prelu_alpha); + out2 = do_activation(out2, relux_max_limit, prelu_alpha); + out3 = do_activation(out3, relux_max_limit, prelu_alpha); + out4 = do_activation(out4, relux_max_limit, prelu_alpha); #endif const int out_x_base = mul24(out_ch_blk, out_width); diff --git a/mace/kernels/opencl/cl/relu.cl b/mace/kernels/opencl/cl/relu.cl deleted file mode 100644 index e0762bdb0312af28ae49a2b54927e36bfc5b78dc..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/cl/relu.cl +++ /dev/null @@ -1,29 +0,0 @@ -#include - -// Supported data type: half/float -__kernel void relu(__read_only image2d_t input, - __write_only image2d_t output) { - const int ch_blk = get_global_id(0); - const int w = get_global_id(1); - const int hb = get_global_id(2); - const int width = get_global_size(1); - - const int pos = mad24(ch_blk, width, w); - DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); - DATA_TYPE4 out = fmax(in, 0); - WRITE_IMAGET(output, (int2)(pos, hb), out); -} - -__kernel void relux(__read_only image2d_t input, - __private const DATA_TYPE max_limit, - __write_only image2d_t output) { - const int ch_blk = get_global_id(0); - const int w = get_global_id(1); - const int hb = get_global_id(2); - const int width = get_global_size(1); - - const int pos = mad24(ch_blk, width, w); - DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); - DATA_TYPE4 out = clamp(in, 0, max_limit); - WRITE_IMAGET(output, (int2)(pos, hb), out); -} diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 5fa46128d9c8f0cb2f7bfe5fd42f6299997613b9..072a0abf0ad16ad3ec7e09d57c7ce90b268cc33b 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -3,52 +3,84 @@ // #include "mace/kernels/conv_2d.h" +#include "mace/kernels/activation.h" #include "mace/kernels/opencl/helper.h" namespace mace { namespace kernels { -extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const int *padding, const int *dilations, - const DataType dt, Tensor *output, +extern void Conv2dOpenclK1x1S1(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, StatsFuture *future); -extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const int *padding, const int *dilations, - const DataType dt, Tensor *output, +extern void Conv2dOpenclK1x1S2(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, StatsFuture *future); -extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const int *padding, const int *dilations, - const DataType dt, Tensor *output, +extern void Conv2dOpenclK3x3S1(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, StatsFuture *future); -extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const int *padding, const int *dilations, - const DataType dt, Tensor *output, +extern void Conv2dOpenclK3x3S2(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, StatsFuture *future); -extern void Conv2dOpencl(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const uint32_t stride, const int *padding, - const int *dilations, const DataType dt, - Tensor *output, StatsFuture *future); +extern void Conv2dOpencl(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const uint32_t stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, + StatsFuture *future); -template +template void Conv2dFunctor::operator()(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output, StatsFuture *future) { - typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const int *padding, const int *dilations, - const DataType dt, Tensor *output, - StatsFuture *future); + typedef void (*Conv2dOpenclFunction)( + const Tensor *input, const Tensor *filter, const Tensor *bias, + const int *padding, const int *dilations, const ActivationType activation, + const float relux_max_limit, const float prelu_alpha, const DataType dt, + Tensor *output, StatsFuture *future); // Selection matrix: kernel_size x stride_size static const Conv2dOpenclFunction selector[5][2] = { {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, @@ -73,8 +105,8 @@ void Conv2dFunctor::operator()(const Tensor *input, std::vector output_shape(4); std::vector paddings(2); kernels::CalcNHWCPaddingAndOutputSize( - input->shape().data(), filter->shape().data(), dilations_, - strides_, paddings_, output_shape.data(), paddings.data()); + input->shape().data(), filter->shape().data(), dilations_, strides_, + paddings_, output_shape.data(), paddings.data()); std::vector output_image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); @@ -83,20 +115,18 @@ void Conv2dFunctor::operator()(const Tensor *input, if (kernel_h == kernel_w && kernel_h <= 5 && selector[kernel_h - 1][strides_[0] - 1] != nullptr) { auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_func(input, filter, bias, false, paddings.data(), dilations_, - DataTypeToEnum::value, output, future); + conv2d_func(input, filter, bias, paddings.data(), dilations_, activation_, + relux_max_limit_, prelu_alpha_, DataTypeToEnum::value, + output, future); } else { - Conv2dOpencl(input, filter, bias, false, strides_[0], - paddings.data(), dilations_, DataTypeToEnum::value, - output, future); + Conv2dOpencl(input, filter, bias, strides_[0], paddings.data(), dilations_, + activation_, relux_max_limit_, prelu_alpha_, + DataTypeToEnum::value, output, future); } - } -template -struct Conv2dFunctor; -template -struct Conv2dFunctor; +template struct Conv2dFunctor; +template struct Conv2dFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 0269c314a2ee98774c84deb686de4b7d854e0afb..a8e9192d2c410f5e5bd7c5802b8f332f50c5b400 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -5,9 +5,10 @@ #include "mace/kernels/conv_2d.h" #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/activation.h" #include "mace/kernels/opencl/helper.h" -#include "mace/utils/utils.h" #include "mace/utils/tuner.h" +#include "mace/utils/utils.h" namespace mace { namespace kernels { @@ -15,8 +16,10 @@ namespace kernels { void Conv1x1(const Tensor *input, const Tensor *filter, const Tensor *bias, - const bool fused_relu, const int stride, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, const DataType dt, Tensor *output, StatsFuture *future) { @@ -44,20 +47,46 @@ void Conv1x1(const Tensor *input, if (bias != nullptr) { built_options.emplace("-DBIAS"); } - if (fused_relu) { - built_options.emplace("-DFUSED_RELU"); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation; } auto runtime = OpenCLRuntime::Global(); - auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options); + auto conv_2d_kernel = + runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options); uint32_t idx = 0; - conv_2d_kernel.setArg(idx++, *(static_cast(input->buffer()))); - conv_2d_kernel.setArg(idx++, *(static_cast(filter->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(input->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(filter->buffer()))); if (bias != nullptr) { - conv_2d_kernel.setArg(idx++, *(static_cast(bias->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(bias->buffer()))); } - conv_2d_kernel.setArg(idx++, *(static_cast(output->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(output->buffer()))); + // FIXME handle flexable data type: half not supported + conv_2d_kernel.setArg(idx++, relux_max_limit); + conv_2d_kernel.setArg(idx++, prelu_alpha); conv_2d_kernel.setArg(idx++, static_cast(input_height)); conv_2d_kernel.setArg(idx++, static_cast(input_width)); conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); @@ -69,86 +98,79 @@ void Conv1x1(const Tensor *input, static_cast(height * batch)}; const std::vector lws = {8, 15, 8}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); - auto params_generator = [&]()->std::vector> { + auto params_generator = [&]() -> std::vector> { std::vector local_ws(3, 0); local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size/16, 4, 4}, - {kwg_size/32, 4, 8}, - {kwg_size/32, 8, 4}, - {kwg_size/64, 8, 8}, - {kwg_size/64, 16, 4}, - {kwg_size/128, 8, 16}, - {kwg_size/128, 16, 8}, - {kwg_size/128, 32, 4}, - {1, kwg_size/32, 32}, - {1, kwg_size/64, 64}, - {1, kwg_size/128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, //SNPE size + local_ws[2] = std::min(height * batch, + kwg_size / (local_ws[0] * local_ws[1])); + return { + {local_ws[0], local_ws[1], local_ws[2]}, + {kwg_size / 16, 4, 4}, + {kwg_size / 32, 4, 8}, + {kwg_size / 32, 8, 4}, + {kwg_size / 64, 8, 8}, + {kwg_size / 64, 16, 4}, + {kwg_size / 128, 8, 16}, + {kwg_size / 128, 16, 8}, + {kwg_size / 128, 32, 4}, + {1, kwg_size / 32, 32}, + {1, kwg_size / 64, 64}, + {1, kwg_size / 128, 128}, + {3, 15, 9}, + {7, 15, 9}, + {9, 7, 15}, + {15, 7, 9}, + {1, kwg_size, 1}, + {4, 15, 8}, // SNPE size }; }; cl::Event event; - auto func = [&](const std::vector& params)->cl_int { + auto func = [&](const std::vector ¶ms) -> cl_int { cl_int error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); + conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; return error; }; - std::stringstream ss; - ss << "conv2d_1x1_opencl_kernel_" - << output->dim(0) << "_" - << output->dim(1) << "_" - << output->dim(2) << "_" - << output->dim(3); + std::string tuning_key = + Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + Tuner::Get()->template TuneOrRun( + tuning_key, lws, params_generator, func, &timer); + SetFuture(future, event); } extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, const Tensor *bias, - const bool fused_relu, const int *padding, const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, const DataType dt, Tensor *output, StatsFuture *future) { - Conv1x1(input, filter, bias, fused_relu, 1, dt, output, future); + Conv1x1(input, filter, bias, 1, activation, relux_max_limit, prelu_alpha, dt, + output, future); }; extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, const Tensor *bias, - const bool fused_relu, const int *padding, const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, const DataType dt, Tensor *output, StatsFuture *future) { - Conv1x1(input, filter, bias, fused_relu, 2, dt, output, future); + Conv1x1(input, filter, bias, 2, activation, relux_max_limit, prelu_alpha, dt, + output, future); }; } // namespace kernels diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 16d86330b21c0a42059c1b49e9f5bddc076df0be..9779107b4a1bd1f524d4faa2766bbb37776b603d 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -2,21 +2,29 @@ // Copyright (c) 2017 XiaoMi All rights reserved. // +#include "mace/kernels/conv_2d.h" #include "mace/core/common.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/conv_2d.h" +#include "mace/kernels/activation.h" #include "mace/kernels/opencl/helper.h" -#include "mace/utils/utils.h" #include "mace/utils/tuner.h" +#include "mace/utils/utils.h" namespace mace { namespace kernels { -static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const uint32_t stride, const int *padding, - const int *dilations, const DataType dt, - Tensor *output, StatsFuture *future) { +static void Conv2d3x3S12(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const uint32_t stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, + StatsFuture *future) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -34,20 +42,45 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace("-DSTRIDE=" + ToString(stride)); - if (fused_relu) { - built_options.emplace("-DFUSED_RELU"); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation; } auto runtime = OpenCLRuntime::Global(); - auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options); + auto conv_2d_kernel = + runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options); uint32_t idx = 0; - conv_2d_kernel.setArg(idx++, *(static_cast(input->buffer()))); - conv_2d_kernel.setArg(idx++, *(static_cast(filter->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(input->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(filter->buffer()))); if (bias != nullptr) { - conv_2d_kernel.setArg(idx++, *(static_cast(bias->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(bias->buffer()))); } - conv_2d_kernel.setArg(idx++, *(static_cast(output->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(output->buffer()))); + conv_2d_kernel.setArg(idx++, relux_max_limit); + conv_2d_kernel.setArg(idx++, prelu_alpha); conv_2d_kernel.setArg(idx++, static_cast(input->dim(1))); conv_2d_kernel.setArg(idx++, static_cast(input->dim(2))); conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); @@ -67,83 +100,75 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, std::vector local_ws(3, 0); local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, //SNPE size + local_ws[2] = std::min(height * batch, + kwg_size / (local_ws[0] * local_ws[1])); + return { + {local_ws[0], local_ws[1], local_ws[2]}, + {local_ws[2], local_ws[1], local_ws[0]}, + {kwg_size / 16, 4, 4}, + {kwg_size / 32, 4, 8}, + {kwg_size / 32, 8, 4}, + {kwg_size / 64, 8, 8}, + {kwg_size / 64, 16, 4}, + {kwg_size / 128, 8, 16}, + {kwg_size / 128, 16, 8}, + {kwg_size / 128, 32, 4}, + {1, kwg_size / 32, 32}, + {1, kwg_size / 64, 64}, + {1, kwg_size / 128, 128}, + {3, 15, 9}, + {7, 15, 9}, + {9, 7, 15}, + {15, 7, 9}, + {1, kwg_size, 1}, + {4, 15, 8}, // SNPE size }; }; cl::Event event; auto func = [&](const std::vector ¶ms) -> cl_int { cl_int error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); + conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; return error; }; - std::stringstream ss; - ss << "conv2d_3x3_opencl_kernel_" - << output->dim(0) << "_" - << output->dim(1) << "_" - << output->dim(2) << "_" - << output->dim(3); + std::string tuning_key = + Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + Tuner::Get()->template TuneOrRun( + tuning_key, lws, params_generator, func, &timer); + SetFuture(future, event); } void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, const Tensor *bias, - const bool fused_relu, const int *padding, const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, const DataType dt, Tensor *output, StatsFuture *future) { - Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dilations, dt, output, future); + Conv2d3x3S12(input, filter, bias, 1, padding, dilations, activation, + relux_max_limit, prelu_alpha, dt, output, future); }; void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, const Tensor *bias, - const bool fused_relu, const int *padding, const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, const DataType dt, Tensor *output, StatsFuture *future) { - Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dilations, dt, output, future); + Conv2d3x3S12(input, filter, bias, 2, padding, dilations, activation, + relux_max_limit, prelu_alpha, dt, output, future); }; } // namespace kernels diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index caf6d5870372a37bc21bb501755577857775c0da..8929579907b006ffeaf9b3ac3bb25260077880ee 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -2,21 +2,29 @@ // Copyright (c) 2017 XiaoMi All rights reserved. // +#include "mace/kernels/conv_2d.h" #include "mace/core/common.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/conv_2d.h" +#include "mace/kernels/activation.h" #include "mace/kernels/opencl/helper.h" -#include "mace/utils/utils.h" #include "mace/utils/tuner.h" +#include "mace/utils/utils.h" namespace mace { namespace kernels { -void Conv2dOpencl(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const uint32_t stride, const int *padding, - const int *dilations, const DataType dt, - Tensor *output, StatsFuture *future) { +void Conv2dOpencl(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const uint32_t stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, + StatsFuture *future) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -34,20 +42,45 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter, built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace("-DSTRIDE=" + ToString(stride)); - if (fused_relu) { - built_options.emplace("-DFUSED_RELU"); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation; } auto runtime = OpenCLRuntime::Global(); - auto conv_2d_kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options); + auto conv_2d_kernel = + runtime->BuildKernel("conv_2d", kernel_name, built_options); uint32_t idx = 0; - conv_2d_kernel.setArg(idx++, *(static_cast(input->buffer()))); - conv_2d_kernel.setArg(idx++, *(static_cast(filter->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(input->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(filter->buffer()))); if (bias != nullptr) { - conv_2d_kernel.setArg(idx++, *(static_cast(bias->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(bias->buffer()))); } - conv_2d_kernel.setArg(idx++, *(static_cast(output->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(output->buffer()))); + conv_2d_kernel.setArg(idx++, relux_max_limit); + conv_2d_kernel.setArg(idx++, prelu_alpha); conv_2d_kernel.setArg(idx++, static_cast(input->dim(1))); conv_2d_kernel.setArg(idx++, static_cast(input->dim(2))); conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); @@ -69,60 +102,46 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter, std::vector local_ws(3, 0); local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, //SNPE size + local_ws[2] = std::min(height * batch, + kwg_size / (local_ws[0] * local_ws[1])); + return { + {local_ws[0], local_ws[1], local_ws[2]}, + {local_ws[2], local_ws[1], local_ws[0]}, + {kwg_size / 16, 4, 4}, + {kwg_size / 32, 4, 8}, + {kwg_size / 32, 8, 4}, + {kwg_size / 64, 8, 8}, + {kwg_size / 64, 16, 4}, + {kwg_size / 128, 8, 16}, + {kwg_size / 128, 16, 8}, + {kwg_size / 128, 32, 4}, + {1, kwg_size / 32, 32}, + {1, kwg_size / 64, 64}, + {1, kwg_size / 128, 128}, + {3, 15, 9}, + {7, 15, 9}, + {9, 7, 15}, + {15, 7, 9}, + {1, kwg_size, 1}, + {4, 15, 8}, // SNPE size }; }; cl::Event event; auto func = [&](const std::vector ¶ms) -> cl_int { cl_int error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); + conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; return error; }; - std::stringstream ss; - ss << "conv2d_general_opencl_kernel_" - << output->dim(0) << "_" - << output->dim(1) << "_" - << output->dim(2) << "_" - << output->dim(3); + std::string tuning_key = + Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + Tuner::Get()->template TuneOrRun( + tuning_key, lws, params_generator, func, &timer); + SetFuture(future, event); } } // namespace kernels diff --git a/mace/kernels/opencl/fused_conv_2d_opencl.cc b/mace/kernels/opencl/fused_conv_2d_opencl.cc deleted file mode 100644 index 2f4e608699042b72af545dda471dcd843a2cfdf5..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/fused_conv_2d_opencl.cc +++ /dev/null @@ -1,99 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include "mace/kernels/fused_conv_2d.h" -#include "mace/kernels/opencl/helper.h" - -namespace mace { -namespace kernels { - -extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const int *padding, const int *dilations, - const DataType dt, Tensor *output, - StatsFuture *future); - -extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const int *padding, const int *dilations, - const DataType dt, Tensor *output, - StatsFuture *future); - -extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const int *padding, const int *dilations, - const DataType dt, Tensor *output, - StatsFuture *future); - -extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const int *padding, const int *dilations, - const DataType dt, Tensor *output, - StatsFuture *future); - -extern void Conv2dOpencl(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const uint32_t stride, const int *padding, - const int *dilations, const DataType dt, - Tensor *output, StatsFuture *future); - -template -void FusedConv2dFunctor::operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, - const Tensor *bias, const bool fused_relu, - const int *padding, const int *dilations, - const DataType dt, Tensor *output, - StatsFuture *future); - // Selection matrix: kernel_size x stride_size - static const Conv2dOpenclFunction selector[5][2] = { - {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, - {nullptr, nullptr}, - {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2}, - {nullptr, nullptr}, - {nullptr, nullptr}}; - index_t kernel_h = filter->dim(0); - index_t kernel_w = filter->dim(1); - if (!input->is_image() || strides_[0] != strides_[1] || strides_[0] > 2 || - (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) { - LOG(WARNING) << "OpenCL conv2d kernel with " - << "filter" << kernel_h << "x" << kernel_w << "," - << " stride " << strides_[0] << "x" << strides_[1] - << ",dilations " << dilations_[0] << "x" << dilations_[1] - << " and input image: " << input->is_image() - << " is not implemented yet."; - MACE_NOT_IMPLEMENTED; - } - - std::vector output_shape(4); - std::vector paddings(2); - kernels::CalcNHWCPaddingAndOutputSize( - input->shape().data(), filter->shape().data(), dilations_, - strides_, paddings_, output_shape.data(), paddings.data()); - - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); - output->ResizeImage(output_shape, output_image_shape); - - if (kernel_h == kernel_w && kernel_h <= 5 && - selector[kernel_h - 1][strides_[0] - 1] != nullptr) { - auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_func(input, filter, bias, true, paddings.data(), dilations_, - DataTypeToEnum::value, output, future); - } else { - Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(), - dilations_, DataTypeToEnum::value, output, future); - } -} - -template -struct FusedConv2dFunctor; -template -struct FusedConv2dFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index c0752f4587ad3fe6ba4bf8e021d2bce161a3bf5f..2927dbfff77000166027cd377ff05dc1337bcc00 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -4,8 +4,12 @@ #ifndef MACE_KERNELS_OPENCL_HELPER_H_ #define MACE_KERNELS_OPENCL_HELPER_H_ + +#include "mace/core/runtime/opencl/cl2_header.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/types.h" #include "mace/utils/utils.h" +#include "mace/core/future.h" namespace mace { namespace kernels { @@ -28,6 +32,40 @@ std::string DtToCLDt(const DataType dt); std::string DtToUpstreamCLDt(const DataType dt); +inline void SetFuture(StatsFuture *future, const cl::Event &event) { + if (future != nullptr) { + future->wait_fn = [event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + OpenCLRuntime::Global()->GetCallStats(event, stats); + } + }; + } +} + +namespace { +template +void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) { + (*ss) << v; +} + +template +void AppendToStream(std::stringstream *ss, + const std::string &delimiter, + T first, + Args... args) { + (*ss) << first << delimiter; + AppendToStream(ss, delimiter, args...); +} +} // namespace + +template +std::string Concat(Args... args) { + std::stringstream ss; + AppendToStream(&ss, "_", args...); + return ss.str(); +} + } // namespace kernels } // namespace mace #endif // MACE_KERNELS_OPENCL_HELPER_H_ diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc deleted file mode 100644 index 30f81739621dacc05970f2aefada1fc3a61f12d3..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/relu_opencl.cc +++ /dev/null @@ -1,120 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include "mace/kernels/relu.h" -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/utils.h" -#include "mace/utils/tuner.h" - -namespace mace { -namespace kernels { - -template -void ReluFunctor::operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - - const index_t batch = input->dim(0); - const index_t height = input->dim(1); - const index_t width = input->dim(2); - const index_t channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - - auto runtime = OpenCLRuntime::Global(); - - std::set built_options; - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - cl::Kernel relu_kernel; - if (max_limit_ < 0) { - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("relu"); - built_options.emplace("-Drelu=" + kernel_name); - relu_kernel = runtime->BuildKernel("relu", kernel_name, built_options); - - uint32_t idx = 0; - relu_kernel.setArg(idx++, *(static_cast(input->buffer()))); - relu_kernel.setArg(idx++, *(static_cast(output->buffer()))); - } else { - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("relux"); - built_options.emplace("-Drelux=" + kernel_name); - relu_kernel = runtime->BuildKernel("relu", kernel_name, built_options); - - uint32_t idx = 0; - relu_kernel.setArg(idx++, *(static_cast(input->buffer()))); - relu_kernel.setArg(idx++, max_limit_); - relu_kernel.setArg(idx++, *(static_cast(output->buffer()))); - } - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - const std::vector lws = {8, 16, 8}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, //SNPE size - }; - }; - cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - relu_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - return error; - }; - std::stringstream ss; - ss << "relu_opencl_kernel_" - << output->dim(0) << "_" - << output->dim(1) << "_" - << output->dim(2) << "_" - << output->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } -} - -template -struct ReluFunctor; -template -struct ReluFunctor; -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/relu.h b/mace/kernels/relu.h deleted file mode 100644 index 19135f5e83b1d8021505429f9cf03879ff0728e4..0000000000000000000000000000000000000000 --- a/mace/kernels/relu.h +++ /dev/null @@ -1,49 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#ifndef MACE_KERNELS_RELU_H_ -#define MACE_KERNELS_RELU_H_ - -#include "mace/core/future.h" -#include "mace/core/tensor.h" - -namespace mace { -namespace kernels { - -template -struct ReluFunctor { - T max_limit_; - - void operator()(const Tensor *input, Tensor *output, StatsFuture *future) { - const T *input_ptr = input->data(); - T *output_ptr = output->mutable_data(); - index_t size = input->size(); - if (max_limit_ < 0) { - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = std::max(input_ptr[i], static_cast(0)); - } - } else { - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = std::min(std::max(input_ptr[i], static_cast(0)), max_limit_); - } - } - } -}; - -template <> -void ReluFunctor::operator()(const Tensor *input, - Tensor *output, - StatsFuture *future); - -template -struct ReluFunctor { - T max_limit_; - - void operator()(const Tensor *input, Tensor *output, StatsFuture *future); -}; - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_RELU_H_ diff --git a/mace/ops/relu.cc b/mace/ops/activation.cc similarity index 59% rename from mace/ops/relu.cc rename to mace/ops/activation.cc index f9f7b3be9b080fc739af959ef0bc469a0f17cc45..5cdffef16dd86852f112356413c829e8c9e5ff4a 100644 --- a/mace/ops/relu.cc +++ b/mace/ops/activation.cc @@ -2,36 +2,36 @@ // Copyright (c) 2017 XiaoMi All rights reserved. // -#include "mace/ops/relu.h" +#include "mace/ops/activation.h" namespace mace { -void Register_Relu(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Relu") +void Register_Activation(OperatorRegistry *op_registry) { + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") .Device(DeviceType::CPU) .TypeConstraint("T") .Build(), - ReluOp); + ActivationOp); #if MACE_ENABLE_NEON - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Relu") + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") .Device(DeviceType::NEON) .TypeConstraint("T") .Build(), - ReluOp); + ActivationOp); #endif // MACE_ENABLE_NEON - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Relu") + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") .Device(DeviceType::OPENCL) .TypeConstraint("T") .Build(), - ReluOp); + ActivationOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("Relu") + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") .Device(DeviceType::OPENCL) .TypeConstraint("T") .Build(), - ReluOp); + ActivationOp); } } // namespace mace diff --git a/mace/ops/activation.h b/mace/ops/activation.h new file mode 100644 index 0000000000000000000000000000000000000000..04ca0249c18a19fb2c47529d97535f4cd8663073 --- /dev/null +++ b/mace/ops/activation.h @@ -0,0 +1,39 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_OPS_ACTIVATION_H_ +#define MACE_OPS_ACTIVATION_H_ + +#include "mace/core/operator.h" +#include "mace/kernels/activation.h" + +namespace mace { + +template +class ActivationOp : public Operator { + public: + ActivationOp(const OperatorDef &operator_def, Workspace *ws) + : Operator(operator_def, ws), + functor_(kernels::StringToActivationType( + OperatorBase::GetSingleArgument("activation", + "NOOP")), + OperatorBase::GetSingleArgument("max_limit", 0.0f), + OperatorBase::GetSingleArgument("alpha", 0.0f)) {} + + bool Run(StatsFuture *future) override { + const Tensor *input_tensor = this->inputs_[0]; + Tensor *output_tensor = this->outputs_[0]; + output_tensor->ResizeLike(input_tensor); + + functor_(input_tensor, output_tensor, future); + return true; + } + + private: + kernels::ActivationFunctor functor_; +}; + +} // namespace mace + +#endif // MACE_OPS_ACTIVATION_H_ diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..63d0cf7fa3dd6545d98db7a3c834ac065268eead --- /dev/null +++ b/mace/ops/activation_benchmark.cc @@ -0,0 +1,311 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include +#include "mace/core/operator.h" +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +template +static void ReluBenchmark( + int iters, int batch, int channels, int height, int width) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", {batch, height, width, channels}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + + OpDefBuilder("Activation", "ReluBM") + .Input("InputImage") + .Output("Output") + .AddStringArg("activation", "RELU") + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("Activation", "ReluBM") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "RELU") + .Finalize(net.NewOperatorDef()); + } + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} + +#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + ReluBenchmark(iters, N, C, H, W); \ + } \ + BENCHMARK(BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + +#define BM_RELU(N, C, H, W, TYPE) \ + BM_RELU_MACRO(N, C, H, W, TYPE, CPU); \ + BM_RELU_MACRO(N, C, H, W, TYPE, OPENCL); + +BM_RELU(1, 1, 512, 512, float); +BM_RELU(1, 3, 128, 128, float); +BM_RELU(1, 3, 512, 512, float); +BM_RELU(1, 32, 112, 112, float); +BM_RELU(1, 64, 256, 256, float); + +template +static void ReluxBenchmark( + int iters, int batch, int channels, int height, int width) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", {batch, height, width, channels}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + + OpDefBuilder("Activation", "ReluxBM") + .Input("InputImage") + .Output("Output") + .AddStringArg("activation", "RELUX") + .AddFloatArg("max_limit", 6.0) + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("Activation", "ReluxBM") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "RELUX") + .AddFloatArg("max_limit", 6.0) + .Finalize(net.NewOperatorDef()); + } + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} + +#define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + ReluxBenchmark(iters, N, C, H, W); \ + } \ + BENCHMARK(BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + +#define BM_RELUX(N, C, H, W, TYPE) \ + BM_RELUX_MACRO(N, C, H, W, TYPE, CPU); \ + BM_RELUX_MACRO(N, C, H, W, TYPE, OPENCL); + +BM_RELUX(1, 1, 512, 512, float); +BM_RELUX(1, 3, 128, 128, float); +BM_RELUX(1, 3, 512, 512, float); +BM_RELUX(1, 32, 112, 112, float); +BM_RELUX(1, 64, 256, 256, float); + +template +static void PreluBenchmark( + int iters, int batch, int channels, int height, int width) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", {batch, height, width, channels}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + + OpDefBuilder("Activation", "PreluBM") + .Input("InputImage") + .Output("Output") + .AddStringArg("activation", "PRELU") + .AddFloatArg("alpha", 2.0) + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("Activation", "PreluBM") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "PRELU") + .AddFloatArg("alpha", 2.0) + .Finalize(net.NewOperatorDef()); + } + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} + +#define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + PreluBenchmark(iters, N, C, H, W); \ + } \ + BENCHMARK(BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + +#define BM_PRELU(N, C, H, W, TYPE) \ + BM_PRELU_MACRO(N, C, H, W, TYPE, CPU); \ + BM_PRELU_MACRO(N, C, H, W, TYPE, OPENCL); + +BM_PRELU(1, 1, 512, 512, float); +BM_PRELU(1, 3, 128, 128, float); +BM_PRELU(1, 3, 512, 512, float); +BM_PRELU(1, 32, 112, 112, float); +BM_PRELU(1, 64, 256, 256, float); + +template +static void TanhBenchmark( + int iters, int batch, int channels, int height, int width) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", {batch, height, width, channels}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + + OpDefBuilder("Activation", "TanhBM") + .Input("InputImage") + .Output("Output") + .AddStringArg("activation", "TANH") + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("Activation", "TanhBM") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "TANH") + .Finalize(net.NewOperatorDef()); + } + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} + +#define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + TanhBenchmark(iters, N, C, H, W); \ + } \ + BENCHMARK(BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + +#define BM_TANH(N, C, H, W, TYPE) \ + BM_TANH_MACRO(N, C, H, W, TYPE, CPU); \ + BM_TANH_MACRO(N, C, H, W, TYPE, OPENCL); + +BM_TANH(1, 1, 512, 512, float); +BM_TANH(1, 3, 128, 128, float); +BM_TANH(1, 3, 512, 512, float); +BM_TANH(1, 32, 112, 112, float); +BM_TANH(1, 64, 256, 256, float); + +template +static void SigmoidBenchmark( + int iters, int batch, int channels, int height, int width) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", {batch, height, width, channels}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + + OpDefBuilder("Activation", "SigmoidBM") + .Input("InputImage") + .Output("Output") + .AddStringArg("activation", "SIGMOID") + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("Activation", "SigmoidBM") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "SIGMOID") + .Finalize(net.NewOperatorDef()); + } + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} + +#define BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + SigmoidBenchmark(iters, N, C, H, W); \ + } \ + BENCHMARK(BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + +#define BM_SIGMOID(N, C, H, W, TYPE) \ + BM_SIGMOID_MACRO(N, C, H, W, TYPE, CPU); \ + BM_SIGMOID_MACRO(N, C, H, W, TYPE, OPENCL); + +BM_SIGMOID(1, 1, 512, 512, float); +BM_SIGMOID(1, 3, 128, 128, float); +BM_SIGMOID(1, 3, 512, 512, float); +BM_SIGMOID(1, 32, 112, 112, float); +BM_SIGMOID(1, 64, 256, 256, float); + +} // namespace mace diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e99579ab4bd7be3bc9f4af17351174284ef53acd --- /dev/null +++ b/mace/ops/activation_test.cc @@ -0,0 +1,400 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/operator.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { + +class ActivationOpTest : public OpsTestBase {}; + +template +void TestSimpleRelu() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {2, 2, 2, 2}, + {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + + OpDefBuilder("Activation", "ReluTest") + .Input("InputImage") + .Output("OutputImage") + .AddStringArg("activation", "RELU") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", + kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Activation", "ReluTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "RELU") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } + + auto expected = CreateTensor( + {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +TEST_F(ActivationOpTest, CPUSimpleRelu) { TestSimpleRelu(); } + +#if __ARM_NEON +TEST_F(ActivationOpTest, NEONSimpleRelu) { TestSimpleRelu(); } +#endif + +TEST_F(ActivationOpTest, OPENCLSimpleRelu) { + TestSimpleRelu(); +} + +template +void TestUnalignedSimpleRelu() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + + OpDefBuilder("Activation", "ReluTest") + .Input("InputImage") + .Output("OutputImage") + .AddStringArg("activation", "RELU") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", + kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Activation", "ReluTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "RELU") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } + + auto expected = CreateTensor({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +TEST_F(ActivationOpTest, CPUUnalignedSimpleRelu) { + TestUnalignedSimpleRelu(); +} + +#if __ARM_NEON +TEST_F(ActivationOpTest, NEONUnalignedSimpleRelu) { + TestUnalignedSimpleRelu(); +} +#endif + +TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) { + TestUnalignedSimpleRelu(); +} + +template +void TestSimpleRelux() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {2, 2, 2, 2}, + {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + + OpDefBuilder("Activation", "ReluxTest") + .Input("InputImage") + .Output("OutputImage") + .AddStringArg("activation", "RELUX") + .AddFloatArg("max_limit", 6) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", + kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Activation", "ReluxTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "RELUX") + .AddFloatArg("max_limit", 6) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } + + auto expected = CreateTensor( + {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux(); } + +#if __ARM_NEON +TEST_F(ActivationOpTest, NEONSimple) { TestSimpleRelux(); } +#endif + +TEST_F(ActivationOpTest, OPENCLSimple) { + TestSimpleRelux(); +} + +template +void TestSimpleReluRelux() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {2, 2, 2, 2}, + {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + + OpDefBuilder("Activation", "ReluxTest") + .Input("InputImage") + .Output("OutputImage") + .AddStringArg("activation", "RELUX") + .AddFloatArg("max_limit", 6) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", + kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Activation", "ReluxTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "RELUX") + .AddFloatArg("max_limit", 6) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } + + auto expected = CreateTensor( + {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +TEST_F(ActivationOpTest, CPUSimpleRelux) { + TestSimpleReluRelux(); +} + +#if __ARM_NEON +TEST_F(ActivationOpTest, NEONSimpleRelux) { + TestSimpleReluRelux(); +} +#endif + +TEST_F(ActivationOpTest, OPENCLSimpleRelux) { + TestSimpleReluRelux(); +} + +template +void TestSimplePrelu() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {2, 2, 2, 2}, + {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + + OpDefBuilder("Activation", "PreluTest") + .Input("InputImage") + .Output("OutputImage") + .AddStringArg("activation", "PRELU") + .AddFloatArg("alpha", 2.0) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", + kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Activation", "PreluTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "PRELU") + .AddFloatArg("alpha", 2.0) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } + + auto expected = CreateTensor( + {2, 2, 2, 2}, {-14, 7, -12, 6, -10, 5, -8, 4, -6, 3, -4, 2, -2, 1, 0, 0}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +TEST_F(ActivationOpTest, CPUSimplePrelu) { TestSimplePrelu(); } + +#if __ARM_NEON +TEST_F(ActivationOpTest, NEONSimplePrelu) { + TestSimplePrelu(); +} +#endif + +TEST_F(ActivationOpTest, OPENCLSimplePrelu) { + TestSimplePrelu(); +} + +template +void TestSimpleTanh() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {2, 2, 2, 2}, + {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + + OpDefBuilder("Activation", "TanhTest") + .Input("InputImage") + .Output("OutputImage") + .AddStringArg("activation", "TANH") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", + kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Activation", "TanhTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "TANH") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } + + auto expected = CreateTensor( + {2, 2, 2, 2}, + {-0.99999834, 0.99999834, -0.99998771, 0.99998771, -0.9999092, 0.9999092, + -0.9993293, 0.9993293, -0.99505475, 0.99505475, -0.96402758, 0.96402758, + -0.76159416, 0.76159416, 0., 0.}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +TEST_F(ActivationOpTest, CPUSimpleTanh) { TestSimpleTanh(); } + +#if __ARM_NEON +TEST_F(ActivationOpTest, NEONSimpleTanh) { TestSimpleTanh(); } +#endif + +TEST_F(ActivationOpTest, OPENCLSimpleTanh) { + TestSimpleTanh(); +} + +template +void TestSimpleSigmoid() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {2, 2, 2, 2}, + {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + + OpDefBuilder("Activation", "SigmoidTest") + .Input("InputImage") + .Output("OutputImage") + .AddStringArg("activation", "SIGMOID") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", + kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Activation", "SigmoidTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "SIGMOID") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } + + auto expected = CreateTensor( + {2, 2, 2, 2}, + {9.11051194e-04, 9.99088949e-01, 2.47262316e-03, 9.97527377e-01, + 6.69285092e-03, 9.93307149e-01, 1.79862100e-02, 9.82013790e-01, + 4.74258732e-02, 9.52574127e-01, 1.19202922e-01, 8.80797078e-01, + 2.68941421e-01, 7.31058579e-01, 5.00000000e-01, 5.00000000e-01}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +TEST_F(ActivationOpTest, CPUSimpleSigmoid) { + TestSimpleSigmoid(); +} + +#if __ARM_NEON +TEST_F(ActivationOpTest, NEONSimpleSigmoid) { + TestSimpleSigmoid(); +} +#endif + +TEST_F(ActivationOpTest, OPENCLSimpleSigmoid) { + TestSimpleSigmoid(); +} + +} // namespace mace diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h index c6d2dd27f7ee36dcf08bac799b199174094f7892..a3a0c136ba12105fa121dbe18372c1d41112e75e 100644 --- a/mace/ops/batch_norm.h +++ b/mace/ops/batch_norm.h @@ -6,6 +6,7 @@ #define MACE_OPS_BATCH_NORM_H_ #include "mace/core/operator.h" +#include "mace/kernels/activation.h" #include "mace/kernels/batch_norm.h" namespace mace { @@ -14,9 +15,10 @@ template class BatchNormOp : public Operator { public: BatchNormOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), functor_(false, false) { - epsilon_ = - OperatorBase::GetSingleArgument("epsilon", static_cast(1e-4)); + : Operator(operator_def, ws), + functor_(false, kernels::ActivationType::NOOP, 0.0f, 0.0f) { + epsilon_ = OperatorBase::GetSingleArgument("epsilon", + static_cast(1e-4)); } bool Run(StatsFuture *future) override { diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h index 1df020118e3e7f2c290fd5cbb6424015b9559cec..79af489401d27ec362d2d952f5df0a70e8d5aeee 100644 --- a/mace/ops/conv_2d.h +++ b/mace/ops/conv_2d.h @@ -18,9 +18,12 @@ class Conv2dOp : public ConvPool2dOpBase { public: Conv2dOp(const OperatorDef &op_def, Workspace *ws) : ConvPool2dOpBase(op_def, ws), - functor_(this->strides_.data(), this->padding_, - this->dilations_.data()) { - } + functor_(this->strides_.data(), + this->padding_, + this->dilations_.data(), + kernels::ActivationType::NOOP, + 0.0f, + 0.0f) {} bool Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/folded_batch_norm.h b/mace/ops/folded_batch_norm.h index 390a30c7119fb20efa5d733dedade9e399b00e32..7e7c7e5d855ffcaf4a7fea7b5feb332fc8695523 100644 --- a/mace/ops/folded_batch_norm.h +++ b/mace/ops/folded_batch_norm.h @@ -15,8 +15,12 @@ class FoldedBatchNormOp : public Operator { public: FoldedBatchNormOp(const OperatorDef &operator_def, Workspace *ws) : Operator(operator_def, ws), - functor_(true, OperatorBase::GetSingleArgument("fused_relu", false)) { - } + functor_(true, + kernels::StringToActivationType( + OperatorBase::GetSingleArgument("activation", + "NOOP")), + OperatorBase::GetSingleArgument("max_limit", 0.0f), + OperatorBase::GetSingleArgument("alpha", 0.0f)) {} bool Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/fused_conv_2d.h b/mace/ops/fused_conv_2d.h index 8ad36d12b019f7c1373725254853be043f86437c..c11cb7784af93a9ca17b26df99db27f96b94d282 100644 --- a/mace/ops/fused_conv_2d.h +++ b/mace/ops/fused_conv_2d.h @@ -8,7 +8,7 @@ #include #include "mace/core/operator.h" -#include "mace/kernels/fused_conv_2d.h" +#include "mace/kernels/conv_2d.h" #include "mace/ops/conv_pool_2d_base.h" namespace mace { @@ -18,9 +18,14 @@ class FusedConv2dOp : public ConvPool2dOpBase { public: FusedConv2dOp(const OperatorDef &op_def, Workspace *ws) : ConvPool2dOpBase(op_def, ws), - functor_(this->strides_.data(), this->padding_, - this->dilations_.data()) { - } + functor_(this->strides_.data(), + this->padding_, + this->dilations_.data(), + kernels::StringToActivationType( + OperatorBase::GetSingleArgument("activation", + "NOOP")), + OperatorBase::GetSingleArgument("max_limit", 0.0f), + OperatorBase::GetSingleArgument("alpha", 0.0f)) {} bool Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); @@ -34,7 +39,7 @@ class FusedConv2dOp : public ConvPool2dOpBase { } private: - kernels::FusedConv2dFunctor functor_; + kernels::Conv2dFunctor functor_; protected: OP_INPUT_TAGS(INPUT, FILTER, BIAS); diff --git a/mace/ops/global_avg_pooling_benchmark.cc b/mace/ops/global_avg_pooling_benchmark.cc index 523ea924d692ead03169a633c119afbc5291f63f..f4decad281a2aab66612fc93e5e13fc83b74e9da 100644 --- a/mace/ops/global_avg_pooling_benchmark.cc +++ b/mace/ops/global_avg_pooling_benchmark.cc @@ -47,9 +47,9 @@ static void GlobalAvgPooling( BENCHMARK(BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE) #define BM_GLOBAL_AVG_POOLING(N, C, H, W) \ - BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, CPU); \ - BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, NEON); + BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, CPU); +// BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, NEON); BM_GLOBAL_AVG_POOLING(1, 3, 7, 7); BM_GLOBAL_AVG_POOLING(1, 3, 64, 64); -BM_GLOBAL_AVG_POOLING(1, 3, 256, 256); \ No newline at end of file +BM_GLOBAL_AVG_POOLING(1, 3, 256, 256); diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc index 2a6580d82a0c20764438bc38d206377a7cd8cf1c..1a4d1925a8d6ef8fa3a2c58900bae999f62bb8ca 100644 --- a/mace/ops/pooling_benchmark.cc +++ b/mace/ops/pooling_benchmark.cc @@ -63,8 +63,8 @@ static void Pooling(int iters, BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE) #define BM_POOLING(N, C, H, W, K, S, PA, PO) \ - BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \ - BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, NEON); + BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); +// BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, NEON); BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX); BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX); diff --git a/mace/ops/relu.h b/mace/ops/relu.h deleted file mode 100644 index 489e21d12f6f5ebb79db6581d0c902427c49ef4b..0000000000000000000000000000000000000000 --- a/mace/ops/relu.h +++ /dev/null @@ -1,36 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#ifndef MACE_OPS_RELU_H_ -#define MACE_OPS_RELU_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/relu.h" - -namespace mace { - -template -class ReluOp : public Operator { - public: - ReluOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws) { - functor_.max_limit_ = - OperatorBase::GetSingleArgument("max_limit", static_cast(-1)); - } - bool Run(StatsFuture *future) override { - const Tensor *input_tensor = this->inputs_[0]; - Tensor *output_tensor = this->outputs_[0]; - output_tensor->ResizeLike(input_tensor); - - functor_(input_tensor, output_tensor, future); - return true; - } - - private: - kernels::ReluFunctor functor_; -}; - -} // namespace mace - -#endif // MACE_OPS_RELU_H_ diff --git a/mace/ops/relu_benchmark.cc b/mace/ops/relu_benchmark.cc deleted file mode 100644 index 1b5d36245a027b3a9c4fe84f1c7cbc85a2050415..0000000000000000000000000000000000000000 --- a/mace/ops/relu_benchmark.cc +++ /dev/null @@ -1,68 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include -#include "mace/core/operator.h" -#include "mace/core/testing/test_benchmark.h" -#include "mace/ops/ops_test_util.h" - -namespace mace { -template -static void ReluBenchmark( - int iters, int batch, int channels, int height, int width) { - mace::testing::StopTiming(); - - OpsTestNet net; - - // Add input data - net.AddRandomInput("Input", {batch, height, width, channels}); - - if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); - - OpDefBuilder("Relu", "ReluBM") - .Input("InputImage") - .Output("Output") - .Finalize(net.NewOperatorDef()); - } else { - OpDefBuilder("Relu", "ReluBM") - .Input("Input") - .Output("Output") - .Finalize(net.NewOperatorDef()); - } - - // Warm-up - for (int i = 0; i < 5; ++i) { - net.RunOp(D); - } - net.Sync(); - - mace::testing::StartTiming(); - while (iters--) { - net.RunOp(D); - } - net.Sync(); -} - -#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \ - static void BM_RELU_##N##C##H##W##_##TYPE##_##DEVICE(int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - ReluBenchmark(iters, N, C, H, W); \ - } \ - BENCHMARK(BM_RELU_##N##C##H##W##_##TYPE##_##DEVICE) - -#define BM_RELU(N, C, H, W, TYPE) \ - BM_RELU_MACRO(N, C, H, W, TYPE, CPU); \ - BM_RELU_MACRO(N, C, H, W, TYPE, NEON); \ - BM_RELU_MACRO(N, C, H, W, TYPE, OPENCL); - -BM_RELU(1, 1, 512, 512, float); -BM_RELU(1, 3, 128, 128, float); -BM_RELU(1, 3, 512, 512, float); -BM_RELU(1, 32, 112, 112, float); -BM_RELU(1, 64, 256, 256, float); -} // namespace mace diff --git a/mace/ops/relu_test.cc b/mace/ops/relu_test.cc deleted file mode 100644 index e2a59a231472bdd872d6d3dabdd254e51717db7f..0000000000000000000000000000000000000000 --- a/mace/ops/relu_test.cc +++ /dev/null @@ -1,215 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include "mace/core/operator.h" -#include "mace/ops/ops_test_util.h" - -namespace mace { - -class ReluOpTest : public OpsTestBase {}; - -template -void TestSimple() { - OpsTestNet net; - - // Add input data - net.AddInputFromArray( - "Input", {2, 2, 2, 2}, - {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - - if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); - - OpDefBuilder("Relu", "ReluTest") - .Input("InputImage") - .Output("OutputImage") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - - // Transfer output - ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); - } else { - OpDefBuilder("Relu", "ReluTest") - .Input("Input") - .Output("Output") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - } - - auto expected = CreateTensor( - {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); - - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); -} - -TEST_F(ReluOpTest, CPUSimple) { TestSimple(); } - -#if __ARM_NEON -TEST_F(ReluOpTest, NEONSimple) { TestSimple(); } -#endif - -TEST_F(ReluOpTest, OPENCLSimple) { TestSimple(); } - -template -void TestUnalignedSimple() { - OpsTestNet net; - - // Add input data - net.AddInputFromArray("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5}); - - if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); - - OpDefBuilder("Relu", "ReluTest") - .Input("InputImage") - .Output("OutputImage") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - - // Transfer output - ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); - } else { - OpDefBuilder("Relu", "ReluTest") - .Input("Input") - .Output("Output") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - } - - auto expected = CreateTensor({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5}); - - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); -} - -TEST_F(ReluOpTest, CPUUnalignedSimple) { - TestUnalignedSimple(); -} - -#if __ARM_NEON -TEST_F(ReluOpTest, NEONUnalignedSimple) { - TestUnalignedSimple(); -} -#endif - -TEST_F(ReluOpTest, OPENCLUnalignedSimple) { - TestUnalignedSimple(); -} - -template -void TestSimpleReluX() { - OpsTestNet net; - - // Add input data - net.AddInputFromArray( - "Input", {2, 2, 2, 2}, - {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - - if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); - - OpDefBuilder("Relu", "ReluTest") - .Input("InputImage") - .Output("OutputImage") - .AddFloatArg("max_limit", 6) - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - - // Transfer output - ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); - } else { - OpDefBuilder("Relu", "ReluTest") - .Input("Input") - .Output("Output") - .AddFloatArg("max_limit", 6) - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - } - - auto expected = CreateTensor( - {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); - - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); -} - -TEST_F(ReluOpTest, CPUSimpleReluX) { TestSimpleReluX(); } - -#if __ARM_NEON -TEST_F(ReluOpTest, NEONSimpleReluX) { TestSimpleReluX(); } -#endif - -TEST_F(ReluOpTest, OPENCLSimpleReluX) { TestSimpleReluX(); } - -template -void TestUnalignedSimpleReluX() { - OpsTestNet net; - - // Add input data - net.AddInputFromArray("Input", {1, 1, 7, 1}, - {-7, 7, -6, 6, -5, 5, -4}); - - if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); - - OpDefBuilder("Relu", "ReluTest") - .Input("InputImage") - .Output("OutputImage") - .AddFloatArg("max_limit", 6) - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - - // Transfer output - ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); - } else { - OpDefBuilder("Relu", "ReluTest") - .Input("Input") - .Output("Output") - .AddFloatArg("max_limit", 6) - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - } - - auto expected = CreateTensor({1, 1, 7, 1}, {0, 6, 0, 6, 0, 5, 0}); - - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); -} - -TEST_F(ReluOpTest, CPUUnalignedSimpleReluX) { - TestUnalignedSimpleReluX(); -} - -#if __ARM_NEON -TEST_F(ReluOpTest, NEONUnalignedSimpleReluX) { - TestUnalignedSimpleReluX(); -} -#endif - -TEST_F(ReluOpTest, OPENCLUnalignedSimpleReluX) { - TestUnalignedSimpleReluX(); -} - -} // namespace mace