From f07dd5163e94af61be16e13bf0b1869292cf0d2c Mon Sep 17 00:00:00 2001 From: liuqi Date: Thu, 16 Nov 2017 14:49:55 +0800 Subject: [PATCH] Refactor the opencl kernel build logic. --- mace/core/runtime/opencl/opencl_runtime.cc | 108 ++++--------- mace/core/runtime/opencl/opencl_runtime.h | 14 +- mace/core/types.cc | 29 ++++ mace/core/types.h | 3 + mace/kernels/batch_norm.h | 6 +- mace/kernels/opencl/addn.cc | 6 +- mace/kernels/opencl/batch_norm_opencl.cc | 10 +- mace/kernels/opencl/cl/addn.cl | 13 +- mace/kernels/opencl/cl/batch_norm.cl | 26 ++-- mace/kernels/opencl/cl/common.h | 3 + mace/kernels/opencl/cl/conv_2d_1x1.cl | 131 ++++++++-------- mace/kernels/opencl/cl/conv_2d_3x3.cl | 120 +++++++++++---- mace/kernels/opencl/cl/conv_helper.cl | 41 ----- mace/kernels/opencl/cl/conv_helper.h | 15 -- mace/kernels/opencl/cl/depthwise_conv_3x3.cl | 123 +++++++++++---- mace/kernels/opencl/cl/pooling.cl | 145 +++++++++--------- mace/kernels/opencl/cl/relu.cl | 21 +-- mace/kernels/opencl/cl/resize_bilinear.cl | 23 +-- mace/kernels/opencl/cl/space_to_batch.cl | 25 +-- mace/kernels/opencl/conv_2d_opencl_1x1.cc | 12 +- mace/kernels/opencl/conv_2d_opencl_3x3.cc | 13 +- .../opencl/depthwise_conv_opencl_3x3.cc | 13 +- mace/kernels/opencl/pooling_opencl.cc | 30 ++-- mace/kernels/opencl/relu_opencl.cc | 7 +- mace/kernels/opencl/resize_bilinear_opencl.cc | 5 +- mace/kernels/opencl/space_to_batch_opecl.cc | 6 +- mace/proto/mace.proto | 1 + 27 files changed, 509 insertions(+), 440 deletions(-) delete mode 100644 mace/kernels/opencl/cl/conv_helper.cl delete mode 100644 mace/kernels/opencl/cl/conv_helper.h diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 02ce4415..32a7bcc2 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -7,8 +7,6 @@ #include #include -#include - #include "mace/core/logging.h" #include "mace/core/runtime/opencl/opencl_runtime.h" @@ -32,55 +30,6 @@ bool ReadSourceFile(const std::string &filename, std::string *content) { return true; } -bool BuildProgram(OpenCLRuntime *runtime, - const std::string &path, - cl::Program *program) { - MACE_CHECK_NOTNULL(program); - - auto closer = [](DIR *d) { - if (d != nullptr) closedir(d); - }; - std::unique_ptr dir(opendir(path.c_str()), closer); - MACE_CHECK_NOTNULL(dir.get()); - - const std::string kSourceSuffix = ".cl"; - cl::Program::Sources sources; - errno = 0; - dirent *entry = readdir(dir.get()); - MACE_CHECK(errno == 0); - while (entry != nullptr) { - if (entry->d_type == DT_REG) { - std::string d_name(entry->d_name); - if (d_name.size() > kSourceSuffix.size() && - d_name.compare(d_name.size() - kSourceSuffix.size(), - kSourceSuffix.size(), kSourceSuffix) == 0) { - std::string filename = path + d_name; - std::string kernel_source; - MACE_CHECK(ReadSourceFile(filename, &kernel_source)); - sources.push_back({kernel_source.c_str(), kernel_source.length()}); - } - } - entry = readdir(dir.get()); - MACE_CHECK(errno == 0); - }; - - *program = cl::Program(runtime->context(), sources); - std::string build_options = "-Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path; - // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math - cl_int ret = program->build({runtime->device()}, build_options.c_str()); - if (ret != CL_SUCCESS) { - if (program->getBuildInfo(runtime->device()) == - CL_BUILD_ERROR) { - std::string build_log = - program->getBuildInfo(runtime->device()); - LOG(INFO) << "Program build log: " << build_log; - } - LOG(FATAL) << "Build program failed: " << ret; - } - - return true; -} - } // namespace @@ -156,58 +105,57 @@ cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; } cl::Program &OpenCLRuntime::program() { // TODO(heliangliang) Support binary format - static const char *kernel_path = getenv("MACE_KERNEL_PATH"); - std::string path(kernel_path == nullptr ? "" : kernel_path); - - std::call_once(build_flag_, [this, &path]() { - MACE_CHECK(BuildProgram(this, path, &program_)); - }); - return program_; } - -const std::unodered_map - OpenCLRuntime::kernel_program_map_ = { - {"BatchNorm", "batch_norm.cl"} +const std::map + OpenCLRuntime::program_map_ = { + {"addn", "addn.cl"}, + {"batch_norm", "batch_norm.cl"}, + {"conv_2d_1x1", "conv_2d_1x1.cl"}, + {"conv_2d_3x3", "conv_2d_3x3.cl"}, + {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"}, + {"pooling", "pooling.cl"}, + {"relu", "relu.cl"}, + {"resize_bilinear", "resize_bilinear.cl"}, + {"space_to_batch", "space_to_batch.cl"}, }; -bool OpenCLRuntime::BuildProgram(const std::string &kernel_name, +void OpenCLRuntime::BuildProgram(const std::string &program_file_name, const std::string &build_options, cl::Program *program) { MACE_CHECK_NOTNULL(program); - cl::Program::Sources sources; - std::string filename = kernel_path_ + kernel_name; + std::string filename = kernel_path_ + program_file_name; std::string kernel_source; MACE_CHECK(ReadSourceFile(filename, &kernel_source)); sources.push_back({kernel_source.c_str(), kernel_source.length()}); *program = cl::Program(this->context(), sources); - build_options += " -Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path; + std::string build_options_str = build_options + + " -Werror -cl-mad-enable -cl-fast-relaxed-math -I" + kernel_path_; // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math - cl_int ret = program->build({runtime->device()}, build_options.c_str()); + cl_int ret = program->build({device()}, build_options_str.c_str()); if (ret != CL_SUCCESS) { - if (program->getBuildInfo(runtime->device()) == + if (program->getBuildInfo(device()) == CL_BUILD_ERROR) { std::string build_log = - program->getBuildInfo(runtime->device()); + program->getBuildInfo(device()); LOG(INFO) << "Program build log: " << build_log; } LOG(FATAL) << "Build program failed: " << ret; } - - return true; } -cl::Kernel OpenCLRuntime::BuildKernel(const std::string &kernel_name, - const std::set &build_options) { - auto kernel_program_it = kernel_program_map_.find(kernel_name); - if (kernel_program_it == kernel_program_map_.end()) { - MACE_CHECK(false, kernel_name, " opencl kernel doesn't exist."); +cl::Kernel OpenCLRuntime::BuildKernel(const std::string &program_name, + const std::string &kernel_name, + const std::set &build_options) { + auto kernel_program_it = program_map_.find(program_name); + if (kernel_program_it == program_map_.end()) { + MACE_CHECK(false, program_name, " opencl kernel doesn't exist."); } - std::string program_name = kernel_program_it->second; + std::string program_file_name = kernel_program_it->second; std::string build_options_str; for(auto &option : build_options) { build_options_str += " " + option; @@ -219,10 +167,10 @@ cl::Kernel OpenCLRuntime::BuildKernel(const std::string &kernel_name, if (built_program_it != built_program_map_.end()) { program = built_program_it->second; } else { - this->BuildProgram(kernel_name, build_options_str, &program); - built_program_map_.emplace(built_program_key, std::move(program)); + this->BuildProgram(program_file_name, build_options_str, &program); + built_program_map_.emplace(built_program_key, program); } - return cl::Kernel(kernel_name, program); + return cl::Kernel(program, kernel_name.c_str()); } uint32_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() { diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index cdbd5d46..cd62ca8d 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -7,7 +7,7 @@ #include #include -#include +#include #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_wrapper.h" @@ -25,7 +25,8 @@ class OpenCLRuntime { uint32_t GetDeviceMaxWorkGroupSize(); uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel); - cl::Kernel BuildKernel(const std::string &kernel_name, + cl::Kernel BuildKernel(const std::string &program_name, + const std::string &kernel_name, const std::set &build_options); private: OpenCLRuntime(cl::Context context, @@ -35,7 +36,7 @@ class OpenCLRuntime { OpenCLRuntime(const OpenCLRuntime&) = delete; OpenCLRuntime &operator=(const OpenCLRuntime&) = delete; - bool BuildProgram(const std::string &kernel_name, + void BuildProgram(const std::string &kernel_name, const std::string &build_options, cl::Program *program); @@ -44,11 +45,10 @@ class OpenCLRuntime { cl::Device device_; cl::CommandQueue command_queue_; cl::Program program_; - std::once_flag build_flag_; std::string kernel_path_; - static const std::unordered_map kernel_program_map_; - mutable std::unordered_map program_map_; + mutable std::map built_program_map_; }; diff --git a/mace/core/types.cc b/mace/core/types.cc index 8ad8fba9..18b5d6a8 100644 --- a/mace/core/types.cc +++ b/mace/core/types.cc @@ -12,6 +12,7 @@ bool DataTypeCanUseMemcpy(DataType dt) { case DT_DOUBLE: case DT_INT32: case DT_INT64: + case DT_UINT32: case DT_UINT16: case DT_UINT8: case DT_INT16: @@ -23,4 +24,32 @@ bool DataTypeCanUseMemcpy(DataType dt) { } } +std::string DataTypeToCLType(const DataType dt) { + switch (dt) { + case DT_FLOAT: + return "float"; + case DT_HALF: + return "half"; + case DT_UINT8: + return "uchar"; + case DT_INT8: + return "char"; + case DT_DOUBLE: + return "double"; + case DT_INT32: + return "int"; + case DT_UINT32: + return "int"; + case DT_UINT16: + return "ushort"; + case DT_INT16: + return "short"; + case DT_INT64: + return "long"; + default: + LOG(FATAL) << "Unsupported data type"; + return ""; + } +} + } // namespace mace \ No newline at end of file diff --git a/mace/core/types.h b/mace/core/types.h index 2f354a14..a592ad9d 100644 --- a/mace/core/types.h +++ b/mace/core/types.h @@ -12,6 +12,8 @@ namespace mace { bool DataTypeCanUseMemcpy(DataType dt); +std::string DataTypeToCLType(const DataType dt); + template struct IsValidDataType; @@ -50,6 +52,7 @@ MATCH_TYPE_AND_ENUM(int16_t, DT_INT16); MATCH_TYPE_AND_ENUM(int8_t, DT_INT8); MATCH_TYPE_AND_ENUM(string, DT_STRING); MATCH_TYPE_AND_ENUM(int64_t, DT_INT64); +MATCH_TYPE_AND_ENUM(uint32_t, DT_UINT32); MATCH_TYPE_AND_ENUM(bool, DT_BOOL); static const int32_t kint32_tmax = ((int32_t)0x7FFFFFFF); diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index d860dcd8..b95d4895 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -76,9 +76,8 @@ void BatchNormFunctor::operator()( const Tensor *epsilon, Tensor *output); -template -struct BatchNormFunctor { - void operator()( +template <> +void BatchNormFunctor::operator()( const Tensor *input, const Tensor *scale, const Tensor *offset, @@ -86,7 +85,6 @@ struct BatchNormFunctor { const Tensor *var, const Tensor *epsilon, Tensor *output); -}; } // namepsace kernels } // namespace mace diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index 24d084ca..b906c92d 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -15,9 +15,9 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) { const uint32_t gws = blocks; auto runtime = OpenCLRuntime::Get(); - auto program = runtime->program(); - - auto addn_kernel = cl::Kernel(program, "add2"); + std::set built_options; + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(output->dtype())); + auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options); const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel); diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 0c8cf342..badb3e7e 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -10,8 +10,8 @@ namespace mace { namespace kernels { -template -void BatchNormFunctor::operator()( +template <> +void BatchNormFunctor::operator()( const Tensor *input, const Tensor *scale, const Tensor *offset, @@ -29,8 +29,8 @@ void BatchNormFunctor::operator()( auto runtime = OpenCLRuntime::Get(); std::set built_options; - built_options.emplace("-DDataType=" + GetDataTypeFromEnum(input->dtype())); - auto bm_kernel = runtime->CreateKernel("batch_norm"); + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); + auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); const std::vector lws = {1, 1, kwg_size}; @@ -63,7 +63,7 @@ void BatchNormFunctor::operator()( cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(params[0], params[1], params[2])); - MACE_CHECK(error == CL_SUCCESS); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; return error; }; std::stringstream ss; diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl index eb1be1ca..55c8d0bf 100644 --- a/mace/kernels/opencl/cl/addn.cl +++ b/mace/kernels/opencl/cl/addn.cl @@ -1,7 +1,10 @@ -__kernel void add2(__global const float *input0, - __global const float *input1, +#include + +// Supported data type: half/float +__kernel void add2(__global const DATA_TYPE *input0, + __global const DATA_TYPE *input1, __private const int size, - __global float *output) { + __global DATA_TYPE *output) { int idx = get_global_id(0); if (idx + 4 > size) { @@ -9,8 +12,8 @@ __kernel void add2(__global const float *input0, *(output+idx) = *(input0+idx) + *(input1+idx); } } else { - float4 in_data0 = vload4(idx, input0); - float4 in_data1 = vload4(idx, input1); + VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0); + VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1); vstore4(in_data0+in_data1, idx, output); } } diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 6cc2a2e0..e6a52d49 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -1,13 +1,15 @@ -void kernel batch_norm(global const float *input, - global const float *scale, - global const float *offset, - global const float *mean, - global const float *var, - global const float *epsilon, +#include +// Supported data types: half/float +void kernel batch_norm(global const DATA_TYPE *input, + global const DATA_TYPE *scale, + global const DATA_TYPE *offset, + global const DATA_TYPE *mean, + global const DATA_TYPE *var, + global const DATA_TYPE *epsilon, private const int pixels, - global float *output, - __local float4 *new_scale, - __local float4 *new_offset) { + global DATA_TYPE *output, + __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_scale, + __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_offset) { const int batch = get_global_id(0); const int channel = get_global_id(1); const int channels = get_global_size(1); @@ -23,8 +25,8 @@ void kernel batch_norm(global const float *input, barrier(CLK_LOCAL_MEM_FENCE); const int image_offset = (batch * channels + channel) * pixels + pixel_offset*4; - const float *input_ptr = input + image_offset; - float *output_ptr = output + image_offset; + const DATA_TYPE *input_ptr = input + image_offset; + DATA_TYPE *output_ptr = output + image_offset; const int end = (batch * channels + channel + 1) * pixels; if ((image_offset+4) > end) { for (int i = image_offset; i < end; ++i) { @@ -33,7 +35,7 @@ void kernel batch_norm(global const float *input, ++output_ptr; } } else { - float4 values = vload4(0, input_ptr); + VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input_ptr); values = values * new_scale[local_channel] + new_offset[local_channel]; vstore4(values, 0, output_ptr); } diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index 74c5b67a..963ff740 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -8,4 +8,7 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable +#define VEC_DATA_TYPE_STR(data_type, size) data_type##size +#define VEC_DATA_TYPE(data_type, size) VEC_DATA_TYPE_STR(data_type, size) + #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 8025074f..6fcc863a 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -25,31 +25,31 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */ } #define vec_conv_2d_1x1_s1 \ - float4 in0 = vload4(0, input_ptr); \ - float4 in1 = vload4(0, input_ptr + in_pixel); \ - float4 in2 = vload4(0, input_ptr + 2 * in_pixel); \ - float4 in3 = vload4(0, input_ptr + 3 * in_pixel); + VEC_DATA_TYPE(DATA_TYPE,4) in0 = vload4(0, input_ptr); \ + VEC_DATA_TYPE(DATA_TYPE,4) in1 = vload4(0, input_ptr + in_pixel); \ + VEC_DATA_TYPE(DATA_TYPE,4) in2 = vload4(0, input_ptr + 2 * in_pixel); \ + VEC_DATA_TYPE(DATA_TYPE,4) in3 = vload4(0, input_ptr + 3 * in_pixel); #define vec_conv_2d_1x1_s2 \ - float4 in00 = vload4(0, input_ptr); \ - float3 in01 = vload3(0, input_ptr + 4); \ - float4 in10 = vload4(0, input_ptr + in_pixel); \ - float3 in11 = vload3(0, input_ptr + in_pixel + 4); \ - float4 in20 = vload4(0, input_ptr + 2 * in_pixel); \ - float3 in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\ - float4 in30 = vload4(0, input_ptr + 3 * in_pixel); \ - float3 in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \ - float4 in0 = (float4)(in00.s02, in01.s02); \ - float4 in1 = (float4)(in10.s02, in11.s02); \ - float4 in2 = (float4)(in20.s02, in21.s02); \ - float4 in3 = (float4)(in30.s02, in31.s02); + VEC_DATA_TYPE(DATA_TYPE,4) in00 = vload4(0, input_ptr); \ + VEC_DATA_TYPE(DATA_TYPE,3) in01 = vload3(0, input_ptr + 4); \ + VEC_DATA_TYPE(DATA_TYPE,4) in10 = vload4(0, input_ptr + in_pixel); \ + VEC_DATA_TYPE(DATA_TYPE,3) in11 = vload3(0, input_ptr + in_pixel + 4); \ + VEC_DATA_TYPE(DATA_TYPE,4) in20 = vload4(0, input_ptr + 2 * in_pixel); \ + VEC_DATA_TYPE(DATA_TYPE,3) in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\ + VEC_DATA_TYPE(DATA_TYPE,4) in30 = vload4(0, input_ptr + 3 * in_pixel); \ + VEC_DATA_TYPE(DATA_TYPE,3) in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \ + VEC_DATA_TYPE(DATA_TYPE,4) in0 = (VEC_DATA_TYPE(DATA_TYPE,4))(in00.s02, in01.s02); \ + VEC_DATA_TYPE(DATA_TYPE,4) in1 = (VEC_DATA_TYPE(DATA_TYPE,4))(in10.s02, in11.s02); \ + VEC_DATA_TYPE(DATA_TYPE,4) in2 = (VEC_DATA_TYPE(DATA_TYPE,4))(in20.s02, in21.s02); \ + VEC_DATA_TYPE(DATA_TYPE,4) in3 = (VEC_DATA_TYPE(DATA_TYPE,4))(in30.s02, in31.s02); #define vec_conv_2d_1x1_compute_loop \ for (int oc = 0; oc < 4; ++oc) { \ - float4 weights = vload4(0, filter_ptr + oc * in_chan_num); \ - float4 out = vload4(0, output_ptr + oc * out_pixel); \ + VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr + oc * in_chan_num); \ + VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr + oc * out_pixel); \ out += in0 * weights.x; \ out += in1 * weights.y; \ out += in2 * weights.z; \ @@ -58,25 +58,27 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */ } #define vec_conv_2d_1x1_compute \ - float4 weights = vload4(0, filter_ptr); \ - float4 out = vload4(0, output_ptr); \ + VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr); \ + VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr); \ out += in0 * weights.x; \ out += in1 * weights.y; \ out += in2 * weights.z; \ out += in3 * weights.w; \ vstore4(out, 0, output_ptr); -__kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */ - __global const float *filter, /* o, i, kh, kw */ - __global const float *bias, /* o */ - __global float *output, /* n, c, h, w */ +// Supported data type: half/float +__kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */ + __global const DATA_TYPE *filter, /* o, i, kh, kw */ +#ifdef BIAS + __global const DATA_TYPE *bias, /* o */ +#endif /* defined(BIAS) */ + __global DATA_TYPE *output, /* n, c, h, w */ __private const int in_chan_num, __private const int out_chan_num, __private const int in_height, __private const int in_width, __private const int out_height, - __private const int out_width, - __private const int stride) { + __private const int out_width) { int batch = get_global_id(0); int out_chan_blk = get_global_id(1); int out_pixel_blk = get_global_id(2); @@ -92,20 +94,30 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */ const int out_chan_end = min(out_chan_begin + 4, out_chan_num); const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); + +#ifdef STRIDE_1 + const int stride = 1; +#else + const int stride = 2; +#endif const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4; const int in_offset = batch * in_chan_num * in_pixel; const int out_offset = batch * out_chan_num * out_pixel; - const float *input_base = input + in_offset + in_pixel_begin; - float *output_base = output + out_offset + out_pixel_begin; + const DATA_TYPE *input_base = input + in_offset + in_pixel_begin; + DATA_TYPE *output_base = output + out_offset + out_pixel_begin; int out_chan_len = out_chan_end - out_chan_begin; int pixel_len = out_pixel_end - out_pixel_begin; for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) { - float *output_ptr = output_base + out_chan * out_pixel; - float bias_value = bias == NULL ? 0 : bias[out_chan]; + DATA_TYPE *output_ptr = output_base + out_chan * out_pixel; +#ifdef BIAS + DATA_TYPE bias_value = bias[out_chan]; +#else + DATA_TYPE bias_value = 0; +#endif for (int p = 0; p < pixel_len; ++p) { output_ptr[p] = bias_value; } @@ -113,48 +125,37 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */ int in_chan = 0; if (pixel_len == 4) { - if (stride == 1) { - for (; in_chan + 3 < in_chan_num; in_chan += 4) { - const float *input_ptr = input_base + in_chan * in_pixel; - int out_chan = out_chan_begin; - for (; out_chan + 3 < out_chan_end; out_chan += 4) { - const float* filter_ptr = filter + out_chan * in_chan_num + in_chan; - float *output_ptr = output_base + out_chan * out_pixel; - vec_conv_2d_1x1_s1; - vec_conv_2d_1x1_compute_loop; - } - for (; out_chan < out_chan_end; ++out_chan) { - const float* filter_ptr = filter + out_chan * in_chan_num + in_chan; - float *output_ptr = output_base + out_chan * out_pixel; - vec_conv_2d_1x1_s1; - vec_conv_2d_1x1_compute; - } + for (; in_chan + 3 < in_chan_num; in_chan += 4) { + const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel; + int out_chan = out_chan_begin; + for (; out_chan + 3 < out_chan_end; out_chan += 4) { + const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan; + DATA_TYPE *output_ptr = output_base + out_chan * out_pixel; +#ifdef STRIDE_1 + vec_conv_2d_1x1_s1; +#else + vec_conv_2d_1x1_s2; +#endif + vec_conv_2d_1x1_compute_loop; } - } else if (stride == 2) { - for (; in_chan + 3 < in_chan_num; in_chan += 4) { - const float *input_ptr = input_base + in_chan * in_pixel; - int out_chan = out_chan_begin; - for (; out_chan + 3 < out_chan_end; out_chan += 4) { - const float* filter_ptr = filter + out_chan * in_chan_num + in_chan; - float *output_ptr = output_base + out_chan * out_pixel; - vec_conv_2d_1x1_s2; - vec_conv_2d_1x1_compute_loop; - } - for (; out_chan < out_chan_end; ++out_chan) { - const float* filter_ptr = filter + out_chan * in_chan_num + in_chan; - float *output_ptr = output_base + out_chan * out_pixel; - vec_conv_2d_1x1_s2; - vec_conv_2d_1x1_compute; - } + for (; out_chan < out_chan_end; ++out_chan) { + const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan; + DATA_TYPE *output_ptr = output_base + out_chan * out_pixel; +#ifdef STRIDE_1 + vec_conv_2d_1x1_s1; +#else + vec_conv_2d_1x1_s2; +#endif + vec_conv_2d_1x1_compute; } } } for (; in_chan < in_chan_num; ++in_chan) { - const float *input_ptr = input_base + in_chan * in_pixel; + const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel; for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) { - float weights = filter[out_chan * in_chan_num + in_chan]; - float *output_ptr = output_base + out_chan * out_pixel; + DATA_TYPE weights = filter[out_chan * in_chan_num + in_chan]; + DATA_TYPE *output_ptr = output_base + out_chan * out_pixel; for (int p = 0; p < pixel_len; ++p) { float in = input_ptr[p*stride]; diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 317daaaf..8962fdad 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -1,16 +1,60 @@ -#include -void kernel conv_2d_3x3(global const float *input, - global const float *filter, - global const float *bias, - global float *output, +#include + +VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s1(const DATA_TYPE *input_ptr, + const DATA_TYPE *filter_ptr) { + VEC_DATA_TYPE(DATA_TYPE,4) row0 = vload4(0, input_ptr); + VEC_DATA_TYPE(DATA_TYPE,2) input1 = vload2(0, input_ptr+4); + VEC_DATA_TYPE(DATA_TYPE,4) row1 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input1.s0); + VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s23, input1.s01); + VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr); + return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 + + (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 + + (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2; +} + +VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s2(const DATA_TYPE *input_ptr, + const DATA_TYPE *filter_ptr) { + VEC_DATA_TYPE(DATA_TYPE,8) input = vload8(0, input_ptr); + VEC_DATA_TYPE(DATA_TYPE,4) row0 = input.even; + VEC_DATA_TYPE(DATA_TYPE,4) row1 = input.odd; + VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input_ptr[8]); + VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr); + return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 + + (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 + + (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2; +} + +// Supported data type: half/float +DATA_TYPE conv3x3(const DATA_TYPE *input_ptr, + const DATA_TYPE *filter_ptr, + const int row_width) { + VEC_DATA_TYPE(DATA_TYPE,3) input_value = vload3(0, input_ptr); + VEC_DATA_TYPE(DATA_TYPE,3) filter_value = vload3(0, filter_ptr); + VEC_DATA_TYPE(DATA_TYPE,3) res = input_value * filter_value; + input_ptr += row_width; + input_value = vload3(0, input_ptr); + filter_value = vload3(1, filter_ptr); + res += input_value * filter_value; + input_ptr += row_width; + input_value = vload3(0, input_ptr); + filter_value = vload3(2, filter_ptr); + res += input_value * filter_value; + + return res.s0 + res.s1 + res.s2; +} + +void kernel conv_2d_3x3(global const DATA_TYPE *input, + global const DATA_TYPE *filter, +#ifdef BIAS + global const DATA_TYPE *bias, +#endif + global DATA_TYPE *output, private const int in_chan_num, private const int out_chan_num, private const int in_height, private const int in_width, private const int out_height, - private const int out_width, - private const int stride_h, - private const int stride_w) { + private const int out_width) { int batch = get_global_id(0); int out_chan_blk = get_global_id(1); int out_pixel_blk = get_global_id(2); @@ -26,46 +70,54 @@ void kernel conv_2d_3x3(global const float *input, const int out_chan_end = min(out_chan_begin + 4, out_chan_num); const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); - const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4; +#ifdef STRIDE_1 + const int stride = 1; +#else + const int stride = 2; +#endif + const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4; const int in_offset = batch * in_chan_num * in_pixel; const int out_offset = batch * out_chan_num * out_pixel; - const float *input_base = input + in_offset + in_pixel_begin; - float *output_base = output + out_offset + out_pixel_begin; + const DATA_TYPE *input_base = input + in_offset + in_pixel_begin; + DATA_TYPE *output_base = output + out_offset + out_pixel_begin; const int pixels = out_pixel_end - out_pixel_begin; for (int i = out_chan_begin; i < out_chan_end; ++i) { - float *output_ptr = output_base + i * out_pixel; - const float *filter_base = filter + i * in_chan_num * 9; + DATA_TYPE *output_ptr = output_base + i * out_pixel; + const DATA_TYPE *filter_base = filter + i * in_chan_num * 9; if (pixels == 4) { +#ifdef BIAS + VEC_DATA_TYPE(DATA_TYPE, 4) res = (VEC_DATA_TYPE(DATA_TYPE, 4))bias[i]; +#else + VEC_DATA_TYPE(DATA_TYPE, 4) res = 0; +#endif - float4 res = bias == NULL ? 0 : (float4)bias[i]; - - if (stride_w == 1) { - for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) { - const float* input_ptr = input_base + in_chan_idx * in_pixel; - const float* filter_ptr = filter_base + in_chan_idx * 9; - res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3); - res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3); - res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3); - } - } else { - for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) { - const float* input_ptr = input_base + in_chan_idx * in_pixel; - const float* filter_ptr = filter_base + in_chan_idx * 9; - res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3); - res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3); - res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3); - } + for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) { + const DATA_TYPE *input_ptr = input_base + in_chan_idx * in_pixel; + const DATA_TYPE *filter_ptr = filter_base + in_chan_idx * 9; +#ifdef STRIDE_1 + res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3); + res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3); + res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3); +#else + res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3); + res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3); + res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3); +#endif } vstore4(res, 0, output_ptr); } else { for (int p = 0; p < pixels; ++p) { - float res = bias == NULL ? 0 : bias[i]; +#ifdef BIAS + DATA_TYPE res = bias[i]; +#else + DATA_TYPE res = 0; +#endif for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) { - const float* input_ptr = input_base + in_chan_idx * in_pixel + p * stride_w; - const float* filter_ptr = filter_base + in_chan_idx * 9; + const DATA_TYPE *input_ptr = input_base + in_chan_idx * in_pixel + p * stride; + const DATA_TYPE *filter_ptr = filter_base + in_chan_idx * 9; res += conv3x3(input_ptr, filter_ptr, in_width); } output_ptr[p] = res; diff --git a/mace/kernels/opencl/cl/conv_helper.cl b/mace/kernels/opencl/cl/conv_helper.cl deleted file mode 100644 index f569ef3a..00000000 --- a/mace/kernels/opencl/cl/conv_helper.cl +++ /dev/null @@ -1,41 +0,0 @@ -float4 conv1x3_s1(const float *input_ptr, - const float *filter_ptr) { - float4 row0 = vload4(0, input_ptr); - float2 input1 = vload2(0, input_ptr+4); - float4 row1 = (float4)(row0.s123, input1.s0); - float4 row2 = (float4)(row0.s23, input1.s01); - float3 filter_values = vload3(0, filter_ptr); - return (float4)filter_values.s0 * row0 + - (float4)filter_values.s1 * row1 + - (float4)filter_values.s2 * row2; -} - -float4 conv1x3_s2(const float *input_ptr, - const float *filter_ptr) { - float8 input = vload8(0, input_ptr); - float4 row0 = input.even; - float4 row1 = input.odd; - float4 row2 = (float4)(row0.s123, input_ptr[8]); - float3 filter_values = vload3(0, filter_ptr); - return (float4)filter_values.s0 * row0 + - (float4)filter_values.s1 * row1 + - (float4)filter_values.s2 * row2; -} - -float conv3x3(const float *input_ptr, - const float *filter_ptr, - const int row_width) { - float3 input_value = vload3(0, input_ptr); - float3 filter_value = vload3(0, filter_ptr); - float3 res = input_value * filter_value; - input_ptr += row_width; - input_value = vload3(0, input_ptr); - filter_value = vload3(1, filter_ptr); - res += input_value * filter_value; - input_ptr += row_width; - input_value = vload3(0, input_ptr); - filter_value = vload3(2, filter_ptr); - res += input_value * filter_value; - - return res.s0 + res.s1 + res.s2; -} diff --git a/mace/kernels/opencl/cl/conv_helper.h b/mace/kernels/opencl/cl/conv_helper.h deleted file mode 100644 index 553af09d..00000000 --- a/mace/kernels/opencl/cl/conv_helper.h +++ /dev/null @@ -1,15 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#ifndef MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_ -#define MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_ - -float4 conv1x3_s1(const float *input_ptr, - const float *filter_ptr); -float4 conv1x3_s2(const float *input_ptr, - const float *filter_ptr); -float conv3x3(const float *input_ptr, - const float *filter_ptr, - const int row_width); -#endif // MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_ diff --git a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl index 9f9a6fc4..29dbc340 100644 --- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl +++ b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl @@ -1,17 +1,60 @@ -#include +#include + +VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s1(const DATA_TYPE *input_ptr, + const DATA_TYPE *filter_ptr) { + VEC_DATA_TYPE(DATA_TYPE,4) row0 = vload4(0, input_ptr); + VEC_DATA_TYPE(DATA_TYPE,2) input1 = vload2(0, input_ptr+4); + VEC_DATA_TYPE(DATA_TYPE,4) row1 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input1.s0); + VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s23, input1.s01); + VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr); + return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 + + (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 + + (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2; +} + +VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s2(const DATA_TYPE *input_ptr, + const DATA_TYPE *filter_ptr) { + VEC_DATA_TYPE(DATA_TYPE,8) input = vload8(0, input_ptr); + VEC_DATA_TYPE(DATA_TYPE,4) row0 = input.even; + VEC_DATA_TYPE(DATA_TYPE,4) row1 = input.odd; + VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input_ptr[8]); + VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr); + return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 + + (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 + + (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2; +} + +// Supported data type: half/float +DATA_TYPE conv3x3(const DATA_TYPE *input_ptr, + const DATA_TYPE *filter_ptr, + const int row_width) { + VEC_DATA_TYPE(DATA_TYPE,3) input_value = vload3(0, input_ptr); + VEC_DATA_TYPE(DATA_TYPE,3) filter_value = vload3(0, filter_ptr); + VEC_DATA_TYPE(DATA_TYPE,3) res = input_value * filter_value; + input_ptr += row_width; + input_value = vload3(0, input_ptr); + filter_value = vload3(1, filter_ptr); + res += input_value * filter_value; + input_ptr += row_width; + input_value = vload3(0, input_ptr); + filter_value = vload3(2, filter_ptr); + res += input_value * filter_value; + + return res.s0 + res.s1 + res.s2; +} //TODO merge the depthwise with conv 3x3 to remove duplicate code. -void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */ - global const float *filter, /* m, i, kh, kw */ - global const float *bias, /* o */ - global float *output, /* n, c, h, w */ - private const int in_chan_num, - private const int out_chan_num, - private const int in_height, - private const int in_width, - private const int out_height, - private const int out_width, - private const int stride_h, - private const int stride_w) { +__kernel void depthwise_conv_3x3(__global const DATA_TYPE *input, /* n, c, h, w */ + __global const DATA_TYPE *filter, /* m, i, kh, kw */ +#ifdef BIAS + __global const DATA_TYPE *bias, /* o */ +#endif + __global DATA_TYPE *output, /* n, c, h, w */ + __private const int in_chan_num, + __private const int out_chan_num, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width) { int batch = get_global_id(0); int out_chan_blk = get_global_id(1); int out_pixel_blk = get_global_id(2); @@ -28,38 +71,54 @@ void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */ const int out_chan_end = min(out_chan_begin + 4, out_chan_num); const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); - const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4; +#ifdef STRIDE_1 + const int in_pixel_begin = out_pixel_height * in_width + out_pixel_width * 4; +#else + const int in_pixel_begin = out_pixel_height * 2 * in_width + out_pixel_width * 2 * 4; +#endif const int in_offset = batch * in_chan_num * in_pixel; const int out_offset = batch * out_chan_num * out_pixel; - const float *input_base = input + in_offset + in_pixel_begin; - float *output_base = output + out_offset + out_pixel_begin; + const DATA_TYPE *input_base = input + in_offset + in_pixel_begin; + DATA_TYPE *output_base = output + out_offset + out_pixel_begin; const int pixels = out_pixel_end - out_pixel_begin; for (int i = out_chan_begin; i < out_chan_end; ++i) { - float bias_value = bias[i]; - const float *input_ptr = input_base + (i / multiplier) * in_pixel; - const float *filter_ptr = filter + i * 9; - float *output_ptr = output_base + i * out_pixel; + const DATA_TYPE *input_ptr = input_base + (i / multiplier) * in_pixel; + const DATA_TYPE *filter_ptr = filter + i * 9; + DATA_TYPE *output_ptr = output_base + i * out_pixel; if (pixels == 4) { - float4 res = (float4)bias[i]; - if (stride_w == 1) { - res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3); - res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3); - res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3); - } else { - res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3); - res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3); - res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3); - } +#ifdef BIAS + VEC_DATA_TYPE(DATA_TYPE,4) res = (VEC_DATA_TYPE(DATA_TYPE,4))bias[i]; +#else + VEC_DATA_TYPE(DATA_TYPE,4) res = 0; +#endif /* defined(BIAS) */ + +#ifdef STRIDE_1 + res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3); + res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3); + res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3); +#else + res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3); + res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3); + res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3); +#endif vstore4(res, 0, output_ptr); } else { for (int p = 0; p < pixels; ++p) { - float res = bias[i]; +#ifdef BIAS + DATA_TYPE res = bias[i]; +#else + DATA_TYPE res = 0; +#endif res += conv3x3(input_ptr, filter_ptr, in_width); output_ptr[p] = res; - input_ptr += stride_w; +#ifdef STRIDE_1 + input_ptr += 1; +#else + input_ptr += 2; +#endif } } } diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index 9f9e38d4..bc987ddd 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -1,69 +1,75 @@ -float4 vec_pooling_3_s1(const float *input_ptr, const int in_width) { - float4 row00 = vload4(0, input_ptr); - float2 row01 = vload2(0, input_ptr + 4); - float4 row10 = vload4(0, input_ptr + in_width); - float2 row11 = vload2(0, input_ptr + in_width + 4); - float4 row20 = vload4(0, input_ptr + in_width * 2); - float2 row21 = vload2(0, input_ptr + in_width * 2 + 4); - - float8 data00 = (float8)(row00.s01212323); - float4 data01 = (float4)(row01.s0, row00.s3, row01.s01); - float8 data10 = (float8)(row10.s01212323); - float4 data11 = (float4)(row11.s0, row10.s3, row11.s01); - float8 data20 = (float8)(row20.s01212323); - float4 data21 = (float4)(row21.s0, row20.s3, row21.s01); - - float8 left = fmax(fmax(data00, data10), data20); - float4 right = fmax(fmax(data01, data11), data21); - - float4 res = fmax((float4)(left.s036, right.s1), (float4)(left.s147, right.s2)); - res = fmax(res, (float4)(left.s25, right.s03)); +#include + +VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) { + VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr); + VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4); + VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width); + VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4); + VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2); + VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4); + + VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323); + VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01); + VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323); + VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01); + VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323); + VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01); + + VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20); + VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21); + + VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1), + (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2)); + res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03)); return res; } -float4 vec_pooling_3_s2(const float *input_ptr, const int in_width) { - float8 row00 = vload8(0, input_ptr); - float row01 = *(input_ptr + 8); - float8 row10 = vload8(0, input_ptr + in_width); - float row11 = *(input_ptr + in_width + 8); - float8 row20 = vload8(0, input_ptr + in_width * 2); - float row21 = *(input_ptr + in_width * 2 + 8); - - float8 data00 = (float8)(row00.s01223445); - float4 data01 = (float4)(row00.s667, row01); - float8 data10 = (float8)(row10.s01223445); - float4 data11 = (float4)(row10.s667, row11); - float8 data20 = (float8)(row20.s01223445); - float4 data21 = (float4)(row20.s667, row21); - - float8 left = fmax(fmax(data00, data10), data20); - float4 right = fmax(fmax(data01, data11), data21); - - float4 res = fmax((float4)(left.s036, right.s1), (float4)(left.s147, right.s2)); - res = fmax(res, (float4)(left.s25, right.s03)); + +VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) { + VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr); + DATA_TYPE row01 = *(input_ptr + 8); + VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width); + DATA_TYPE row11 = *(input_ptr + in_width + 8); + VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2); + DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8); + + VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445); + VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01); + VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445); + VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11); + VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445); + VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21); + + VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20); + VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21); + + VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1), + (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2)); + res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03)); return res; } -float inner_pooling_3(const float *input_ptr, const int in_width) { - float3 row0 = vload3(0, input_ptr); - float3 row1 = vload3(0, input_ptr + in_width); - float3 row2 = vload3(0, input_ptr + in_width * 2); +DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) { + VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr); + VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width); + VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2); - float3 data = fmax(fmax(row0, row1), row2); + VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2); - float res = fmax(fmax(data.s0, data.s1), data.s2); + DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2); return res; } -__kernel void pooling3(__global const float *input, /* n, c, h, w */ +// Supported data type: half/float +__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */ __private const int in_height, __private const int in_width, __private const int out_chan_num, __private const int out_height, __private const int out_width, __private const int stride, - __global float *output) { + __global DATA_TYPE *output) { int batch = get_global_id(0); int out_chan_blk = get_global_id(1); int out_pixel_blk = get_global_id(2); @@ -83,21 +89,21 @@ __kernel void pooling3(__global const float *input, /* n, c, h, w */ const int in_offset = batch * out_chan_num * in_pixel; const int out_offset = batch * out_chan_num * out_pixel; - const float *input_base = input + in_offset + in_pixel_begin; - float *output_base = output + out_offset + out_pixel_begin; + const DATA_TYPE *input_base = input + in_offset + in_pixel_begin; + DATA_TYPE *output_base = output + out_offset + out_pixel_begin; const int pixels = out_pixel_end - out_pixel_begin; for (int i = out_chan_begin; i < out_chan_end; ++i) { - const float *input_ptr = input_base + i * in_pixel; - float *output_ptr = output_base + i * out_pixel; + const DATA_TYPE *input_ptr = input_base + i * in_pixel; + DATA_TYPE *output_ptr = output_base + i * out_pixel; if (pixels == 4) { - float4 res; - if (stride == 1) { - res = vec_pooling_3_s1(input_ptr, in_width); - } else { - res = vec_pooling_3_s2(input_ptr, in_width); - } + VEC_DATA_TYPE(DATA_TYPE,4) res; +#ifdef STRIDE_1 + res = vec_pooling_3_s1(input_ptr, in_width); +#else + res = vec_pooling_3_s2(input_ptr, in_width); +#endif vstore4(res, 0, output_ptr); } else { for (int p = 0; p < pixels; ++p) { @@ -122,7 +128,8 @@ int calculate_avg_block_size(const int pos_h, return (h_end - h_start) * (w_end - w_start); } -__kernel void poolingn(__global const float *input, /* n, c, h, w */ +// Supported data type: half/float +__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */ __private const int in_height, __private const int in_width, __private const int out_chan_num, @@ -132,7 +139,7 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */ __private const int pad_h, __private const int pad_w, __private const int pooling_size, - __global float *output) { + __global DATA_TYPE *output) { int batch = get_global_id(0); int out_chan_idx = get_global_id(1); int out_pixel_idx = get_global_id(2); @@ -150,8 +157,8 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */ const int in_offset = batch * out_chan_num * in_pixel; const int out_offset = batch * out_chan_num * out_pixel; - const float *input_base = input + in_offset + in_pixel_idx; - float *output_base = output + out_offset + out_pixel_idx; + const DATA_TYPE *input_base = input + in_offset + in_pixel_idx; + DATA_TYPE *output_base = output + out_offset + out_pixel_idx; const int block_size = calculate_avg_block_size( out_pixel_height * stride, @@ -162,14 +169,14 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */ in_height - pad_h, in_width - pad_w); for (int i = out_chan_begin; i < out_chan_end; ++i) { - float8 sum8 = 0.0f; - float sum1 = 0.0f; - float *output_ptr = output_base + i * out_pixel; + VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f; + DATA_TYPE sum1 = 0.0f; + DATA_TYPE *output_ptr = output_base + i * out_pixel; for (int y = 0; y < pooling_size; ++y) { - const float *input_ptr = input_base + i * in_pixel + y * in_width; + const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width; int x = 0; for (; x < (pooling_size-8); x += 8) { - float8 data = vload8(0, input_ptr); + VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr); sum8 += data; input_ptr += 8; } @@ -178,8 +185,8 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */ input_ptr++; } } - float4 sum4 = sum8.s0123 + sum8.s4567; - float2 sum2 = sum4.s01 + sum4.s23; + VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567; + VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23; *output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size; } diff --git a/mace/kernels/opencl/cl/relu.cl b/mace/kernels/opencl/cl/relu.cl index 390c8454..33fe65d0 100644 --- a/mace/kernels/opencl/cl/relu.cl +++ b/mace/kernels/opencl/cl/relu.cl @@ -1,6 +1,9 @@ -__kernel void relu(__global const float *input, +#include + +// Supported data type: half/float +__kernel void relu(__global const DATA_TYPE *input, __private const int size, - __global float *output) { + __global DATA_TYPE *output) { int idx = get_global_id(0); if (idx + 4 > size) { @@ -8,16 +11,16 @@ __kernel void relu(__global const float *input, *(output+idx) = fmax(*(input+idx), 0); } } else { - float4 data = vload4(idx, input); - data = fmax(data, (float4)0); + VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input); + data = fmax(data, (VEC_DATA_TYPE(DATA_TYPE,4))0); vstore4(data, idx, output); } } -__kernel void relux(__global const float *input, - __private const float max_limit, +__kernel void relux(__global const DATA_TYPE *input, + __private const DATA_TYPE max_limit, __private const int size, - __global float *output) { + __global DATA_TYPE *output) { int idx = get_global_id(0); if (idx + 4 > size) { @@ -25,8 +28,8 @@ __kernel void relux(__global const float *input, *(output+idx) = clamp(*(input+idx), 0.0f, max_limit); } } else { - float4 data = vload4(idx, input); - data = clamp(data, (float4)0, (float4)max_limit); + VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input); + data = clamp(data, (VEC_DATA_TYPE(DATA_TYPE,4))0, (VEC_DATA_TYPE(DATA_TYPE,4))max_limit); vstore4(data, idx, output); } } diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index b1e987ec..f34e63cb 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -1,5 +1,8 @@ -__kernel void resize_bilinear_nocache(__global const float *input, /* n * c, h, w */ - __global float *output /* n * c, h, w */, +#include + +// Supported data type: half/float +__kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, h, w */ + __global DATA_TYPE *output /* n * c, h, w */, __private const float height_scale, __private const float width_scale, __private const int in_height, @@ -21,16 +24,16 @@ __kernel void resize_bilinear_nocache(__global const float *input, /* n * c, h, const float h_lerp = h_in - h_lower; const float w_lerp = w_in - w_lower; - const float *input_base = input + c * in_height * in_width; - float *output_base = output + c * height * width; + const DATA_TYPE *input_base = input + c * in_height * in_width; + DATA_TYPE *output_base = output + c * height * width; - float top_left = input_base[h_lower * in_width + w_lower]; - float top_right = input_base[h_lower * in_width + w_upper]; - float bottom_left = input_base[h_upper * in_width + w_lower]; - float bottom_right = input_base[h_upper * in_width + w_upper]; + DATA_TYPE top_left = input_base[h_lower * in_width + w_lower]; + DATA_TYPE top_right = input_base[h_lower * in_width + w_upper]; + DATA_TYPE bottom_left = input_base[h_upper * in_width + w_lower]; + DATA_TYPE bottom_right = input_base[h_upper * in_width + w_upper]; - const float top = top_left + (top_right - top_left) * w_lerp; - const float bottom = bottom_left + (bottom_right - bottom_left) * w_lerp; + const DATA_TYPE top = top_left + (top_right - top_left) * w_lerp; + const DATA_TYPE bottom = bottom_left + (bottom_right - bottom_left) * w_lerp; output_base[h * width + w] = top + (bottom - top) * h_lerp; } diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl index 921a3bf8..5d098d86 100644 --- a/mace/kernels/opencl/cl/space_to_batch.cl +++ b/mace/kernels/opencl/cl/space_to_batch.cl @@ -1,14 +1,17 @@ -void kernel space_to_batch(global float *space_data_ptr, - global const int *block_shape_ptr, - global const int *paddings_ptr, - private const int space_batch, - private const int space_channel, - private const int space_height, - private const int space_width, - private const int batch_height, - private const int batch_width, - private const int b2s, - global float* batch_data_ptr) { +#include + +// Supported data type: all +__kernel void space_to_batch(__global DATA_TYPE *space_data_ptr, + __global const int *block_shape_ptr, + __global const int *paddings_ptr, + __private const int space_batch, + __private const int space_channel, + __private const int space_height, + __private const int space_width, + __private const int batch_height, + __private const int batch_width, + __private const int b2s, + __global DATA_TYPE* batch_data_ptr) { int batch_idx = get_global_id(0); int batch_channel_idx = get_global_id(1); int batch_pixel_idx = get_global_id(2); diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 0c043b8c..1d89519e 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -61,16 +61,19 @@ void Conv1x1V2(const Tensor *input, // TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy // TODO check wired clReleaseCommandQueue latency // The KernelFunctor can cause segment faults in cb_retain_event - auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v2"); + std::set built_options; + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); + built_options.emplace(stride == 1 ? "-DSTRIDE_1" : ""); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1_v2", built_options); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); uint32_t idx = 0; conv_2d_kernel.setArg(idx++, *(static_cast(input->buffer()))); conv_2d_kernel.setArg(idx++, *(static_cast(filter->buffer()))); - if (bias == NULL) { - conv_2d_kernel.setArg(idx++, NULL); - } else { + if (bias != nullptr) { conv_2d_kernel.setArg(idx++, *(static_cast(bias->buffer()))); } @@ -81,7 +84,6 @@ void Conv1x1V2(const Tensor *input, conv_2d_kernel.setArg(idx++, static_cast(input->dim(3))); conv_2d_kernel.setArg(idx++, static_cast(height)); conv_2d_kernel.setArg(idx++, static_cast(width)); - conv_2d_kernel.setArg(idx++, stride); auto command_queue = runtime->command_queue(); cl_int error = command_queue.enqueueNDRangeKernel( diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index a0224484..452f46fd 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -22,14 +22,17 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter, auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); - auto conv_kernel = cl::Kernel(program, "conv_2d_3x3"); + + std::set built_options; + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); + built_options.emplace(stride == 1 ? "-DSTRIDE_1" : ""); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + auto conv_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options); uint32_t idx = 0; conv_kernel.setArg(idx++, *(static_cast(input->buffer()))); conv_kernel.setArg(idx++, *(static_cast(filter->buffer()))); - if (bias == nullptr) { - conv_kernel.setArg(idx++, NULL); - } else { + if (bias != nullptr) { conv_kernel.setArg(idx++, *(static_cast(bias->buffer()))); } conv_kernel.setArg(idx++, *(static_cast(output->buffer()))); @@ -39,8 +42,6 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter, conv_kernel.setArg(idx++, static_cast(input->dim(3))); conv_kernel.setArg(idx++, static_cast(height)); conv_kernel.setArg(idx++, static_cast(width)); - conv_kernel.setArg(idx++, stride); - conv_kernel.setArg(idx++, stride); const uint32_t gws[3] = {static_cast(output->dim(0)), static_cast(channel_blocks), static_cast(pixel_blocks)}; diff --git a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc index c37fe77f..84b73071 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc @@ -30,13 +30,18 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input, const index_t pixel_blocks = (width + 3) / 4 * height; auto runtime = OpenCLRuntime::Get(); - auto program = runtime->program(); - auto conv_kernel = cl::Kernel(program, "depthwise_conv_3x3"); + std::set built_options; + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); + built_options.emplace(stride == 1 ? "-DSTRIDE_1" : ""); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + auto conv_kernel = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options); uint32_t idx = 0; conv_kernel.setArg(idx++, *(static_cast(input->buffer()))); conv_kernel.setArg(idx++, *(static_cast(filter->buffer()))); - conv_kernel.setArg(idx++, *(static_cast(bias->buffer()))); + if (bias != nullptr) { + conv_kernel.setArg(idx++, *(static_cast(bias->buffer()))); + } conv_kernel.setArg(idx++, *(static_cast(output->buffer()))); conv_kernel.setArg(idx++, static_cast(input->dim(1))); conv_kernel.setArg(idx++, static_cast(channels)); @@ -44,8 +49,6 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input, conv_kernel.setArg(idx++, static_cast(input->dim(3))); conv_kernel.setArg(idx++, static_cast(height)); conv_kernel.setArg(idx++, static_cast(width)); - conv_kernel.setArg(idx++, stride); - conv_kernel.setArg(idx++, stride); const uint32_t gws[3] = {static_cast(output->dim(0)), static_cast(channel_blocks), diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 8daa78f6..f3fb6812 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -30,24 +30,26 @@ static void Pooling3(const Tensor *input, }; auto runtime = OpenCLRuntime::Get(); - auto program = runtime->program(); + std::set built_options; + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); + built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : ""); + auto pooling_kernel = runtime->BuildKernel("pooling", "pooling3", built_options); - auto max_pooling_kernel = cl::Kernel(program, "pooling3"); const uint32_t lws[3] = {1, 8, 128}; uint32_t idx = 0; - max_pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); - max_pooling_kernel.setArg(idx++, static_cast(input->dim(2))); - max_pooling_kernel.setArg(idx++, static_cast(input->dim(3))); - max_pooling_kernel.setArg(idx++, static_cast(channels)); - max_pooling_kernel.setArg(idx++, static_cast(out_height)); - max_pooling_kernel.setArg(idx++, static_cast(out_width)); - max_pooling_kernel.setArg(idx++, stride[0]); - max_pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); + pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); + pooling_kernel.setArg(idx++, static_cast(input->dim(2))); + pooling_kernel.setArg(idx++, static_cast(input->dim(3))); + pooling_kernel.setArg(idx++, static_cast(channels)); + pooling_kernel.setArg(idx++, static_cast(out_height)); + pooling_kernel.setArg(idx++, static_cast(out_width)); + pooling_kernel.setArg(idx++, stride[0]); + pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); cl_int error = runtime->command_queue().enqueueNDRangeKernel( - max_pooling_kernel, cl::NullRange, + pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(lws[0], lws[1], lws[2])); MACE_CHECK(error == CL_SUCCESS); @@ -75,9 +77,9 @@ static void PoolingN(const Tensor *input, }; auto runtime = OpenCLRuntime::Get(); - auto program = runtime->program(); - - auto pooling_kernel = cl::Kernel(program, "poolingn"); + std::set built_options; + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); + auto pooling_kernel = runtime->BuildKernel("pooling", "poolingn", built_options); const uint32_t lws[3] = {1, 8, 128}; diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc index c4d22ae8..ed562d23 100644 --- a/mace/kernels/opencl/relu_opencl.cc +++ b/mace/kernels/opencl/relu_opencl.cc @@ -21,9 +21,10 @@ void ReluFunctor::operator()(const Tensor *input, auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); + std::set built_options; + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); if (max_limit_ < 0) { - auto relu_kernel = cl::Kernel(program, "relu"); - + auto relu_kernel = runtime->BuildKernel("relu", "relu", built_options); const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel); uint32_t idx = 0; @@ -37,7 +38,7 @@ void ReluFunctor::operator()(const Tensor *input, cl::NDRange(lws)); MACE_CHECK(error == CL_SUCCESS); } else { - auto relu_kernel = cl::Kernel(program, "relux"); + auto relu_kernel = runtime->BuildKernel("relu", "relux", built_options); const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel); diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index 50d717ef..bf603d94 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -29,9 +29,10 @@ void ResizeBilinearFunctor::operator()( float width_scale = CalculateResizeScale(in_width, out_width, align_corners_); auto runtime = OpenCLRuntime::Get(); - auto program = runtime->program(); + std::set built_options; + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); + auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options); - auto rb_kernel = cl::Kernel(program, "resize_bilinear_nocache"); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel); uint32_t idx = 0; rb_kernel.setArg(idx++, *(static_cast(input->buffer()))); diff --git a/mace/kernels/opencl/space_to_batch_opecl.cc b/mace/kernels/opencl/space_to_batch_opecl.cc index 84601492..a4ec2694 100644 --- a/mace/kernels/opencl/space_to_batch_opecl.cc +++ b/mace/kernels/opencl/space_to_batch_opecl.cc @@ -18,9 +18,9 @@ void SpaceToBatchFunctor::operator()(Tensor *space_te const Tensor *paddings_tensor, Tensor *batch_tensor) { auto runtime = OpenCLRuntime::Get(); - auto program = runtime->program(); - auto s2b_kernel = cl::Kernel(program, "space_to_batch"); - + std::set built_options; + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(space_tensor->dtype())); + auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options); uint32_t idx = 0; s2b_kernel.setArg(idx++, *(static_cast(space_tensor->buffer()))); diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index d70318c3..8e680821 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -24,6 +24,7 @@ enum DataType { DT_UINT16 = 9; DT_BOOL = 10; DT_HALF = 19; + DT_UINT32 = 22; } message TensorProto { -- GitLab