diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9d7a093951b0d295a0e3822973204026640d3a97..aa3993589ffca7f33b304cdddd037873e3e76f39 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,6 +8,7 @@ stages: - ops_test - api_test - python_tools_tests + - model_tests - build_android_demo - ops_benchmark - extra_tests @@ -113,6 +114,18 @@ python_tools_tests: python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; + +model_tests: + stage: model_tests + script: + - pwd + - rm -rf mace-models + - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git + - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1.yml + - > + python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1; + python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml - > python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1; diff --git a/mace/core/device.cc b/mace/core/device.cc index aa0d1663cfd244353a77a4866eb2614a586dd510..35e8c7af6bda7ba27faa768cedc0cbdbfecef7f7 100644 --- a/mace/core/device.cc +++ b/mace/core/device.cc @@ -14,6 +14,8 @@ #include "mace/core/device.h" +#include "mace/core/buffer.h" + namespace mace { CPUDevice::CPUDevice(const int num_threads, @@ -21,7 +23,8 @@ CPUDevice::CPUDevice(const int num_threads, const bool use_gemmlowp) : cpu_runtime_(new CPURuntime(num_threads, policy, - use_gemmlowp)) {} + use_gemmlowp)), + scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {} CPUDevice::~CPUDevice() = default; @@ -31,6 +34,7 @@ CPURuntime *CPUDevice::cpu_runtime() { #ifdef MACE_ENABLE_OPENCL OpenCLRuntime *CPUDevice::opencl_runtime() { + LOG(FATAL) << "CPU device should not call OpenCL Runtime"; return nullptr; } #endif @@ -43,4 +47,8 @@ DeviceType CPUDevice::device_type() const { return DeviceType::CPU; } +ScratchBuffer *CPUDevice::scratch_buffer() { + return scratch_buffer_.get(); +} + } // namespace mace diff --git a/mace/core/device.h b/mace/core/device.h index ec1c7e6ada3ae0dafce11cd9100b1551673152fa..bfa00b02f95c3fe9ab5af78dcc264f79ecc679df 100644 --- a/mace/core/device.h +++ b/mace/core/device.h @@ -26,6 +26,8 @@ namespace mace { +class ScratchBuffer; + class Device { public: virtual ~Device() {} @@ -37,6 +39,7 @@ class Device { virtual Allocator *allocator() = 0; virtual DeviceType device_type() const = 0; + virtual ScratchBuffer *scratch_buffer() = 0; }; class CPUDevice : public Device { @@ -53,9 +56,11 @@ class CPUDevice : public Device { Allocator *allocator() override; DeviceType device_type() const override; + ScratchBuffer *scratch_buffer() override; private: std::unique_ptr cpu_runtime_; + std::unique_ptr scratch_buffer_; }; } // namespace mace diff --git a/mace/core/future.h b/mace/core/future.h index b5bf30dceb25616bad892b0681767fe056c6045c..f5807f54daabc9c1bba6e4ed29b1d5cfb8e0861b 100644 --- a/mace/core/future.h +++ b/mace/core/future.h @@ -15,7 +15,9 @@ #ifndef MACE_CORE_FUTURE_H_ #define MACE_CORE_FUTURE_H_ +#include #include +#include #include "mace/utils/logging.h" @@ -25,9 +27,7 @@ struct CallStats; // Wait the call to finish and get the stats if param is not nullptr struct StatsFuture { - std::function wait_fn = [](CallStats *) { - LOG(FATAL) << "wait_fn must be properly set"; - }; + std::function wait_fn; }; inline void SetFutureDefaultWaitFn(StatsFuture *future) { @@ -41,6 +41,29 @@ inline void SetFutureDefaultWaitFn(StatsFuture *future) { } } +inline void MergeMultipleFutureWaitFn( + const std::vector &org_futures, + StatsFuture *dst_future) { + if (dst_future != nullptr) { + dst_future->wait_fn = [org_futures](CallStats *stats) { + if (stats != nullptr) { + stats->start_micros = INT64_MAX; + stats->end_micros = 0; + for (auto &org_future : org_futures) { + CallStats tmp_stats; + if (org_future.wait_fn != nullptr) { + org_future.wait_fn(&tmp_stats); + stats->start_micros = std::min(stats->start_micros, + tmp_stats.start_micros); + stats->end_micros += tmp_stats.end_micros - tmp_stats.start_micros; + } + } + stats->end_micros += stats->start_micros; + } + }; + } +} + } // namespace mace #endif // MACE_CORE_FUTURE_H_ diff --git a/mace/core/runtime/opencl/gpu_device.cc b/mace/core/runtime/opencl/gpu_device.cc index 65686f831dfe9265a299368eff22eec4ac55bd07..1c85273e372a1de1c7e6214c29fe58d8b0a3a063 100644 --- a/mace/core/runtime/opencl/gpu_device.cc +++ b/mace/core/runtime/opencl/gpu_device.cc @@ -14,6 +14,8 @@ #include "mace/core/runtime/opencl/gpu_device.h" +#include "mace/core/buffer.h" + namespace mace { GPUDevice::GPUDevice(Tuner *tuner, @@ -27,7 +29,8 @@ GPUDevice::GPUDevice(Tuner *tuner, CPUDevice(num_threads, cpu_affinity_policy, use_gemmlowp), runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf, opencl_binary_storage, tuner)), - allocator_(new OpenCLAllocator(runtime_.get())) {} + allocator_(new OpenCLAllocator(runtime_.get())), + scratch_buffer_(new ScratchBuffer(allocator_.get())) {} GPUDevice::~GPUDevice() = default; @@ -43,4 +46,8 @@ DeviceType GPUDevice::device_type() const { return DeviceType::GPU; } +ScratchBuffer *GPUDevice::scratch_buffer() { + return scratch_buffer_.get(); +} + } // namespace mace diff --git a/mace/core/runtime/opencl/gpu_device.h b/mace/core/runtime/opencl/gpu_device.h index 350d53c8e43e654621db35574630741875ad2f5c..64a2d5e34f5fa69b9d456782d4538adf6dca4edd 100644 --- a/mace/core/runtime/opencl/gpu_device.h +++ b/mace/core/runtime/opencl/gpu_device.h @@ -37,9 +37,11 @@ class GPUDevice : public CPUDevice { OpenCLRuntime *opencl_runtime() override; Allocator *allocator() override; DeviceType device_type() const override; + ScratchBuffer *scratch_buffer() override; private: std::unique_ptr runtime_; std::unique_ptr allocator_; + std::unique_ptr scratch_buffer_; }; } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 1fe145508fa22f65cd7444f5b7279c534c97a0bd..97da0e45b3b8f5430e5a487eb74b26e68d3f0df5 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -31,8 +31,6 @@ namespace mace { -std::string kOpenCLParameterPath; // NOLINT(runtime/string) - extern const std::map> kEncryptedProgramMap; @@ -286,7 +284,8 @@ OpenCLRuntime::OpenCLRuntime( is_opencl_avaliable_(false), is_profiling_enabled_(false), opencl_version_(CL_VER_UNKNOWN), - gpu_type_(UNKNOWN) { + gpu_type_(UNKNOWN), + mem_type_(MemoryType::GPU_IMAGE) { std::vector all_platforms; cl::Platform::get(&all_platforms); if (all_platforms.size() == 0) { @@ -471,6 +470,14 @@ uint32_t OpenCLRuntime::device_compute_units() const { return device_compute_units_; } +bool OpenCLRuntime::UseImageMemory() { + return this->mem_type_ == MemoryType::GPU_IMAGE; +} + +void OpenCLRuntime::set_mem_type(MemoryType type) { + this->mem_type_ = type; +} + bool OpenCLRuntime::BuildProgramFromCache( const std::string &built_program_key, const std::string &build_options_str, diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 222fe8514a4cf4b08c944959e2faf8f646bf5c29..8480848944b2aba302dcc6007f92b26d5081085d 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -25,6 +25,7 @@ #include "mace/core/file_storage.h" #include "mace/core/future.h" #include "mace/core/runtime/opencl/cl2_header.h" +#include "mace/proto/mace.pb.h" #include "mace/utils/string_util.h" #include "mace/utils/timer.h" #include "mace/utils/tuner.h" @@ -82,6 +83,9 @@ class OpenCLRuntime { uint32_t device_compute_units() const; Tuner *tuner(); bool is_opencl_avaliable(); + // TODO(liuqi): remove this function in the future, make decision at runtime. + bool UseImageMemory(); + void set_mem_type(MemoryType type); void GetCallStats(const cl::Event &event, CallStats *stats); uint64_t GetDeviceMaxWorkGroupSize(); @@ -129,6 +133,7 @@ class OpenCLRuntime { bool is_profiling_enabled_; OpenCLVersion opencl_version_; GPUType gpu_type_; + MemoryType mem_type_; // All OpenCL object must be a pointer and manually deleted before unloading // OpenCL library. std::shared_ptr context_; diff --git a/mace/core/tensor.h b/mace/core/tensor.h index b3c58152041ca0a8714717e6f8e49bf32a8d33f0..4c03f33132ef733ce0b3e5a875cce5b23fafaf26 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -101,13 +101,14 @@ enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 }; class Tensor { public: Tensor(Allocator *alloc, DataType type, - bool is_weight = false) + bool is_weight = false, + const std::string name = "") : allocator_(alloc), dtype_(type), buffer_(nullptr), is_buffer_owner_(true), unused_(false), - name_(""), + name_(name), is_weight_(is_weight), scale_(0.f), zero_point_(0), @@ -115,12 +116,13 @@ class Tensor { maxval_(0.f) {} Tensor(BufferBase *buffer, DataType dtype, - bool is_weight = false) + bool is_weight = false, + const std::string name = "") : dtype_(dtype), buffer_(buffer), is_buffer_owner_(false), unused_(false), - name_(""), + name_(name), is_weight_(is_weight), scale_(0.f), zero_point_(0), @@ -129,12 +131,13 @@ class Tensor { Tensor(const BufferSlice &buffer_slice, DataType dtype, - bool is_weight = false) + bool is_weight = false, + const std::string name = "") : dtype_(dtype), buffer_slice_(buffer_slice), is_buffer_owner_(false), unused_(false), - name_(""), + name_(name), is_weight_(is_weight), scale_(0.f), zero_point_(0), @@ -152,6 +155,8 @@ class Tensor { } } + inline std::string name() const { return name_; } + inline DataType dtype() const { return dtype_; } inline void SetDtype(DataType dtype) { dtype_ = dtype; } @@ -188,11 +193,15 @@ class Tensor { shape_configured_ = shape_configured; } + inline const std::vector &buffer_shape() const { + return buffer_shape_; + } + inline index_t dim_size() const { return shape_.size(); } inline index_t dim(unsigned int index) const { - MACE_CHECK(index < shape_.size(), "Dim out of range: ", index, " >= ", - shape_.size()); + MACE_CHECK(index < shape_.size(), + name_, ": Dim out of range: ", index, " >= ", shape_.size()); return shape_[index]; } @@ -214,12 +223,12 @@ class Tensor { #ifdef MACE_ENABLE_OPENCL inline cl::Image *opencl_image() const { - MACE_CHECK(has_opencl_image(), "do not have image"); + MACE_CHECK(has_opencl_image(), name_, " do not have image"); return static_cast(buffer_->buffer()); } inline cl::Buffer *opencl_buffer() const { - MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer"); + MACE_CHECK(has_opencl_buffer(), name_, " do not have opencl buffer"); return static_cast(buffer_->buffer()); } #endif @@ -268,12 +277,14 @@ class Tensor { inline MaceStatus Resize(const std::vector &shape) { shape_ = shape; + buffer_shape_ = shape; image_shape_.clear(); if (buffer_ != nullptr) { - MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage."); + MACE_CHECK(!has_opencl_image(), + name_, ": Cannot resize image, use ResizeImage."); if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) { - LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to " - << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE; + LOG(WARNING) << name_ << ": Resize buffer from size " << buffer_->size() + << " to " << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE; return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE); } return MaceStatus::MACE_SUCCESS; @@ -296,19 +307,22 @@ class Tensor { allocator_ = other.allocator_; dtype_ = other.dtype_; shape_ = other.shape_; + buffer_shape_ = other.buffer_shape_; image_shape_ = other.image_shape_; } inline MaceStatus ResizeImage(const std::vector &shape, const std::vector &image_shape) { shape_ = shape; + buffer_shape_ = shape; image_shape_ = image_shape; if (buffer_ == nullptr) { MACE_CHECK(is_buffer_owner_); buffer_ = new Image(allocator_); return buffer_->Allocate(image_shape, dtype_); } else { - MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize."); + MACE_CHECK(has_opencl_image(), + name_, ": Cannot ResizeImage buffer, use Resize."); Image *image = dynamic_cast(buffer_); MACE_CHECK(image_shape[0] <= image->image_shape()[0] && image_shape[1] <= image->image_shape()[1], @@ -366,8 +380,6 @@ class Tensor { inline BufferBase *UnderlyingBuffer() const { return buffer_; } - inline void SetSourceOpName(const std::string name) { name_ = name; } - inline void DebugPrint() const { using namespace numerical_chars; // NOLINT(build/namespaces) std::stringstream os; @@ -459,9 +471,12 @@ class Tensor { private: Allocator *allocator_; DataType dtype_; + // the shape of buffer(logical) std::vector shape_; std::vector shape_configured_; std::vector image_shape_; + // the shape of buffer(physical storage) + std::vector buffer_shape_; BufferBase *buffer_; BufferSlice buffer_slice_; bool is_buffer_owner_; diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 333d6e60d4bffcf319700db897206b0df55a473d..766e125e20830cfc61891aaaf9f57dfd6eef8244 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -44,8 +44,7 @@ bool HasQuantizeOp(const NetDef &net_def) { } } // namespace -Workspace::Workspace() : - host_scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {} +Workspace::Workspace() = default; Tensor *Workspace::CreateTensor(const std::string &name, Allocator *alloc, @@ -54,8 +53,8 @@ Tensor *Workspace::CreateTensor(const std::string &name, VLOG(3) << "Tensor " << name << " already exists. Skipping."; } else { VLOG(3) << "Creating Tensor " << name; - tensor_map_[name] = std::unique_ptr(new Tensor(alloc, type)); - tensor_map_[name]->SetSourceOpName(name); + tensor_map_[name] = std::unique_ptr(new Tensor(alloc, type, + false, name)); } return GetTensor(name); } @@ -171,7 +170,10 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, tensor_buffer_.get(), const_tensor.offset(), const_tensor.data_size() * GetEnumTypeSize(const_tensor.data_type())), - const_tensor.data_type(), true)); + const_tensor.data_type(), + true, + const_tensor.name())); + tensor->Reshape(dims); tensor->SetScale(const_tensor.scale()); tensor->SetZeroPoint(const_tensor.zero_point()); @@ -275,7 +277,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, std::unique_ptr tensor_buf( new Buffer(device->allocator())); MACE_RETURN_IF_ERROR(tensor_buf->Allocate( - mem_block.x() * GetEnumTypeSize(dtype))); + mem_block.x() * GetEnumTypeSize(dtype) + + MACE_EXTRA_BUFFER_PAD_SIZE)); preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(tensor_buf)); } @@ -301,10 +304,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, } std::unique_ptr tensor (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), - output_type)); - tensor->SetSourceOpName(op.name()); - if (device_type == DeviceType::GPU) { - VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" + output_type, false, op.output(i))); + if (device_type == DeviceType::GPU && tensor->has_opencl_image()) { + VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")" << " Mem: " << mem_ids[i] << " Image shape: " << dynamic_cast(tensor->UnderlyingBuffer()) @@ -312,8 +314,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, << ", " << dynamic_cast(tensor->UnderlyingBuffer()) ->image_shape()[1]; - } else if (device_type == DeviceType::CPU) { - VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" + } else { + VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")" << " Mem: " << mem_ids[i] << ", Buffer size: " << tensor->UnderlyingBuffer()->size(); } @@ -356,14 +358,6 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, return MaceStatus::MACE_SUCCESS; } -ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) { - if (device_type == CPU) { - return host_scratch_buffer_.get(); - } else { - return nullptr; - } -} - void Workspace::RemoveUnusedBuffer() { auto iter = tensor_map_.begin(); auto end_iter = tensor_map_.end(); diff --git a/mace/core/workspace.h b/mace/core/workspace.h index 71850098e03593083454acc7102743a0cc106f1b..2a8089370c01c4341d6cd94a775ee6eaf1443910 100644 --- a/mace/core/workspace.h +++ b/mace/core/workspace.h @@ -52,8 +52,6 @@ class Workspace { Device *device, const unsigned char *model_data); - ScratchBuffer *GetScratchBuffer(DeviceType device_type); - void RemoveUnusedBuffer(); void RemoveAndReloadBuffer(const NetDef &net_def, @@ -64,15 +62,12 @@ class Workspace { MaceStatus CreateOutputTensorBuffer(const NetDef &net_def, Device *device); - Device *device_; - TensorMap tensor_map_; std::unique_ptr tensor_buffer_; PreallocatedPooledAllocator preallocated_allocator_; - std::unique_ptr host_scratch_buffer_; bool fused_buffer_; MACE_DISABLE_COPY_AND_ASSIGN(Workspace); diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index 3491b743332ca1da9b85977be554b877efab78a0..6b37cf5060376493d0925f8eeca8360e83261f84 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -32,6 +32,8 @@ cc_library( ) + if_opencl_enabled(glob( [ "opencl/*.cc", + "opencl/image/*.cc", + "opencl/buffer/*.cc", ], exclude = [ "opencl/*_test.cc", @@ -43,14 +45,16 @@ cc_library( "arm/*.h", ], exclude = [ - "buffer_to_image.h", - "image_to_buffer.h", + "buffer_transform.h", + "buffer_inverse_transform.h", "lstmcell.h", ], ) + if_opencl_enabled(glob([ "opencl/*.h", - "buffer_to_image.h", - "image_to_buffer.h", + "opencl/image/*.h", + "opencl/buffer/*.h", + "buffer_transform.h", + "buffer_inverse_transform.h", "lstmcell.h", ])), copts = [ diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 47350d913e1faeb2a2686eada01afd22078fba22..59f7edd84ba648ba7aff55a100bc723b1a0ba510 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -26,10 +26,6 @@ #include "mace/core/types.h" #include "mace/kernels/kernel.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -164,15 +160,22 @@ class ActivationFunctor : OpKernel { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLActivationKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *alpha, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLActivationKernel); +}; template class ActivationFunctor : OpKernel { public: ActivationFunctor(OpKernelContext *context, ActivationType type, - T relux_max_limit) - : OpKernel(context), - activation_(type), - relux_max_limit_(static_cast(relux_max_limit)) {} + T relux_max_limit); MaceStatus operator()(const Tensor *input, const Tensor *alpha, @@ -180,13 +183,7 @@ class ActivationFunctor : OpKernel { StatsFuture *future); private: - ActivationType activation_; - T relux_max_limit_; - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::string tuning_key_prefix_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index d81f25a337410d1225f9d8e49e071e496372d79a..2fa3e21a91c48782cbcb73de4326731f6b656671 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -26,10 +26,6 @@ #include "mace/core/tensor.h" #include "mace/kernels/kernel.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -96,17 +92,23 @@ struct AddNFunctor : OpKernel { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLAddNKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const std::vector &input_tensors, + Tensor *output_tensor, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLAddNKernel); +}; template struct AddNFunctor : OpKernel { - explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {} + explicit AddNFunctor(OpKernelContext *context); MaceStatus operator()(const std::vector &input_tensors, - Tensor *output_tensor, - StatsFuture *future); + Tensor *output_tensor, + StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index 4c9aac3a6415fdd8bc60e1af34ca3d51e9ca9a12..75e58f937ddca72241648ae52d4df8f079bf3f39 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -26,41 +26,22 @@ #include "mace/kernels/activation.h" #include "mace/public/mace.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { -struct BatchNormFunctorBase : OpKernel { - BatchNormFunctorBase(OpKernelContext *context, - bool folded_constant, - const ActivationType activation, - const float relux_max_limit) - : OpKernel(context), - folded_constant_(folded_constant), - activation_(activation), - relux_max_limit_(relux_max_limit) {} - - const bool folded_constant_; - const ActivationType activation_; - const float relux_max_limit_; -}; - template struct BatchNormFunctor; template<> -struct BatchNormFunctor : BatchNormFunctorBase { +struct BatchNormFunctor : OpKernel { BatchNormFunctor(OpKernelContext *context, const bool folded_constant, const ActivationType activation, const float relux_max_limit) - : BatchNormFunctorBase(context, - folded_constant, - activation, - relux_max_limit) {} + : OpKernel(context), + folded_constant_(folded_constant), + activation_(activation), + relux_max_limit_(relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *scale, @@ -133,31 +114,42 @@ struct BatchNormFunctor : BatchNormFunctorBase { return MACE_SUCCESS; } + + const bool folded_constant_; + const ActivationType activation_; + const float relux_max_limit_; }; #ifdef MACE_ENABLE_OPENCL +class OpenCLBatchNormKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const float epsilon, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchNormKernel); +}; template -struct BatchNormFunctor : BatchNormFunctorBase { +struct BatchNormFunctor : OpKernel { BatchNormFunctor(OpKernelContext *context, const bool folded_constant, const ActivationType activation, - const float relux_max_limit) - : BatchNormFunctorBase(context, - folded_constant, - activation, - relux_max_limit) {} + const float relux_max_limit); MaceStatus operator()(const Tensor *input, - const Tensor *scale, - const Tensor *offset, - const Tensor *mean, - const Tensor *var, - const float epsilon, - Tensor *output, - StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const float epsilon, + Tensor *output, + StatsFuture *future); + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/batch_to_space.h b/mace/kernels/batch_to_space.h index a88959d26fe6643cf43dc6061c9e07bad2354669..45b2ff8850f8cedef493fac1da575ec594d482be 100644 --- a/mace/kernels/batch_to_space.h +++ b/mace/kernels/batch_to_space.h @@ -24,10 +24,6 @@ #include "mace/kernels/kernel.h" #include "mace/public/mace.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -51,7 +47,8 @@ struct BatchToSpaceFunctorBase : OpKernel { void CalculateBatchToSpaceOutputShape(const Tensor *input_tensor, const DataFormat data_format, index_t *output_shape) { - MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D"); + MACE_CHECK(input_tensor->dim_size() == 4, + "Input(", input_tensor->name(), ") shape should be 4D"); index_t batch = input_tensor->dim(0); index_t channels = 0; index_t height = 0; @@ -96,8 +93,8 @@ struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { const std::vector &block_shape) : BatchToSpaceFunctorBase(context, paddings, block_shape) {} - MaceStatus operator()(Tensor *space_tensor, - Tensor *batch_tensor, + MaceStatus operator()(const Tensor *batch_tensor, + Tensor *space_tensor, StatsFuture *future) { MACE_UNUSED(future); @@ -191,8 +188,8 @@ struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { const std::vector &block_shape) : BatchToSpaceFunctorBase(context, paddings, block_shape) {} - MaceStatus operator()(Tensor *space_tensor, - Tensor *batch_tensor, + MaceStatus operator()(const Tensor *batch_tensor, + Tensor *space_tensor, StatsFuture *future) { MACE_UNUSED(future); @@ -272,21 +269,29 @@ struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLBatchToSpaceKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *batch_tensor, + const std::vector &paddings, + const std::vector &block_shape, + const std::vector &output_shape, + Tensor *space_tensor, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchToSpaceKernel); +}; template struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { BatchToSpaceFunctor(OpKernelContext *context, const std::vector &paddings, - const std::vector &block_shape) - : BatchToSpaceFunctorBase(context, paddings, block_shape) {} + const std::vector &block_shape); - MaceStatus operator()(Tensor *space_tensor, - Tensor *batch_tensor, - StatsFuture *future); + MaceStatus operator()(const Tensor *batch_tensor, + Tensor *space_tensor, + StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector space_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h index e2ea8ccfb88308e6fcfa3de731e6905fe970b13b..d58a4d93e26a6f10c324bb21f0e3dca9c111ec3a 100644 --- a/mace/kernels/bias_add.h +++ b/mace/kernels/bias_add.h @@ -24,10 +24,6 @@ #include "mace/kernels/kernel.h" #include "mace/public/mace.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -96,18 +92,26 @@ struct BiasAddFunctor : BiasAddFunctorBase { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLBiasAddKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *bias, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBiasAddKernel); +}; + template struct BiasAddFunctor : BiasAddFunctorBase { - BiasAddFunctor(OpKernelContext *context, const DataFormat data_format) - : BiasAddFunctorBase(context, data_format) {} + BiasAddFunctor(OpKernelContext *context, const DataFormat data_format); MaceStatus operator()(const Tensor *input, const Tensor *bias, Tensor *output, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/buffer_inverse_transform.h b/mace/kernels/buffer_inverse_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..2b3e0098cc243c0b14e66e91e8b3e128d6e863a4 --- /dev/null +++ b/mace/kernels/buffer_inverse_transform.h @@ -0,0 +1,82 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_ +#define MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_ + +#include +#include + +#include "mace/core/future.h" +#include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" +#include "mace/kernels/opencl/common.h" + +namespace mace { +namespace kernels { + +struct BufferInverseTransformFunctorBase : OpKernel { + BufferInverseTransformFunctorBase(OpKernelContext *context, + const int wino_blk_size) + : OpKernel(context), + wino_blk_size_(wino_blk_size) {} + const int wino_blk_size_; +}; + +template +struct BufferInverseTransformFunctor : BufferInverseTransformFunctorBase { + explicit BufferInverseTransformFunctor(OpKernelContext *context, + const int wino_blk_size) + : BufferInverseTransformFunctorBase(context, wino_blk_size) {} + MaceStatus operator()(const Tensor *input, + const BufferType type, + Tensor *output, + StatsFuture *future) { + MACE_UNUSED(input); + MACE_UNUSED(type); + MACE_UNUSED(output); + MACE_UNUSED(future); + MACE_NOT_IMPLEMENTED; + return MACE_SUCCESS; + } +}; + +class OpenCLBufferInverseTransformKernel { + public: + virtual MaceStatus Compute(OpKernelContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferInverseTransformKernel) +}; + +template +struct BufferInverseTransformFunctor + : BufferInverseTransformFunctorBase { + explicit BufferInverseTransformFunctor(OpKernelContext *context, + const int wino_blk_size); + MaceStatus operator()(const Tensor *input, + const BufferType type, + Tensor *output, + StatsFuture *future); + + std::unique_ptr kernel_; +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_ diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h deleted file mode 100644 index 4a2f731b0e49baee5db257998e1d82c665a0aee2..0000000000000000000000000000000000000000 --- a/mace/kernels/buffer_to_image.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_ -#define MACE_KERNELS_BUFFER_TO_IMAGE_H_ - -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/kernels/opencl/common.h" - -namespace mace { -namespace kernels { - -struct BufferToImageFunctorBase : OpKernel { - explicit BufferToImageFunctorBase(OpKernelContext *context, - const int wino_blk_size) - : OpKernel(context), wino_blk_size_(wino_blk_size) {} - const int wino_blk_size_; -}; - -template -struct BufferToImageFunctor : BufferToImageFunctorBase { - explicit BufferToImageFunctor(OpKernelContext *context, - const int wino_blk_size) - : BufferToImageFunctorBase(context, wino_blk_size) {} - MaceStatus operator()(const Tensor *input, - const BufferType type, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(input); - MACE_UNUSED(type); - MACE_UNUSED(output); - MACE_UNUSED(future); - MACE_NOT_IMPLEMENTED; - return MACE_SUCCESS; - } -}; - -template -struct BufferToImageFunctor : BufferToImageFunctorBase { - explicit BufferToImageFunctor(OpKernelContext *context, - const int wino_blk_size) - : BufferToImageFunctorBase(context, wino_blk_size) {} - MaceStatus operator()(const Tensor *input, - const BufferType type, - Tensor *output, - StatsFuture *future); - - cl::Kernel kernel_; - std::unique_ptr kernel_error_; - std::vector input_shape_; -}; - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_BUFFER_TO_IMAGE_H_ diff --git a/mace/kernels/image_to_buffer.h b/mace/kernels/buffer_transform.h similarity index 54% rename from mace/kernels/image_to_buffer.h rename to mace/kernels/buffer_transform.h index c4394fda15e95c2c65af625ed0e711af4391be6b..8f0fd039c0b68be20373e58d7e465df3d870180b 100644 --- a/mace/kernels/image_to_buffer.h +++ b/mace/kernels/buffer_transform.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_IMAGE_TO_BUFFER_H_ -#define MACE_KERNELS_IMAGE_TO_BUFFER_H_ +#ifndef MACE_KERNELS_BUFFER_TRANSFORM_H_ +#define MACE_KERNELS_BUFFER_TRANSFORM_H_ #include #include @@ -26,18 +26,19 @@ namespace mace { namespace kernels { -struct ImageToBufferFunctorBase : OpKernel { - ImageToBufferFunctorBase(OpKernelContext *context, - const int wino_blk_size) - : OpKernel(context), - wino_blk_size_(wino_blk_size) {} +struct BufferTransformFunctorBase : OpKernel { + explicit BufferTransformFunctorBase(OpKernelContext *context, + const int wino_blk_size) + : OpKernel(context), wino_blk_size_(wino_blk_size) {} const int wino_blk_size_; }; template -struct ImageToBufferFunctor : ImageToBufferFunctorBase { - ImageToBufferFunctor(OpKernelContext *context, const int wino_blk_size) - : ImageToBufferFunctorBase(context, wino_blk_size) {} +struct BufferTransformFunctor : BufferTransformFunctorBase { + BufferTransformFunctor(OpKernelContext *context, + const int wino_blk_size) + : BufferTransformFunctorBase(context, wino_blk_size) {} + MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, @@ -51,22 +52,30 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { } }; +class OpenCLBufferTransformKernel { + public: + virtual MaceStatus Compute(OpKernelContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferTransformKernel) +}; + template -struct ImageToBufferFunctor : ImageToBufferFunctorBase { - ImageToBufferFunctor(OpKernelContext *context, - const int wino_blk_size) - : ImageToBufferFunctorBase(context, wino_blk_size) {} +struct BufferTransformFunctor : BufferTransformFunctorBase { + BufferTransformFunctor(OpKernelContext *context, const int wino_blk_size); + MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, StatsFuture *future); - cl::Kernel kernel_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; } // namespace kernels } // namespace mace -#endif // MACE_KERNELS_IMAGE_TO_BUFFER_H_ +#endif // MACE_KERNELS_BUFFER_TRANSFORM_H_ diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h index 029eb1c66b665baed39cacec05c9dbe9b45ca1b5..d5cf5fe0bb746918115e271f4b471cab3c1ec8b1 100644 --- a/mace/kernels/channel_shuffle.h +++ b/mace/kernels/channel_shuffle.h @@ -71,20 +71,24 @@ struct ChannelShuffleFunctor : OpKernel { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLChannelShuffleKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLChannelShuffleKernel); +}; template struct ChannelShuffleFunctor : OpKernel { - ChannelShuffleFunctor(OpKernelContext *context, const int groups) - : OpKernel(context), groups_(groups) {} + ChannelShuffleFunctor(OpKernelContext *context, const int groups); MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - const int groups_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h index 7cd6958fca52d3a5e9bb237d9db8ac2b0aa7d87c..14bf38cdce8d0627c2441649e93611bf9ae096b1 100644 --- a/mace/kernels/concat.h +++ b/mace/kernels/concat.h @@ -24,24 +24,13 @@ #include "mace/kernels/kernel.h" #include "mace/public/mace.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { -struct ConcatFunctorBase : OpKernel { - ConcatFunctorBase(OpKernelContext *context, const int32_t axis) - : OpKernel(context), axis_(axis) {} - - int32_t axis_; -}; - template -struct ConcatFunctor : ConcatFunctorBase { +struct ConcatFunctor : OpKernel { ConcatFunctor(OpKernelContext *context, const int32_t axis) - : ConcatFunctorBase(context, axis) {} + : OpKernel(context), axis_(axis) {} MaceStatus operator()(const std::vector &input_list, Tensor *output, @@ -98,21 +87,29 @@ struct ConcatFunctor : ConcatFunctorBase { return MACE_SUCCESS; } + + int32_t axis_; }; #ifdef MACE_ENABLE_OPENCL +class OpenCLConcatKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const std::vector &input_list, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConcatKernel); +}; template -struct ConcatFunctor : ConcatFunctorBase { - ConcatFunctor(OpKernelContext *context, const int32_t axis) - : ConcatFunctorBase(context, axis) {} +struct ConcatFunctor : OpKernel { + ConcatFunctor(OpKernelContext *context, const int32_t axis); MaceStatus operator()(const std::vector &input_list, - Tensor *output, - StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + Tensor *output, + StatsFuture *future); + + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 4e3c56ce3786e1af4cd14496ccdd4c2ca5f22905..e7b8e633e087d0e8af6ecc2d967928a8907cdc99 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -35,10 +35,6 @@ #include "mace/kernels/quantize.h" #include "mace/utils/utils.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -78,8 +74,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { const int *dilations, const ActivationType activation, const float relux_max_limit, - const bool is_filter_transformed, - ScratchBuffer *scratch) + const bool is_filter_transformed) : Conv2dFunctorBase(context, strides, padding_type, @@ -88,8 +83,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { activation, relux_max_limit), transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT), - is_filter_transformed_(is_filter_transformed), - scratch_(scratch) {} + is_filter_transformed_(is_filter_transformed) {} void Conv2dGeneral(const float *input, const float *filter, @@ -494,14 +488,15 @@ struct Conv2dFunctor : Conv2dFunctorBase { } // Init scratch buffer - scratch_->Rewind(); - scratch_->GrowSize(total_scratch_size); + ScratchBuffer *scratch = context_->device()->scratch_buffer(); + scratch->Rewind(); + scratch->GrowSize(total_scratch_size); Tensor - transformed_input(scratch_->Scratch(transformed_input_size), DT_FLOAT); + transformed_input(scratch->Scratch(transformed_input_size), DT_FLOAT); Tensor - transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT); - Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT); - Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT); + transformed_output(scratch->Scratch(transformed_output_size), DT_FLOAT); + Tensor padded_input(scratch->Scratch(padded_input_size), DT_FLOAT); + Tensor padded_output(scratch->Scratch(padded_output_size), DT_FLOAT); const index_t extra_input_shape[4] = {batch, input_channels, extra_input_height, extra_input_width}; const index_t extra_output_shape[4] = @@ -559,7 +554,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { transformed_output_data, pad_output, &sgemm_, - scratch_); + scratch); }; } else if (use_neon_3x3_s1) { conv_func = [=](const float *pad_input, float *pad_output) { @@ -588,7 +583,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { channels, pad_output, &sgemm_, - scratch_); + scratch); }; } else if (use_neon_5x5_s1) { conv_func = [=](const float *pad_input, float *pad_output) { @@ -735,7 +730,6 @@ struct Conv2dFunctor : Conv2dFunctorBase { Tensor transformed_filter_; bool is_filter_transformed_; - ScratchBuffer *scratch_; SGemm sgemm_; }; @@ -748,16 +742,14 @@ struct Conv2dFunctor : Conv2dFunctorBase { const int *dilations, const ActivationType activation, const float relux_max_limit, - const bool is_filter_transformed, - ScratchBuffer *scratch) + const bool is_filter_transformed) : Conv2dFunctorBase(context, strides, padding_type, paddings, dilations, activation, - relux_max_limit), - scratch_(scratch) { + relux_max_limit) { MACE_UNUSED(is_filter_transformed); } @@ -926,13 +918,14 @@ struct Conv2dFunctor : Conv2dFunctorBase { bool im2col_required = filter_h != 1 || filter_w != 1 || stride_h != 1 || stride_w != 1; total_scratch_size += (im2col_required ? im2col_size : 0); - scratch_->Rewind(); - scratch_->GrowSize(total_scratch_size); + ScratchBuffer *scratch = context_->device()->scratch_buffer(); + scratch->Rewind(); + scratch->GrowSize(total_scratch_size); std::unique_ptr zero_bias; const int32_t *bias_data = nullptr; if (bias == nullptr) { - zero_bias.reset(new Tensor(scratch_->Scratch(zero_bias_size), DT_INT32)); + zero_bias.reset(new Tensor(scratch->Scratch(zero_bias_size), DT_INT32)); zero_bias->Reshape({channels}); zero_bias->Clear(); bias_data = zero_bias->data(); @@ -944,7 +937,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { auto gemm_input_data = input_data; if (im2col_required) { // prepare im2col - im2col.reset(new Tensor(scratch_->Scratch(im2col_size), DT_UINT8)); + im2col.reset(new Tensor(scratch->Scratch(im2col_size), DT_UINT8)); uint8_t *im2col_data = im2col->mutable_data(); Im2col(input_data, input->shape(), filter_h, filter_w, stride_h, stride_w, static_cast(input->zero_point()), @@ -976,12 +969,28 @@ struct Conv2dFunctor : Conv2dFunctorBase { return MACE_SUCCESS; } - - ScratchBuffer *scratch_; }; #ifdef MACE_ENABLE_OPENCL -template +class OpenCLConv2dKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConv2dKernel); +}; + +template struct Conv2dFunctor : Conv2dFunctorBase { Conv2dFunctor(OpKernelContext *context, const int *strides, @@ -990,18 +999,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { const int *dilations, const ActivationType activation, const float relux_max_limit, - const bool is_filter_transformed, - ScratchBuffer *scratch) - : Conv2dFunctorBase(context, - strides, - padding_type, - paddings, - dilations, - activation, - relux_max_limit) { - MACE_UNUSED(is_filter_transformed); - MACE_UNUSED(scratch); - } + const bool is_filter_transformed); MaceStatus operator()(const Tensor *input, const Tensor *filter, @@ -1009,10 +1007,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { Tensor *output, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc index 801f206162f2155cd566d055692de8ddf6cbedd6..ce9fb39c77cf592f6b8a34a137af8f195e30364e 100644 --- a/mace/kernels/conv_pool_2d_util.cc +++ b/mace/kernels/conv_pool_2d_util.cc @@ -210,6 +210,20 @@ void CalcOutputSize(const index_t *input_shape, } } +void CalcNCHWInputShape(const index_t *output_shape, + const index_t *filter_shape, + const int *strides, + const int *dilations, + index_t *input_shape) { + MACE_CHECK_NOTNULL(input_shape); + input_shape[0] = output_shape[0]; + input_shape[1] = filter_shape[1]; + input_shape[2] = (output_shape[2] - 1) * strides[0] + + (filter_shape[2] - 1) * dilations[0] + 1; + input_shape[3] = (output_shape[3] - 1) * strides[1] + + (filter_shape[3] - 1) * dilations[1] + 1; +} + void CalcOutputSize(const index_t *input_shape, // NHWC const index_t *filter_shape, // OIHW const int *padding_size, @@ -234,8 +248,8 @@ void CalcNCHWOutputSize(const index_t *input_shape, // NCHW void CalPaddingSize(const index_t *input_shape, // NCHW const index_t *filter_shape, // OIHW - const int *dilations, const int *strides, + const int *dilations, Padding padding, int *padding_size) { MACE_CHECK(dilations[0] > 0 && dilations[1] > 0, diff --git a/mace/kernels/conv_pool_2d_util.h b/mace/kernels/conv_pool_2d_util.h index dba90bc5cdbb056c5fbce34930b5bb019b1f1955..e735a97d0e78af1b9344f0bfc2facfca05ae166e 100644 --- a/mace/kernels/conv_pool_2d_util.h +++ b/mace/kernels/conv_pool_2d_util.h @@ -84,6 +84,12 @@ void CalcNCHWOutputSize(const index_t *input_shape, const RoundType round_type, index_t *output_shape); +void CalcNCHWInputShape(const index_t *output_shape, + const index_t *filter_shape, + const int *strides, + const int *dilations, + index_t *input_shape); + void CalPaddingSize(const index_t *input_shape, // NCHW const index_t *filter_shape, // OIHW const int *dilations, @@ -91,6 +97,7 @@ void CalPaddingSize(const index_t *input_shape, // NCHW Padding padding, int *padding_size); + MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input, const int pad_top, const int pad_bottom, const int pad_left, const int pad_right, diff --git a/mace/kernels/crop.h b/mace/kernels/crop.h index 6ad9650ee406d13a8ca2b64b41fadd81ce462ca6..0838b69a91b91ae70a7ad16d4a264163c392e3a0 100644 --- a/mace/kernels/crop.h +++ b/mace/kernels/crop.h @@ -24,31 +24,17 @@ #include "mace/kernels/kernel.h" #include "mace/public/mace.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { -struct CropFunctorBase : OpKernel { - CropFunctorBase(OpKernelContext *context, - const int axis, - const std::vector &offset) - : OpKernel(context), - axis_(axis), - offset_(offset) {} - - const int axis_; - std::vector offset_; -}; - template -struct CropFunctor : CropFunctorBase { +struct CropFunctor : OpKernel { CropFunctor(OpKernelContext *context, const int axis, const std::vector &offset) - : CropFunctorBase(context, axis, offset) {} + : OpKernel(context), + axis_(axis), + offset_(offset) {} void crop_copy(const T* input_data, T* output_data, const std::vector &input_shape, @@ -121,23 +107,31 @@ struct CropFunctor : CropFunctorBase { return MACE_SUCCESS; } + + const int axis_; + std::vector offset_; }; #ifdef MACE_ENABLE_OPENCL +class OpenCLCropKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const std::vector &input_list, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLCropKernel); +}; template -struct CropFunctor : CropFunctorBase { +struct CropFunctor : OpKernel { CropFunctor(OpKernelContext *context, const int axis, - const std::vector &offset) - : CropFunctorBase(context, axis, offset) {} + const std::vector &offset); MaceStatus operator()(const std::vector &input_list, Tensor *output, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h index 4bfc4d613a1454624e1a373f677ebd4df29c1db9..72edfca81b886d1624ce8828e72209c29d4c2ce7 100644 --- a/mace/kernels/deconv_2d.h +++ b/mace/kernels/deconv_2d.h @@ -28,10 +28,6 @@ #include "mace/kernels/conv_pool_2d_util.h" #include "mace/utils/utils.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -317,6 +313,22 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLDeconv2dKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const int *padding_data, + const ActivationType activation, + const float relux_max_limit, + const std::vector &output_shape, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDeconv2dKernel); +}; template struct Deconv2dFunctor : Deconv2dFunctorBase { Deconv2dFunctor(OpKernelContext *context, @@ -325,14 +337,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { const std::vector &paddings, const std::vector &output_shape, const ActivationType activation, - const float relux_max_limit) - : Deconv2dFunctorBase(context, - strides, - padding_type, - paddings, - output_shape, - activation, - relux_max_limit) {} + const float relux_max_limit); MaceStatus operator()(const Tensor *input, const Tensor *filter, @@ -341,10 +346,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { Tensor *output, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index 601da6cc81f35daf664abcd93cc9eb1c643d8416..e73dec7660e4f61bafc5356e499e1076d5f1ef79 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -93,20 +93,24 @@ struct DepthToSpaceOpFunctor : OpKernel { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLDepthToSpaceKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthToSpaceKernel); +}; template struct DepthToSpaceOpFunctor : OpKernel { DepthToSpaceOpFunctor(OpKernelContext *context, - const int block_size) - : OpKernel(context), block_size_(block_size) {} + const int block_size); MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future); - const int block_size_; - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index bd98184362fe10286c23e8172cadf53f87270c84..a7765b3022f0a57ee3e0826d8c8037761c62150a 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -501,6 +501,24 @@ struct DepthwiseConv2dFunctor }; #ifdef MACE_ENABLE_OPENCL +class OpenCLDepthwiseConv2dKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthwiseConv2dKernel); +}; + template struct DepthwiseConv2dFunctor : DepthwiseConv2dFunctorBase { @@ -510,25 +528,15 @@ struct DepthwiseConv2dFunctor const std::vector &paddings, const int *dilations, const ActivationType activation, - const float relux_max_limit) - : DepthwiseConv2dFunctorBase(context, - strides, - padding_type, - paddings, - dilations, - activation, - relux_max_limit) {} + const float relux_max_limit); MaceStatus operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future); - - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + const Tensor *filter, + const Tensor *bias, + Tensor *output, + StatsFuture *future); + + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h index 5403733b5f634b9c4025553e68cf1a32f65d4a0a..d507011a2ed74b93ce105100590d6f20a3bbc899 100644 --- a/mace/kernels/eltwise.h +++ b/mace/kernels/eltwise.h @@ -27,10 +27,6 @@ #include "mace/kernels/kernel.h" #include "mace/utils/quantize.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -805,41 +801,20 @@ inline void TensorEltwisePerChannel(const EltwiseType type, } } -struct EltwiseFunctorBase : OpKernel { - EltwiseFunctorBase(OpKernelContext *context, - const EltwiseType type, - const std::vector &coeff, - const float scalar_input, - const int32_t scalar_input_index, - const DataFormat data_format) - : OpKernel(context), - type_(type), - coeff_(coeff), - scalar_input_(scalar_input), - scalar_input_index_(scalar_input_index), - data_format_(data_format) {} - - EltwiseType type_; - std::vector coeff_; - float scalar_input_; - int32_t scalar_input_index_; - DataFormat data_format_; -}; - template -struct EltwiseFunctor : EltwiseFunctorBase { +struct EltwiseFunctor : OpKernel { EltwiseFunctor(OpKernelContext *context, const EltwiseType type, const std::vector &coeff, const float scalar_input, // float as it comes from arg const int32_t scalar_input_index, const DataFormat data_format) - : EltwiseFunctorBase(context, - type, - coeff, - scalar_input, - scalar_input_index, - data_format) {} + : OpKernel(context), + type_(type), + coeff_(coeff), + scalar_input_(scalar_input), + scalar_input_index_(scalar_input_index), + data_format_(data_format) {} template MaceStatus DoEltwise(const Tensor *input0, @@ -957,23 +932,28 @@ struct EltwiseFunctor : EltwiseFunctorBase { } } + EltwiseType type_; + std::vector coeff_; + float scalar_input_; + int32_t scalar_input_index_; + DataFormat data_format_; Tensor scalar_tensor_; }; template <> -struct EltwiseFunctor : EltwiseFunctorBase { +struct EltwiseFunctor : OpKernel { EltwiseFunctor(OpKernelContext *context, const EltwiseType type, const std::vector &coeff, const float scalar_input, // float as it comes from arg const int32_t scalar_input_index, const DataFormat data_format) - : EltwiseFunctorBase(context, - type, - coeff, - scalar_input, - scalar_input_index, - data_format) {} + : OpKernel(context), + type_(type), + coeff_(coeff), + scalar_input_(scalar_input), + scalar_input_index_(scalar_input_index), + data_format_(data_format) {} MaceStatus operator()(const Tensor *input0, const Tensor *input1, @@ -1093,33 +1073,41 @@ struct EltwiseFunctor : EltwiseFunctorBase { return MACE_SUCCESS; } + + EltwiseType type_; + std::vector coeff_; + float scalar_input_; + int32_t scalar_input_index_; + DataFormat data_format_; + Tensor scalar_tensor_; }; #ifdef MACE_ENABLE_OPENCL +class OpenCLEltwiseKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input0, + const Tensor *input1, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLEltwiseKernel); +}; template -struct EltwiseFunctor : EltwiseFunctorBase { +struct EltwiseFunctor : OpKernel { EltwiseFunctor(OpKernelContext *context, const EltwiseType type, const std::vector &coeff, const float scalar_input, const int32_t scalar_input_index, - const DataFormat data_format) - : EltwiseFunctorBase(context, - type, - coeff, - scalar_input, - scalar_input_index, - data_format) {} + const DataFormat data_format); MaceStatus operator()(const Tensor *input0, const Tensor *input1, Tensor *output, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h index ccbc63445f8e2d038ed73db184f96e62c79a7792..20a572cbc52fa7ae241c6d77f9e0b9bf2b42e000 100644 --- a/mace/kernels/fully_connected.h +++ b/mace/kernels/fully_connected.h @@ -151,24 +151,32 @@ struct FullyConnectedFunctor: FullyConnectedBase { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLFullyConnectedKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *weight, + const Tensor *bias, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLFullyConnectedKernel); +}; template struct FullyConnectedFunctor : FullyConnectedBase { FullyConnectedFunctor(OpKernelContext *context, const ActivationType activation, - const float relux_max_limit) - : FullyConnectedBase(context, activation, relux_max_limit) {} + const float relux_max_limit); MaceStatus operator()(const Tensor *input, - const Tensor *weight, - const Tensor *bias, - Tensor *output, - StatsFuture *future); - - cl::Kernel kernel_; - std::vector gws_; - std::vector lws_; - std::vector input_shape_; - std::unique_ptr kernel_error_; + const Tensor *weight, + const Tensor *bias, + Tensor *output, + StatsFuture *future); + + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/lstmcell.h b/mace/kernels/lstmcell.h index cb6b86fdd2959067b9d5c53bc69cdb325b286d2e..81a7f386a215b71511da3566e5442dec6711cb65 100644 --- a/mace/kernels/lstmcell.h +++ b/mace/kernels/lstmcell.h @@ -35,11 +35,23 @@ namespace kernels { template struct LSTMCellFunctor; +class OpenCLLSTMCellKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *pre_output, + const Tensor *weight, + const Tensor *bias, + const Tensor *pre_cell, + Tensor *cell, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLLSTMCellKernel); +}; template struct LSTMCellFunctor : OpKernel{ - LSTMCellFunctor(OpKernelContext *context, T forget_bias) - : OpKernel(context), - forget_bias_(static_cast(forget_bias)) {} + LSTMCellFunctor(OpKernelContext *context, T forget_bias); MaceStatus operator()(const Tensor *input, const Tensor *pre_output, const Tensor *weight, @@ -49,11 +61,7 @@ struct LSTMCellFunctor : OpKernel{ Tensor *output, StatsFuture *future); - T forget_bias_; - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; } // namespace kernels diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h index 137c71517355e2ad8f1e559e47257864ed0460fa..5dab02c528867295df8cd7b2bf03ed1d2bcede81 100644 --- a/mace/kernels/matmul.h +++ b/mace/kernels/matmul.h @@ -34,10 +34,6 @@ #include "mace/kernels/gemmlowp_util.h" #include "mace/kernels/sgemm.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -89,7 +85,7 @@ struct MatMulFunctor : OpKernel { const index_t height_b = B->dim(rank - 2); const index_t width_b = B->dim(rank - 1); - auto scratch_buffer = context_->workspace()->GetScratchBuffer(D); + auto scratch_buffer = context_->device()->scratch_buffer(); scratch_buffer->Rewind(); index_t scratch_size = C->raw_max_size(); if (!A->is_weight()) { @@ -112,7 +108,7 @@ struct MatMulFunctor : OpKernel { A->is_weight(), B->is_weight(), c_ptr_base, - scratch_buffer); + context_->device()->scratch_buffer()); return MACE_SUCCESS; } @@ -218,9 +214,21 @@ struct MatMulFunctor : OpKernel { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLMatMulKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *A, + const Tensor *B, + Tensor *C, + bool transpose_a, + bool transpose_b, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLMatMulKernel); +}; template struct MatMulFunctor : OpKernel { - explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {} + explicit MatMulFunctor(OpKernelContext *context); MaceStatus operator()(const Tensor *A, const Tensor *B, @@ -229,9 +237,7 @@ struct MatMulFunctor : OpKernel { bool transpose_b, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc index 7757758c379b82ccfc8238da9960d46eed50380a..14c014ba6894ac6407c989337308dff1757a6b1c 100644 --- a/mace/kernels/opencl/activation.cc +++ b/mace/kernels/opencl/activation.cc @@ -13,96 +13,31 @@ // limitations under the License. #include "mace/kernels/activation.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" + +#include "mace/kernels/opencl/image/activation.h" namespace mace { namespace kernels { + +template +ActivationFunctor::ActivationFunctor( + OpKernelContext *context, + ActivationType type, + T relux_max_limit) : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset( + new opencl::image::ActivationKernel(type, relux_max_limit)); + } else { + MACE_NOT_IMPLEMENTED; + } +} template MaceStatus ActivationFunctor::operator()( const Tensor *input, const Tensor *alpha, Tensor *output, StatsFuture *future) { - const index_t batch = input->dim(0); - const index_t height = input->dim(1); - const index_t width = input->dim(2); - const index_t channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); - built_options.emplace("-Dactivation=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - switch (activation_) { - case RELU: - tuning_key_prefix_ = "relu_opencl_kernel"; - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - tuning_key_prefix_ = "relux_opencl_kernel"; - built_options.emplace("-DUSE_RELUX"); - break; - case PRELU: - tuning_key_prefix_ = "prelu_opencl_kernel"; - built_options.emplace("-DUSE_PRELU"); - break; - case TANH: - tuning_key_prefix_ = "tanh_opencl_kernel"; - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - tuning_key_prefix_ = "sigmoid_opencl_kernel"; - built_options.emplace("-DUSE_SIGMOID"); - break; - default: - LOG(FATAL) << "Unknown activation type: " << activation_; - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - - if (!IsVecEqual(input_shape_, input->shape())) { - int idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - if (activation_ == PRELU) { - MACE_CHECK_NOTNULL(alpha); - kernel_.setArg(idx++, *(alpha->opencl_image())); - } - kernel_.setArg(idx++, static_cast(relux_max_limit_)); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), - output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, input, alpha, output, future); } template struct ActivationFunctor; diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index 7c1c1afc669b3de85055aac01ea9f96d9cf007ec..af3d18d5453056c53be268799b73a976aa0c1083 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -13,97 +13,32 @@ // limitations under the License. #include "mace/kernels/addn.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" + +#include "mace/kernels/opencl/image/addn.h" namespace mace { namespace kernels { +template +AddNFunctor::AddNFunctor(OpKernelContext *context) + : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset( + new opencl::image::AddNKernel); + } else { + MACE_NOT_IMPLEMENTED; + } +} + template MaceStatus AddNFunctor::operator()( const std::vector &input_tensors, Tensor *output_tensor, StatsFuture *future) { - size_t size = input_tensors.size(); - MACE_CHECK(size >= 2 && input_tensors[0] != nullptr); - - const index_t batch = input_tensors[0]->dim(0); - const index_t height = input_tensors[0]->dim(1); - const index_t width = input_tensors[0]->dim(2); - const index_t channels = input_tensors[0]->dim(3); - - auto runtime = context_->device()->opencl_runtime(); - - for (size_t i = 1; i < size; ++i) { - MACE_CHECK_NOTNULL(input_tensors[i]); - MACE_CHECK(batch == input_tensors[i]->dim(0)); - MACE_CHECK(height == input_tensors[i]->dim(1)); - MACE_CHECK(width == input_tensors[i]->dim(2)); - MACE_CHECK(channels == input_tensors[i]->dim(3)); - } - - if (kernel_.get() == nullptr) { - if (input_tensors.size() > 4) { - MACE_NOT_IMPLEMENTED; - } - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); - built_options.emplace("-Daddn=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); - - MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - std::vector output_shape = input_tensors[0]->shape(); - - const index_t channel_blocks = RoundUpDiv4(channels); - const index_t width_pixels = channel_blocks * width; - const index_t batch_height_pixels = batch * height; - - const uint32_t gws[2] = {static_cast(width_pixels), - static_cast(batch_height_pixels)}; - - if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) { - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR( - output_tensor->ResizeImage(output_shape, output_image_shape)); - - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_2D_GWS_ARGS(kernel_); - for (auto input : input_tensors) { - kernel_.setArg(idx++, *(input->opencl_image())); - } - kernel_.setArg(idx++, *(output_tensor->opencl_image())); - - input_shape_ = input_tensors[0]->shape(); - } - - const std::vector lws = {kwg_size_ / 16, 16, 0}; - std::string tuning_key = - Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1), - output_tensor->dim(2), output_tensor->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, input_tensors, output_tensor, future); } template struct AddNFunctor; - template struct AddNFunctor; } // namespace kernels diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc index 446a26cc034bc9536d2495fc89c91d8174804f06..c09f8eb23b53c9b1a474a6b30214d6045dfa08ff 100644 --- a/mace/kernels/opencl/batch_norm.cc +++ b/mace/kernels/opencl/batch_norm.cc @@ -13,14 +13,26 @@ // limitations under the License. #include "mace/kernels/batch_norm.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/batch_norm.h" namespace mace { namespace kernels { +template +BatchNormFunctor::BatchNormFunctor( + OpKernelContext *context, + const bool folded_constant, + const ActivationType activation, + const float relux_max_limit) + : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::BatchNormKernel( + folded_constant, activation, relux_max_limit)); + } else { + MACE_NOT_IMPLEMENTED; + } +} + template MaceStatus BatchNormFunctor::operator()( const Tensor *input, @@ -31,84 +43,8 @@ MaceStatus BatchNormFunctor::operator()( const float epsilon, Tensor *output, StatsFuture *future) { - MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr)); - - const index_t batch = input->dim(0); - const index_t height = input->dim(1); - const index_t width = input->dim(2); - const index_t channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); - built_options.emplace("-Dbatch_norm=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - if (folded_constant_) { - built_options.emplace("-DFOLDED_CONSTANT"); - } - switch (activation_) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - default: - LOG(FATAL) << "Unknown activation type: " << activation_; - } - - MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(scale->opencl_image())); - kernel_.setArg(idx++, *(offset->opencl_image())); - if (!folded_constant_) { - kernel_.setArg(idx++, *(mean->opencl_image())); - kernel_.setArg(idx++, *(var->opencl_image())); - kernel_.setArg(idx++, epsilon); - } - kernel_.setArg(idx++, *(output->opencl_image())); - kernel_.setArg(idx++, relux_max_limit_); - - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("batch_norm_opencl_kernel", activation_, output->dim(0), - output->dim(1), output->dim(2), output->dim(3), folded_constant_); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, input, scale, offset, mean, + var, epsilon, output, future); } template struct BatchNormFunctor; diff --git a/mace/kernels/opencl/batch_to_space.cc b/mace/kernels/opencl/batch_to_space.cc index ec5cf5f7a2da1ff0d7ced9df1e75eaa7bda58707..7fe533ebe168a01d6d202410b781efe1de51feac 100644 --- a/mace/kernels/opencl/batch_to_space.cc +++ b/mace/kernels/opencl/batch_to_space.cc @@ -16,84 +16,31 @@ #define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_ #include "mace/kernels/batch_to_space.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/batch_to_space.h" namespace mace { namespace kernels { +template +BatchToSpaceFunctor::BatchToSpaceFunctor( + OpKernelContext *context, + const std::vector &paddings, + const std::vector &block_shape) + : BatchToSpaceFunctorBase(context, paddings, block_shape) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::BatchToSpaceKernel); + } else { + MACE_NOT_IMPLEMENTED; + } +} template MaceStatus BatchToSpaceFunctor::operator()( - Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) { + const Tensor *batch_tensor, Tensor *space_tensor, StatsFuture *future) { std::vector output_shape(4, 0); CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC, output_shape.data()); - - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR( - space_tensor->ResizeImage(output_shape, output_image_shape)); - - const uint32_t chan_blk = - static_cast(RoundUpDiv4(batch_tensor->dim(3))); - - const uint32_t gws[3] = { - chan_blk, static_cast(batch_tensor->dim(2)), - static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - const char *kernel_name = "batch_to_space"; - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToCLCMDDt(DataTypeToEnum::value)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space", - obfuscated_kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - if (!IsVecEqual(space_shape_, space_tensor->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(batch_tensor->opencl_image())); - kernel_.setArg(idx++, *(space_tensor->opencl_image())); - kernel_.setArg(idx++, block_shape_[0]); - kernel_.setArg(idx++, block_shape_[1]); - kernel_.setArg(idx++, paddings_[0]); - kernel_.setArg(idx++, paddings_[2]); - kernel_.setArg(idx++, static_cast(space_tensor->dim(0))); - kernel_.setArg(idx++, static_cast(space_tensor->dim(1))); - kernel_.setArg(idx++, static_cast(space_tensor->dim(2))); - kernel_.setArg(idx++, static_cast(batch_tensor->dim(1))); - kernel_.setArg(idx++, static_cast(batch_tensor->dim(2))); - - space_shape_ = space_tensor->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1), - batch_tensor->dim(2), batch_tensor->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, batch_tensor, paddings_, block_shape_, + output_shape, space_tensor, future); } template struct BatchToSpaceFunctor; diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc index eae22c0074c8205c69fc7741274b09700e94d6f1..6904eed9d113693b55578485d8d9e7d80196b5a4 100644 --- a/mace/kernels/opencl/bias_add.cc +++ b/mace/kernels/opencl/bias_add.cc @@ -13,13 +13,23 @@ // limitations under the License. #include "mace/kernels/bias_add.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/bias_add.h" namespace mace { namespace kernels { +template +BiasAddFunctor::BiasAddFunctor( + OpKernelContext *context, + const DataFormat data_format) + : BiasAddFunctorBase(context, data_format) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::BiasAddKernel); + } else { + MACE_NOT_IMPLEMENTED; + } +} + template MaceStatus BiasAddFunctor::operator()(const Tensor *input, const Tensor *bias, @@ -27,75 +37,7 @@ MaceStatus BiasAddFunctor::operator()(const Tensor *input, StatsFuture *future) { MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC, "gpu only support biasadd for 4-dimensional NHWC format tensor"); - - const index_t batch = input->dim(0); - const index_t height = input->dim(1); - const index_t width = input->dim(2); - const index_t channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::set built_options; - auto dt = DataTypeToEnum::value; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); - built_options.emplace("-Dbias_add=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name, - built_options, &kernel_)); - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(bias->opencl_image())); - kernel_.setArg(idx++, *(output->opencl_image())); - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - - cl::Event event; - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } else { - std::vector roundup_gws(lws.size()); - for (size_t i = 0; i < lws.size(); ++i) { - if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]); - } - - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } - MACE_CL_RET_STATUS(error); - OUT_OF_RANGE_VALIDATION(kernel_error_); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } - - return MACE_SUCCESS; + return kernel_->Compute(context_, input, bias, output, future); } template struct BiasAddFunctor; diff --git a/mace/kernels/opencl/buffer/buffer_inverse_transform.h b/mace/kernels/opencl/buffer/buffer_inverse_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..93bd22a9412b7d2fd48cd1297da32cc5c7a3e371 --- /dev/null +++ b/mace/kernels/opencl/buffer/buffer_inverse_transform.h @@ -0,0 +1,71 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_ +#define MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_ + +#include "mace/kernels/buffer_inverse_transform.h" +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { + +MaceStatus BufferTypeTransform( + OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const DataType dt, + Tensor *output, + StatsFuture *future); + +template +class BufferInverseTransform: public OpenCLBufferInverseTransformKernel { + public: + MaceStatus Compute(OpKernelContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + Tensor *output, + StatsFuture *future) override; + private: + cl::Kernel kernel_; +}; + +template +MaceStatus BufferInverseTransform::Compute(OpKernelContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + Tensor *output, + StatsFuture *future) { + MACE_UNUSED(type); + MACE_UNUSED(wino_blk_size); + const DataType dt = DataTypeToEnum::value; + if (input->dtype() != output->dtype()) { + return BufferTypeTransform(context, &kernel_, input, dt, output, future); + } else { + SetFutureDefaultWaitFn(future); + output->ReuseTensorBuffer(*input); + return MaceStatus::MACE_SUCCESS; + } +} + +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_ diff --git a/mace/kernels/opencl/buffer/buffer_transform.cc b/mace/kernels/opencl/buffer/buffer_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..73ee521cef20f273a55e71bfcb529e04eb122f5d --- /dev/null +++ b/mace/kernels/opencl/buffer/buffer_transform.cc @@ -0,0 +1,244 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/opencl/buffer/buffer_transform.h" + +#include +#include +#include + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { + +MaceStatus TransformConv2DFilter( + OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const DataType dt, + Tensor *output, + StatsFuture *future) { + const index_t out_chan = input->dim(0); + const index_t in_chan = input->dim(1); + const index_t filter_height = input->dim(2); + const index_t filter_width = input->dim(3); + + std::vector transformed_shape = { + filter_height, filter_width, + RoundUpDiv4(out_chan), + RoundUp(in_chan, 4), + 4, + }; + uint32_t gws[3]; + gws[0] = static_cast(transformed_shape[3]); + gws[1] = static_cast(transformed_shape[2]); + gws[2] = static_cast(filter_height * filter_width); + MACE_RETURN_IF_ERROR(output->Resize(transformed_shape)); + output->Reshape(input->shape()); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION + if (kernel->get() == nullptr) { + std::set built_options; + MACE_NON_UNIFORM_WG_CONFIG; + MACE_OUT_OF_RANGE_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter"); + built_options.emplace("-Dtransform_conv_filter=" + kernel_name); + built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", + kernel_name, + built_options, + kernel)); + } + MACE_OUT_OF_RANGE_INIT(*kernel); + + uint32_t idx = 0; + MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size()); + MACE_SET_3D_GWS_ARGS(*kernel, gws); + kernel->setArg(idx++, *(input->opencl_buffer())); + MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0, + "buffer offset not aligned"); + kernel->setArg(idx++, + static_cast(input->buffer_offset() / + GetEnumTypeSize(input->dtype()))); + kernel->setArg(idx++, *(output->opencl_buffer())); + kernel->setArg(idx++, static_cast(out_chan)); + kernel->setArg(idx++, static_cast(in_chan)); + kernel->setArg(idx++, static_cast(filter_height)); + kernel->setArg(idx++, static_cast(filter_width)); + kernel->setArg(idx++, static_cast( + in_chan * filter_height * filter_width)); + + std::string tuning_key = + Concat("transform_conv_filter", + transformed_shape[0], + transformed_shape[1], + transformed_shape[2], + transformed_shape[3]); + std::vector lws = {4, 4, 4, 0}; + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, + gws, lws, future)); + MACE_OUT_OF_RANGE_VALIDATION + // Mark the buffer unused. + const_cast(input)->MarkUnused(); + return MACE_SUCCESS; +} + +MaceStatus TransformDWConv2DFilter( + OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const DataType dt, + Tensor *output, + StatsFuture *future) { + const index_t multiplier = input->dim(0); + const index_t in_chan = input->dim(1); + const index_t filter_height = input->dim(2); + const index_t filter_width = input->dim(3); + + std::vector transformed_shape = { + multiplier, RoundUpDiv4(in_chan), + filter_height, filter_width, 4, + }; + uint32_t gws[3]; + gws[0] = static_cast(filter_width); + gws[1] = static_cast(filter_height); + gws[2] = static_cast(transformed_shape[1]); + MACE_RETURN_IF_ERROR(output->Resize(transformed_shape)); + output->Reshape(input->shape()); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION + if (kernel->get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter"); + built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name); + built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", + kernel_name, + built_options, + kernel)); + } + + MACE_OUT_OF_RANGE_INIT(*kernel); + + uint32_t idx = 0; + MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size()); + MACE_SET_3D_GWS_ARGS(*kernel, gws); + kernel->setArg(idx++, *(input->opencl_buffer())); + MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0, + "buffer offset not aligned"); + kernel->setArg(idx++, + static_cast(input->buffer_offset() / + GetEnumTypeSize(input->dtype()))); + kernel->setArg(idx++, *(output->opencl_buffer())); + kernel->setArg(idx++, static_cast(in_chan)); + kernel->setArg(idx++, static_cast(filter_height * filter_width)); + + std::string tuning_key = + Concat("transform_conv_filter", + transformed_shape[0], + transformed_shape[1], + transformed_shape[2], + transformed_shape[3]); + std::vector lws = {4, 4, 4, 0}; + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, + gws, lws, future)); + MACE_OUT_OF_RANGE_VALIDATION + // Mark the buffer unused. + const_cast(input)->MarkUnused(); + return MACE_SUCCESS; +} + +MaceStatus TransformArgument( + OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const DataType dt, + Tensor *output, + StatsFuture *future) { + const index_t size = input->dim(0); + + std::vector transformed_shape = {RoundUp(size, 4)}; + uint32_t gws = static_cast(RoundUpDiv4(transformed_shape[0])); + MACE_RETURN_IF_ERROR(output->Resize(transformed_shape)); + output->Reshape(input->shape()); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION + if (kernel->get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg"); + built_options.emplace("-Dtransform_arg=" + kernel_name); + built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", + kernel_name, + built_options, + kernel)); + } + MACE_OUT_OF_RANGE_INIT(*kernel); + + uint32_t idx = 0; + MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size()); + kernel->setArg(idx++, gws); + kernel->setArg(idx++, *(input->opencl_buffer())); + MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0, + "buffer offset not aligned"); + kernel->setArg(idx++, + static_cast(input->buffer_offset() / + GetEnumTypeSize(input->dtype()))); + kernel->setArg(idx++, *(output->opencl_buffer())); + kernel->setArg(idx++, static_cast(size)); + + const uint32_t lws = + static_cast(RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize())); + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, cl::NDRange(gws), + cl::NDRange(lws), nullptr, &event); + } else { + uint32_t roundup_gws = RoundUp(gws, lws); + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, cl::NDRange(roundup_gws), + cl::NDRange(lws), nullptr, &event); + } + MACE_CL_RET_STATUS(error); + MACE_OUT_OF_RANGE_VALIDATION + if (future != nullptr) { + future->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + // Mark the buffer unused. + const_cast(input)->MarkUnused(); + return MACE_SUCCESS; +} + +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/buffer/buffer_transform.h b/mace/kernels/opencl/buffer/buffer_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..4c56f316e70b10dfb4aaa2c62a76a778600c5ed1 --- /dev/null +++ b/mace/kernels/opencl/buffer/buffer_transform.h @@ -0,0 +1,113 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_ +#define MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_ + +#include + +#include "mace/kernels/buffer_transform.h" +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { + +MaceStatus BufferTypeTransform( + OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const DataType dt, + Tensor *output, + StatsFuture *future); + +MaceStatus TransformConv2DFilter( + OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const DataType dt, + Tensor *output, + StatsFuture *future); + +MaceStatus TransformDWConv2DFilter( + OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const DataType dt, + Tensor *output, + StatsFuture *future); + +MaceStatus TransformArgument( + OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const DataType dt, + Tensor *output, + StatsFuture *future); + + +template +class BufferTransform: public OpenCLBufferTransformKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + Tensor *output, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + std::vector input_shape_; +}; + +template +MaceStatus BufferTransform::Compute(OpKernelContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + Tensor *output, + StatsFuture *future) { + MACE_UNUSED(type); + MACE_UNUSED(wino_blk_size); + const DataType dt = DataTypeToEnum::value; + switch (type) { + case CONV2D_FILTER: + return TransformConv2DFilter(context, &kernel_, input, + dt, output, future); + case DW_CONV2D_FILTER: + return TransformDWConv2DFilter(context, &kernel_, input, + dt, output, future); + case ARGUMENT: + return TransformArgument(context, &kernel_, input, dt, output, future); + default: + if (input->dtype() != dt) { + return BufferTypeTransform(context, &kernel_, input, + dt, output, future); + } else { + SetFutureDefaultWaitFn(future); + output->ReuseTensorBuffer(*input); + return MaceStatus::MACE_SUCCESS; + } + } +} + +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_ diff --git a/mace/kernels/opencl/buffer/buffer_type_transform.cc b/mace/kernels/opencl/buffer/buffer_type_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..8de6d6df02b6c3111b64d6b65b817e1eae198010 --- /dev/null +++ b/mace/kernels/opencl/buffer/buffer_type_transform.cc @@ -0,0 +1,99 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/activation.h" +#include "mace/kernels/conv_2d.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/tuner.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { + + +MaceStatus BufferTypeTransform( + OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const DataType dt, + Tensor *output, + StatsFuture *future) { + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION + + const uint32_t gws = + static_cast(RoundUpDiv4(output->size())); + if (kernel->get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type"); + built_options.emplace("-Dtransform_data_type=" + kernel_name); + built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", + kernel_name, + built_options, + kernel)); + } + + MACE_OUT_OF_RANGE_INIT(*kernel); + uint32_t idx = 0; + MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size()); + kernel->setArg(idx++, gws); + kernel->setArg(idx++, *(input->opencl_buffer())); + MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0, + "buffer offset not aligned"); + kernel->setArg(idx++, + static_cast(input->buffer_offset() / + GetEnumTypeSize(input->dtype()))); + kernel->setArg(idx++, *(output->opencl_buffer())); + + const uint32_t lws = + static_cast(RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize())); + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, cl::NDRange(gws), + cl::NDRange(lws), nullptr, &event); + } else { + uint32_t roundup_gws = RoundUp(gws, lws); + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, cl::NDRange(roundup_gws), + cl::NDRange(lws), nullptr, &event); + } + MACE_CL_RET_STATUS(error); + MACE_OUT_OF_RANGE_VALIDATION + if (future != nullptr) { + future->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + // Mark the buffer unused. + const_cast(input)->MarkUnused(); + return MACE_SUCCESS; +} + +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/buffer/conv_2d.h b/mace/kernels/opencl/buffer/conv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..ba1983208547976dad0e472b2a250e35ee806745 --- /dev/null +++ b/mace/kernels/opencl/buffer/conv_2d.h @@ -0,0 +1,219 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_ +#define MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_ + +#include "mace/kernels/conv_2d.h" + +#include +#include +#include + +#include "mace/kernels/opencl/buffer/utils.h" +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { +namespace conv2d { + +extern MaceStatus Conv2d1x1(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *padded_input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const DataType dt, + const ActivationType activation, + const float relux_max_limit, + const bool input_changed, + Tensor *output, + StatsFuture *future); + +extern MaceStatus Conv2dGeneral(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const int *dilations, + const DataType dt, + const ActivationType activation, + const float relux_max_limit, + const bool input_changed, + Tensor *output, + StatsFuture *future); +} // namespace conv2d + +template +class Conv2dKernel : public OpenCLConv2dKernel { + public: + Conv2dKernel() : old_scratch_size_(0) {} + + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) override; + + private: + index_t old_scratch_size_; + cl::Kernel kernels_[2]; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus Conv2dKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) { + StatsFuture pad_future, conv_future; + index_t filter_h = filter->dim(2); + index_t filter_w = filter->dim(3); + // Reshape output + std::vector output_shape(4); + std::vector paddings(2); + if (padding_data.empty()) { + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter->shape().data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), filter->shape().data(), + padding_data.data(), dilations, strides, RoundType::FLOOR, + output_shape.data()); + } + + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + // calculate padded input shape + index_t width = output_shape[2]; + index_t channels = output_shape[3]; + + index_t input_height = input->dim(1); + index_t input_width = input->dim(2); + index_t input_channels = input->dim(3); + + int pad_top = paddings[0] >> 1; + int pad_left = paddings[1] >> 1; + + MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels); + MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ", + input_channels); + + std::function conv_func; + + // Mark whether input changed or not + bool input_changed = !IsVecEqual(input_shape_, input->shape()); + input_shape_ = input->shape(); + + bool use_1x1 = filter_h == 1 && filter_w == 1; + + std::vector padded_output_shape = output_shape; + index_t tile_w, tile_c = 4; + if (use_1x1) { + tile_w = 2; + } else { + tile_w = 4; + } + padded_output_shape[2] = RoundUp(width, tile_w); + + std::vector padded_input_shape = input->shape(); + padded_input_shape[1] = input_height + paddings[0]; + padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] + + (filter_w - 1) * dilations[1] + 1; + padded_input_shape[3] = RoundUp(input_channels, tile_c); + + const Tensor *padded_input_ptr = input; + // pad input + std::unique_ptr padded_input; + if (padded_input_shape[1] != input_height || + padded_input_shape[2] != input_width || + padded_input_shape[3] != input_channels) { + // decide scratch size before allocate it + index_t total_scratch_size = 0; + index_t padded_input_size = 0; + + padded_input_size = + std::accumulate(padded_input_shape.begin(), + padded_input_shape.end(), + 1, + std::multiplies()) + * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE; + total_scratch_size += padded_input_size; + + // Init scratch buffer + ScratchBuffer *scratch = context->device()->scratch_buffer(); + scratch->Rewind(); + scratch->GrowSize(total_scratch_size); + if (old_scratch_size_ != scratch->size()) { + input_changed |= scratch->size() != old_scratch_size_; + old_scratch_size_ = scratch->size(); + } + + padded_input.reset(new Tensor(scratch->Scratch(padded_input_size), + input->dtype())); + + padded_input->Resize(padded_input_shape); + PadInput(context, &kernels_[0], input, pad_top, pad_left, + input_changed, padded_input.get(), &pad_future); + padded_input_ptr = padded_input.get(); + } + + if (use_1x1) { + conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus { + return conv2d::Conv2d1x1( + context, &kernels_[1], pad_input, filter, bias, strides, + DataTypeToEnum::v(), activation, relux_max_limit, + input_changed, output, &conv_future); + }; + } else { + conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus { + return conv2d::Conv2dGeneral( + context, &kernels_[1], pad_input, filter, bias, strides, dilations, + DataTypeToEnum::v(), activation, relux_max_limit, + input_changed, output, &conv_future); + }; + } + MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output)); + MergeMultipleFutureWaitFn({pad_future, conv_future}, future); + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_ diff --git a/mace/kernels/opencl/buffer/conv_2d_1x1.cc b/mace/kernels/opencl/buffer/conv_2d_1x1.cc new file mode 100644 index 0000000000000000000000000000000000000000..97854cf47cadcc93069aea6bb72b2b9e747dd342 --- /dev/null +++ b/mace/kernels/opencl/buffer/conv_2d_1x1.cc @@ -0,0 +1,127 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/activation.h" +#include "mace/kernels/conv_2d.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/tuner.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { +namespace conv2d { + +MaceStatus Conv2d1x1(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *padded_input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const DataType dt, + const ActivationType activation, + const float relux_max_limit, + const bool input_changed, + Tensor *output, + StatsFuture *future) { + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channel = output->dim(3); + + const index_t in_height = padded_input->dim(1); + const index_t in_width = padded_input->dim(2); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel->get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d"); + built_options.emplace("-Dconv2d=" + kernel_name); + built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype())); + built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation; + } + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1_buffer", + kernel_name, + built_options, kernel)); + } + + const uint32_t gws[2] = {static_cast( + RoundUpDiv4(channel) * + RoundUpDiv(width, 2)), + static_cast(height * batch)}; + + MACE_OUT_OF_RANGE_INIT(*kernel); + if (input_changed) { + uint32_t idx = 0; + MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size()); + MACE_SET_2D_GWS_ARGS(*kernel, gws); + kernel->setArg(idx++, *(padded_input->opencl_buffer())); + kernel->setArg(idx++, *(filter->opencl_buffer())); + if (bias != nullptr) { + kernel->setArg(idx++, *(bias->opencl_buffer())); + } + kernel->setArg(idx++, static_cast(in_height)); + kernel->setArg(idx++, static_cast(in_width)); + kernel->setArg(idx++, static_cast(padded_input->dim(3))); + kernel->setArg(idx++, + static_cast(filter->buffer_shape()[3])); + kernel->setArg(idx++, static_cast(height)); + kernel->setArg(idx++, static_cast(width)); + kernel->setArg(idx++, static_cast(channel)); + kernel->setArg(idx++, strides[0]); + kernel->setArg(idx++, strides[1]); + kernel->setArg(idx++, relux_max_limit); + kernel->setArg(idx++, *(output->opencl_buffer())); + } + + std::string tuning_key = + Concat("conv2d_1x1_buffer", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + std::vector lws = {16, 4, 0}; + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws, + lws, future)); + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace conv2d +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/buffer/conv_2d_general.cc b/mace/kernels/opencl/buffer/conv_2d_general.cc new file mode 100644 index 0000000000000000000000000000000000000000..67feecdf79a383f4971f0448d7a71ac85c430650 --- /dev/null +++ b/mace/kernels/opencl/buffer/conv_2d_general.cc @@ -0,0 +1,141 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/activation.h" +#include "mace/kernels/conv_2d.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/tuner.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { +namespace conv2d { + +MaceStatus Conv2dGeneral(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *padded_input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const int *dilations, + const DataType dt, + const ActivationType activation, + const float relux_max_limit, + const bool input_changed, + Tensor *output, + StatsFuture *future) { + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channel = output->dim(3); + + const index_t in_height = padded_input->dim(1); + const index_t in_width = padded_input->dim(2); + const index_t in_channel = padded_input->dim(3); + + const index_t filter_height = filter->dim(2); + const index_t filter_width = filter->dim(3); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel->get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d"); + built_options.emplace("-Dconv2d=" + kernel_name); + built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype())); + built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation; + } + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_buffer", + kernel_name, + built_options, kernel)); + } + + const uint32_t gws[2] = {static_cast( + RoundUpDiv4(channel) * RoundUpDiv4(width)), + static_cast(height * batch)}; + + MACE_OUT_OF_RANGE_INIT(*kernel); + if (input_changed) { + auto filter_buffer_shape = filter->buffer_shape(); + uint32_t idx = 0; + MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size()); + MACE_SET_2D_GWS_ARGS(*kernel, gws) + kernel->setArg(idx++, *(padded_input->opencl_buffer())); + kernel->setArg(idx++, *(filter->opencl_buffer())); + if (bias != nullptr) { + kernel->setArg(idx++, *(bias->opencl_buffer())); + } + kernel->setArg(idx++, static_cast(in_height)); + kernel->setArg(idx++, static_cast(in_width)); + kernel->setArg(idx++, static_cast(padded_input->dim(3))); + kernel->setArg(idx++, static_cast(filter_height)); + kernel->setArg(idx++, static_cast(filter_width)); + kernel->setArg(idx++, + static_cast(filter_buffer_shape[3])); + kernel->setArg(idx++, static_cast( + filter_buffer_shape[2] * filter_buffer_shape[3] + * filter_buffer_shape[4])); + kernel->setArg(idx++, static_cast(height)); + kernel->setArg(idx++, static_cast(width)); + kernel->setArg(idx++, static_cast(channel)); + kernel->setArg(idx++, strides[0]); + kernel->setArg(idx++, strides[1]); + kernel->setArg(idx++, static_cast( + dilations[0] * in_width * in_channel)); + kernel->setArg(idx++, static_cast( + dilations[1] * in_channel)); + kernel->setArg(idx++, relux_max_limit); + kernel->setArg(idx++, *(output->opencl_buffer())); + } + + std::string tuning_key = + Concat("conv2d_general_buffer", output->dim(0), output->dim(1), + output->dim(2), output->dim(3), filter_height, filter_width); + std::vector lws = {16, 4, 0}; + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws, + lws, future)); + MACE_OUT_OF_RANGE_VALIDATION + return MACE_SUCCESS; +} + +} // namespace conv2d +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/buffer/depthwise_conv2d.cc b/mace/kernels/opencl/buffer/depthwise_conv2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..bcd36bba706cd5affb40caf8c587b3577aac298e --- /dev/null +++ b/mace/kernels/opencl/buffer/depthwise_conv2d.cc @@ -0,0 +1,137 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/opencl/buffer/depthwise_conv2d.h" + +#include +#include + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { +namespace depthwise { + +MaceStatus DepthwiseConv2d(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *padded_input, // NHWC + const Tensor *filter, // HWIM + const Tensor *bias, + const int *strides, + const int *dilations, + const DataType dt, + const ActivationType activation, + const float relux_max_limit, + const bool input_changed, + Tensor *output, + StatsFuture *future) { + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channel = output->dim(3); + + const index_t in_height = padded_input->dim(1); + const index_t in_width = padded_input->dim(2); + const index_t in_channel = padded_input->dim(3); + + const index_t filter_height = filter->dim(2); + const index_t filter_width = filter->dim(3); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION + + if (kernel->get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); + built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); + built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype())); + built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation; + } + + MACE_RETURN_IF_ERROR( + runtime->BuildKernel("depthwise_conv2d_buffer", kernel_name, + built_options, kernel)); + } + + const uint32_t gws[2] = { + static_cast(RoundUpDiv4(channel) * RoundUpDiv4(width)), + static_cast(height * batch) + }; + + MACE_OUT_OF_RANGE_INIT(*kernel); + if (input_changed) { + uint32_t idx = 0; + MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size()); + MACE_SET_2D_GWS_ARGS(*kernel, gws); + kernel->setArg(idx++, *(padded_input->opencl_buffer())); + kernel->setArg(idx++, *(filter->opencl_buffer())); + if (bias != nullptr) { + kernel->setArg(idx++, *(bias->opencl_buffer())); + } + kernel->setArg(idx++, static_cast(in_height)); + kernel->setArg(idx++, static_cast(in_width)); + kernel->setArg(idx++, static_cast(in_channel)); + kernel->setArg(idx++, static_cast(filter_height)); + kernel->setArg(idx++, static_cast(filter_width)); + kernel->setArg(idx++, static_cast(filter_height * filter_width)); + kernel->setArg(idx++, static_cast(height)); + kernel->setArg(idx++, static_cast(width)); + kernel->setArg(idx++, static_cast(channel)); + kernel->setArg(idx++, static_cast(strides[0])); + kernel->setArg(idx++, static_cast(strides[1])); + kernel->setArg(idx++, static_cast( + dilations[0] * in_width * in_channel)); + kernel->setArg(idx++, static_cast( + dilations[1] * in_channel)); + kernel->setArg(idx++, relux_max_limit); + kernel->setArg(idx++, *(output->opencl_buffer())); + } + + std::vector lws = {16, 4, 0}; + std::string tuning_key = + Concat("depthwise_conv2d_buffer_kernel", in_height, in_width, in_channel, + filter_height, filter_width, channel); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION + return MACE_SUCCESS; +} + +} // namespace depthwise +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/buffer/depthwise_conv2d.h b/mace/kernels/opencl/buffer/depthwise_conv2d.h new file mode 100644 index 0000000000000000000000000000000000000000..23fddf0e3e7ac5eb4fa88ef22c548c677d1bfeba --- /dev/null +++ b/mace/kernels/opencl/buffer/depthwise_conv2d.h @@ -0,0 +1,190 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_ +#define MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_ + +#include "mace/kernels/depthwise_conv2d.h" + +#include +#include +#include + +#include "mace/kernels/opencl/buffer/utils.h" +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { +namespace depthwise { + +MaceStatus DepthwiseConv2d(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *padded_input, // NHWC + const Tensor *filter, // HWIM + const Tensor *bias, + const int *strides, + const int *dilations, + const DataType dt, + const ActivationType activation, + const float relux_max_limit, + const bool input_changed, + Tensor *output, + StatsFuture *future); +} // namespace depthwise + + +template +class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { + public: + DepthwiseConv2dKernel() : old_scratch_size_(0) {} + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) override; + + private: + index_t old_scratch_size_; + cl::Kernel kernels_[2]; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus DepthwiseConv2dKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) { + StatsFuture pad_future, dw_conv_future; + index_t filter_w = filter->dim(3); + + // Create a fake conv_2d filter to calculate the paddings and output size + std::vector fake_filter_shape(4); + fake_filter_shape[0] = filter->dim(0) * filter->dim(1); + fake_filter_shape[1] = filter->dim(1); + fake_filter_shape[2] = filter->dim(2); + fake_filter_shape[3] = filter->dim(3); + + std::vector output_shape(4); + std::vector paddings(2); + if (padding_data.empty()) { + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), fake_filter_shape.data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), fake_filter_shape.data(), + padding_data.data(), dilations, strides, RoundType::FLOOR, + output_shape.data()); + } + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + // calculate padded input shape + index_t width = output_shape[2]; + index_t channels = output_shape[3]; + + index_t input_height = input->dim(1); + index_t input_width = input->dim(2); + index_t input_channels = input->dim(3); + + int pad_top = paddings[0] >> 1; + int pad_left = paddings[1] >> 1; + + MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported"); + MACE_CHECK(filter->dim(0) * input_channels == channels); + MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ", + input_channels); + + // Mark whether input changed or not + bool input_changed = !IsVecEqual(input_shape_, input->shape()); + input_shape_ = input->shape(); + + std::vector padded_output_shape = output_shape; + index_t tile_w = 4, tile_c = 4; + padded_output_shape[2] = RoundUp(width, tile_w); + + std::vector padded_input_shape = input->shape(); + padded_input_shape[1] = input_height + paddings[0]; + padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] + + (filter_w - 1) * dilations[1] + 1; + padded_input_shape[3] = RoundUp(input_channels, tile_c); + + const Tensor *padded_input_ptr = input; + // pad input + std::unique_ptr padded_input; + if (padded_input_shape[1] != input_height || + padded_input_shape[2] != input_width || + padded_input_shape[3] != input_channels) { + index_t total_scratch_size = 0; + index_t padded_input_size = 0; + + padded_input_size = + std::accumulate(padded_input_shape.begin(), + padded_input_shape.end(), + 1, + std::multiplies()) + * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE; + total_scratch_size += padded_input_size; + + // Init scratch buffer + ScratchBuffer *scratch = context->device()->scratch_buffer(); + scratch->Rewind(); + scratch->GrowSize(total_scratch_size); + if (old_scratch_size_ != scratch->size()) { + input_changed |= scratch->size() != old_scratch_size_; + old_scratch_size_ = scratch->size(); + } + + padded_input.reset(new Tensor(scratch->Scratch(padded_input_size), + input->dtype())); + + padded_input->Resize(padded_input_shape); + PadInput(context, &kernels_[0], input, pad_top, pad_left, + input_changed, padded_input.get(), &pad_future); + padded_input_ptr = padded_input.get(); + } + + MACE_RETURN_IF_ERROR( + depthwise::DepthwiseConv2d( + context, &kernels_[1], padded_input_ptr, filter, bias, strides, + dilations, DataTypeToEnum::v(), activation, relux_max_limit, + input_changed, output, &dw_conv_future)); + MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, future); + return MaceStatus::MACE_SUCCESS; +} + +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_ diff --git a/mace/kernels/opencl/buffer/pooling.h b/mace/kernels/opencl/buffer/pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..ef4ee4472f4a254768d49abd7839a453365587ee --- /dev/null +++ b/mace/kernels/opencl/buffer/pooling.h @@ -0,0 +1,213 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_BUFFER_POOLING_H_ +#define MACE_KERNELS_OPENCL_BUFFER_POOLING_H_ + +#include "mace/kernels/pooling.h" + +#include +#include +#include +#include +#include + +#include "mace/kernels/opencl/buffer/utils.h" +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { + +template +class PoolingKernel : public OpenCLPoolingKernel { + public: + PoolingKernel() : old_scratch_size_(0) {} + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + Tensor *output, + StatsFuture *future) override; + + private: + index_t old_scratch_size_; + cl::Kernel kernels_[2]; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus PoolingKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + Tensor *output, + StatsFuture *future) { + MACE_CHECK(dilations[0] == 1 && dilations[1] == 1) + << "Pooling opencl kernel not support dilation yet"; + + StatsFuture pad_future, pooling_future; + + index_t input_channels = input->dim(3); + + std::vector output_shape(4); + std::vector filter_shape = {input->dim(3), input->dim(3), + kernels[0], kernels[1]}; + + std::vector paddings(2); + if (padding_data.empty()) { + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter_shape.data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), filter_shape.data(), + padding_data.data(), dilations, strides, RoundType::CEIL, + output_shape.data()); + } + + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + // Mark whether input changed or not + bool input_changed = !IsVecEqual(input_shape_, input->shape()); + input_shape_ = input->shape(); + + auto runtime = context->device()->opencl_runtime(); + + // pad input + std::vector padded_input_shape = input->shape(); + padded_input_shape[3] = RoundUp(input_channels, 4); + + const Tensor *padded_input_ptr = input; + // pad input + std::unique_ptr padded_input; + if (padded_input_shape[3] != input_channels) { + index_t total_scratch_size = 0; + index_t padded_input_size = 0; + + padded_input_size = + std::accumulate(padded_input_shape.begin(), + padded_input_shape.end(), + 1, + std::multiplies()) + * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE; + total_scratch_size += padded_input_size; + + // Init scratch buffer + ScratchBuffer *scratch = context->device()->scratch_buffer(); + scratch->Rewind(); + scratch->GrowSize(total_scratch_size); + if (old_scratch_size_ != scratch->size()) { + input_changed |= scratch->size() != old_scratch_size_; + old_scratch_size_ = scratch->size(); + } + + padded_input.reset(new Tensor(scratch->Scratch(padded_input_size), + input->dtype())); + + padded_input->Resize(padded_input_shape); + PadInput(context, &kernels_[0], input, 0, 0, + input_changed, padded_input.get(), &pad_future); + padded_input_ptr = padded_input.get(); + } + + cl::Kernel *kernel = &kernels_[1]; + MACE_OUT_OF_RANGE_DEFINITION + + if (kernel->get() == nullptr) { + const DataType dt = DataTypeToEnum::value; + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); + built_options.emplace("-Dpooling=" + kernel_name); + + if (pooling_type == MAX && input->dtype() == output->dtype()) { + built_options.emplace("-DIN_DATA_TYPE=" + + DtToCLDt(input->dtype())); + built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + } else { + built_options.emplace("-DIN_DATA_TYPE=" + + DtToCLDt(input->dtype())); + built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + } + if (pooling_type == AVG) { + built_options.emplace("-DPOOL_AVG"); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer", + kernel_name, + built_options, + kernel)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); + } + + const uint32_t gws[3] = { + static_cast(RoundUpDiv4(output->dim(3))), + static_cast(output->dim(2)), + static_cast(output->dim(0) * output->dim(1)), + }; + + MACE_OUT_OF_RANGE_INIT(*kernel); + if (input_changed) { + uint32_t idx = 0; + MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size()); + MACE_SET_3D_GWS_ARGS(*kernel, gws); + kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer())); + kernel->setArg(idx++, static_cast(padded_input_ptr->dim(1))); + kernel->setArg(idx++, static_cast(padded_input_ptr->dim(2))); + kernel->setArg(idx++, static_cast(padded_input_ptr->dim(3))); + kernel->setArg(idx++, static_cast(output->dim(1))); + kernel->setArg(idx++, static_cast(output->dim(3))); + kernel->setArg(idx++, paddings[0] / 2); + kernel->setArg(idx++, paddings[1] / 2); + kernel->setArg(idx++, strides[0]); + kernel->setArg(idx++, strides[1]); + kernel->setArg(idx++, kernels[0]); + kernel->setArg(idx++, kernels[1]); + kernel->setArg(idx++, *(output->opencl_buffer())); + } + + const std::vector lws = {4, 4, 4, 0}; + std::string tuning_key = + Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, + gws, lws, &pooling_future)); + MACE_OUT_OF_RANGE_VALIDATION + MergeMultipleFutureWaitFn({pad_future, pooling_future}, future); + + return MACE_SUCCESS; +} + +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_BUFFER_POOLING_H_ diff --git a/mace/kernels/opencl/buffer/softmax.h b/mace/kernels/opencl/buffer/softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..59bb8d26dd25bf7313a0882c348e62cdc68c2a57 --- /dev/null +++ b/mace/kernels/opencl/buffer/softmax.h @@ -0,0 +1,125 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_ +#define MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_ + +#include "mace/kernels/softmax.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { +template +class SoftmaxKernel : public OpenCLSoftmaxKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *logits, + Tensor *output, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus SoftmaxKernel::Compute( + OpKernelContext *context, + const Tensor *logits, + Tensor *output, + StatsFuture *future) { + index_t batch = 0; + index_t height = 0; + index_t width = 0; + index_t channels = 0; + + if (logits->dim_size() == 2) { + batch = logits->dim(0); + height = 1; + width = 1; + channels = logits->dim(1); + + } else if (logits->dim_size() == 4) { + batch = logits->dim(0); + height = logits->dim(1); + width = logits->dim(2); + channels = logits->dim(3); + } else { + MACE_NOT_IMPLEMENTED; + } + + const index_t channel_blocks = RoundUpDiv4(channels); + const int remain_channels = channel_blocks * 4 - channels; + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); + built_options.emplace("-Dsoftmax=" + kernel_name); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype())); + built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, logits->shape())) { + uint32_t idx = 0; + MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size()); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(logits->opencl_buffer())); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(channels)); + kernel_.setArg(idx++, remain_channels); + kernel_.setArg(idx++, *(output->opencl_buffer())); + + input_shape_ = logits->shape(); + } + + std::vector lws = {4, 4, 4, 0}; + std::string tuning_key = + Concat("softmax_opencl_kernel", batch, height, width, channels); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + MACE_OUT_OF_RANGE_VALIDATION + return MACE_SUCCESS; +} + +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_ diff --git a/mace/kernels/opencl/buffer/utils.cc b/mace/kernels/opencl/buffer/utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..abc06ca8538fb66151b636bf913f036c22302905 --- /dev/null +++ b/mace/kernels/opencl/buffer/utils.cc @@ -0,0 +1,97 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/opencl/buffer/utils.h" + +#include +#include +#include + +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { + +MaceStatus PadInput(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const int pad_top, + const int pad_left, + const bool input_changed, + Tensor *padded_input, + StatsFuture *future) { + const index_t batch = input->dim(0); + const index_t in_height = input->dim(1); + const index_t in_width = input->dim(2); + const index_t in_channel = input->dim(3); + const index_t padded_height = padded_input->dim(1); + const index_t padded_width = padded_input->dim(2); + const index_t padded_channel = padded_input->dim(3); + + const uint32_t gws[2] = { + static_cast(padded_width * RoundUpDiv4(padded_channel)), + static_cast(padded_height * batch) + }; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel->get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("Dpad_input"); + built_options.emplace("-Dpad_input=" + kernel_name); + built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input->dtype())); + MACE_RETURN_IF_ERROR(runtime->BuildKernel( + "buffer_transform", + kernel_name, + built_options, + kernel)); + } + + MACE_OUT_OF_RANGE_INIT(*kernel); + if (input_changed) { + uint32_t idx = 0; + MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, padded_input->size()); + MACE_SET_2D_GWS_ARGS(*kernel, gws) + kernel->setArg(idx++, *(input->opencl_buffer())); + kernel->setArg(idx++, static_cast(in_height)); + kernel->setArg(idx++, static_cast(in_width)); + kernel->setArg(idx++, static_cast(in_channel)); + kernel->setArg(idx++, static_cast(padded_height)); + kernel->setArg(idx++, static_cast(padded_width)); + kernel->setArg(idx++, static_cast(padded_channel)); + kernel->setArg(idx++, pad_top); + kernel->setArg(idx++, pad_left); + kernel->setArg(idx++, *(padded_input->opencl_buffer())); + } + std::string tuning_key = + Concat("pad_input", batch, in_height, in_width, in_channel, + padded_height, padded_width, padded_channel); + std::vector lws = {8, 4, 0}; + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, + gws, lws, future)); + MACE_OUT_OF_RANGE_VALIDATION + return MACE_SUCCESS; +} + +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/buffer/utils.h b/mace/kernels/opencl/buffer/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..f19a8210da90a7e0303bdf53d7e0d1687f920210 --- /dev/null +++ b/mace/kernels/opencl/buffer/utils.h @@ -0,0 +1,41 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_BUFFER_UTILS_H_ +#define MACE_KERNELS_OPENCL_BUFFER_UTILS_H_ + +#include "mace/core/future.h" +#include "mace/core/op_kernel_context.h" +#include "mace/core/tensor.h" +#include "mace/public/mace.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace buffer { + +MaceStatus PadInput(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const int pad_top, + const int pad_left, + const bool input_changed, + Tensor *padded_input, + StatsFuture *future); + +} // namespace buffer +} // namespace opencl +} // namespace kernels +} // namespace mace +#endif // MACE_KERNELS_OPENCL_BUFFER_UTILS_H_ diff --git a/mace/kernels/opencl/buffer_inverse_transform.cc b/mace/kernels/opencl/buffer_inverse_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..352fbed7cdf1193426ce2ad6c8a414fd126456b1 --- /dev/null +++ b/mace/kernels/opencl/buffer_inverse_transform.cc @@ -0,0 +1,49 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/buffer_inverse_transform.h" +#include "mace/kernels/opencl/image/image_to_buffer.h" +#include "mace/kernels/opencl/buffer/buffer_inverse_transform.h" + +namespace mace { +namespace kernels { + +template +BufferInverseTransformFunctor< + DeviceType::GPU, T>::BufferInverseTransformFunctor( + OpKernelContext *context, + const int wino_blk_size) + : BufferInverseTransformFunctorBase(context, wino_blk_size) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::ImageToBuffer); + } else { + kernel_.reset(new opencl::buffer::BufferInverseTransform); + } +} + +template +MaceStatus BufferInverseTransformFunctor::operator()( + const Tensor *input, + const BufferType type, + Tensor *output, + StatsFuture *future) { + return kernel_->Compute(context_, input, type, + wino_blk_size_, output, future); +} + +template struct BufferInverseTransformFunctor; +template struct BufferInverseTransformFunctor; + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/buffer_transform.cc b/mace/kernels/opencl/buffer_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..55854753af8bbb96aae9fb9e9582a2ea57afa56d --- /dev/null +++ b/mace/kernels/opencl/buffer_transform.cc @@ -0,0 +1,48 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/buffer_transform.h" +#include "mace/kernels/opencl/image/buffer_to_image.h" +#include "mace/kernels/opencl/buffer/buffer_transform.h" + +namespace mace { +namespace kernels { + +template +BufferTransformFunctor::BufferTransformFunctor( + OpKernelContext *context, + const int wino_blk_size) + : BufferTransformFunctorBase(context, wino_blk_size) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::BufferToImage); + } else { + kernel_.reset(new opencl::buffer::BufferTransform); + } +} + +template +MaceStatus BufferTransformFunctor::operator()( + const Tensor *input, + const BufferType type, + Tensor *output, + StatsFuture *future) { + return kernel_->Compute(context_, input, type, + wino_blk_size_, output, future); +} + +template struct BufferTransformFunctor; +template struct BufferTransformFunctor; + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index 64de09c2d597b5fe5bd2bf4923c442426c43ce8c..7d8365038250265a43bc09569d8710cc0cabf75a 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -13,73 +13,26 @@ // limitations under the License. #include "mace/kernels/channel_shuffle.h" -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/channel_shuffle.h" namespace mace { namespace kernels { template -MaceStatus ChannelShuffleFunctor::operator()( - const Tensor *input, Tensor *output, StatsFuture *future) { - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - - const index_t batch = input->dim(0); - const index_t height = input->dim(1); - const index_t width = input->dim(2); - const index_t channels = input->dim(3); - const index_t channels_per_group = channels / groups_; - MACE_CHECK(channels_per_group % 4 == 0, - "channels per group must be multiple of 4"); - MACE_CHECK(groups_ % 4 == 0, "groups must be multiple of 4"); - const index_t group_channel_blocks = RoundUpDiv4(channels_per_group); - - const uint32_t gws[3] = {static_cast(group_channel_blocks), - static_cast(width), - static_cast(height * batch)}; - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); - built_options.emplace("-Dchannel_shuffle=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - MACE_RETURN_IF_ERROR( - runtime->BuildKernel("channel_shuffle", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, groups_); - kernel_.setArg(idx++, static_cast(channels_per_group)); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); +ChannelShuffleFunctor::ChannelShuffleFunctor( + OpKernelContext *context, + const int groups) : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::ChannelShuffleKernel(groups)); + } else { + MACE_NOT_IMPLEMENTED; } +} - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; +template +MaceStatus ChannelShuffleFunctor::operator()( + const Tensor *input, Tensor *output, StatsFuture *future) { + return kernel_->Compute(context_, input, output, future); } template struct ChannelShuffleFunctor; diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl index 6436b82ae8fff1be4994f1cad6f11ba084bba367..62978d88abdb8c0949111dd6678f38e4ffc9950c 100644 --- a/mace/kernels/opencl/cl/activation.cl +++ b/mace/kernels/opencl/cl/activation.cl @@ -1,6 +1,6 @@ #include -__kernel void activation(KERNEL_ERROR_PARAMS +__kernel void activation(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, #ifdef USE_PRELU diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl index 1e8616f3825e0eebff7e09dae414af05e1d5fb32..758a0c7de7c270bec968ed37411f95b5c1703723 100644 --- a/mace/kernels/opencl/cl/addn.cl +++ b/mace/kernels/opencl/cl/addn.cl @@ -1,6 +1,6 @@ #include -__kernel void addn(KERNEL_ERROR_PARAMS +__kernel void addn(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input1, diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 2da41eedf8a4cad86c048d1b4cd450fd6a7ca593..cf1f18c7144c5a20a8643857d7f1f788280311f0 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -1,6 +1,6 @@ #include // Supported data types: half/float -__kernel void batch_norm(KERNEL_ERROR_PARAMS +__kernel void batch_norm(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __read_only image2d_t scale, diff --git a/mace/kernels/opencl/cl/batch_to_space.cl b/mace/kernels/opencl/cl/batch_to_space.cl index 7ed0abb8f15b3ad4d636a075f6e8aeb254e56ead..de59bb86e2f4586fb223bbb44c1262ec9809ca90 100644 --- a/mace/kernels/opencl/cl/batch_to_space.cl +++ b/mace/kernels/opencl/cl/batch_to_space.cl @@ -1,6 +1,6 @@ #include -__kernel void batch_to_space(KERNEL_ERROR_PARAMS +__kernel void batch_to_space(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t batch_data, __write_only image2d_t space_data, diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl index 31d11be70eb18242b0d21d12bbb96b1eb117cee0..7ae00846a09305ada5ada9d057bae2f8662c6b1d 100644 --- a/mace/kernels/opencl/cl/bias_add.cl +++ b/mace/kernels/opencl/cl/bias_add.cl @@ -1,6 +1,6 @@ #include // Supported data types: half/float -__kernel void bias_add(KERNEL_ERROR_PARAMS +__kernel void bias_add(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __read_only image2d_t bias, diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index 0ab39219c6068544037289b5b9a24d60069f5bd2..ac0fb7788f0f3e95cbf00cc04623b3ef015f2038 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -1,6 +1,6 @@ #include -__kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS +__kernel void filter_buffer_to_image(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* OIHW */ __private const int input_offset, @@ -52,7 +52,7 @@ __kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS WRITE_IMAGET(output, coord, values); } -__kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS +__kernel void filter_image_to_buffer(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, /* OIHW */ __private const int out_channel, @@ -102,7 +102,7 @@ __kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS } // TODO(liuqi): Support multiplier > 1 -__kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS +__kernel void dw_filter_buffer_to_image(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* MIHW */ __private const int input_offset, @@ -154,7 +154,7 @@ __kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS WRITE_IMAGET(output, coord, values); } -__kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS +__kernel void in_out_buffer_to_image(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, @@ -196,7 +196,7 @@ __kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS WRITE_IMAGET(output, coord, values); } -__kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS +__kernel void in_out_image_to_buffer(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, /* nhwc */ __private const int height, @@ -236,7 +236,7 @@ __kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS } } -__kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS +__kernel void arg_buffer_to_image(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, __private const int input_offset, @@ -272,7 +272,7 @@ __kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS WRITE_IMAGET(output, coord, values); } -__kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS +__kernel void arg_image_to_buffer(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, __private const int count, @@ -306,7 +306,7 @@ __kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS } -__kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS +__kernel void in_out_height_buffer_to_image(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, //nhwc __private const int input_offset, @@ -349,7 +349,7 @@ __kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS WRITE_IMAGET(output, coord, values); } -__kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS +__kernel void in_out_height_image_to_buffer(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, //nhwc __private const int height, @@ -387,7 +387,7 @@ __kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS output[offset] = values.w; } -__kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS +__kernel void in_out_width_buffer_to_image(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, @@ -430,7 +430,7 @@ __kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS WRITE_IMAGET(output, coord, values); } -__kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS +__kernel void weight_height_buffer_to_image(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, // OIHW __private const int input_offset, @@ -475,7 +475,7 @@ __kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS WRITE_IMAGET(output, coord, values); } -__kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS +__kernel void weight_height_image_to_buffer(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, //OIHW __private const int out_channels, @@ -517,7 +517,7 @@ __kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS } -__kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS +__kernel void weight_width_buffer_to_image(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, // OIHW __private const int input_offset, @@ -565,7 +565,7 @@ __kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS WRITE_IMAGET(output, coord, values); } -__kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS +__kernel void weight_width_image_to_buffer(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, // OIHW __private const int in_channels, @@ -609,7 +609,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS } // only support 3x3 now -__kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS +__kernel void winograd_filter_buffer_to_image_2x2(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, //Oc, Ic, H, W __private const int input_offset, @@ -714,7 +714,7 @@ __kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS } // only support 3x3 now -__kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS +__kernel void winograd_filter_image_to_buffer_2x2(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, //Oc, Ic, H, W __private const int height, @@ -757,7 +757,7 @@ __kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS } // only support 3x3 now -__kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS +__kernel void winograd_filter_buffer_to_image_6x6(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, //Oc, Ic, H, W __private const int input_offset, @@ -891,7 +891,7 @@ PROCESS(7); #undef PROCESS } -__kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS +__kernel void winograd_filter_image_to_buffer_6x6(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, //Oc, Ic, H, W __private const int height, @@ -933,7 +933,7 @@ __kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS } // only support 3x3 now -__kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS +__kernel void winograd_filter_buffer_to_image_4x4(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, //Oc, Ic, H, W __private const int input_offset, @@ -1040,7 +1040,7 @@ __kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS #undef PROCESS } -__kernel void winograd_filter_image_to_buffer_4x4(KERNEL_ERROR_PARAMS +__kernel void winograd_filter_image_to_buffer_4x4(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, //Oc, Ic, H, W __private const int height, diff --git a/mace/kernels/opencl/cl/buffer_transform.cl b/mace/kernels/opencl/cl/buffer_transform.cl new file mode 100644 index 0000000000000000000000000000000000000000..d5f00e31b19aed182e63e3405322c0184c6c2f17 --- /dev/null +++ b/mace/kernels/opencl/cl/buffer_transform.cl @@ -0,0 +1,209 @@ +#include + +__kernel void pad_input(BUFFER_OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __global IN_DATA_TYPE *input, + __private const int in_height, + __private const int in_width, + __private const int in_chan, + __private const int padded_height, + __private const int padded_width, + __private const int padded_chan, + __private const int pad_top, + __private const int pad_left, + __global DATA_TYPE *output) { + const int padded_wc_blk_idx = get_global_id(0); + const int padded_hb_idx = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (padded_wc_blk_idx >= global_size_dim0 || + padded_hb_idx >= global_size_dim1) { + return; + } +#endif + const int padded_chan_blk = (padded_chan + 3) >> 2; + const int padded_width_idx = padded_wc_blk_idx / padded_chan_blk; + const int padded_chan_blk_idx = padded_wc_blk_idx % padded_chan_blk; + const int batch_idx = padded_hb_idx / padded_height; + const int padded_height_idx = padded_hb_idx % padded_height; + const int padded_chan_idx = padded_chan_blk_idx << 2; + const int in_height_idx = padded_height_idx - pad_top; + const int in_width_idx = padded_width_idx - pad_left; + const int padded_offset = mad24(mad24(mad24(batch_idx, padded_height, padded_height_idx), + padded_width, padded_width_idx), padded_chan, padded_chan_idx); + const int in_offset = mad24(mad24(mad24(batch_idx, in_height, in_height_idx), + in_width, in_width_idx), in_chan, padded_chan_idx); + + DATA_TYPE4 value = 0; + if (0 <= in_height_idx && in_height_idx < in_height && + 0 <= in_width_idx && in_width_idx < in_width) { + const int remain_chan = in_chan - padded_chan_idx; + if (remain_chan < 4) { + switch (remain_chan) { + case 3: + value.z = CONVERT(input[in_offset + 2]); + case 2: + value.y = CONVERT(input[in_offset + 1]); + case 1: + value.x = CONVERT(input[in_offset]); + } + } else { + value = CONVERT4(vload4(0, input + in_offset)); + } + } + vstore4(value, 0, output + padded_offset); + CHECK_OUT_OF_RANGE_FOR_BUFFER(padded_offset + 3); +} + +// OIHW -> [H, W, (O+3) / 4, I, 4] +__kernel void transform_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __global IN_DATA_TYPE *input, // OIHW + __private const int input_offset, + __global DATA_TYPE *output, + __private const int out_chan, + __private const int in_chan, + __private const int height, + __private const int width, + __private const int inner_size) { + const int in_chan_idx = get_global_id(0); + const int out_chan_blk_idx = get_global_id(1); + const int hw_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (in_chan_idx >= global_size_dim0 || + out_chan_blk_idx >= global_size_dim1 || + hw_idx >= global_size_dim2) { + return; + } +#endif + const int t_in_chan = global_size_dim0; + const int out_chan_blk = global_size_dim1; + + const int h_idx = hw_idx / width; + const int w_idx = hw_idx % width; + const int out_chan_idx = out_chan_blk_idx << 2; + const int in_offset = mad24(mad24(mad24(out_chan_idx, in_chan, in_chan_idx), + height, h_idx), width, w_idx) + input_offset; + const int out_offset = (mad24(mad24(mad24(h_idx, width, w_idx), + out_chan_blk, out_chan_blk_idx), t_in_chan, in_chan_idx) << 2); + + DATA_TYPE4 value = 0; + if (in_chan_idx < in_chan) { + if (out_chan_idx + 3 < out_chan) { + value.x = CONVERT(input[in_offset]); + value.y = CONVERT(input[in_offset + inner_size]); + value.z = CONVERT(input[in_offset + 2 * inner_size]); + value.w = CONVERT(input[in_offset + 3 * inner_size]); + } else { + const int diff = out_chan - out_chan_idx; + switch(diff) { + case 3: + value.z = CONVERT(input[in_offset + 2 * inner_size]); + case 2: + value.y = CONVERT(input[in_offset + inner_size]); + case 1: + value.x = CONVERT(input[in_offset]); + } + } + } + VSTORE4(value, output, out_offset); +} + +// MIHW -> [M, (I+3) / 4, H, W, 4] +__kernel void transform_dw_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __global IN_DATA_TYPE *input, // MIHW + __private const int input_offset, + __global DATA_TYPE *output, + __private const int in_chan, + __private const int in_hw) { + const int width_idx = get_global_id(0); + const int height_idx = get_global_id(1); + const int in_chan_blk_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (width_idx >= global_size_dim0 || + height_idx >= global_size_dim1 || + in_chan_blk_idx >= global_size_dim2) { + return; + } +#endif + const int width = global_size_dim0; + const int height = global_size_dim1; + const int in_chan_idx = in_chan_blk_idx << 2; + + const int in_offset = mad24(in_chan_idx, in_hw, + mad24(height_idx, width, width_idx)) + input_offset; + const int out_offset = mad24(in_chan_blk_idx, in_hw, + mad24(height_idx, width, width_idx)) << 2; + + DATA_TYPE4 value = 0; + if (in_chan_idx + 3 < in_chan) { + value.x = CONVERT(input[in_offset]); + value.y = CONVERT(input[in_offset + in_hw]); + value.z = CONVERT(input[in_offset + (in_hw << 1)]); + value.w = CONVERT(input[in_offset + in_hw + (in_hw << 1)]); + } else { + const int diff = in_chan - in_chan_idx; + switch(diff) { + case 3: + value.z = CONVERT(input[in_offset + (in_hw << 1)]); + case 2: + value.y = CONVERT(input[in_offset + in_hw]); + case 1: + value.x = CONVERT(input[in_offset]); + } + } + VSTORE4(value, output, out_offset); +} + +__kernel void transform_arg(BUFFER_OUT_OF_RANGE_PARAMS + __private const int global_size_dim0, + __global IN_DATA_TYPE *input, + __private const int input_offset, + __global DATA_TYPE *output, + __private int size) { + const int blk_idx = get_global_id(0); + +#ifndef NON_UNIFORM_WORK_GROUP + if (blk_idx >= global_size_dim0) { + return; + } +#endif + const int idx = blk_idx << 2; + const int diff = size - idx; + const int in_idx = idx + input_offset; + DATA_TYPE4 value = 0; + if (diff < 4) { + switch (diff) { + case 3: + value.z = CONVERT(input[in_idx + 2]); + case 2: + value.y = CONVERT(input[in_idx + 1]); + case 1: + value.x = CONVERT(input[in_idx]); + } + } else { + value = CONVERT4(vload4(0, input + in_idx)); + } + + VSTORE4(value, output, idx); +} + +__kernel void transform_data_type(BUFFER_OUT_OF_RANGE_PARAMS + __private const int global_size_dim0, + __global IN_DATA_TYPE *input, + __private const int input_offset, + __global DATA_TYPE *output) { + const int out_idx = get_global_id(0); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_idx >= global_size_dim0) { + return; + } +#endif + + DATA_TYPE4 input_value = CONVERT4(vload4(out_idx, input + input_offset)); + vstore4(input_value, out_idx, output); +} diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl index 6563c7a8b79db82f9829cde65bcb5e558553ec0e..556de82ebcc4701cd236ad2732403ca16d89ad1c 100644 --- a/mace/kernels/opencl/cl/channel_shuffle.cl +++ b/mace/kernels/opencl/cl/channel_shuffle.cl @@ -1,7 +1,7 @@ #include // assume channes_per_group mod 4 = 0 && groups mod 4 == 0 -__kernel void channel_shuffle(KERNEL_ERROR_PARAMS +__kernel void channel_shuffle(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int groups, diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index df34fee96c19909397ae96557f4cad3e0b9f9745..abfdd9785339a0d6664fb610771ba25b19c3266f 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -24,12 +24,42 @@ #define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type) #define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4) +#define OUT_DATA_TYPE4 VEC_DATA_TYPE(OUT_DATA_TYPE, 4) +#define CONVERT_STR(value, type) convert_##type((value)) + +#define CONVERT_TO(value, type) CONVERT_STR(value, type) +#define CONVERT(value) CONVERT_TO(value, DATA_TYPE) +#define CONVERT4(value) CONVERT_TO(value, DATA_TYPE4) + +#define GLOBAL_WORK_GROUP_SIZE_DIM2 \ + __private const int global_size_dim0, \ + __private const int global_size_dim1, + +#define GLOBAL_WORK_GROUP_SIZE_DIM3 \ + __private const int global_size_dim0, \ + __private const int global_size_dim1, \ + __private const int global_size_dim2, + +// oorc for 'Out Of Range Check' #ifdef OUT_OF_RANGE_CHECK +#define OUT_OF_RANGE_PARAMS \ + __global int *oorc_flag, + +#define BUFFER_OUT_OF_RANGE_PARAMS \ + __global int *oorc_flag, \ + __private const int oorc_output_length, + #define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \ - check_out_of_range_for_image2d(image, (coord).x, (coord).y, kernel_error); + check_out_of_range_for_image2d(image, (coord).x, (coord).y, oorc_flag); + +#define CHECK_OUT_OF_RANGE_FOR_BUFFER(idx) \ + check_out_of_range_for_buffer(oorc_output_length, (idx), oorc_flag); #else +#define OUT_OF_RANGE_PARAMS +#define BUFFER_OUT_OF_RANGE_PARAMS #define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) +#define CHECK_OUT_OF_RANGE_FOR_BUFFER(idx) #endif #define READ_IMAGET(image, sampler, coord) \ @@ -38,25 +68,10 @@ CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \ CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value); -#define GLOBAL_WORK_GROUP_SIZE_DIM2 \ - __private const int global_size_dim0, \ - __private const int global_size_dim1, - -#define GLOBAL_WORK_GROUP_SIZE_DIM3 \ - __private const int global_size_dim0, \ - __private const int global_size_dim1, \ - __private const int global_size_dim2, - -#ifdef OUT_OF_RANGE_CHECK - -#define KERNEL_ERROR_PARAMS \ - __global char *kernel_error, - -#else +#define VSTORE4(data, output, offset) \ + CHECK_OUT_OF_RANGE_FOR_BUFFER((offset) + 3) \ + vstore4(data, 0, output + (offset)); -#define KERNEL_ERROR_PARAMS - -#endif __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; @@ -66,6 +81,7 @@ inline float4 do_sigmoid(float4 in) { return native_recip(1.0f + native_exp(-in)); } +#ifdef DATA_TYPE inline DATA_TYPE4 do_activation(DATA_TYPE4 in, #ifdef USE_PRELU DATA_TYPE4 prelu_alpha, @@ -89,17 +105,25 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in, #endif return out; } +#endif inline void check_out_of_range_for_image2d(__write_only image2d_t image, __private const int x, __private const int y, - global char *kernel_error) { -#ifdef OUT_OF_RANGE_CHECK + __global int *oorc_flag) { int2 image_dim = get_image_dim(image); if (x >= image_dim.x || y >= image_dim.y) { - *kernel_error = 1; + *oorc_flag = 1; + } +} + +inline void check_out_of_range_for_buffer(__private const int length, + __private const int idx, + __global int *oorc_flag) { + if (idx >= length) { + *oorc_flag = idx - length + 1; } -#endif } + #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl index e656109cef25a5b41ffaa8d166f0f90a8579af66..7f36c5b4525c98897e247fb3909e4ad388d465d4 100644 --- a/mace/kernels/opencl/cl/concat.cl +++ b/mace/kernels/opencl/cl/concat.cl @@ -22,7 +22,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left, } // Supported data type: half/float -__kernel void concat_channel(KERNEL_ERROR_PARAMS +__kernel void concat_channel(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input0, __read_only image2d_t input1, @@ -84,7 +84,7 @@ __kernel void concat_channel(KERNEL_ERROR_PARAMS } // Required: All input channels are divisible by 4 -__kernel void concat_channel_multi(KERNEL_ERROR_PARAMS +__kernel void concat_channel_multi(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int chan_blk_offset, diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index b645502c48797d92a8c45691a466d7f67f1bb704..b5ec1b150eaffa022274e30b08d2685dc14f4620 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -1,6 +1,6 @@ #include -__kernel void conv_2d(KERNEL_ERROR_PARAMS +__kernel void conv_2d(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */ diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index b9b387e1926304a4431c5e63e096d8fac8592430..ffb8a434ef033075382a3d639cf2097ce92e4407 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -1,6 +1,6 @@ #include -__kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS +__kernel void conv_2d_1x1(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */ diff --git a/mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl b/mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl new file mode 100644 index 0000000000000000000000000000000000000000..d49895fa695f95f8c3f033675cd6b9090cb52d45 --- /dev/null +++ b/mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl @@ -0,0 +1,111 @@ +#include + +__kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __global IN_DATA_TYPE *padded_input, + __global IN_DATA_TYPE *filter, +#ifdef BIAS + __global IN_DATA_TYPE *bias, +#endif + __private const int in_height, + __private const int in_width, + __private const int in_chan, + __private const int filter_in_chan, + __private const int out_height, + __private const int out_width, + __private const int out_chan, + __private const int stride_h, + __private const int stride_w, + __private const float relux_max_limit, + __global OUT_DATA_TYPE *output) { + const int out_wc_blk_idx = get_global_id(0); + const int out_hb_idx = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_wc_blk_idx >= global_size_dim0 || + out_hb_idx >= global_size_dim1) { + return; + } +#endif + const int out_chan_blk = (out_chan + 3) >> 2; + + const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk; + const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk; + + const int batch_idx = out_hb_idx / out_height; + const int out_height_idx = out_hb_idx % out_height; + const int out_width_idx = out_width_blk_idx << 1; + const int out_chan_idx = out_chan_blk_idx << 2; + + const int in_height_idx = mul24(out_height_idx, stride_h); + const int in_width_idx = mul24(out_width_idx, stride_w); + const int strided_chan = mul24(in_chan, stride_w); + +#ifdef BIAS + DATA_TYPE4 out0 = CONVERT4(vload4(0, bias + out_chan_idx)); + DATA_TYPE4 out1 = out0; +#else + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; +#endif + + int in_offset = mul24(mad24(mad24(batch_idx, in_height, in_height_idx), + in_width, in_width_idx), in_chan); + int filter_offset = mul24(out_chan_blk_idx, filter_in_chan) << 2; + DATA_TYPE4 in0, in1; + DATA_TYPE4 w0, w1, w2, w3; + + for (int in_chan_idx = 0; in_chan_idx < in_chan; in_chan_idx += 4) { + w0 = CONVERT4(vload4(0, filter + filter_offset)); + w1 = CONVERT4(vload4(0, filter + filter_offset + 4)); + w2 = CONVERT4(vload4(0, filter + filter_offset + 8)); + w3 = CONVERT4(vload4(0, filter + filter_offset + 12)); + + in0 = CONVERT4(vload4(0, padded_input + in_offset)); + in1 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan)); + + out0 = mad((DATA_TYPE4)(in0.x), w0, out0); + out0 = mad((DATA_TYPE4)(in0.y), w1, out0); + out0 = mad((DATA_TYPE4)(in0.z), w2, out0); + out0 = mad((DATA_TYPE4)(in0.w), w3, out0); + + out1 = mad((DATA_TYPE4)(in1.x), w0, out1); + out1 = mad((DATA_TYPE4)(in1.y), w1, out1); + out1 = mad((DATA_TYPE4)(in1.z), w2, out1); + out1 = mad((DATA_TYPE4)(in1.w), w3, out1); + + filter_offset += 16; + in_offset += 4; + } + +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID) + out0 = do_activation(out0, relux_max_limit); + out1 = do_activation(out1, relux_max_limit); +#endif + + int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx), + out_width, out_width_idx), out_chan, out_chan_idx); + +#define WRITE_OUTPUT(i) \ + if (out_chan_idx + 4 > out_chan) { \ + const int diff = out_chan - out_chan_idx; \ + switch(diff) { \ + case 3: \ + output[out_offset + 2] = CONVERT_TO(out##i.z, OUT_DATA_TYPE); \ + case 2: \ + output[out_offset + 1] = CONVERT_TO(out##i.y, OUT_DATA_TYPE); \ + case 1: \ + output[out_offset] = CONVERT_TO(out##i.x, OUT_DATA_TYPE); \ + } \ + CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + diff - 1); \ + } else { \ + VSTORE4(CONVERT_TO(out##i, OUT_DATA_TYPE4), output, out_offset); \ + } + + WRITE_OUTPUT(0); + if (out_width_idx + 1 >= out_width) return; + out_offset += out_chan; + WRITE_OUTPUT(1); +#undef WRITE_OUTPUT + +} diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 076032879a53b572a989f1b59d7a4ea171c11b18..f4172e59984f73e3787b5e1fd74af66a98f99753 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -1,6 +1,6 @@ #include -__kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS +__kernel void conv_2d_3x3(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */ diff --git a/mace/kernels/opencl/cl/conv_2d_buffer.cl b/mace/kernels/opencl/cl/conv_2d_buffer.cl new file mode 100644 index 0000000000000000000000000000000000000000..e0c0c56412e7a0d02b5aa723ae2fd4dece421983 --- /dev/null +++ b/mace/kernels/opencl/cl/conv_2d_buffer.cl @@ -0,0 +1,147 @@ +#include + +__kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __global IN_DATA_TYPE *padded_input, + __global IN_DATA_TYPE *filter, +#ifdef BIAS + __global IN_DATA_TYPE *bias, +#endif + __private const int in_height, + __private const int in_width, + __private const int in_chan, + __private const int filter_height, + __private const int filter_width, + __private const int filter_in_chan, + __private const int filter_chan_size, + __private const int out_height, + __private const int out_width, + __private const int out_chan, + __private const int stride_h, + __private const int stride_w, + __private const int dilated_h_offset, + __private const int dilated_w_offset, + __private const float relux_max_limit, + __global OUT_DATA_TYPE *output) { + const int out_wc_blk_idx = get_global_id(0); + const int out_hb_idx = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_wc_blk_idx >= global_size_dim0 || + out_hb_idx >= global_size_dim1) { + return; + } +#endif + const int out_chan_blk = (out_chan + 3) >> 2; + + const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk; + const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk; + + const int batch_idx = out_hb_idx / out_height; + const int out_height_idx = out_hb_idx % out_height; + const int out_width_idx = out_width_blk_idx << 2; + const int out_chan_idx = out_chan_blk_idx << 2; + + const int in_height_idx = mul24(out_height_idx, stride_h); + const int in_width_idx = mul24(out_width_idx, stride_w); + const int strided_chan = mul24(in_chan, stride_w); + +#ifdef BIAS + DATA_TYPE4 out0 = CONVERT4(vload4(0, bias + out_chan_idx)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; +#else + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; +#endif + + const int in_offset_base = mul24(mad24(mad24(batch_idx, in_height, in_height_idx), + in_width, in_width_idx), in_chan); + int filter_offset_base = mul24(out_chan_blk_idx, filter_in_chan) << 2; + DATA_TYPE4 in0, in1, in2, in3; + DATA_TYPE4 w0, w1, w2, w3; + + for (int filter_h_idx = 0; filter_h_idx < filter_height; ++filter_h_idx) { + int in_height_offset = mad24(filter_h_idx, dilated_h_offset, in_offset_base); + for (int filter_w_idx = 0; filter_w_idx < filter_width; ++filter_w_idx) { + int filter_offset = filter_offset_base; + int in_offset = mad24(filter_w_idx, dilated_w_offset, in_height_offset); + for (int in_chan_idx = 0; in_chan_idx < in_chan; in_chan_idx += 4) { + w0 = CONVERT4(vload4(0, filter + filter_offset)); + w1 = CONVERT4(vload4(0, filter + filter_offset + 4)); + w2 = CONVERT4(vload4(0, filter + filter_offset + 8)); + w3 = CONVERT4(vload4(0, filter + filter_offset + 12)); + + in0 = CONVERT4(vload4(0, padded_input + in_offset)); + in1 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan)); + in2 = CONVERT4(vload4(0, padded_input + in_offset + (strided_chan << 1))); + in3 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan + (strided_chan << 1))); + + out0 = mad((DATA_TYPE4)(in0.x), w0, out0); + out0 = mad((DATA_TYPE4)(in0.y), w1, out0); + out0 = mad((DATA_TYPE4)(in0.z), w2, out0); + out0 = mad((DATA_TYPE4)(in0.w), w3, out0); + + out1 = mad((DATA_TYPE4)(in1.x), w0, out1); + out1 = mad((DATA_TYPE4)(in1.y), w1, out1); + out1 = mad((DATA_TYPE4)(in1.z), w2, out1); + out1 = mad((DATA_TYPE4)(in1.w), w3, out1); + + out2 = mad((DATA_TYPE4)(in2.x), w0, out2); + out2 = mad((DATA_TYPE4)(in2.y), w1, out2); + out2 = mad((DATA_TYPE4)(in2.z), w2, out2); + out2 = mad((DATA_TYPE4)(in2.w), w3, out2); + + out3 = mad((DATA_TYPE4)(in3.x), w0, out3); + out3 = mad((DATA_TYPE4)(in3.y), w1, out3); + out3 = mad((DATA_TYPE4)(in3.z), w2, out3); + out3 = mad((DATA_TYPE4)(in3.w), w3, out3); + filter_offset += 16; + in_offset += 4; + } + filter_offset_base += filter_chan_size; + } + } + +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID) + out0 = do_activation(out0, relux_max_limit); + out1 = do_activation(out1, relux_max_limit); + out2 = do_activation(out2, relux_max_limit); + out3 = do_activation(out3, relux_max_limit); +#endif + + int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx), + out_width, out_width_idx), out_chan, out_chan_idx); + +#define WRITE_OUTPUT(i) \ + if (out_chan_idx + 4 > out_chan) { \ + const int diff = out_chan - out_chan_idx; \ + switch(diff) { \ + case 3: \ + output[out_offset + 2] = CONVERT_TO(out##i.z, OUT_DATA_TYPE); \ + case 2: \ + output[out_offset + 1] = CONVERT_TO(out##i.y, OUT_DATA_TYPE); \ + case 1: \ + output[out_offset] = CONVERT_TO(out##i.x, OUT_DATA_TYPE); \ + } \ + CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + diff - 1); \ + } else { \ + VSTORE4(CONVERT_TO(out##i, OUT_DATA_TYPE4), output, out_offset); \ + } + + WRITE_OUTPUT(0); + if (out_width_idx + 1 >= out_width) return; + out_offset += out_chan; + WRITE_OUTPUT(1); + if (out_width_idx + 2 >= out_width) return; + out_offset += out_chan; + WRITE_OUTPUT(2); + if (out_width_idx + 3 >= out_width) return; + out_offset += out_chan; + WRITE_OUTPUT(3); +#undef WRITE_OUTPUT + +} diff --git a/mace/kernels/opencl/cl/crop.cl b/mace/kernels/opencl/cl/crop.cl index 609398feecdd537cb1e49e4ef7a6f6e6f472b95b..3145b2c41a323e385f2162bfcd80d9e520885d6c 100644 --- a/mace/kernels/opencl/cl/crop.cl +++ b/mace/kernels/opencl/cl/crop.cl @@ -1,6 +1,6 @@ #include -__kernel void crop(KERNEL_ERROR_PARAMS +__kernel void crop(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int offset_b, diff --git a/mace/kernels/opencl/cl/deconv_2d.cl b/mace/kernels/opencl/cl/deconv_2d.cl index d7728f3e15e0a5b474d20aa48f12e6d4f0d7a6db..ae08b4098f759fe16a703af61ad75aa254ff3692 100644 --- a/mace/kernels/opencl/cl/deconv_2d.cl +++ b/mace/kernels/opencl/cl/deconv_2d.cl @@ -1,6 +1,6 @@ #include -__kernel void deconv_2d(KERNEL_ERROR_PARAMS +__kernel void deconv_2d(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __read_only image2d_t weights, diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl index 7267c61e1c7f9a3a88f04c5a798682ffb5325876..8ac80a8fbfa7d88911cdbb8410114fb2e67e000e 100644 --- a/mace/kernels/opencl/cl/depth_to_space.cl +++ b/mace/kernels/opencl/cl/depth_to_space.cl @@ -1,6 +1,6 @@ #include -__kernel void depth_to_space(KERNEL_ERROR_PARAMS +__kernel void depth_to_space(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int block_size, diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl index 979c882b2565f42b7579a92b4d1553e93b9b602b..59761ee59d17bdf190fe11a4cc9fa297221fc08e 100644 --- a/mace/kernels/opencl/cl/depthwise_conv2d.cl +++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl @@ -1,7 +1,7 @@ #include // Only multiplier = 1 is supported -__kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS +__kernel void depthwise_conv2d(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ @@ -136,7 +136,7 @@ __kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); } -__kernel void depthwise_conv2d_s1(KERNEL_ERROR_PARAMS +__kernel void depthwise_conv2d_s1(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ diff --git a/mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl b/mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl new file mode 100644 index 0000000000000000000000000000000000000000..31f2460b2aa8a20718c8d4440d44749e8b787b3f --- /dev/null +++ b/mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl @@ -0,0 +1,126 @@ +#include + +#define BLOCK_SIZE 4 +__kernel void depthwise_conv2d(BUFFER_OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __global IN_DATA_TYPE *padded_input, + __global IN_DATA_TYPE *filter, +#ifdef BIAS + __global IN_DATA_TYPE *bias, +#endif + __private const int in_height, + __private const int in_width, + __private const int in_chan, + __private const int filter_height, + __private const int filter_width, + __private const int filter_hw, + __private const int out_height, + __private const int out_width, + __private const int out_chan, + __private const int stride_h, + __private const int stride_w, + __private const int dilated_h_offset, + __private const int dilated_w_offset, + __private const float relux_max_limit, + __global OUT_DATA_TYPE *output) { + const int out_wc_blk_idx = get_global_id(0); + const int out_hb_idx = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_wc_blk_idx >= global_size_dim0 || + out_hb_idx >= global_size_dim1) { + return; + } +#endif + const int out_chan_blk = (out_chan + 3) >> 2; + + const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk; + const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk; + + const int batch_idx = out_hb_idx / out_height; + const int out_height_idx = out_hb_idx % out_height; + const int out_width_idx = out_width_blk_idx << 2; + const int out_chan_idx = out_chan_blk_idx << 2; + const int in_chan_idx = out_chan_idx; + + const int in_height_idx = mul24(out_height_idx, stride_h); + const int in_width_idx = mul24(out_width_idx, stride_w); + const int strided_chan = mul24(in_chan, stride_w); + +#ifdef BIAS + DATA_TYPE4 out0 = CONVERT4(vload4(0, bias + out_chan_idx)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; +#else + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; +#endif + + const int in_offset_base = mad24(mad24(mad24(batch_idx, in_height, in_height_idx), + in_width, in_width_idx), in_chan, in_chan_idx); + int filter_offset = mul24(out_chan_blk_idx, filter_hw) << 2; + DATA_TYPE4 in0, in1, in2, in3; + DATA_TYPE4 w; + + for (int filter_h_idx = 0; filter_h_idx < filter_height; ++filter_h_idx) { + int in_offset = mad24(filter_h_idx, dilated_h_offset, in_offset_base); + for (int filter_w_idx = 0; filter_w_idx < filter_width; ++filter_w_idx) { + w = CONVERT4(vload4(0, filter + filter_offset)); + + in0 = CONVERT4(vload4(0, padded_input + in_offset)); + in1 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan)); + in2 = CONVERT4(vload4(0, padded_input + in_offset + (strided_chan << 1))); + in3 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan + (strided_chan << 1))); + + out0 = mad(in0, w, out0); + out1 = mad(in1, w, out1); + out2 = mad(in2, w, out2); + out3 = mad(in3, w, out3); + + filter_offset += 4; + in_offset += dilated_w_offset; + } + } + +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID) + out0 = do_activation(out0, relux_max_limit); + out1 = do_activation(out1, relux_max_limit); + out2 = do_activation(out2, relux_max_limit); + out3 = do_activation(out3, relux_max_limit); +#endif + + int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx), + out_width, out_width_idx), out_chan, out_chan_idx); + +#define WRITE_OUTPUT(i) \ + if (out_chan_idx + 4 > out_chan) { \ + const int diff = out_chan - out_chan_idx; \ + switch(diff) { \ + case 3: \ + output[out_offset + 2] = CONVERT_TO(out##i.z, OUT_DATA_TYPE); \ + case 2: \ + output[out_offset + 1] = CONVERT_TO(out##i.y, OUT_DATA_TYPE); \ + case 1: \ + output[out_offset] = CONVERT_TO(out##i.x, OUT_DATA_TYPE); \ + } \ + CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + diff - 1); \ + } else { \ + VSTORE4(CONVERT_TO(out##i, OUT_DATA_TYPE4), output, out_offset); \ + } + + WRITE_OUTPUT(0); + if (out_width_idx + 1 >= out_width) return; + out_offset += out_chan; + WRITE_OUTPUT(1); + if (out_width_idx + 2 >= out_width) return; + out_offset += out_chan; + WRITE_OUTPUT(2); + if (out_width_idx + 3 >= out_width) return; + out_offset += out_chan; + WRITE_OUTPUT(3); +#undef WRITE_OUTPUT + +} diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl index 52ee65eb0f95fa144e82ea6fce7a4ee928615613..9de68bc7a54ec4bcfb4a27929aeabbea01c5f2e1 100644 --- a/mace/kernels/opencl/cl/eltwise.cl +++ b/mace/kernels/opencl/cl/eltwise.cl @@ -1,6 +1,6 @@ #include -__kernel void eltwise(KERNEL_ERROR_PARAMS +__kernel void eltwise(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input0, #if INPUT_TYPE == 1 diff --git a/mace/kernels/opencl/cl/fully_connected.cl b/mace/kernels/opencl/cl/fully_connected.cl index 0aea2ee56e99f3213b178f8ff60e6963bda2d052..9a76dfed2dabe1d4f099741064bfc618cf154120 100644 --- a/mace/kernels/opencl/cl/fully_connected.cl +++ b/mace/kernels/opencl/cl/fully_connected.cl @@ -1,7 +1,7 @@ #include // output = weight * input + bias -__kernel void fully_connected(KERNEL_ERROR_PARAMS +__kernel void fully_connected(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, __read_only image2d_t weight, @@ -64,7 +64,7 @@ __kernel void fully_connected(KERNEL_ERROR_PARAMS } // output = weight * input + bias -__kernel void fully_connected_width(KERNEL_ERROR_PARAMS +__kernel void fully_connected_width(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __read_only image2d_t weight, diff --git a/mace/kernels/opencl/cl/lstmcell.cl b/mace/kernels/opencl/cl/lstmcell.cl index c020eb56c7c296cc1a8e0a62cb098924f6223cff..909c63d0453d90e10baa71cc327b6ac2a8a0596e 100644 --- a/mace/kernels/opencl/cl/lstmcell.cl +++ b/mace/kernels/opencl/cl/lstmcell.cl @@ -1,6 +1,6 @@ #include -__kernel void lstmcell(KERNEL_ERROR_PARAMS +__kernel void lstmcell(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, __read_only image2d_t pre_output, diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl index 0509159cc2022325fd8af40fa914c3a9423636f2..c272e040e9802fa52b3dd9c7cf1e4e313013e127 100644 --- a/mace/kernels/opencl/cl/matmul.cl +++ b/mace/kernels/opencl/cl/matmul.cl @@ -1,7 +1,7 @@ #include // C = A * B -__kernel void matmul(KERNEL_ERROR_PARAMS +__kernel void matmul(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t A, __read_only image2d_t B, diff --git a/mace/kernels/opencl/cl/pad.cl b/mace/kernels/opencl/cl/pad.cl index 8e102d60c76f1acd7b8f2551e8021e45bdb3256e..ad323b76c32c15469d70002b45c579ba831ebc9c 100644 --- a/mace/kernels/opencl/cl/pad.cl +++ b/mace/kernels/opencl/cl/pad.cl @@ -1,6 +1,6 @@ #include -__kernel void pad(KERNEL_ERROR_PARAMS +__kernel void pad(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __write_only image2d_t output, diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index 7b1b4607c6b03ed3f1b98d3dd7e39cec4e2e0a55..28987d3ca28c45e7c79c137e0c8a2973f6e8c4e4 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -16,7 +16,7 @@ inline int calculate_avg_block_size(const int pool_size_h, } // Supported data type: half/float -__kernel void pooling(KERNEL_ERROR_PARAMS +__kernel void pooling(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int in_height, diff --git a/mace/kernels/opencl/cl/pooling_buffer.cl b/mace/kernels/opencl/cl/pooling_buffer.cl new file mode 100644 index 0000000000000000000000000000000000000000..c4ecff9e19dfc737b57df69e24abc47add54bd6c --- /dev/null +++ b/mace/kernels/opencl/cl/pooling_buffer.cl @@ -0,0 +1,114 @@ +#include + +#define MIN_VALUE -FLT_MAX + +inline int calculate_avg_block_size(const int pool_size_h, + const int pool_size_w, + const int pos_h, + const int pos_w, + const int h_size, + const int w_size) { + const int h_start = max(0, pos_h); + const int w_start = max(0, pos_w); + const int h_end = min(pos_h + pool_size_h, h_size); + const int w_end = min(pos_w + pool_size_w, w_size); + return mul24((h_end - h_start), (w_end - w_start)); +} + +// Supported data type: half/float +__kernel void pooling(BUFFER_OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __global IN_DATA_TYPE *input, + __private const int in_height, + __private const int in_width, + __private const int in_chan, + __private const int out_height, + __private const int out_chan, + __private const int pad_top, + __private const int pad_left, + __private const int stride_h, + __private const int stride_w, + __private const int kernel_h, + __private const int kernel_w, + __global OUT_DATA_TYPE *output) { + + const int out_chan_blk_idx = get_global_id(0); + const int out_width_idx = get_global_id(1); + const int out_hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_chan_blk_idx >= global_size_dim0 || + out_width_idx >= global_size_dim1 || + out_hb_idx >= global_size_dim2) { + return; + } +#endif + const int out_width = global_size_dim1; + const int in_wc_size = mul24(in_width, in_chan); + + const int batch_idx = out_hb_idx / out_height; + const int out_height_idx = out_hb_idx % out_height; + const int chan_idx = out_chan_blk_idx << 2; + const int in_height_start = mul24(out_height_idx, stride_h) - pad_top; + const int in_width_start = mul24(out_width_idx, stride_w) - pad_left; + int in_offset_base = mad24(mad24(mad24(batch_idx, in_height, in_height_start), + in_width, in_width_start), in_chan, chan_idx); + +#ifdef POOL_AVG + DATA_TYPE4 res = 0; + for (int height = 0; height < kernel_h; ++height) { + int in_height_idx = in_height_start + height; + if (0 <= in_height_idx && in_height_idx < in_height) { + int in_offset = mad24(height, in_wc_size, in_offset_base); + for (int width = 0; width < kernel_w; ++width) { + int in_width_idx = in_width_start + width; + if (0 <= in_width_idx && in_width_idx < in_width) { + DATA_TYPE4 in = CONVERT4(vload4(0, input + in_offset)); + res = res + in; + } + in_offset += in_chan; + } + } + } + const int block_size = calculate_avg_block_size(kernel_h, + kernel_w, + in_height_start, + in_width_start, + in_height, + in_width); + res /= block_size; +#else + DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE); + for (int height = 0; height < kernel_h; ++height) { + int in_height_idx = in_height_start + height; + if (0 <= in_height_idx && in_height_idx < in_height) { + int in_offset = mad24(height, in_wc_size, in_offset_base); + for (int width = 0; width < kernel_w; ++width) { + int in_width_idx = in_width_start + width; + if (0 <= in_width_idx && in_width_idx < in_width) { + DATA_TYPE4 in = CONVERT4(vload4(0, input + in_offset)); + res = fmax(res, in); + } + in_offset += in_chan; + } + } + } +#endif + + const int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx), + out_width, out_width_idx), out_chan, chan_idx); + int remain_chan = out_chan - chan_idx; + if (remain_chan < 4) { + switch(remain_chan) { + case 3: + output[out_offset + 2] = res.z; + case 2: + output[out_offset + 1] = res.y; + case 1: + output[out_offset] = res.x; + } + CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + remain_chan - 1); + } else { + VSTORE4(CONVERT_TO(res, OUT_DATA_TYPE4), output, out_offset); + } +} diff --git a/mace/kernels/opencl/cl/reduce_mean.cl b/mace/kernels/opencl/cl/reduce_mean.cl index 5a23d1051930ee5a0b5c010938ab46b35eca5766..674c1c649a05261b1012c25e3324e682d4fbf56b 100644 --- a/mace/kernels/opencl/cl/reduce_mean.cl +++ b/mace/kernels/opencl/cl/reduce_mean.cl @@ -1,6 +1,6 @@ #include -__kernel void reduce_mean(KERNEL_ERROR_PARAMS +__kernel void reduce_mean(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __local DATA_TYPE4 *group_sum, diff --git a/mace/kernels/opencl/cl/resize_bicubic.cl b/mace/kernels/opencl/cl/resize_bicubic.cl index 767ee3911c3c2fdbc62e8c2c485e6c790e9fdeda..a2863a47f21e59bc91eed3eb91a089088002aa24 100644 --- a/mace/kernels/opencl/cl/resize_bicubic.cl +++ b/mace/kernels/opencl/cl/resize_bicubic.cl @@ -10,7 +10,7 @@ inline float coeff_odd(float i) { return ((-0.75f * x + 3.75f) * x - 6.0f) * x + 3.0f; } -__kernel void resize_bicubic_nocache(KERNEL_ERROR_PARAMS +__kernel void resize_bicubic_nocache(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __write_only image2d_t output, diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index 8e1fb1e6ecdf987a822e2b8c2656d4162cbd984e..4aa3af9f3b66bcdc68c9b3c555f7ac2201f55147 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -1,6 +1,6 @@ #include -__kernel void resize_bilinear_nocache(KERNEL_ERROR_PARAMS +__kernel void resize_bilinear_nocache(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __write_only image2d_t output, diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl index 361ea263cc13a79f561f633bc17112e1b6f0c2d7..39f8c89fe6e7b94bcfb803e1e6da1c562c86f320 100644 --- a/mace/kernels/opencl/cl/softmax.cl +++ b/mace/kernels/opencl/cl/softmax.cl @@ -1,6 +1,6 @@ #include -__kernel void softmax(KERNEL_ERROR_PARAMS +__kernel void softmax(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int channels, diff --git a/mace/kernels/opencl/cl/softmax_buffer.cl b/mace/kernels/opencl/cl/softmax_buffer.cl new file mode 100644 index 0000000000000000000000000000000000000000..8ec5d84c084d2c59811299aba70ad9bf9f4a05ef --- /dev/null +++ b/mace/kernels/opencl/cl/softmax_buffer.cl @@ -0,0 +1,88 @@ +#include + +__kernel void softmax(BUFFER_OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __global IN_DATA_TYPE *input, + __private const int height, + __private const int channels, + __private const int remain_channels, + __global OUT_DATA_TYPE *output) { + const int chan_blk_idx = get_global_id(0); + const int width_idx = get_global_id(1); + const int hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 + || hb_idx >= global_size_dim2) { + return; + } +#endif + const int chan_blks = global_size_dim0 - 1; + const int width = global_size_dim1; + const int batch_idx = hb_idx / height; + const int height_idx = hb_idx % height; + const int chan_idx = chan_blk_idx << 2; + + const int offset_base = mul24(mad24(mad24(batch_idx, height, height_idx), + width, width_idx), channels); + int in_offset = offset_base; + DATA_TYPE max_value = -FLT_MAX; + DATA_TYPE sum = 0; + DATA_TYPE4 data; + for (short i = 0; i < chan_blks; ++i) { + data = CONVERT4(vload4(0, input + in_offset)); + max_value = max(max_value, data.x); + max_value = max(max_value, data.y); + max_value = max(max_value, data.z); + max_value = max(max_value, data.w); + in_offset += 4; + } + switch(remain_channels) { + case 0: + max_value = max(max_value, CONVERT(input[in_offset + 3])); + case 1: + max_value = max(max_value, CONVERT(input[in_offset + 2])); + case 2: + max_value = max(max_value, CONVERT(input[in_offset + 1])); + case 3: + max_value = max(max_value, CONVERT(input[in_offset])); + } + + in_offset = offset_base; + for (short i = 0; i < chan_blks; ++i) { + data = CONVERT4(vload4(0, input + in_offset)); + data = native_exp(data - max_value); + sum += data.x; + sum += data.y; + sum += data.z; + sum += data.w; + in_offset += 4; + } + switch(remain_channels) { + case 0: + sum += native_exp(CONVERT(input[in_offset + 3]) - max_value); + case 1: + sum += native_exp(CONVERT(input[in_offset + 2]) - max_value); + case 2: + sum += native_exp(CONVERT(input[in_offset + 1]) - max_value); + case 3: + sum += native_exp(CONVERT(input[in_offset]) - max_value); + } + + int remain_chan = channels - chan_idx; + int offset = offset_base + chan_idx; + if (remain_chan < 4) { + switch(remain_chan) { + case 3: + output[offset + 2] = native_exp(CONVERT(input[offset + 2]) - max_value) / sum; + case 2: + output[offset + 1] = native_exp(CONVERT(input[offset + 1]) - max_value) / sum; + case 1: + output[offset] = native_exp(CONVERT(input[offset]) - max_value) / sum; + } + } else { + data = CONVERT4(vload4(0, input + offset)); + data = native_exp(data - max_value) / sum; + VSTORE4(CONVERT_TO(data, OUT_DATA_TYPE4), output, offset); + } +} diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl index d6f325d8e8f7277bd11fecbc8d4a167b6ebe369a..dbdcea205be29ee904b17e5b3c1fe66de10f7584 100644 --- a/mace/kernels/opencl/cl/space_to_batch.cl +++ b/mace/kernels/opencl/cl/space_to_batch.cl @@ -1,6 +1,6 @@ #include -__kernel void space_to_batch(KERNEL_ERROR_PARAMS +__kernel void space_to_batch(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t space_data, __write_only image2d_t batch_data, diff --git a/mace/kernels/opencl/cl/space_to_depth.cl b/mace/kernels/opencl/cl/space_to_depth.cl index 604d665128288c7593a8106d652a609362e8a882..94dd38839406b269e02d3f6e69f04d20b1a5a72b 100644 --- a/mace/kernels/opencl/cl/space_to_depth.cl +++ b/mace/kernels/opencl/cl/space_to_depth.cl @@ -1,6 +1,6 @@ #include -__kernel void space_to_depth(KERNEL_ERROR_PARAMS +__kernel void space_to_depth(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int block_size, diff --git a/mace/kernels/opencl/cl/split.cl b/mace/kernels/opencl/cl/split.cl index 8f93742ec552e294d29f19310989e96e62bf4d54..3c648c40644e350600348ac8768c381db7de4079 100644 --- a/mace/kernels/opencl/cl/split.cl +++ b/mace/kernels/opencl/cl/split.cl @@ -1,6 +1,6 @@ #include -__kernel void split(KERNEL_ERROR_PARAMS +__kernel void split(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int chan_blk_offset, diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl index 5e0a467fda8277532d66d787afbffa22626da443..018bede58561cd049b059f1cb7c7eca84a119923 100644 --- a/mace/kernels/opencl/cl/winograd_transform.cl +++ b/mace/kernels/opencl/cl/winograd_transform.cl @@ -1,6 +1,6 @@ #include -__kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS +__kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, __write_only image2d_t output, @@ -118,7 +118,7 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS } } -__kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS +__kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, #ifdef BIAS @@ -231,7 +231,7 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS } -__kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS +__kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, __write_only image2d_t output, @@ -390,7 +390,7 @@ __kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS } } -__kernel void winograd_inverse_transform_4x4(KERNEL_ERROR_PARAMS +__kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, #ifdef BIAS diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 6fa4ba8fedf2abc4365a2b83b16c0d90a51aeae0..12ba334f241965371e69e23102ab23f91c9de20d 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -13,191 +13,21 @@ // limitations under the License. #include "mace/kernels/concat.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/concat.h" namespace mace { namespace kernels { -namespace { -std::vector LocalWS(OpenCLRuntime *runtime, - const uint32_t *gws, - const uint32_t kwg_size) { - std::vector lws(4, 0); - if (kwg_size == 0) { - lws[0] = lws[1] = lws[2] = 1; +template +ConcatFunctor::ConcatFunctor( + OpKernelContext *context, + const int32_t axis) + : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::ConcatKernel(axis)); } else { - uint64_t - cache_size = runtime->device_global_mem_cache_size(); - uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - lws[0] = std::min(base, kwg_size / lws[1]); - const uint32_t lws_size = lws[0] * lws[1]; - lws[2] = - std::max(std::min(base, kwg_size / lws_size), 1); + MACE_NOT_IMPLEMENTED; } - return lws; -} - -} // namespace - -static MaceStatus Concat2(OpKernelContext *context, - cl::Kernel *kernel, - const Tensor *input0, - const Tensor *input1, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - StatsFuture *future, - uint32_t *kwg_size, - std::unique_ptr *kernel_error) { - const index_t batch = output->dim(0); - const index_t height = output->dim(1); - const index_t width = output->dim(2); - const index_t channel = output->dim(3); - - const int channel_blk = RoundUpDiv4(channel); - const uint32_t gws[3] = { - static_cast(channel_blk), static_cast(width), - static_cast(batch * height), - }; - - auto runtime = context->device()->opencl_runtime(); - - if (kernel->get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error, context); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); - built_options.emplace("-Dconcat_channel=" + kernel_name); - if (input0->dtype() == output->dtype()) { - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - } else { - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - } - if (input0->dim(3) % 4 == 0) { - built_options.emplace("-DDIVISIBLE_FOUR"); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name, - built_options, kernel)); - - *kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - } - if (!IsVecEqual(*prev_input_shape, input0->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG_PTR; - SET_3D_GWS_ARGS_PTR(kernel, gws); - kernel->setArg(idx++, - *(static_cast(input0->opencl_image()))); - kernel->setArg(idx++, - *(static_cast(input1->opencl_image()))); - kernel->setArg(idx++, static_cast(input0->dim(3))); - kernel->setArg(idx++, - *(static_cast(output->opencl_image()))); - - *prev_input_shape = input0->shape(); - } - - const std::vector lws = LocalWS(runtime, gws, *kwg_size); - std::string tuning_key = - Concat("concat_opencl_kernel", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, - gws, lws, future)); - OUT_OF_RANGE_VALIDATION(*kernel_error); - return MACE_SUCCESS; -} - -static MaceStatus ConcatN(OpKernelContext *context, - cl::Kernel *kernel, - const std::vector &input_list, - const DataType dt, - Tensor *output, - StatsFuture *future, - uint32_t *kwg_size, - std::unique_ptr *kernel_error) { - const index_t batch = output->dim(0); - const index_t height = output->dim(1); - const index_t width = output->dim(2); - - auto runtime = context->device()->opencl_runtime(); - - if (kernel->get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error, context); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); - built_options.emplace("-Dconcat_channel_multi=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name, - built_options, kernel)); - *kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - } - - const int inputs_count = input_list.size(); - index_t chan_blk_offset = 0; - cl::Event event; - CallStats call_stats{INT64_MAX, 0}; - for (int i = 0; i < inputs_count; ++i) { - const Tensor *input = input_list[i]; - index_t input_channel_blk = input->dim(3) / 4; - const uint32_t gws[3] = { - static_cast(input_channel_blk), static_cast(width), - static_cast(batch * height), - }; - const std::vector lws = LocalWS(runtime, gws, *kwg_size); - - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG_PTR; - SET_3D_GWS_ARGS_PTR(kernel, gws); - kernel->setArg(idx++, *(input->opencl_image())); - kernel->setArg(idx++, static_cast(chan_blk_offset)); - kernel->setArg(idx++, *(output->opencl_image())); - - chan_blk_offset += input_channel_blk; - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - *kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } else { - std::vector roundup_gws(lws.size()); - for (size_t j = 0; j < 3; ++j) { - roundup_gws[j] = RoundUp(gws[j], lws[j]); - } - error = runtime->command_queue().enqueueNDRangeKernel( - *kernel, cl::NullRange, - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } - MACE_CL_RET_STATUS(error); - OUT_OF_RANGE_VALIDATION(*kernel_error); - if (future != nullptr && runtime->is_profiling_enabled()) { - event.wait(); - CallStats tmp_stats; - runtime->GetCallStats(event, &tmp_stats); - call_stats.start_micros = - std::min(tmp_stats.start_micros, call_stats.start_micros); - call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; - } - } - if (future != nullptr) { - future->wait_fn = [call_stats](CallStats *stats) { - if (stats != nullptr) { - stats->start_micros = call_stats.start_micros; - stats->end_micros = stats->start_micros + call_stats.end_micros; - } - }; - } - - return MACE_SUCCESS; } template @@ -205,52 +35,7 @@ MaceStatus ConcatFunctor::operator()( const std::vector &input_list, Tensor *output, StatsFuture *future) { - const int inputs_count = input_list.size(); - MACE_CHECK(inputs_count >= 2 && axis_ == 3) - << "Concat opencl kernel only support >=2 elements with axis == 3"; - - const Tensor *input0 = input_list[0]; - bool divisible_four = input0->dim(axis_) % 4 == 0; - - std::vector output_shape(input0->shape()); - for (int i = 1; i < inputs_count; ++i) { - const Tensor *input = input_list[i]; - MACE_CHECK(input->dim_size() == input0->dim_size(), - "Ranks of all input tensors must be same."); - divisible_four &= input->dim(axis_) % 4 == 0; - for (int j = 0; j < input->dim_size(); ++j) { - if (j == axis_) { - continue; - } - MACE_CHECK(input->dim(j) == input0->dim(j), - "Dimensions of inputs should equal except axis."); - } - output_shape[axis_] += input->dim(axis_); - } - MACE_CHECK( - inputs_count == 2 || divisible_four, - "Dimensions of inputs should be divisible by 4 when inputs_count > 2."); - std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - - switch (inputs_count) { - case 2: - return Concat2(context_, - &kernel_, input_list[0], input_list[1], - DataTypeToEnum::value, &input_shape_, output, future, - &kwg_size_, &kernel_error_); - default: - if (divisible_four) { - return ConcatN(context_, - &kernel_, input_list, DataTypeToEnum::value, output, - future, &kwg_size_, &kernel_error_); - } else { - MACE_NOT_IMPLEMENTED; - } - } - - return MACE_SUCCESS; + return kernel_->Compute(context_, input_list, output, future); } template struct ConcatFunctor; diff --git a/mace/kernels/opencl/conv_2d.cc b/mace/kernels/opencl/conv_2d.cc index bc8538b77e9f9de56a6e51cdbdbcd905ff8f2a50..e21c4744603b2f0208b2f759ef95befb573c4e09 100644 --- a/mace/kernels/opencl/conv_2d.cc +++ b/mace/kernels/opencl/conv_2d.cc @@ -13,61 +13,37 @@ // limitations under the License. #include "mace/kernels/conv_2d.h" -#include "mace/kernels/opencl/helper.h" +#include "mace/kernels/opencl/image/conv_2d.h" +#include "mace/kernels/opencl/buffer/conv_2d.h" namespace mace { namespace kernels { -extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *runtime, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - StatsFuture *future, - uint32_t *kwg_size, - std::unique_ptr *kernel_error); +template +Conv2dFunctor::Conv2dFunctor( + OpKernelContext *context, + const int *strides, + const Padding &padding_type, + const std::vector &paddings, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const bool is_filter_transformed) + : Conv2dFunctorBase(context, + strides, + padding_type, + paddings, + dilations, + activation, + relux_max_limit) { + MACE_UNUSED(is_filter_transformed); -extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *runtime, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - StatsFuture *future, - uint32_t *kwg_size, - std::unique_ptr *kernel_error); - -extern MaceStatus Conv2dOpencl(OpKernelContext *runtime, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - StatsFuture *future, - uint32_t *kwg_size, - std::unique_ptr *kernel_error); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::Conv2dKernel); + } else { + kernel_.reset(new opencl::buffer::Conv2dKernel); + } +} template MaceStatus Conv2dFunctor::operator()(const Tensor *input, @@ -75,61 +51,11 @@ MaceStatus Conv2dFunctor::operator()(const Tensor *input, const Tensor *bias, Tensor *output, StatsFuture *future) { - typedef MaceStatus (*Conv2dOpenclFunction)( - OpKernelContext *runtime, cl::Kernel * kernel, const Tensor *input, - const Tensor *filter, const Tensor *bias, const int stride, - const int *padding, const int *dilations, - const ActivationType activation, - const float relux_max_limit, const DataType dt, - std::vector *input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size, std::unique_ptr *kernel_error); - // Selection matrix: kernel_size x stride_size - static const Conv2dOpenclFunction selector[3] = { - Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3}; - - index_t kernel_h = filter->dim(2); - index_t kernel_w = filter->dim(3); - if (strides_[0] != strides_[1] || - (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) { - LOG(WARNING) << "OpenCL conv2d kernel with " - << "filter" << kernel_h << "x" << kernel_w << "," - << " stride " << strides_[0] << "x" << strides_[1] - << ",dilations " << dilations_[0] << "x" << dilations_[1] - << " is not implemented yet."; - MACE_NOT_IMPLEMENTED; - } - - std::vector output_shape(4); - std::vector paddings(2); - if (paddings_.empty()) { - kernels::CalcNHWCPaddingAndOutputSize( - input->shape().data(), filter->shape().data(), dilations_, strides_, - padding_type_, output_shape.data(), paddings.data()); - } else { - paddings = paddings_; - CalcOutputSize(input->shape().data(), filter->shape().data(), - paddings_.data(), dilations_, strides_, RoundType::FLOOR, - output_shape.data()); - } - - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - if (kernel_h == kernel_w && kernel_h <= 3 && - selector[kernel_h - 1] != nullptr) { - auto conv2d_func = selector[kernel_h - 1]; - return conv2d_func(context_, - &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, - activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, - output, future, &kwg_size_, &kernel_error_); - } else { - return Conv2dOpencl(context_, - &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, - activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, - output, future, &kwg_size_, &kernel_error_); - } + // Compute + return kernel_->Compute(context_, input, filter, bias, + strides_, padding_type_, paddings_, + dilations_, activation_, relux_max_limit_, + output, future); } template struct Conv2dFunctor; diff --git a/mace/kernels/opencl/crop.cc b/mace/kernels/opencl/crop.cc index fce91d2be483d62570fec85bd91515f8bb89e8d5..720b2c8cdb49bd2ea664b7e4fe1cae516527189e 100644 --- a/mace/kernels/opencl/crop.cc +++ b/mace/kernels/opencl/crop.cc @@ -13,170 +13,29 @@ // limitations under the License. #include "mace/kernels/crop.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/crop.h" namespace mace { namespace kernels { -namespace { -std::vector LocalWS(OpenCLRuntime *runtime, - const uint32_t *gws, - const uint32_t kwg_size) { - std::vector lws(4, 0); - if (kwg_size == 0) { - lws[0] = lws[1] = lws[2] = 1; +template +CropFunctor::CropFunctor(OpKernelContext *context, + const int axis, + const std::vector &offset) + : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::CropKernel(axis, offset)); } else { - uint64_t - cache_size = runtime->device_global_mem_cache_size(); - uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - lws[0] = std::min(base, kwg_size / lws[1]); - const uint32_t lws_size = lws[0] * lws[1]; - lws[2] = - std::max(std::min(base, kwg_size / lws_size), 1); + MACE_NOT_IMPLEMENTED; } - return lws; } -} // namespace - - template MaceStatus CropFunctor::operator()( const std::vector &input_list, Tensor *output, StatsFuture *future) { - MACE_UNUSED(future); - - const int32_t inputs_count = static_cast(input_list.size()); - MACE_CHECK(inputs_count >= 2) - << "Crop opencl kernel only support 2 elements input"; - const Tensor *input0 = input_list[0]; - const Tensor *input1 = input_list[1]; - const uint32_t in0_dims = static_cast(input0->dim_size()); - const uint32_t in1_dims = static_cast(input0->dim_size()); - MACE_CHECK(in0_dims == 4 && in1_dims == 4, - "Crop op only supports 4-dims inputs now."); - - std::vector offsets(4, 0); - - std::vector output_shape(input0->shape()); - switch (axis_) { - case 0: - if (offset_.size() == 1) { - offsets[0] = offset_[0]; - offsets[1] = offset_[0]; - offsets[2] = offset_[0]; - offsets[3] = offset_[0]; - } else if (offset_.size() == 4) { - offsets[0] = offset_[0]; - offsets[1] = offset_[2]; - offsets[2] = offset_[3]; - offsets[3] = offset_[1]; - } - for (int i = 0; i < 4; ++i) { - output_shape[i] = input1->dim(i); - } - break; - case 1: - if (offset_.size() == 1) { - offsets[1] = offset_[0]; - offsets[2] = offset_[0]; - offsets[3] = offset_[0]; - } else if (offset_.size() == 3) { - offsets[1] = offset_[1]; - offsets[2] = offset_[2]; - offsets[3] = offset_[0]; - } - for (int i = 1; i < 4; ++i) { - output_shape[i] = input1->dim(i); - } - break; - case 2: - if (offset_.size() == 1) { - offsets[1] = offset_[0]; - offsets[2] = offset_[0]; - } else if (offset_.size() == 2) { - offsets[1] = offset_[0]; - offsets[2] = offset_[1]; - } - output_shape[1] = input1->dim(1); - output_shape[2] = input1->dim(2); - break; - case 3: - if (offset_.size() == 1) { - offsets[2] = offset_[0]; - } - output_shape[2] = input1->dim(2); - break; - default: - MACE_CHECK(axis_ >= 0 && axis_ < 4, "axis is out of boundary."); - break; - } - MACE_CHECK(offsets[3] % 4 == 0, - "MACE opencl only supports cropping channel offset divisible by 4."); - for (index_t i = 0; i < 4; ++i) { - MACE_CHECK(input0->dim(i) - offsets[i] >= input1->dim(i)) - << "the crop for dimension" << i << "is out of bound with size" - << input1->dim(i) << "and offset" << offsets[i]; - } - std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - - const index_t offset_chan_blk = RoundUpDiv4(offsets[3]); - const index_t channel_blk = RoundUpDiv4(output->dim(3)); - const uint32_t gws[3] = { - static_cast(channel_blk), static_cast(output->dim(2)), - static_cast(output->dim(0) * output->dim(1)) - }; - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop"); - built_options.emplace("-Dcrop=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - if (!IsVecEqual(input_shape_, input0->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input0->opencl_image())); - kernel_.setArg(idx++, static_cast(offsets[0])); - kernel_.setArg(idx++, static_cast(offsets[1])); - kernel_.setArg(idx++, static_cast(offsets[2])); - kernel_.setArg(idx++, static_cast(offset_chan_blk)); - kernel_.setArg(idx++, static_cast(input0->dim(1))); - kernel_.setArg(idx++, static_cast(input0->dim(2))); - kernel_.setArg(idx++, static_cast(output->dim(1))); - kernel_.setArg(idx++, static_cast(output->dim(2))); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input0->shape(); - } - - const std::vector lws = LocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("crop_opencl_kernel", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, input_list, output, future); } template struct CropFunctor; diff --git a/mace/kernels/opencl/deconv_2d.cc b/mace/kernels/opencl/deconv_2d.cc index 197b305e7e80b10d121c883d417c59a71d2abd9e..4911e26beb36f53aba64f6451d854066d2ceb614 100644 --- a/mace/kernels/opencl/deconv_2d.cc +++ b/mace/kernels/opencl/deconv_2d.cc @@ -13,140 +13,34 @@ // limitations under the License. #include "mace/kernels/deconv_2d.h" -#include "mace/kernels/opencl/helper.h" +#include "mace/kernels/opencl/image/deconv_2d.h" namespace mace { namespace kernels { -namespace { - -MaceStatus Deconv2dOpencl(OpKernelContext *context, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *strides, - const int *paddings, - const ActivationType activation, - const float relux_max_limit, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - StatsFuture *future, - uint32_t *kwg_size, - std::unique_ptr *kernel_error) { - const index_t batch = output->dim(0); - const index_t height = output->dim(1); - const index_t width = output->dim(2); - const index_t channels = output->dim(3); - const index_t input_channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - const index_t input_channel_blocks = RoundUpDiv4(input_channels); - const int stride_h = strides[0]; - const int stride_w = strides[1]; - MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0."); -#define MACE_WIDTH_BLK 5 - const index_t n_strides = (width + stride_w - 1) / stride_w; - const index_t width_blocks = - ((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride_w; - const float stride_h_r = 1.f / static_cast(stride_h); - const float stride_w_r = 1.f / static_cast(stride_w); - const int padding_h = (paddings[0] + 1) >> 1; - const int padding_w = (paddings[1] + 1) >> 1; - - const int align_h = stride_h - 1 - padding_h; - const int align_w = stride_w - 1 - padding_w; - const int kernel_size = filter->dim(2) * filter->dim(3); - - auto runtime = context->device()->opencl_runtime(); - - if (kernel->get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error, context); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d"); - built_options.emplace("-Ddeconv_2d=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); - switch (activation) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - default: - LOG(FATAL) << "Unknown activation type: " << activation; - } - - MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name, - built_options, kernel)); - - *kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - } - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width_blocks), - static_cast(height * batch)}; - - if (!IsVecEqual(*prev_input_shape, input->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG_PTR; - SET_3D_GWS_ARGS_PTR(kernel, gws); - kernel->setArg(idx++, *(input->opencl_image())); - kernel->setArg(idx++, *(filter->opencl_image())); - if (bias != nullptr) { - kernel->setArg(idx++, *(bias->opencl_image())); - } - kernel->setArg(idx++, *(output->opencl_image())); - kernel->setArg(idx++, relux_max_limit); - kernel->setArg(idx++, static_cast(input->dim(1))); - kernel->setArg(idx++, static_cast(input->dim(2))); - kernel->setArg(idx++, static_cast(input->dim(3))); - kernel->setArg(idx++, static_cast(height)); - kernel->setArg(idx++, static_cast(width)); - kernel->setArg(idx++, static_cast(channels)); - kernel->setArg(idx++, static_cast(stride_h)); - kernel->setArg(idx++, static_cast(stride_w)); - kernel->setArg(idx++, stride_h_r); - kernel->setArg(idx++, stride_w_r); - kernel->setArg(idx++, static_cast(align_h)); - kernel->setArg(idx++, static_cast(align_w)); - kernel->setArg(idx++, static_cast(padding_h)); - kernel->setArg(idx++, static_cast(padding_w)); - kernel->setArg(idx++, static_cast(filter->dim(2))); - kernel->setArg(idx++, static_cast(filter->dim(3))); - kernel->setArg(idx++, static_cast(kernel_size)); - kernel->setArg(idx++, static_cast(input_channel_blocks)); - kernel->setArg(idx++, static_cast(channel_blocks)); - - *prev_input_shape = input->shape(); +template +Deconv2dFunctor::Deconv2dFunctor( + OpKernelContext *context, + const std::vector &strides, + const Padding &padding_type, + const std::vector &paddings, + const std::vector &output_shape, + const ActivationType activation, + const float relux_max_limit) + : Deconv2dFunctorBase(context, + strides, + padding_type, + paddings, + output_shape, + activation, + relux_max_limit) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::Deconv2dKernel); + } else { + MACE_NOT_IMPLEMENTED; } - - const std::vector lws = Default3DLocalWS(runtime, gws, *kwg_size); - std::string tuning_key = - Concat("deconv2d_opencl_kernel_", activation, output->dim(0), - output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(*kernel_error); - return MACE_SUCCESS; } -} // namespace - template MaceStatus Deconv2dFunctor::operator()( const Tensor *input, @@ -188,16 +82,10 @@ MaceStatus Deconv2dFunctor::operator()( output_shape.data(), paddings.data()); } - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - return Deconv2dOpencl(context_, &kernel_, input, filter, bias, - strides_.data(), paddings.data(), activation_, - relux_max_limit_, DataTypeToEnum::value, - &input_shape_, output, future, - &kwg_size_, &kernel_error_); + return kernel_->Compute(context_, input, filter, bias, + strides_.data(), paddings.data(), activation_, + relux_max_limit_, output_shape, output, future); } template struct Deconv2dFunctor; diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc index d6d8ba82c1a72c854f8548bf38aa7b324f938541..2ab670d771266d2f7dc7d6a6e64587c074bce6ae 100644 --- a/mace/kernels/opencl/depth_to_space.cc +++ b/mace/kernels/opencl/depth_to_space.cc @@ -13,98 +13,26 @@ // limitations under the License. #include "mace/kernels/depth_to_space.h" -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/depth_to_space.h" namespace mace { namespace kernels { +template +DepthToSpaceOpFunctor::DepthToSpaceOpFunctor( + OpKernelContext *context, + const int block_size) + : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::DepthToSpaceKernel(block_size)); + } else { + MACE_NOT_IMPLEMENTED; + } +} template MaceStatus DepthToSpaceOpFunctor::operator()( const Tensor *input, Tensor *output, StatsFuture *future) { - const index_t batch = input->dim(0); - const index_t input_height = input->dim(1); - const index_t input_width = input->dim(2); - const index_t input_depth = input->dim(3); - - MACE_CHECK(input_depth % (block_size_ * block_size_) == 0, - "input depth should be dividable by block_size * block_size", - input_depth); - MACE_CHECK((input_depth % 4) == 0, - "input channel should be dividable by 4"); - - const index_t output_height = input_height * block_size_; - const index_t output_width = input_width * block_size_; - const index_t output_depth = input_depth / (block_size_ * block_size_); - MACE_CHECK(output_depth % 4 == 0, "output channel not support:") - << output_depth; - - const index_t input_depth_blocks = RoundUpDiv4(input_depth); - const index_t output_depth_blocks = RoundUpDiv4(output_depth); - - std::vector output_shape = {batch, - output_height, - output_width, - output_depth}; - std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - - const uint32_t gws[3] = { - static_cast(RoundUpDiv4(output_depth)), - static_cast(output_width), - static_cast(output_height * batch) - }; - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - const char *kernel_name = kernel_name = "depth_to_space"; - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space", - obfuscated_kernel_name, - built_options, - &kernel_)); - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, static_cast(block_size_)); - kernel_.setArg(idx++, static_cast(input_height * batch)); - kernel_.setArg(idx++, static_cast(input_width)); - kernel_.setArg(idx++, static_cast(input_depth_blocks)); - kernel_.setArg(idx++, static_cast(output_width)); - kernel_.setArg(idx++, static_cast(output_depth_blocks)); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } - - std::string tuning_key = Concat("depth_to_space_opencl_kernel", - batch, output_height, - output_width, output_depth); - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, input, output, future); } template struct DepthToSpaceOpFunctor; diff --git a/mace/kernels/opencl/depthwise_conv2d.cc b/mace/kernels/opencl/depthwise_conv2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..29f028764a64404de7ab6691d426aa54c7b1672f --- /dev/null +++ b/mace/kernels/opencl/depthwise_conv2d.cc @@ -0,0 +1,61 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/depthwise_conv2d.h" +#include "mace/kernels/opencl/buffer/depthwise_conv2d.h" +#include "mace/kernels/opencl/image/depthwise_conv2d.h" + +namespace mace { +namespace kernels { +template +DepthwiseConv2dFunctor::DepthwiseConv2dFunctor( + OpKernelContext *context, + const int *strides, + const Padding padding_type, + const std::vector &paddings, + const int *dilations, + const ActivationType activation, + const float relux_max_limit) + : DepthwiseConv2dFunctorBase(context, + strides, + padding_type, + paddings, + dilations, + activation, + relux_max_limit) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::DepthwiseConv2dKernel); + } else { + kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel); + } +} + +template +MaceStatus DepthwiseConv2dFunctor::operator()( + const Tensor *input, + const Tensor *filter, /* MIHW */ + const Tensor *bias, + Tensor *output, + StatsFuture *future) { + return kernel_->Compute(context_, input, filter, bias, + strides_, padding_type_, paddings_, + dilations_, activation_, relux_max_limit_, + output, future); +} + +template struct DepthwiseConv2dFunctor; +template struct DepthwiseConv2dFunctor; + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc index 201639e31bad24abc3c61053596b89d5fc7a25d7..e43e21987dbf9d8dfd79f836b31e8692fea4fe3b 100644 --- a/mace/kernels/opencl/eltwise.cc +++ b/mace/kernels/opencl/eltwise.cc @@ -13,125 +13,33 @@ // limitations under the License. #include "mace/kernels/eltwise.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" +#include "mace/kernels/opencl/image/eltwise.h" namespace mace { namespace kernels { +template +EltwiseFunctor::EltwiseFunctor( + OpKernelContext *context, + const EltwiseType type, + const std::vector &coeff, + const float scalar_input, + const int32_t scalar_input_index, + const DataFormat data_format) : OpKernel(context) { + MACE_UNUSED(data_format); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::EltwiseKernel( + type, coeff, scalar_input, scalar_input_index)); + } else { + MACE_NOT_IMPLEMENTED; + } +} template MaceStatus EltwiseFunctor::operator()(const Tensor *input0, const Tensor *input1, Tensor *output, StatsFuture *future) { - MACE_UNUSED(future); - bool swapped = false; - if (input1 != nullptr) { - MACE_CHECK(input0->dim_size() == input1->dim_size() || - input0->dim_size() == 1 || input1->dim_size() == 1) - << "Inputs of Eltwise op must be same shape"; - if (input0->size() != input1->size()) { - if (input0->size() < input1->size()) { - std::swap(input0, input1); - swapped = true; - } - if (input1->dim_size() == 1) { - MACE_CHECK(input0->dim(3) == input1->dim(0)) - << "Element-Wise op only support channel dimension broadcast"; - } else { - MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) && - input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 && - input1->dim(2) == 1) - << "Element-Wise op only support channel dimension broadcast"; - } - } - } - - if (scalar_input_index_ == 0) { - swapped = !swapped; - } - - std::vector output_shape(4); - output_shape[0] = input0->dim(0); - output_shape[1] = input0->dim(1); - output_shape[2] = input0->dim(2); - output_shape[3] = input0->dim(3); - - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - const index_t batch = output->dim(0); - const index_t height = output->dim(1); - const index_t width = output->dim(2); - const index_t channels = output->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - const index_t batch_height_pixels = batch * height; - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(batch_height_pixels)}; - - auto runtime = context_->device()->opencl_runtime(); - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); - built_options.emplace("-Deltwise=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); - if (input1 == nullptr) { - built_options.emplace("-DINPUT_TYPE=1"); - } else if (input0->size() != input1->size()) { - if (input1->dim(0) == 1 || input1->dim_size() == 1) - built_options.emplace("-DINPUT_TYPE=3"); - else - built_options.emplace("-DINPUT_TYPE=2"); - if (swapped) built_options.emplace("-DSWAPPED"); - } - if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - if (!IsVecEqual(input_shape_, input0->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input0->opencl_image())); - if (input1 == nullptr) { - kernel_.setArg(idx++, scalar_input_); - } else { - kernel_.setArg(idx++, *(input1->opencl_image())); - } - kernel_.setArg(idx++, static_cast(height)); - kernel_.setArg(idx++, static_cast(width)); - kernel_.setArg(idx++, static_cast(channels)); - if (!coeff_.empty()) { - kernel_.setArg(idx++, coeff_[0]); - kernel_.setArg(idx++, coeff_[1]); - } - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input0->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, input0, input1, output, future); } template struct EltwiseFunctor; diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc index 2af592c761976cad0414d2b16288b09b99a48d49..3dd0db4c622f05181c8bd538a54a566df7072189 100644 --- a/mace/kernels/opencl/fully_connected.cc +++ b/mace/kernels/opencl/fully_connected.cc @@ -13,239 +13,23 @@ // limitations under the License. #include "mace/kernels/fully_connected.h" - -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" +#include "mace/kernels/opencl/image/fully_connected.h" namespace mace { namespace kernels { -namespace { template -MaceStatus FCWXKernel(OpKernelContext *context, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *weight, - const Tensor *bias, - std::vector *prev_input_shape, - Tensor *output, - const ActivationType activation, - std::vector *gws, - std::vector *lws, - const float relux_max_limit, - StatsFuture *future, - std::unique_ptr *kernel_error) { - MACE_CHECK_NOTNULL(gws); - MACE_CHECK_NOTNULL(lws); - auto runtime = context->device()->opencl_runtime(); - - if (kernel->get() == nullptr) { - const index_t batch = output->dim(0); - const index_t output_size = output->dim(3); - const index_t output_blocks = RoundUpDiv4(output_size); - - std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error, context); - NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width"); - built_options.emplace("-Dfully_connected_width=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - if (bias != nullptr) { - built_options.emplace("-DBIAS"); - } - switch (activation) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - default: - LOG(FATAL) << "Unknown activation type: " << activation; - } - if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { - built_options.emplace("-DNON_QUALCOMM_ADRENO"); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name, - built_options, kernel)); - - if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - const uint32_t wave_size = - static_cast(runtime->GetKernelWaveSize(*kernel)); - - *gws = {4, (wave_size / 4), static_cast(batch * output_blocks)}; - - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]); - *lws = {(*gws)[0], (*gws)[1], inter_local_blks}; - } else { - *gws = {4, 8, static_cast(batch * output_blocks)}; - - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]); - *lws = {(*gws)[0], (*gws)[1], inter_local_blks}; - } - } - if (!IsVecEqual(*prev_input_shape, input->shape())) { - const index_t batch = output->dim(0); - const index_t output_blocks = RoundUpDiv4(output->dim(3)); - (*gws)[2] = static_cast(batch * output_blocks); - - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG_PTR; - SET_3D_GWS_ARGS_PTR(kernel, *gws); - kernel->setArg(idx++, *(input->opencl_image())); - kernel->setArg(idx++, *(weight->opencl_image())); - if (bias != nullptr) { - kernel->setArg(idx++, *(bias->opencl_image())); - } - kernel->setArg(idx++, *(output->opencl_image())); - kernel->setArg(idx++, ((*lws)[0] * (*lws)[1] * (*lws)[2] * sizeof(float)), - nullptr); - kernel->setArg(idx++, static_cast(input->dim(1))); - kernel->setArg(idx++, static_cast(input->dim(2))); - kernel->setArg(idx++, static_cast(RoundUpDiv4(input->dim(3)))); - kernel->setArg(idx++, static_cast(output_blocks)); - kernel->setArg(idx++, relux_max_limit); - - *prev_input_shape = input->shape(); - } - cl::Event event; - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]), - cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); +FullyConnectedFunctor::FullyConnectedFunctor( + OpKernelContext *context, + const ActivationType activation, + const float relux_max_limit) + : FullyConnectedBase(context, activation, relux_max_limit) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::FullyConnectedKernel); } else { - std::vector roundup_gws(lws->size()); - for (size_t i = 0; i < lws->size(); ++i) { - roundup_gws[i] = RoundUp((*gws)[i], (*lws)[i]); - } - error = runtime->command_queue().enqueueNDRangeKernel( - *kernel, cl::NullRange, - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); + MACE_NOT_IMPLEMENTED; } - OUT_OF_RANGE_VALIDATION(*kernel_error); - MACE_CL_RET_STATUS(error); - - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } - - return MACE_SUCCESS; } - -template -MaceStatus FCWTXKernel(OpKernelContext *context, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *weight, - const Tensor *bias, - std::vector *prev_input_shape, - Tensor *output, - const ActivationType activation, - std::vector *gws, - std::vector *lws, - const float relux_max_limit, - StatsFuture *future, - std::unique_ptr *kernel_error) { - MACE_CHECK_NOTNULL(gws); - MACE_CHECK_NOTNULL(lws); - auto runtime = context->device()->opencl_runtime(); - if (kernel->get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error, context); - NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); - built_options.emplace("-Dfully_connected=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - if (bias != nullptr) { - built_options.emplace("-DBIAS"); - } - switch (activation) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - default: - LOG(FATAL) << "Unknown activation type: " << activation; - } - MACE_RETURN_IF_ERROR( - runtime->BuildKernel("fully_connected", kernel_name, - built_options, kernel)); - - uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - *lws = {16, kwg_size / 16, 0}; - } - if (!IsVecEqual(*prev_input_shape, input->shape())) { - const index_t batch = output->dim(0); - const index_t output_blocks = RoundUpDiv4(output->dim(3)); - - *gws = { - static_cast(batch), static_cast(output_blocks), - }; - - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG_PTR; - SET_2D_GWS_ARGS_PTR(kernel, *gws); - kernel->setArg(idx++, *(input->opencl_image())); - kernel->setArg(idx++, *(weight->opencl_image())); - if (bias != nullptr) { - kernel->setArg(idx++, *(bias->opencl_image())); - } - kernel->setArg(idx++, *(output->opencl_image())); - kernel->setArg(idx++, static_cast(input->dim(1))); - kernel->setArg(idx++, static_cast(input->dim(2))); - kernel->setArg(idx++, static_cast(input->dim(3))); - // FIXME handle flexable data type: half not supported - kernel->setArg(idx++, relux_max_limit); - - *prev_input_shape = input->shape(); - } - - std::string tuning_key = - Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), - output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, - gws->data(), *lws, future)); - - OUT_OF_RANGE_VALIDATION(*kernel_error); - return MACE_SUCCESS; -} -} // namespace - template MaceStatus FullyConnectedFunctor::operator()( const Tensor *input, @@ -253,16 +37,9 @@ MaceStatus FullyConnectedFunctor::operator()( const Tensor *bias, Tensor *output, StatsFuture *future) { - std::vector output_shape = {input->dim(0), 1, 1, weight->dim(0)}; - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - return FCWXKernel(context_, - &kernel_, input, weight, bias, &input_shape_, output, - activation_, &gws_, &lws_, relux_max_limit_, future, - &kernel_error_); + return kernel_->Compute( + context_, input, weight, bias, activation_, relux_max_limit_, + output, future); } template struct FullyConnectedFunctor; diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index d9e309bc2c19045ffcd9eb4f373fb9dc7b208f61..0126d2ac2cab143c85b59016b0d7c305a944d9bd 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -30,60 +30,61 @@ namespace mace { namespace kernels { +// oorc for 'Out Of Range Check' +#define MACE_OUT_OF_RANGE_DEFINITION \ + std::shared_ptr oorc_flag; -#define OUT_OF_RANGE_CONFIG(kernel_error, context) \ - if (runtime->IsOutOfRangeCheckEnabled()) { \ - built_options.emplace("-DOUT_OF_RANGE_CHECK"); \ - (kernel_error) = std::move(std::unique_ptr( \ - new Buffer((context)->device()->allocator()))); \ - MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1)); \ - (kernel_error)->Map(nullptr); \ - *((kernel_error)->mutable_data()) = 0; \ - (kernel_error)->UnMap(); \ +#define MACE_OUT_OF_RANGE_CONFIG \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + built_options.emplace("-DOUT_OF_RANGE_CHECK"); \ } -#define OUT_OF_RANGE_SET_ARG \ - if (runtime->IsOutOfRangeCheckEnabled()) { \ - kernel_.setArg(idx++, \ - *(static_cast(kernel_error_->buffer()))); \ +#define MACE_OUT_OF_RANGE_INIT(kernel) \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + oorc_flag = std::move(std::unique_ptr( \ + new Buffer((context)->device()->allocator()))); \ + MACE_RETURN_IF_ERROR((oorc_flag)->Allocate(sizeof(int))); \ + oorc_flag->Map(nullptr); \ + *(oorc_flag->mutable_data()) = 0; \ + oorc_flag->UnMap(); \ + (kernel).setArg(0, \ + *(static_cast(oorc_flag->buffer())));\ } -#define OUT_OF_RANGE_SET_ARG_PTR \ - if (runtime->IsOutOfRangeCheckEnabled()) { \ - kernel->setArg(idx++, \ - *(static_cast((*kernel_error)->buffer()))); \ +#define MACE_OUT_OF_RANGE_SET_ARGS(kernel) \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + (kernel).setArg(idx++, \ + *(static_cast(oorc_flag->buffer())));\ } -#define OUT_OF_RANGE_VALIDATION(kernel_error) \ - if (runtime->IsOutOfRangeCheckEnabled()) { \ - (kernel_error)->Map(nullptr); \ - char *kerror_code = (kernel_error)->mutable_data(); \ - MACE_CHECK(*kerror_code == 0, "Kernel error code: ", *kerror_code);\ - (kernel_error)->UnMap(); \ +#define MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel, size) \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + (kernel).setArg(idx++, \ + *(static_cast(oorc_flag->buffer()))); \ + (kernel).setArg(idx++, static_cast(size)); \ } -#define NON_UNIFORM_WG_CONFIG \ +#define MACE_OUT_OF_RANGE_VALIDATION \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + oorc_flag->Map(nullptr); \ + int *kerror_code = oorc_flag->mutable_data(); \ + MACE_CHECK(*kerror_code == 0, "Kernel error code: ", *kerror_code); \ + oorc_flag->UnMap(); \ + } + +#define MACE_NON_UNIFORM_WG_CONFIG \ if (runtime->IsNonUniformWorkgroupsSupported()) { \ built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); \ } -#define SET_3D_GWS_ARGS(kernel) \ - kernel.setArg(idx++, gws[0]); \ - kernel.setArg(idx++, gws[1]); \ - kernel.setArg(idx++, gws[2]); - -#define SET_2D_GWS_ARGS(kernel) \ - kernel.setArg(idx++, gws[0]); \ - kernel.setArg(idx++, gws[1]); - -#define SET_3D_GWS_ARGS_PTR(kernel, gws) \ - kernel->setArg(idx++, (gws)[0]); \ - kernel->setArg(idx++, (gws)[1]); \ - kernel->setArg(idx++, (gws)[2]); +#define MACE_SET_3D_GWS_ARGS(kernel, gws) \ + (kernel).setArg(idx++, (gws)[0]); \ + (kernel).setArg(idx++, (gws)[1]); \ + (kernel).setArg(idx++, (gws)[2]); -#define SET_2D_GWS_ARGS_PTR(kernel, gws) \ - kernel->setArg(idx++, (gws)[0]); \ - kernel->setArg(idx++, (gws)[1]); +#define MACE_SET_2D_GWS_ARGS(kernel, gws) \ + (kernel).setArg(idx++, (gws)[0]); \ + (kernel).setArg(idx++, (gws)[1]); // Max execution time of OpenCL kernel for tuning to prevent UI stuck. const float kMaxKernelExecTime = 1000.0; // microseconds @@ -114,6 +115,10 @@ std::string DtToCLDt(const DataType dt); // e.g. half -> float std::string DtToUpCompatibleCLDt(const DataType dt); +// CPU data type to OpenCL condition data type used in select +// e.g. half -> float +std::string DtToCLCondDt(const DataType dt); + // Tuning or Run OpenCL kernel with 3D work group size MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime, const cl::Kernel &kernel, @@ -167,6 +172,7 @@ std::string Concat(Args... args) { std::vector Default3DLocalWS(OpenCLRuntime *runtime, const uint32_t *gws, const uint32_t kwg_size); + } // namespace kernels } // namespace mace #endif // MACE_KERNELS_OPENCL_HELPER_H_ diff --git a/mace/kernels/opencl/image/activation.h b/mace/kernels/opencl/image/activation.h new file mode 100644 index 0000000000000000000000000000000000000000..5ddf00ac24d14953cd56a75a3e097c95adee0869 --- /dev/null +++ b/mace/kernels/opencl/image/activation.h @@ -0,0 +1,147 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_ +#define MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_ + +#include "mace/kernels/activation.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class ActivationKernel : public OpenCLActivationKernel { + public: + ActivationKernel(ActivationType type, + T relux_max_limit) + : activation_(type), relux_max_limit_(relux_max_limit) {} + + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *alpha, + Tensor *output, + StatsFuture *future) override; + + private: + ActivationType activation_; + T relux_max_limit_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; + std::string tuning_key_prefix_; +}; + +template +MaceStatus ActivationKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *alpha, + Tensor *output, + StatsFuture *future) { + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); + built_options.emplace("-Dactivation=" + kernel_name); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + switch (activation_) { + case RELU: + tuning_key_prefix_ = "relu_opencl_kernel"; + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + tuning_key_prefix_ = "relux_opencl_kernel"; + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + tuning_key_prefix_ = "prelu_opencl_kernel"; + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + tuning_key_prefix_ = "tanh_opencl_kernel"; + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + tuning_key_prefix_ = "sigmoid_opencl_kernel"; + built_options.emplace("-DUSE_SIGMOID"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation_; + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + int idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + if (activation_ == PRELU) { + MACE_CHECK_NOTNULL(alpha); + kernel_.setArg(idx++, *(alpha->opencl_image())); + } + kernel_.setArg(idx++, static_cast(relux_max_limit_)); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), + output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_ diff --git a/mace/kernels/opencl/image/addn.h b/mace/kernels/opencl/image/addn.h new file mode 100644 index 0000000000000000000000000000000000000000..49721c0938d3427e44db708ad4f6656645ea13ba --- /dev/null +++ b/mace/kernels/opencl/image/addn.h @@ -0,0 +1,135 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_ADDN_H_ +#define MACE_KERNELS_OPENCL_IMAGE_ADDN_H_ + +#include "mace/kernels/addn.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class AddNKernel : public OpenCLAddNKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const std::vector &input_tensors, + Tensor *output_tensor, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus AddNKernel::Compute( + OpKernelContext *context, + const std::vector &input_tensors, + Tensor *output_tensor, + StatsFuture *future) { + size_t size = input_tensors.size(); + MACE_CHECK(size >= 2 && input_tensors[0] != nullptr); + + const index_t batch = input_tensors[0]->dim(0); + const index_t height = input_tensors[0]->dim(1); + const index_t width = input_tensors[0]->dim(2); + const index_t channels = input_tensors[0]->dim(3); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + for (size_t i = 1; i < size; ++i) { + MACE_CHECK_NOTNULL(input_tensors[i]); + MACE_CHECK(batch == input_tensors[i]->dim(0)); + MACE_CHECK(height == input_tensors[i]->dim(1)); + MACE_CHECK(width == input_tensors[i]->dim(2)); + MACE_CHECK(channels == input_tensors[i]->dim(3)); + } + + if (kernel_.get() == nullptr) { + if (input_tensors.size() > 4) { + MACE_NOT_IMPLEMENTED; + } + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + auto dt = DataTypeToEnum::value; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); + built_options.emplace("-Daddn=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + std::vector output_shape = input_tensors[0]->shape(); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t width_pixels = channel_blocks * width; + const index_t batch_height_pixels = batch * height; + + const uint32_t gws[2] = {static_cast(width_pixels), + static_cast(batch_height_pixels)}; + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) { + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR( + output_tensor->ResizeImage(output_shape, output_image_shape)); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_2D_GWS_ARGS(kernel_, gws); + for (auto input : input_tensors) { + kernel_.setArg(idx++, *(input->opencl_image())); + } + kernel_.setArg(idx++, *(output_tensor->opencl_image())); + + input_shape_ = input_tensors[0]->shape(); + } + + const std::vector lws = {kwg_size_ / 16, 16, 0}; + std::string tuning_key = + Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1), + output_tensor->dim(2), output_tensor->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_ADDN_H_ diff --git a/mace/kernels/opencl/image/batch_norm.h b/mace/kernels/opencl/image/batch_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..7b16015473ae4e374fb57362da729b7033549a08 --- /dev/null +++ b/mace/kernels/opencl/image/batch_norm.h @@ -0,0 +1,163 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_ +#define MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_ + +#include "mace/kernels/batch_norm.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class BatchNormKernel : public OpenCLBatchNormKernel { + public: + BatchNormKernel( + const bool folded_constant, + const ActivationType activation, + const float relux_max_limit); + MaceStatus Compute(OpKernelContext *context, + const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const float epsilon, + Tensor *output, + StatsFuture *future) override; + + private: + const bool folded_constant_; + const ActivationType activation_; + const float relux_max_limit_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +BatchNormKernel::BatchNormKernel(const bool folded_constant, + const ActivationType activation, + const float relux_max_limit) + : folded_constant_(folded_constant), + activation_(activation), + relux_max_limit_(relux_max_limit) {} + +template +MaceStatus BatchNormKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const float epsilon, + Tensor *output, + StatsFuture *future) { + MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr)); + + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + auto dt = DataTypeToEnum::value; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); + built_options.emplace("-Dbatch_norm=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + if (folded_constant_) { + built_options.emplace("-DFOLDED_CONSTANT"); + } + switch (activation_) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation_; + } + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(scale->opencl_image())); + kernel_.setArg(idx++, *(offset->opencl_image())); + if (!folded_constant_) { + kernel_.setArg(idx++, *(mean->opencl_image())); + kernel_.setArg(idx++, *(var->opencl_image())); + kernel_.setArg(idx++, epsilon); + } + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, relux_max_limit_); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("batch_norm_opencl_kernel", activation_, output->dim(0), + output->dim(1), output->dim(2), output->dim(3), folded_constant_); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_ diff --git a/mace/kernels/opencl/image/batch_to_space.h b/mace/kernels/opencl/image/batch_to_space.h new file mode 100644 index 0000000000000000000000000000000000000000..f3c4bf8cf7beabf9141f8b4b9d1cea4ff700297d --- /dev/null +++ b/mace/kernels/opencl/image/batch_to_space.h @@ -0,0 +1,130 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_ +#define MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_ + +#include "mace/kernels/batch_to_space.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *batch_tensor, + const std::vector &paddings, + const std::vector &block_shape, + const std::vector &output_shape, + Tensor *space_tensor, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus BatchToSpaceKernel::Compute( + OpKernelContext *context, + const Tensor *batch_tensor, + const std::vector &paddings, + const std::vector &block_shape, + const std::vector &output_shape, + Tensor *space_tensor, + StatsFuture *future) { + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR( + space_tensor->ResizeImage(output_shape, output_image_shape)); + + const uint32_t chan_blk = + static_cast(RoundUpDiv4(batch_tensor->dim(3))); + + const uint32_t gws[3] = { + chan_blk, static_cast(batch_tensor->dim(2)), + static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + const char *kernel_name = "batch_to_space"; + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToCLCMDDt(DataTypeToEnum::value)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space", + obfuscated_kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, batch_tensor->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(batch_tensor->opencl_image())); + kernel_.setArg(idx++, *(space_tensor->opencl_image())); + kernel_.setArg(idx++, block_shape[0]); + kernel_.setArg(idx++, block_shape[1]); + kernel_.setArg(idx++, paddings[0]); + kernel_.setArg(idx++, paddings[2]); + kernel_.setArg(idx++, static_cast(space_tensor->dim(0))); + kernel_.setArg(idx++, static_cast(space_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(space_tensor->dim(2))); + kernel_.setArg(idx++, static_cast(batch_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(batch_tensor->dim(2))); + + input_shape_ = batch_tensor->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1), + batch_tensor->dim(2), batch_tensor->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_ diff --git a/mace/kernels/opencl/image/bias_add.h b/mace/kernels/opencl/image/bias_add.h new file mode 100644 index 0000000000000000000000000000000000000000..3a84cbceefe6fdcce5d4573a5e8d42c5c3108992 --- /dev/null +++ b/mace/kernels/opencl/image/bias_add.h @@ -0,0 +1,131 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_ +#define MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_ + +#include "mace/kernels/bias_add.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class BiasAddKernel : public OpenCLBiasAddKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *bias, + Tensor *output, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus BiasAddKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *bias, + Tensor *output, + StatsFuture *future) { + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + auto dt = DataTypeToEnum::value; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); + built_options.emplace("-Dbias_add=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name, + built_options, &kernel_)); + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(bias->opencl_image())); + kernel_.setArg(idx++, *(output->opencl_image())); + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]); + } + + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } + MACE_CL_RET_STATUS(error); + MACE_OUT_OF_RANGE_VALIDATION; + if (future != nullptr) { + future->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_ diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/image/buffer_to_image.h similarity index 60% rename from mace/kernels/opencl/buffer_to_image.cc rename to mace/kernels/opencl/image/buffer_to_image.h index 75d0c4f542a11feda4e615ff025d0d771931008b..a791c064d4b5242aa3d0b58be6fc7bdcbcaf44fe 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/image/buffer_to_image.h @@ -12,29 +12,55 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/kernels/buffer_to_image.h" +#ifndef MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_ +#define MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_ +#include +#include +#include + +#include "mace/kernels/buffer_transform.h" #include "mace/kernels/opencl/helper.h" namespace mace { namespace kernels { +namespace opencl { +namespace image { + +template +class BufferToImage : public OpenCLBufferTransformKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + Tensor *output, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + std::vector input_shape_; +}; template -MaceStatus BufferToImageFunctor::operator()( - const Tensor *buffer, +MaceStatus BufferToImage::Compute( + OpKernelContext *context, + const Tensor *input, const BufferType type, - Tensor *image, + const int wino_blk_size, + Tensor *output, StatsFuture *future) { - auto formatted_buffer_shape = FormatBufferShape(buffer->shape(), type); + auto formatted_buffer_shape = FormatBufferShape(input->shape(), type); std::vector image_shape; - CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_); + CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size); if (type == WINOGRAD_FILTER) { std::vector new_shape = - {(wino_blk_size_ + 2) * (wino_blk_size_ + 2), - buffer->dim(0), buffer->dim(1)}; - MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape)); + {(wino_blk_size + 2) * (wino_blk_size + 2), + input->dim(0), input->dim(1)}; + MACE_RETURN_IF_ERROR(output->ResizeImage(new_shape, image_shape)); } else { - MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape)); + MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape)); } uint32_t gws[2] = {static_cast(image_shape[0]), @@ -67,25 +93,26 @@ MaceStatus BufferToImageFunctor::operator()( break; case WINOGRAD_FILTER: { std::stringstream ss_tmp; - gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2); + gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2); ss_tmp << "winograd_filter_buffer_to_image_" - << wino_blk_size_ << "x" << wino_blk_size_; + << wino_blk_size << "x" << wino_blk_size; kernel_name = ss_tmp.str(); break; } } - auto runtime = context_->device()->opencl_runtime(); + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); - if (buffer->dtype() == image->dtype()) { + if (input->dtype() == output->dtype()) { built_options.emplace( "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + @@ -100,40 +127,41 @@ MaceStatus BufferToImageFunctor::operator()( "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_)); } - if (!IsVecEqual(input_shape_, buffer->shape())) { + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_2D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(buffer->opencl_buffer())); - MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_2D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_buffer())); + MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0, "buffer offset not aligned"); kernel_.setArg(idx++, - static_cast(buffer->buffer_offset() / - GetEnumTypeSize(buffer->dtype()))); + static_cast(input->buffer_offset() / + GetEnumTypeSize(input->dtype()))); if (type == CONV2D_FILTER) { const index_t - inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); - kernel_.setArg(idx++, static_cast(buffer->dim(0))); - kernel_.setArg(idx++, static_cast(buffer->dim(2))); - kernel_.setArg(idx++, static_cast(buffer->dim(3))); + inner_size = input->dim(1) * input->dim(2) * input->dim(3); + kernel_.setArg(idx++, static_cast(input->dim(0))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(input->dim(3))); kernel_.setArg(idx++, static_cast(inner_size)); } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) { - kernel_.setArg(idx++, static_cast(buffer->dim(0))); - kernel_.setArg(idx++, static_cast(buffer->dim(1))); - kernel_.setArg(idx++, static_cast(buffer->dim(2))); - kernel_.setArg(idx++, static_cast(buffer->dim(3))); + kernel_.setArg(idx++, static_cast(input->dim(0))); + kernel_.setArg(idx++, static_cast(input->dim(1))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(input->dim(3))); } else if (type == ARGUMENT) { - kernel_.setArg(idx++, static_cast(buffer->dim(0))); + kernel_.setArg(idx++, static_cast(input->dim(0))); } else { kernel_.setArg(idx++, - static_cast(formatted_buffer_shape[1])); + static_cast(formatted_buffer_shape[1])); kernel_.setArg(idx++, - static_cast(formatted_buffer_shape[2])); + static_cast(formatted_buffer_shape[2])); kernel_.setArg(idx++, - static_cast(formatted_buffer_shape[3])); + static_cast(formatted_buffer_shape[3])); } - kernel_.setArg(idx++, *(image->opencl_image())); - input_shape_ = buffer->shape(); + kernel_.setArg(idx++, *(output->opencl_image())); + input_shape_ = input->shape(); } const uint32_t kwg_size = @@ -157,7 +185,7 @@ MaceStatus BufferToImageFunctor::operator()( cl::NDRange(lws[0], lws[1]), nullptr, &event); } MACE_CL_RET_STATUS(error); - OUT_OF_RANGE_VALIDATION(kernel_error_); + MACE_OUT_OF_RANGE_VALIDATION; if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); @@ -168,13 +196,14 @@ MaceStatus BufferToImageFunctor::operator()( } // Mark the buffer unused. - const_cast(buffer)->MarkUnused(); + const_cast(input)->MarkUnused(); return MACE_SUCCESS; } -template struct BufferToImageFunctor; -template struct BufferToImageFunctor; - +} // namespace image +} // namespace opencl } // namespace kernels } // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_ diff --git a/mace/kernels/opencl/image/channel_shuffle.h b/mace/kernels/opencl/image/channel_shuffle.h new file mode 100644 index 0000000000000000000000000000000000000000..5034f56917307984025b7d5d6d08fcb9facb391b --- /dev/null +++ b/mace/kernels/opencl/image/channel_shuffle.h @@ -0,0 +1,118 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_ +#define MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_ + +#include "mace/kernels/channel_shuffle.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class ChannelShuffleKernel : public OpenCLChannelShuffleKernel { + public: + explicit ChannelShuffleKernel(const int groups) : groups_(groups) {} + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) override; + + private: + const int groups_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus ChannelShuffleKernel::Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) { + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); + const index_t channels_per_group = channels / groups_; + MACE_CHECK(channels_per_group % 4 == 0, + "channels per group must be multiple of 4"); + MACE_CHECK(groups_ % 4 == 0, "groups must be multiple of 4"); + const index_t group_channel_blocks = RoundUpDiv4(channels_per_group); + + const uint32_t gws[3] = {static_cast(group_channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = context->device()->opencl_runtime(); + + MACE_OUT_OF_RANGE_DEFINITION; + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); + built_options.emplace("-Dchannel_shuffle=" + kernel_name); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + MACE_RETURN_IF_ERROR( + runtime->BuildKernel("channel_shuffle", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, groups_); + kernel_.setArg(idx++, static_cast(channels_per_group)); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_ diff --git a/mace/kernels/opencl/image/concat.cc b/mace/kernels/opencl/image/concat.cc new file mode 100644 index 0000000000000000000000000000000000000000..60144d558107418bd27a55a49b2f629fb7c64712 --- /dev/null +++ b/mace/kernels/opencl/image/concat.cc @@ -0,0 +1,213 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/opencl/image/concat.h" + +#include +#include +#include + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { +namespace concat { +namespace { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(4, 0); + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; + } else { + uint64_t + cache_size = runtime->device_global_mem_cache_size(); + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + lws[0] = std::min(base, kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = + std::max(std::min(base, kwg_size / lws_size), 1); + } + return lws; +} + +} // namespace + + +MaceStatus Concat2(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input0, + const Tensor *input1, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + StatsFuture *future, + uint32_t *kwg_size) { + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channel = output->dim(3); + + const int channel_blk = RoundUpDiv4(channel); + const uint32_t gws[3] = { + static_cast(channel_blk), static_cast(width), + static_cast(batch * height), + }; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel->get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); + built_options.emplace("-Dconcat_channel=" + kernel_name); + if (input0->dtype() == output->dtype()) { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + } + if (input0->dim(3) % 4 == 0) { + built_options.emplace("-DDIVISIBLE_FOUR"); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name, + built_options, kernel)); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); + } + MACE_OUT_OF_RANGE_INIT(*kernel); + if (!IsVecEqual(*prev_input_shape, input0->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(*kernel); + MACE_SET_3D_GWS_ARGS(*kernel, gws); + kernel->setArg(idx++, + *(static_cast(input0->opencl_image()))); + kernel->setArg(idx++, + *(static_cast(input1->opencl_image()))); + kernel->setArg(idx++, static_cast(input0->dim(3))); + kernel->setArg(idx++, + *(static_cast(output->opencl_image()))); + + *prev_input_shape = input0->shape(); + } + + const std::vector lws = LocalWS(runtime, gws, *kwg_size); + std::string tuning_key = + Concat("concat_opencl_kernel", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, + gws, lws, future)); + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +MaceStatus ConcatN(OpKernelContext *context, + cl::Kernel *kernel, + const std::vector &input_list, + const DataType dt, + Tensor *output, + StatsFuture *future, + uint32_t *kwg_size) { + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel->get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); + built_options.emplace("-Dconcat_channel_multi=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name, + built_options, kernel)); + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); + } + + const int inputs_count = input_list.size(); + index_t chan_blk_offset = 0; + cl::Event event; + CallStats call_stats{INT64_MAX, 0}; + + MACE_OUT_OF_RANGE_INIT(*kernel); + for (int i = 0; i < inputs_count; ++i) { + const Tensor *input = input_list[i]; + index_t input_channel_blk = input->dim(3) / 4; + const uint32_t gws[3] = { + static_cast(input_channel_blk), static_cast(width), + static_cast(batch * height), + }; + const std::vector lws = LocalWS(runtime, gws, *kwg_size); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(*kernel); + MACE_SET_3D_GWS_ARGS(*kernel, gws); + kernel->setArg(idx++, *(input->opencl_image())); + kernel->setArg(idx++, static_cast(chan_blk_offset)); + kernel->setArg(idx++, *(output->opencl_image())); + + chan_blk_offset += input_channel_blk; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t j = 0; j < 3; ++j) { + roundup_gws[j] = RoundUp(gws[j], lws[j]); + } + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } + MACE_CL_RET_STATUS(error); + MACE_OUT_OF_RANGE_VALIDATION; + if (future != nullptr && runtime->is_profiling_enabled()) { + event.wait(); + CallStats tmp_stats; + runtime->GetCallStats(event, &tmp_stats); + call_stats.start_micros = + std::min(tmp_stats.start_micros, call_stats.start_micros); + call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; + } + } + if (future != nullptr) { + future->wait_fn = [call_stats](CallStats *stats) { + if (stats != nullptr) { + stats->start_micros = call_stats.start_micros; + stats->end_micros = stats->start_micros + call_stats.end_micros; + } + }; + } + + return MACE_SUCCESS; +} + +} // namespace concat +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/image/concat.h b/mace/kernels/opencl/image/concat.h new file mode 100644 index 0000000000000000000000000000000000000000..6289a000f78d253ba864bfb9918baa7f398ecd47 --- /dev/null +++ b/mace/kernels/opencl/image/concat.h @@ -0,0 +1,123 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_ +#define MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_ + +#include "mace/kernels/concat.h" + +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { +namespace concat { +MaceStatus Concat2(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input0, + const Tensor *input1, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + StatsFuture *future, + uint32_t *kwg_size); + +MaceStatus ConcatN(OpKernelContext *context, + cl::Kernel *kernel, + const std::vector &input_list, + const DataType dt, + Tensor *output, + StatsFuture *future, + uint32_t *kwg_size); +} // namespace concat + +template +class ConcatKernel : public OpenCLConcatKernel { + public: + explicit ConcatKernel(const int32_t axis) : axis_(axis) {} + MaceStatus Compute( + OpKernelContext *context, + const std::vector &input_list, + Tensor *output, + StatsFuture *future) override; + + private: + int32_t axis_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus ConcatKernel::Compute( + OpKernelContext *context, + const std::vector &input_list, + Tensor *output, + StatsFuture *future) { + const int inputs_count = input_list.size(); + MACE_CHECK(inputs_count >= 2 && axis_ == 3) + << "Concat opencl kernel only support >=2 elements with axis == 3"; + + const Tensor *input0 = input_list[0]; + bool divisible_four = input0->dim(axis_) % 4 == 0; + + std::vector output_shape(input0->shape()); + for (int i = 1; i < inputs_count; ++i) { + const Tensor *input = input_list[i]; + MACE_CHECK(input->dim_size() == input0->dim_size(), + "Ranks of all input tensors must be same."); + divisible_four &= input->dim(axis_) % 4 == 0; + for (int j = 0; j < input->dim_size(); ++j) { + if (j == axis_) { + continue; + } + MACE_CHECK(input->dim(j) == input0->dim(j), + "Dimensions of inputs should equal except axis."); + } + output_shape[axis_] += input->dim(axis_); + } + MACE_CHECK( + inputs_count == 2 || divisible_four, + "Dimensions of inputs should be divisible by 4 when inputs_count > 2."); + std::vector image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); + + switch (inputs_count) { + case 2: + return concat::Concat2( + context, &kernel_, input_list[0], input_list[1], + DataTypeToEnum::value, &input_shape_, output, future, &kwg_size_); + default: + if (divisible_four) { + return concat::ConcatN(context, &kernel_, input_list, + DataTypeToEnum::value, output, future, + &kwg_size_); + } else { + MACE_NOT_IMPLEMENTED; + } + } + + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_ diff --git a/mace/kernels/opencl/image/conv_2d.h b/mace/kernels/opencl/image/conv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..05ee6a0750d129df06d97ce2bbf4f8aa9d70cf7e --- /dev/null +++ b/mace/kernels/opencl/image/conv_2d.h @@ -0,0 +1,179 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_ +#define MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_ + +#include "mace/kernels/conv_2d.h" + +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + StatsFuture *future, + uint32_t *kwg_size); + +extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + StatsFuture *future, + uint32_t *kwg_size); + +extern MaceStatus Conv2dOpencl(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + StatsFuture *future, + uint32_t *kwg_size); + + +template +class Conv2dKernel : public OpenCLConv2dKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus Conv2dKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) { + typedef MaceStatus (*Conv2dOpenclFunction)( + OpKernelContext *context, + cl::Kernel * kernel, const Tensor *input, const Tensor *filter, + const Tensor *bias, const int stride, const int *padding, + const int *dilations, const ActivationType activation, + const float relux_max_limit, const DataType dt, + std::vector *input_shape, Tensor *output, StatsFuture *future, + uint32_t *kwg_size); + // Selection matrix: kernel_size x stride_size + static const Conv2dOpenclFunction selector[3] = { + Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3}; + + index_t kernel_h = filter->dim(2); + index_t kernel_w = filter->dim(3); + if (strides[0] != strides[1] || + (dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) { + LOG(WARNING) << "OpenCL conv2d kernel with " + << "filter" << kernel_h << "x" << kernel_w << "," + << " stride " << strides[0] << "x" << strides[1] + << ",dilations " << dilations[0] << "x" << dilations[1] + << " is not implemented yet."; + MACE_NOT_IMPLEMENTED; + } + + // Reshape output + std::vector output_shape(4); + std::vector paddings(2); + if (padding_data.empty()) { + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter->shape().data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), filter->shape().data(), + padding_data.data(), dilations, strides, RoundType::FLOOR, + output_shape.data()); + } + + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + if (kernel_h == kernel_w && kernel_h <= 3 && + selector[kernel_h - 1] != nullptr) { + auto conv2d_func = selector[kernel_h - 1]; + return conv2d_func(context, + &kernel_, input, filter, bias, strides[0], paddings.data(), dilations, + activation, relux_max_limit, DataTypeToEnum::value, &input_shape_, + output, future, &kwg_size_); + } else { + return Conv2dOpencl( + context, &kernel_, input, filter, bias, + strides[0], paddings.data(), dilations, + activation, relux_max_limit, DataTypeToEnum::value, &input_shape_, + output, future, &kwg_size_); + } +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_ diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/image/conv_2d_1x1.cc similarity index 94% rename from mace/kernels/opencl/conv_2d_1x1.cc rename to mace/kernels/opencl/image/conv_2d_1x1.cc index c43c045019ba1a2e9cde11cd9288159f36ed45ec..2460afe5a2cd16510aee171a6f1447a7bacaa95d 100644 --- a/mace/kernels/opencl/conv_2d_1x1.cc +++ b/mace/kernels/opencl/image/conv_2d_1x1.cc @@ -19,6 +19,8 @@ namespace mace { namespace kernels { +namespace opencl { +namespace image { namespace { // (inputs + weights + outputs) * array_size * sizeof(float) @@ -78,8 +80,7 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size, - std::unique_ptr *kernel_error) { + uint32_t *kwg_size) { MACE_UNUSED(padding); MACE_UNUSED(dilations); const index_t batch = output->dim(0); @@ -96,13 +97,13 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, const index_t input_channel_blocks = RoundUpDiv4(input_channels); auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; if (kernel->get() == nullptr) { MACE_CHECK(input_batch == batch); - std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error, context); - NON_UNIFORM_WG_CONFIG; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); @@ -139,12 +140,13 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; + MACE_OUT_OF_RANGE_INIT(*kernel); // Support different input size if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG_PTR; - SET_3D_GWS_ARGS_PTR(kernel, gws); + MACE_OUT_OF_RANGE_SET_ARGS(*kernel); + MACE_SET_3D_GWS_ARGS(*kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -169,9 +171,11 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); - OUT_OF_RANGE_VALIDATION(*kernel_error); + MACE_OUT_OF_RANGE_VALIDATION; return MACE_SUCCESS; } +} // namespace image +} // namespace opencl } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/image/conv_2d_3x3.cc similarity index 94% rename from mace/kernels/opencl/conv_2d_3x3.cc rename to mace/kernels/opencl/image/conv_2d_3x3.cc index c0362831658ccd327ec0407bdc9f4ff05d40cf1c..900cd6102c50ea5a1704901189f8124d726e0fdf 100644 --- a/mace/kernels/opencl/conv_2d_3x3.cc +++ b/mace/kernels/opencl/image/conv_2d_3x3.cc @@ -21,6 +21,9 @@ namespace mace { namespace kernels { +namespace opencl { +namespace image { + namespace { // (inputs + weights + outputs) * array_size * sizeof(float) const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; @@ -71,8 +74,7 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size, - std::unique_ptr *kernel_error) { + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -84,11 +86,12 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, const index_t width_blocks = RoundUpDiv(width); auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error, context); - NON_UNIFORM_WG_CONFIG; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); @@ -123,12 +126,13 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; + MACE_OUT_OF_RANGE_INIT(*kernel); // Support different input size if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG_PTR; - SET_3D_GWS_ARGS_PTR(kernel, gws); + MACE_OUT_OF_RANGE_SET_ARGS(*kernel); + MACE_SET_3D_GWS_ARGS(*kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -149,16 +153,17 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, *prev_input_shape = input->shape(); } - std::vector lws = LocalWS(runtime, gws, *kwg_size); std::string tuning_key = Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); - OUT_OF_RANGE_VALIDATION(*kernel_error); + MACE_OUT_OF_RANGE_VALIDATION; return MACE_SUCCESS; } +} // namespace image +} // namespace opencl } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/image/conv_2d_general.cc similarity index 94% rename from mace/kernels/opencl/conv_2d_general.cc rename to mace/kernels/opencl/image/conv_2d_general.cc index bac1da8f40e0c8ad2a75d328730ee9f0f495319b..0286edf7346837f8d8ea45939e075611b2059c1b 100644 --- a/mace/kernels/opencl/conv_2d_general.cc +++ b/mace/kernels/opencl/image/conv_2d_general.cc @@ -21,6 +21,9 @@ namespace mace { namespace kernels { +namespace opencl { +namespace image { + namespace { // (inputs + weights + outputs) * array_size * sizeof(float) const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; @@ -79,8 +82,7 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size, - std::unique_ptr *kernel_error) { + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -92,11 +94,12 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context, const index_t width_blocks = RoundUpDiv4(width); auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error, context); - NON_UNIFORM_WG_CONFIG; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); @@ -131,12 +134,13 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; + MACE_OUT_OF_RANGE_INIT(*kernel); // Support different input size if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG_PTR; - SET_3D_GWS_ARGS_PTR(kernel, gws); + MACE_OUT_OF_RANGE_SET_ARGS(*kernel); + MACE_SET_3D_GWS_ARGS(*kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -168,9 +172,11 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); - OUT_OF_RANGE_VALIDATION(*kernel_error); + MACE_OUT_OF_RANGE_VALIDATION; return MACE_SUCCESS; } +} // namespace image +} // namespace opencl } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/image/crop.h b/mace/kernels/opencl/image/crop.h new file mode 100644 index 0000000000000000000000000000000000000000..10aa6ecbf8abdfe173ee43cb6ed2773947d8bcb7 --- /dev/null +++ b/mace/kernels/opencl/image/crop.h @@ -0,0 +1,194 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_CROP_H_ +#define MACE_KERNELS_OPENCL_IMAGE_CROP_H_ + +#include "mace/kernels/crop.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class CropKernel : public OpenCLCropKernel { + public: + explicit CropKernel( + const int axis, + const std::vector &offset) + : axis_(axis), offset_(offset) {} + MaceStatus Compute( + OpKernelContext *context, + const std::vector &input_list, + Tensor *output, + StatsFuture *future) override; + + private: + const int axis_; + std::vector offset_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus CropKernel::Compute( + OpKernelContext *context, + const std::vector &input_list, + Tensor *output, + StatsFuture *future) { + const int32_t inputs_count = static_cast(input_list.size()); + MACE_CHECK(inputs_count >= 2) + << "Crop opencl kernel only support 2 elements input"; + const Tensor *input0 = input_list[0]; + const Tensor *input1 = input_list[1]; + const uint32_t in0_dims = static_cast(input0->dim_size()); + const uint32_t in1_dims = static_cast(input0->dim_size()); + MACE_CHECK(in0_dims == 4 && in1_dims == 4, + "Crop op only supports 4-dims inputs now."); + + std::vector offsets(4, 0); + + std::vector output_shape(input0->shape()); + switch (axis_) { + case 0: + if (offset_.size() == 1) { + offsets[0] = offset_[0]; + offsets[1] = offset_[0]; + offsets[2] = offset_[0]; + offsets[3] = offset_[0]; + } else if (offset_.size() == 4) { + offsets[0] = offset_[0]; + offsets[1] = offset_[2]; + offsets[2] = offset_[3]; + offsets[3] = offset_[1]; + } + for (int i = 0; i < 4; ++i) { + output_shape[i] = input1->dim(i); + } + break; + case 1: + if (offset_.size() == 1) { + offsets[1] = offset_[0]; + offsets[2] = offset_[0]; + offsets[3] = offset_[0]; + } else if (offset_.size() == 3) { + offsets[1] = offset_[1]; + offsets[2] = offset_[2]; + offsets[3] = offset_[0]; + } + for (int i = 1; i < 4; ++i) { + output_shape[i] = input1->dim(i); + } + break; + case 2: + if (offset_.size() == 1) { + offsets[1] = offset_[0]; + offsets[2] = offset_[0]; + } else if (offset_.size() == 2) { + offsets[1] = offset_[0]; + offsets[2] = offset_[1]; + } + output_shape[1] = input1->dim(1); + output_shape[2] = input1->dim(2); + break; + case 3: + if (offset_.size() == 1) { + offsets[2] = offset_[0]; + } + output_shape[2] = input1->dim(2); + break; + default: + MACE_CHECK(axis_ >= 0 && axis_ < 4, "axis is out of boundary."); + break; + } + MACE_CHECK(offsets[3] % 4 == 0, + "MACE opencl only supports cropping channel" + " offset divisible by 4."); + for (index_t i = 0; i < 4; ++i) { + MACE_CHECK(input0->dim(i) - offsets[i] >= input1->dim(i)) + << "the crop for dimension" << i << "is out of bound with size" + << input1->dim(i) << "and offset" << offsets[i]; + } + std::vector image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); + + const index_t offset_chan_blk = RoundUpDiv4(offsets[3]); + const index_t channel_blk = RoundUpDiv4(output->dim(3)); + const uint32_t gws[3] = { + static_cast(channel_blk), static_cast(output->dim(2)), + static_cast(output->dim(0) * output->dim(1)) + }; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop"); + built_options.emplace("-Dcrop=" + kernel_name); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input0->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input0->opencl_image())); + kernel_.setArg(idx++, static_cast(offsets[0])); + kernel_.setArg(idx++, static_cast(offsets[1])); + kernel_.setArg(idx++, static_cast(offsets[2])); + kernel_.setArg(idx++, static_cast(offset_chan_blk)); + kernel_.setArg(idx++, static_cast(input0->dim(1))); + kernel_.setArg(idx++, static_cast(input0->dim(2))); + kernel_.setArg(idx++, static_cast(output->dim(1))); + kernel_.setArg(idx++, static_cast(output->dim(2))); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input0->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("crop_opencl_kernel", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_CROP_H_ diff --git a/mace/kernels/opencl/image/deconv_2d.h b/mace/kernels/opencl/image/deconv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..f1ce71c80af3ac4f2d36b88bb993cbe0ff65aac3 --- /dev/null +++ b/mace/kernels/opencl/image/deconv_2d.h @@ -0,0 +1,188 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_ +#define MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_ + +#include "mace/kernels/deconv_2d.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class Deconv2dKernel : public OpenCLDeconv2dKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const int *padding_data, + const ActivationType activation, + const float relux_max_limit, + const std::vector &output_shape, + Tensor *output, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus Deconv2dKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const int *padding_data, + const ActivationType activation, + const float relux_max_limit, + const std::vector &output_shape, + Tensor *output, + StatsFuture *future) { + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + const DataType dt = DataTypeToEnum::value; + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channels = output->dim(3); + const index_t input_channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t input_channel_blocks = RoundUpDiv4(input_channels); + const int stride_h = strides[0]; + const int stride_w = strides[1]; + MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0."); +#define MACE_WIDTH_BLK 5 + const index_t n_strides = (width + stride_w - 1) / stride_w; + const index_t width_blocks = + ((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride_w; + const float stride_h_r = 1.f / static_cast(stride_h); + const float stride_w_r = 1.f / static_cast(stride_w); + const int padding_h = (padding_data[0] + 1) >> 1; + const int padding_w = (padding_data[1] + 1) >> 1; + + const int align_h = stride_h - 1 - padding_h; + const int align_w = stride_w - 1 - padding_w; + const int kernel_size = filter->dim(2) * filter->dim(3); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d"); + built_options.emplace("-Ddeconv_2d=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation; + } + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width_blocks), + static_cast(height * batch)}; + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(filter->opencl_image())); + if (bias != nullptr) { + kernel_.setArg(idx++, *(bias->opencl_image())); + } + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, relux_max_limit); + kernel_.setArg(idx++, static_cast(input->dim(1))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(input->dim(3))); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(channels)); + kernel_.setArg(idx++, static_cast(stride_h)); + kernel_.setArg(idx++, static_cast(stride_w)); + kernel_.setArg(idx++, stride_h_r); + kernel_.setArg(idx++, stride_w_r); + kernel_.setArg(idx++, static_cast(align_h)); + kernel_.setArg(idx++, static_cast(align_w)); + kernel_.setArg(idx++, static_cast(padding_h)); + kernel_.setArg(idx++, static_cast(padding_w)); + kernel_.setArg(idx++, static_cast(filter->dim(2))); + kernel_.setArg(idx++, static_cast(filter->dim(3))); + kernel_.setArg(idx++, static_cast(kernel_size)); + kernel_.setArg(idx++, static_cast(input_channel_blocks)); + kernel_.setArg(idx++, static_cast(channel_blocks)); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("deconv2d_opencl_kernel_", activation, output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_ diff --git a/mace/kernels/opencl/image/depth_to_space.h b/mace/kernels/opencl/image/depth_to_space.h new file mode 100644 index 0000000000000000000000000000000000000000..280cdaa66703c9ea7a0d6bc4599cc11b0e725547 --- /dev/null +++ b/mace/kernels/opencl/image/depth_to_space.h @@ -0,0 +1,144 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_ +#define MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_ + +#include "mace/kernels/depth_to_space.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel { + public: + explicit DepthToSpaceKernel(const int block_size) + : block_size_(block_size) {} + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) override; + + private: + const int block_size_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus DepthToSpaceKernel::Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) { + const index_t batch = input->dim(0); + const index_t input_height = input->dim(1); + const index_t input_width = input->dim(2); + const index_t input_depth = input->dim(3); + + MACE_CHECK(input_depth % (block_size_ * block_size_) == 0, + "input depth should be dividable by block_size * block_size", + input_depth); + MACE_CHECK((input_depth % 4) == 0, + "input channel should be dividable by 4"); + + const index_t output_height = input_height * block_size_; + const index_t output_width = input_width * block_size_; + const index_t output_depth = input_depth / (block_size_ * block_size_); + MACE_CHECK(output_depth % 4 == 0, "output channel not support:") + << output_depth; + + const index_t input_depth_blocks = RoundUpDiv4(input_depth); + const index_t output_depth_blocks = RoundUpDiv4(output_depth); + + std::vector output_shape = {batch, + output_height, + output_width, + output_depth}; + std::vector image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); + + const uint32_t gws[3] = { + static_cast(RoundUpDiv4(output_depth)), + static_cast(output_width), + static_cast(output_height * batch) + }; + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + const char *kernel_name = kernel_name = "depth_to_space"; + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space", + obfuscated_kernel_name, + built_options, + &kernel_)); + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, static_cast(block_size_)); + kernel_.setArg(idx++, static_cast(input_height * batch)); + kernel_.setArg(idx++, static_cast(input_width)); + kernel_.setArg(idx++, static_cast(input_depth_blocks)); + kernel_.setArg(idx++, static_cast(output_width)); + kernel_.setArg(idx++, static_cast(output_depth_blocks)); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + std::string tuning_key = Concat("depth_to_space_opencl_kernel", + batch, output_height, + output_width, output_depth); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_ diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/image/depthwise_conv2d.cc similarity index 64% rename from mace/kernels/opencl/depthwise_conv.cc rename to mace/kernels/opencl/image/depthwise_conv2d.cc index 1bc910fdabc5551ff48e431f193ba42346830759..00f0102ec10bab243c29b3f6e67355f49635ad22 100644 --- a/mace/kernels/opencl/depthwise_conv.cc +++ b/mace/kernels/opencl/image/depthwise_conv2d.cc @@ -12,14 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/activation.h" -#include "mace/kernels/depthwise_conv2d.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" +#include "mace/kernels/opencl/image/depthwise_conv2d.h" + +#include +#include +#include namespace mace { namespace kernels { +namespace opencl { +namespace image { +namespace depthwise { namespace { // (inputs + weights + outputs) * array_size * sizeof(float) @@ -60,22 +63,21 @@ std::vector LocalWS(OpenCLRuntime *runtime, } // namespace -static MaceStatus DepthwiseConv2d(OpKernelContext *context, - cl::Kernel *kernel, - const Tensor *input, // NHWC - const Tensor *filter, // HWIM - const Tensor *bias, - const int stride, - const int *paddings, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - StatsFuture *future, - uint32_t *kwg_size, - std::unique_ptr *kernel_error) { +MaceStatus DepthwiseConv2d(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, // NHWC + const Tensor *filter, // HWIM + const Tensor *bias, + const int stride, + const int *paddings, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + StatsFuture *future, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -93,11 +95,12 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context, static_cast(height * batch)}; auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error, context); - NON_UNIFORM_WG_CONFIG; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d_s1"); @@ -135,6 +138,7 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context, *kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } + MACE_OUT_OF_RANGE_INIT(*kernel); if (!IsVecEqual(*prev_input_shape, input->shape())) { const index_t input_height = input->dim(1); const index_t input_width = input->dim(2); @@ -147,8 +151,8 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context, input_channels); uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG_PTR; - SET_3D_GWS_ARGS_PTR(kernel, gws); + MACE_OUT_OF_RANGE_SET_ARGS(*kernel); + MACE_SET_3D_GWS_ARGS(*kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -179,60 +183,12 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); - OUT_OF_RANGE_VALIDATION(*kernel_error); + MACE_OUT_OF_RANGE_VALIDATION; return MACE_SUCCESS; } -template -MaceStatus DepthwiseConv2dFunctor::operator()( - const Tensor *input, - const Tensor *filter, /* MIHW */ - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - index_t kernel_h = filter->dim(2); - index_t kernel_w = filter->dim(3); - if (strides_[0] != strides_[1]) { - LOG(FATAL) << "GPU depthwise conv2d kernel with " - << "filter" << kernel_h << "x" << kernel_w << "," - << " stride " << strides_[0] << "x" << strides_[1] - << " is not implemented yet."; - } - - // Create a fake conv_2d filter to calculate the paddings and output size - std::vector fake_filter_shape(4); - fake_filter_shape[0] = filter->dim(0) * filter->dim(1); - fake_filter_shape[1] = filter->dim(1); - fake_filter_shape[2] = filter->dim(2); - fake_filter_shape[3] = filter->dim(3); - - std::vector output_shape(4); - std::vector paddings(2); - if (paddings_.empty()) { - kernels::CalcNHWCPaddingAndOutputSize( - input->shape().data(), fake_filter_shape.data(), dilations_, strides_, - padding_type_, output_shape.data(), paddings.data()); - } else { - paddings = paddings_; - CalcOutputSize(input->shape().data(), fake_filter_shape.data(), - paddings_.data(), dilations_, strides_, RoundType::FLOOR, - output_shape.data()); - } - - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - return DepthwiseConv2d( - context_, - &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, - activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, - output, future, &kwg_size_, &kernel_error_); -} - -template struct DepthwiseConv2dFunctor; -template struct DepthwiseConv2dFunctor; - +} // namespace depthwise +} // namespace image +} // namespace opencl } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/image/depthwise_conv2d.h b/mace/kernels/opencl/image/depthwise_conv2d.h new file mode 100644 index 0000000000000000000000000000000000000000..8b5568f5ae16ef901df72ebb09d6457e4f4aa08a --- /dev/null +++ b/mace/kernels/opencl/image/depthwise_conv2d.h @@ -0,0 +1,131 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_ +#define MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_ + +#include "mace/kernels/depthwise_conv2d.h" + +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { +namespace depthwise { + +MaceStatus DepthwiseConv2d(OpKernelContext *context, + cl::Kernel *kernel, + const Tensor *input, // NHWC + const Tensor *filter, // HWIM + const Tensor *bias, + const int stride, + const int *paddings, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + StatsFuture *future, + uint32_t *kwg_size); +} // namespace depthwise + + +template +class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus DepthwiseConv2dKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) { + index_t kernel_h = filter->dim(2); + index_t kernel_w = filter->dim(3); + if (strides[0] != strides[1]) { + LOG(WARNING) << "OpenCL depthwise conv2d kernel with " + << "filter" << kernel_h << "x" << kernel_w << "," + << " stride " << strides[0] << "x" << strides[1] + << " is not implemented yet, using slow version"; + MACE_NOT_IMPLEMENTED; + } + + // Create a fake conv_2d filter to calculate the paddings and output size + std::vector fake_filter_shape(4); + fake_filter_shape[0] = filter->dim(0) * filter->dim(1); + fake_filter_shape[1] = filter->dim(1); + fake_filter_shape[2] = filter->dim(2); + fake_filter_shape[3] = filter->dim(3); + + std::vector output_shape(4); + std::vector paddings(2); + if (padding_data.empty()) { + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), fake_filter_shape.data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), fake_filter_shape.data(), + padding_data.data(), dilations, strides, RoundType::FLOOR, + output_shape.data()); + } + + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + return depthwise::DepthwiseConv2d( + context, &kernel_, input, filter, bias, strides[0], paddings.data(), + dilations, activation, relux_max_limit, DataTypeToEnum::value, + &input_shape_, output, future, &kwg_size_); +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_ diff --git a/mace/kernels/opencl/image/eltwise.h b/mace/kernels/opencl/image/eltwise.h new file mode 100644 index 0000000000000000000000000000000000000000..2a18cbef204e374cf6332dac7a8b1e996d2f72f5 --- /dev/null +++ b/mace/kernels/opencl/image/eltwise.h @@ -0,0 +1,183 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_ +#define MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_ + +#include "mace/kernels/eltwise.h" + +#include +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class EltwiseKernel : public OpenCLEltwiseKernel { + public: + explicit EltwiseKernel( + const EltwiseType type, + const std::vector &coeff, + const float scalar_input, + const int32_t scalar_input_index) + : type_(type), + coeff_(coeff), + scalar_input_(scalar_input), + scalar_input_index_(scalar_input_index) {} + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input0, + const Tensor *input1, + Tensor *output, + StatsFuture *future) override; + + private: + EltwiseType type_; + std::vector coeff_; + float scalar_input_; + int32_t scalar_input_index_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus EltwiseKernel::Compute( + OpKernelContext *context, + const Tensor *input0, + const Tensor *input1, + Tensor *output, + StatsFuture *future) { + bool swapped = false; + if (input1 != nullptr) { + MACE_CHECK(input0->dim_size() == input1->dim_size() || + input0->dim_size() == 1 || input1->dim_size() == 1) + << "Inputs of Eltwise op must be same shape"; + if (input0->size() != input1->size()) { + if (input0->size() < input1->size()) { + std::swap(input0, input1); + swapped = true; + } + if (input1->dim_size() == 1) { + MACE_CHECK(input0->dim(3) == input1->dim(0)) + << "Element-Wise op only support channel dimension broadcast"; + } else { + MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) && + input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 && + input1->dim(2) == 1) + << "Element-Wise op only support channel dimension broadcast"; + } + } + } + + if (scalar_input_index_ == 0) { + swapped = !swapped; + } + + std::vector output_shape(4); + output_shape[0] = input0->dim(0); + output_shape[1] = input0->dim(1); + output_shape[2] = input0->dim(2); + output_shape[3] = input0->dim(3); + + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channels = output->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t batch_height_pixels = batch * height; + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(batch_height_pixels)}; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + auto dt = DataTypeToEnum::value; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); + built_options.emplace("-Deltwise=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); + if (input1 == nullptr) { + built_options.emplace("-DINPUT_TYPE=1"); + } else if (input0->size() != input1->size()) { + if (input1->dim(0) == 1 || input1->dim_size() == 1) + built_options.emplace("-DINPUT_TYPE=3"); + else + built_options.emplace("-DINPUT_TYPE=2"); + if (swapped) built_options.emplace("-DSWAPPED"); + } + if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input0->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input0->opencl_image())); + if (input1 == nullptr) { + kernel_.setArg(idx++, scalar_input_); + } else { + kernel_.setArg(idx++, *(input1->opencl_image())); + } + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(channels)); + if (!coeff_.empty()) { + kernel_.setArg(idx++, coeff_[0]); + kernel_.setArg(idx++, coeff_[1]); + } + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input0->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_ diff --git a/mace/kernels/opencl/image/fully_connected.h b/mace/kernels/opencl/image/fully_connected.h new file mode 100644 index 0000000000000000000000000000000000000000..d0d921d87021968a95d2a6359534719ca1183b56 --- /dev/null +++ b/mace/kernels/opencl/image/fully_connected.h @@ -0,0 +1,190 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_ +#define MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_ + +#include "mace/kernels/fully_connected.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class FullyConnectedKernel : public OpenCLFullyConnectedKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *weight, + const Tensor *bias, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + std::vector gws_; + std::vector lws_; + std::vector input_shape_; +}; + +template +MaceStatus FullyConnectedKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *weight, + const Tensor *bias, + const ActivationType activation, + const float relux_max_limit, + Tensor *output, + StatsFuture *future) { + std::vector output_shape = {input->dim(0), 1, 1, weight->dim(0)}; + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + const index_t batch = output->dim(0); + const index_t output_size = output->dim(3); + const index_t output_blocks = RoundUpDiv4(output_size); + + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + auto dt = DataTypeToEnum::value; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width"); + built_options.emplace("-Dfully_connected_width=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + if (bias != nullptr) { + built_options.emplace("-DBIAS"); + } + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation; + } + if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { + built_options.emplace("-DNON_QUALCOMM_ADRENO"); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name, + built_options, &kernel_)); + + const uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + + if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + const uint32_t wave_size = + static_cast(runtime->GetKernelWaveSize(kernel_)); + + gws_ = {4, (wave_size / 4), static_cast(batch * output_blocks)}; + + const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]); + lws_ = {gws_[0], gws_[1], inter_local_blks}; + } else { + gws_ = {4, 8, static_cast(batch * output_blocks)}; + + const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]); + lws_ = {gws_[0], gws_[1], inter_local_blks}; + } + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + const index_t batch = output->dim(0); + const index_t output_blocks = RoundUpDiv4(output->dim(3)); + gws_[2] = static_cast(batch * output_blocks); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws_); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(weight->opencl_image())); + if (bias != nullptr) { + kernel_.setArg(idx++, *(bias->opencl_image())); + } + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)), + nullptr); + kernel_.setArg(idx++, static_cast(input->dim(1))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(RoundUpDiv4(input->dim(3)))); + kernel_.setArg(idx++, static_cast(output_blocks)); + kernel_.setArg(idx++, relux_max_limit); + + input_shape_ = input->shape(); + } + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]), + cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws_.size()); + for (size_t i = 0; i < lws_.size(); ++i) { + roundup_gws[i] = RoundUp(gws_[i], lws_[i]); + } + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event); + } + MACE_OUT_OF_RANGE_VALIDATION; + MACE_CL_RET_STATUS(error); + + if (future != nullptr) { + future->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_ diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image/image_to_buffer.h similarity index 60% rename from mace/kernels/opencl/image_to_buffer.cc rename to mace/kernels/opencl/image/image_to_buffer.h index b98e9fb2ac77ee785ac21f17360fa998b37f537f..0a345bf584a58c79d4b5a706181dd70aefeb8d7a 100644 --- a/mace/kernels/opencl/image_to_buffer.cc +++ b/mace/kernels/opencl/image/image_to_buffer.h @@ -12,24 +12,47 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/kernels/image_to_buffer.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" +#ifndef MACE_KERNELS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_ +#define MACE_KERNELS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_ +#include +#include +#include + +#include "mace/kernels/buffer_inverse_transform.h" #include "mace/kernels/opencl/helper.h" namespace mace { namespace kernels { +namespace opencl { +namespace image { + +template +class ImageToBuffer : public OpenCLBufferInverseTransformKernel { + public: + MaceStatus Compute(OpKernelContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + Tensor *output, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + std::vector input_shape_; +}; template -MaceStatus ImageToBufferFunctor::operator()( - const Tensor *image, - const BufferType type, - Tensor *buffer, - StatsFuture *future) { - auto formatted_buffer_shape = FormatBufferShape(image->shape(), type); +MaceStatus ImageToBuffer::Compute(OpKernelContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + Tensor *output, + StatsFuture *future) { + auto formatted_buffer_shape = FormatBufferShape(input->shape(), type); std::vector image_shape; - CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_); - MACE_RETURN_IF_ERROR(buffer->Resize(image->shape())); + CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size); + MACE_RETURN_IF_ERROR(output->Resize(input->shape())); uint32_t gws[2] = {static_cast(image_shape[0]), static_cast(image_shape[1])}; @@ -49,9 +72,9 @@ MaceStatus ImageToBufferFunctor::operator()( break; case WINOGRAD_FILTER: { std::stringstream ss_tmp; - gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2); + gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2); ss_tmp << "winograd_filter_image_to_buffer_" - << wino_blk_size_ << "x" << wino_blk_size_; + << wino_blk_size << "x" << wino_blk_size; kernel_name = ss_tmp.str(); break; } @@ -67,17 +90,18 @@ MaceStatus ImageToBufferFunctor::operator()( break; } - auto runtime = context_->device()->opencl_runtime(); + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); - if (buffer->dtype() == image->dtype()) { + if (output->dtype() == input->dtype()) { built_options.emplace( "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + @@ -94,35 +118,36 @@ MaceStatus ImageToBufferFunctor::operator()( &kernel_)); } - if (!IsVecEqual(input_shape_, image->shape())) { + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_2D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(buffer->opencl_buffer())); + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_2D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(output->opencl_buffer())); if (type == CONV2D_FILTER) { const index_t - inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); - kernel_.setArg(idx++, static_cast(buffer->dim(0))); - kernel_.setArg(idx++, static_cast(buffer->dim(2))); - kernel_.setArg(idx++, static_cast(buffer->dim(3))); + inner_size = output->dim(1) * output->dim(2) * output->dim(3); + kernel_.setArg(idx++, static_cast(output->dim(0))); + kernel_.setArg(idx++, static_cast(output->dim(2))); + kernel_.setArg(idx++, static_cast(output->dim(3))); kernel_.setArg(idx++, static_cast(inner_size)); } else if (type == ARGUMENT) { - kernel_.setArg(idx++, static_cast(buffer->dim(0))); + kernel_.setArg(idx++, static_cast(output->dim(0))); } else if (type == WEIGHT_HEIGHT) { - kernel_.setArg(idx++, static_cast(buffer->dim(0))); - kernel_.setArg(idx++, static_cast(buffer->dim(1))); - kernel_.setArg(idx++, static_cast(buffer->dim(2))); - kernel_.setArg(idx++, static_cast(buffer->dim(3))); + kernel_.setArg(idx++, static_cast(output->dim(0))); + kernel_.setArg(idx++, static_cast(output->dim(1))); + kernel_.setArg(idx++, static_cast(output->dim(2))); + kernel_.setArg(idx++, static_cast(output->dim(3))); } else { kernel_.setArg(idx++, - static_cast(formatted_buffer_shape[1])); + static_cast(formatted_buffer_shape[1])); kernel_.setArg(idx++, - static_cast(formatted_buffer_shape[2])); + static_cast(formatted_buffer_shape[2])); kernel_.setArg(idx++, - static_cast(formatted_buffer_shape[3])); + static_cast(formatted_buffer_shape[3])); } - kernel_.setArg(idx++, *(image->opencl_image())); - input_shape_ = image->shape(); + kernel_.setArg(idx++, *(input->opencl_image())); + input_shape_ = input->shape(); } const uint32_t kwg_size = @@ -146,7 +171,7 @@ MaceStatus ImageToBufferFunctor::operator()( cl::NDRange(lws[0], lws[1]), nullptr, &event); } MACE_CL_RET_STATUS(error); - OUT_OF_RANGE_VALIDATION(kernel_error_); + MACE_OUT_OF_RANGE_VALIDATION; if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); @@ -159,8 +184,9 @@ MaceStatus ImageToBufferFunctor::operator()( return MACE_SUCCESS; } -template struct ImageToBufferFunctor; -template struct ImageToBufferFunctor; - +} // namespace image +} // namespace opencl } // namespace kernels } // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_ diff --git a/mace/kernels/opencl/image/lstm_cell.h b/mace/kernels/opencl/image/lstm_cell.h new file mode 100644 index 0000000000000000000000000000000000000000..2b7d41d48f5f86415415525d5644df4e9531b1e2 --- /dev/null +++ b/mace/kernels/opencl/image/lstm_cell.h @@ -0,0 +1,140 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_ +#define MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_ + +#include +#include +#include +#include + +#include "mace/kernels/lstmcell.h" +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class LSTMCellKernel : public OpenCLLSTMCellKernel { + public: + explicit LSTMCellKernel( + const T forget_bias) + : forget_bias_(forget_bias) {} + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *pre_output, + const Tensor *weight, + const Tensor *bias, + const Tensor *pre_cell, + Tensor *cell, + Tensor *output, + StatsFuture *future) override; + + private: + T forget_bias_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus LSTMCellKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const Tensor *pre_output, + const Tensor *weight, + const Tensor *bias, + const Tensor *pre_cell, + Tensor *cell, + Tensor *output, + StatsFuture *future) { + MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0, + "LSTM hidden units should be a multiple of 4"); + + const index_t height = input->dim(0); + const index_t width = input->dim(1); + const index_t hidden_units = pre_output->dim(1); + const index_t w_blocks = hidden_units >> 2; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + auto dt = DataTypeToEnum::value; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell"); + built_options.emplace("-Dlstmcell=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[2] = {static_cast(w_blocks), + static_cast(height)}; + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + std::vector output_shape_padded = {height, 1, 1, hidden_units}; + std::vector output_image_shape; + CalImage2DShape(output_shape_padded, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(), + output_image_shape)); + MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(), + output_image_shape)); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_2D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(pre_output->opencl_image())); + kernel_.setArg(idx++, *(weight->opencl_image())); + kernel_.setArg(idx++, *(bias->opencl_image())); + kernel_.setArg(idx++, *(pre_cell->opencl_image())); + kernel_.setArg(idx++, static_cast(forget_bias_)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(hidden_units)); + kernel_.setArg(idx++, static_cast(RoundUpDiv4(width))); + kernel_.setArg(idx++, *(cell->opencl_image())); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + const std::vector lws = {kwg_size_ / 16, 16, 0}; + std::string tuning_key = + Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1)); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + MACE_OUT_OF_RANGE_VALIDATION; + + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_ diff --git a/mace/kernels/opencl/image/matmul.h b/mace/kernels/opencl/image/matmul.h new file mode 100644 index 0000000000000000000000000000000000000000..751887529c47fca09dc1f58a8beafc7708539b98 --- /dev/null +++ b/mace/kernels/opencl/image/matmul.h @@ -0,0 +1,129 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_ +#define MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_ + +#include "mace/kernels/matmul.h" + +#include +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class MatMulKernel : public OpenCLMatMulKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *A, + const Tensor *B, + Tensor *C, + bool transpose_a, + bool transpose_b, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; +}; + +template +MaceStatus MatMulKernel::Compute( + OpKernelContext *context, + const Tensor *A, + const Tensor *B, + Tensor *C, + bool transpose_a, + bool transpose_b, + StatsFuture *future) { + MACE_UNUSED(future); + MACE_CHECK(!transpose_a && !transpose_b, + "GPU does not support transpose matmul"); + + index_t rank = A->dim_size(); + index_t height = A->dim(rank - 2); + index_t K = A->dim(rank - 1); + index_t width = B->dim(rank - 1); + index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1, + std::multiplies()); + + std::vector c_shape = A->shape(); + c_shape[rank - 2] = height; + c_shape[rank - 1] = width; + std::vector c_image_shape; + std::vector padded_c_shape = {batch, height, width, 1}; + CalImage2DShape(padded_c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape); + MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape)); + + const index_t height_blocks = RoundUpDiv4(height); + const index_t width_blocks = RoundUpDiv4(width); + const uint32_t gws[2] = { + static_cast(width_blocks), + static_cast(height_blocks * batch), + }; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + auto dt = DataTypeToEnum::value; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); + built_options.emplace("-Dmatmul=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_2D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(A->opencl_image())); + kernel_.setArg(idx++, *(B->opencl_image())); + kernel_.setArg(idx++, *(C->opencl_image())); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(K)); + kernel_.setArg(idx++, static_cast(height_blocks)); + kernel_.setArg(idx++, static_cast(RoundUpDiv4(K))); + + const std::vector lws = {kwg_size_ / 64, 64, 0}; + std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_ diff --git a/mace/kernels/opencl/image/pad.h b/mace/kernels/opencl/image/pad.h new file mode 100644 index 0000000000000000000000000000000000000000..1533b6d2b8ae850f1144cb5f2e5f78bf6120de0f --- /dev/null +++ b/mace/kernels/opencl/image/pad.h @@ -0,0 +1,136 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_PAD_H_ +#define MACE_KERNELS_OPENCL_IMAGE_PAD_H_ + +#include "mace/kernels/pad.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class PadKernel : public OpenCLPadKernel { + public: + PadKernel(const std::vector &paddings, + const float constant_value) + : paddings_(paddings), constant_value_(constant_value) {} + + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) override; + + private: + std::vector paddings_; + float constant_value_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus PadKernel::Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) { + MACE_CHECK(this->paddings_.size() == + static_cast((input->dim_size() * 2))); + MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) && + (this->paddings_[6] == 0) && (this->paddings_[7] == 0)) + << "Mace only support height/width dimension now"; + auto input_shape = input->shape(); + std::vector output_shape = { + input_shape[0] + this->paddings_[0] + this->paddings_[1], + input_shape[1] + this->paddings_[2] + this->paddings_[3], + input_shape[2] + this->paddings_[4] + this->paddings_[5], + input_shape[3] + this->paddings_[6] + this->paddings_[7]}; + + std::vector image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); + + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channels = output->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad"); + built_options.emplace("-Dpad=" + kernel_name); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + MACE_OUT_OF_RANGE_INIT(kernel_); + + if (!IsVecEqual(input_shape_, input->shape())) { + int idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, this->constant_value_); + kernel_.setArg(idx++, static_cast(input_shape[1])); + kernel_.setArg(idx++, static_cast(input_shape[2])); + kernel_.setArg(idx++, static_cast(output_shape[1])); + kernel_.setArg(idx++, this->paddings_[2]); + kernel_.setArg(idx++, this->paddings_[4]); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = Concat("pad", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_PAD_H_ diff --git a/mace/kernels/opencl/image/pooling.h b/mace/kernels/opencl/image/pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..8b11475e7131fa80fbd39e48a570f761c5c15c53 --- /dev/null +++ b/mace/kernels/opencl/image/pooling.h @@ -0,0 +1,187 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_POOLING_H_ +#define MACE_KERNELS_OPENCL_IMAGE_POOLING_H_ + +#include "mace/kernels/pooling.h" + +#include +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { +namespace pooling { +inline std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(4, 0); + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; + } else { + uint64_t + cache_size = runtime->device_global_mem_cache_size(); + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + lws[2] = + std::min(std::min(gws[2], base), kwg_size / lws[1]); + const uint32_t lws_size = lws[1] * lws[2]; + lws[0] = gws[0] / 4; + if (lws[0] == 0) { + lws[0] = gws[0]; + } + lws[0] = std::max(std::min(lws[0], kwg_size / lws_size), + 1); + } + return lws; +} +} // namespace pooling + + +template +class PoolingKernel : public OpenCLPoolingKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + Tensor *output, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus PoolingKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + Tensor *output, + StatsFuture *future) { + MACE_CHECK(dilations[0] == 1 && dilations[1] == 1) + << "Pooling opencl kernel not support dilation yet"; + + std::vector output_shape(4); + std::vector filter_shape = {input->dim(3), input->dim(3), + kernels[0], kernels[1]}; + + std::vector paddings(2); + if (padding_data.empty()) { + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter_shape.data(), dilations, strides, + padding_type, output_shape.data(), paddings.data()); + } else { + paddings = padding_data; + CalcOutputSize(input->shape().data(), filter_shape.data(), + padding_data.data(), dilations, strides, RoundType::CEIL, + output_shape.data()); + } + + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + const DataType dt = DataTypeToEnum::value; + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); + built_options.emplace("-Dpooling=" + kernel_name); + + if (pooling_type == MAX && input->dtype() == output->dtype()) { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + } + if (pooling_type == AVG) { + built_options.emplace("-DPOOL_AVG"); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = { + static_cast(RoundUpDiv4(output->dim(3))), + static_cast(output->dim(2)), + static_cast(output->dim(0) * output->dim(1)), + }; + MACE_OUT_OF_RANGE_INIT(kernel_); + + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, static_cast(input->dim(1))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(output->dim(1))); + kernel_.setArg(idx++, paddings[0] / 2); + kernel_.setArg(idx++, paddings[1] / 2); + kernel_.setArg(idx++, strides[0]); + kernel_.setArg(idx++, strides[1]); + kernel_.setArg(idx++, kernels[0]); + kernel_.setArg(idx++, kernels[1]); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + const std::vector lws = pooling::LocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_POOLING_H_ diff --git a/mace/kernels/opencl/image/reduce_mean.h b/mace/kernels/opencl/image/reduce_mean.h new file mode 100644 index 0000000000000000000000000000000000000000..3c826540ff026315fec84980114d6ad0eaf19cec --- /dev/null +++ b/mace/kernels/opencl/image/reduce_mean.h @@ -0,0 +1,177 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_ +#define MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_ + +#include "mace/kernels/reduce_mean.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class ReduceMeanKernel : public OpenCLReduceMeanKernel { + public: + ReduceMeanKernel(const std::vector axis, + const bool keep_dims) + : axis_(axis), keep_dims_(keep_dims) {} + + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) override; + + private: + const std::vector axis_; + bool keep_dims_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus ReduceMeanKernel::Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) { + MACE_CHECK_NOTNULL(input); +// MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims."); + MACE_CHECK(input->dim_size() == 4, + "reduce mean gpu only support 4-dim input"); + MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2, + "reduce mean gpu only support 1,2-axis reduce"); + index_t batch = input->dim(0); + const index_t in_height = input->dim(1); + const index_t in_width = input->dim(2); + const index_t channels = input->dim(3); + const index_t channel_blocks = RoundUpDiv4(channels); + const uint32_t image_size = static_cast(in_height * in_width); + + std::vector gws(3); + std::vector lws(3); + std::vector output_shape{batch, 1, 1, channels}; + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + const DataType dt = DataTypeToEnum::value; + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean"); + built_options.emplace("-Dreduce_mean=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { + built_options.emplace("-DNON_QUALCOMM_ADRENO"); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { + const uint32_t wave_size = + static_cast(runtime->GetKernelWaveSize(kernel_)); + gws = {4, (wave_size / 4), static_cast(batch * channel_blocks)}; + } else { + gws = {4, 16, static_cast(batch * channel_blocks)}; + } + lws = {gws[0], gws[1], 1}; + const int group_size = lws[0] * lws[1] * lws[2]; + const int partial_len = (image_size + group_size - 1) / group_size; + const int remain_index = image_size % group_size; + const float in_width_reciprocal = 1.f / in_width; + const float img_size_reciprocal = 1.f / (in_width * in_height); + const float channel_blk_reciprocal = 1.f / channel_blocks; + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, (group_size * 4 * sizeof(T)), + nullptr); + kernel_.setArg(idx++, static_cast(group_size)); + kernel_.setArg(idx++, static_cast(partial_len)); + kernel_.setArg(idx++, static_cast(remain_index)); + kernel_.setArg(idx++, static_cast(batch)); + kernel_.setArg(idx++, static_cast(in_height)); + kernel_.setArg(idx++, static_cast(in_width)); + kernel_.setArg(idx++, img_size_reciprocal); + kernel_.setArg(idx++, in_width_reciprocal); + kernel_.setArg(idx++, static_cast(channel_blocks)); + kernel_.setArg(idx++, channel_blk_reciprocal); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + roundup_gws[i] = RoundUp(gws[i], lws[i]); + } + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } + MACE_CL_RET_STATUS(error); + MACE_OUT_OF_RANGE_VALIDATION; + + if (future != nullptr) { + future->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_ diff --git a/mace/kernels/opencl/image/resize_bicubic.h b/mace/kernels/opencl/image/resize_bicubic.h new file mode 100644 index 0000000000000000000000000000000000000000..669f644539d6e5cd97a1561003068c69cec13f20 --- /dev/null +++ b/mace/kernels/opencl/image/resize_bicubic.h @@ -0,0 +1,173 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_ +#define MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_ + +#include "mace/kernels/resize_bicubic.h" + +#include +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { +namespace resize_bicubic { +inline std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(4, 0); + uint64_t cache_size = runtime->device_global_mem_cache_size(); + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + if (lws[1] >= base) { + lws[0] = std::min(gws[0], base); + } else { + lws[0] = gws[0] / 8; + if (lws[0] == 0) { + lws[0] = gws[0]; + } + } + lws[0] = std::min(lws[0], kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = gws[2] / 8; + if (lws[2] == 0) { + lws[2] = gws[2]; + } + lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), + 1); + return lws; +} + +} // namespace resize_bicubic + +template +class ResizeBicubicKernel : public OpenCLResizeBicubicKernel { + public: + ResizeBicubicKernel(bool align_corners, + const index_t out_height, + const index_t out_width) + : align_corners_(align_corners), + out_height_(out_height), + out_width_(out_width) {} + + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) override; + + private: + bool align_corners_; + index_t out_height_; + index_t out_width_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus ResizeBicubicKernel::Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) { + const index_t batch = input->dim(0); + const index_t in_height = input->dim(1); + const index_t in_width = input->dim(2); + const index_t channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t out_height = out_height_; + const index_t out_width = out_width_; + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(out_width), + static_cast(out_height * batch)}; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + auto dt = DataTypeToEnum::value; + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache"); + built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize)); + MACE_RETURN_IF_ERROR( + runtime->BuildKernel("resize_bicubic", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + MACE_CHECK(out_height > 0 && out_width > 0); + std::vector output_shape{batch, out_height, out_width, channels}; + + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + float height_scale = + CalculateResizeScale(in_height, out_height, align_corners_); + float width_scale = + CalculateResizeScale(in_width, out_width, align_corners_); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, height_scale); + kernel_.setArg(idx++, width_scale); + kernel_.setArg(idx++, static_cast(in_height)); + kernel_.setArg(idx++, static_cast(in_width)); + kernel_.setArg(idx++, static_cast(out_height)); + + input_shape_ = input->shape(); + } + + const std::vector + lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_ diff --git a/mace/kernels/opencl/image/resize_bilinear.h b/mace/kernels/opencl/image/resize_bilinear.h new file mode 100644 index 0000000000000000000000000000000000000000..459babc9f4aacff9e049f38cd1dd2fc5f62343df --- /dev/null +++ b/mace/kernels/opencl/image/resize_bilinear.h @@ -0,0 +1,176 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_ +#define MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_ + +#include "mace/kernels/resize_bilinear.h" + +#include +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { +namespace resize_bilinear { +inline std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(4, 0); + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; + } else { + uint64_t + cache_size = runtime->device_global_mem_cache_size(); + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + if (lws[1] >= base) { + lws[0] = std::min(gws[0], base); + } else { + lws[0] = gws[0] / 8; + if (lws[0] == 0) { + lws[0] = gws[0]; + } + } + lws[0] = std::min(lws[0], kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = gws[2] / 8; + if (lws[2] == 0) { + lws[2] = gws[2]; + } + lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), + 1); + } + return lws; +} + +} // namespace resize_bilinear + +template +class ResizeBilinearKernel : public OpenCLResizeBilinearKernel { + public: + ResizeBilinearKernel(bool align_corners, + const index_t out_height, + const index_t out_width) + : align_corners_(align_corners), + out_height_(out_height), + out_width_(out_width) {} + + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) override; + + private: + bool align_corners_; + index_t out_height_; + index_t out_width_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus ResizeBilinearKernel::Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) { + const index_t batch = input->dim(0); + const index_t in_height = input->dim(1); + const index_t in_width = input->dim(2); + const index_t channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t out_height = out_height_; + const index_t out_width = out_width_; + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(out_width), + static_cast(out_height * batch)}; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); + built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + MACE_RETURN_IF_ERROR( + runtime->BuildKernel("resize_bilinear", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + MACE_CHECK(out_height > 0 && out_width > 0); + std::vector output_shape{batch, out_height, out_width, channels}; + + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + + float height_scale = + CalculateResizeScale(in_height, out_height, align_corners_); + float width_scale = + CalculateResizeScale(in_width, out_width, align_corners_); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, height_scale); + kernel_.setArg(idx++, width_scale); + kernel_.setArg(idx++, static_cast(in_height)); + kernel_.setArg(idx++, static_cast(in_width)); + kernel_.setArg(idx++, static_cast(out_height)); + + input_shape_ = input->shape(); + } + + const std::vector + lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1), + output->dim(2), output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_ diff --git a/mace/kernels/opencl/image/softmax.h b/mace/kernels/opencl/image/softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..0c3aa61862a39c3d52a1db06406a7c3e46fbfd2f --- /dev/null +++ b/mace/kernels/opencl/image/softmax.h @@ -0,0 +1,151 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_ +#define MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_ + +#include "mace/kernels/softmax.h" + +#include +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { +namespace softmax { + +inline std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(4, 0); + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; + } else { + uint64_t + cache_size = runtime->device_global_mem_cache_size(); + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + if (gws[0] < base) { + lws[0] = gws[0]; + } else { + lws[0] = gws[0] / base; + } + lws[0] = std::min(lws[0], kwg_size / lws[1]); + lws[2] = std::max(std::min( + gws[2], kwg_size / (lws[0] * lws[1])), 1); + } + return lws; +} +} // namespace softmax + +template +class SoftmaxKernel : public OpenCLSoftmaxKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *logits, + Tensor *output, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus SoftmaxKernel::Compute( + OpKernelContext *context, + const Tensor *logits, + Tensor *output, + StatsFuture *future) { + index_t batch = 0; + index_t height = 0; + index_t width = 0; + index_t channels = 0; + + if (logits->dim_size() == 2) { + batch = logits->dim(0); + height = 1; + width = 1; + channels = logits->dim(1); + + } else if (logits->dim_size() == 4) { + batch = logits->dim(0); + height = logits->dim(1); + width = logits->dim(2); + channels = logits->dim(3); + } else { + MACE_NOT_IMPLEMENTED; + } + + const index_t channel_blocks = RoundUpDiv4(channels); + const int remain_channels = channel_blocks * 4 - channels; + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); + built_options.emplace("-Dsoftmax=" + kernel_name); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, logits->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(logits->opencl_image())); + kernel_.setArg(idx++, static_cast(channels)); + kernel_.setArg(idx++, remain_channels); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = logits->shape(); + } + + std::vector lws = softmax::LocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("softmax_opencl_kernel", batch, height, width, channels); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_ diff --git a/mace/kernels/opencl/image/space_to_batch.h b/mace/kernels/opencl/image/space_to_batch.h new file mode 100644 index 0000000000000000000000000000000000000000..89bcdf6ab4e53ca6ba1d31cd36882a4f36b5949c --- /dev/null +++ b/mace/kernels/opencl/image/space_to_batch.h @@ -0,0 +1,128 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_ +#define MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_ + +#include "mace/kernels/space_to_batch.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel { + public: + MaceStatus Compute( + OpKernelContext *context, + const Tensor *space_tensor, + const std::vector &paddings, + const std::vector &block_shape, + const std::vector &output_shape, + Tensor *batch_tensor, + StatsFuture *future) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus SpaceToBatchKernel::Compute( + OpKernelContext *context, + const Tensor *space_tensor, + const std::vector &paddings, + const std::vector &block_shape, + const std::vector &output_shape, + Tensor *batch_tensor, + StatsFuture *future) { + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR( + batch_tensor->ResizeImage(output_shape, output_image_shape)); + const char *kernel_name = "space_to_batch"; + const uint32_t chan_blk = RoundUpDiv4(batch_tensor->dim(3)); + const uint32_t gws[3] = { + chan_blk, static_cast(batch_tensor->dim(2)), + static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToCLCMDDt(DataTypeToEnum::value)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch", + obfuscated_kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, space_tensor->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + + kernel_.setArg(idx++, *(space_tensor->opencl_image())); + kernel_.setArg(idx++, *(batch_tensor->opencl_image())); + kernel_.setArg(idx++, block_shape[0]); + kernel_.setArg(idx++, block_shape[1]); + kernel_.setArg(idx++, paddings[0]); + kernel_.setArg(idx++, paddings[2]); + kernel_.setArg(idx++, static_cast(space_tensor->dim(0))); + kernel_.setArg(idx++, static_cast(space_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(space_tensor->dim(2))); + kernel_.setArg(idx++, static_cast(batch_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(batch_tensor->dim(2))); + + input_shape_ = space_tensor->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1), + batch_tensor->dim(2), batch_tensor->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_ diff --git a/mace/kernels/opencl/image/space_to_depth.h b/mace/kernels/opencl/image/space_to_depth.h new file mode 100644 index 0000000000000000000000000000000000000000..e1247dc31c5f5364d0431d0c7763a247ff5285c5 --- /dev/null +++ b/mace/kernels/opencl/image/space_to_depth.h @@ -0,0 +1,138 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_ +#define MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_ + +#include "mace/kernels/space_to_depth.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel { + public: + explicit SpaceToDepthKernel(const int block_size) + : block_size_(block_size) {} + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) override; + + private: + const int block_size_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus SpaceToDepthKernel::Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) { + const index_t batch = input->dim(0); + const index_t input_height = input->dim(1); + const index_t input_width = input->dim(2); + const index_t input_depth = input->dim(3); + + MACE_CHECK((input_depth % 4) == 0, + "input channel should be dividable by 4"); + MACE_CHECK( + (input_width % block_size_ == 0) && (input_height % block_size_ == 0), + "input width and height should be dividable by block_size"); + + const index_t output_height = input_height / block_size_; + const index_t output_width = input_width / block_size_; + const index_t output_depth = input_depth * block_size_ * block_size_; + + const index_t input_depth_blocks = RoundUpDiv4(input_depth); + const index_t output_depth_blocks = RoundUpDiv4(output_depth); + + std::vector output_shape = {batch, output_height, output_width, + output_depth}; + + std::vector image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + const char *kernel_name = "space_to_depth"; + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth", + obfuscated_kernel_name, + built_options, + &kernel_)); + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = {static_cast(input_depth_blocks), + static_cast(input_width), + static_cast(input_height * batch)}; + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, static_cast(block_size_)); + kernel_.setArg(idx++, static_cast(input_width)); + kernel_.setArg(idx++, static_cast(input_depth_blocks)); + kernel_.setArg(idx++, static_cast(output_height * batch)); + kernel_.setArg(idx++, static_cast(output_width)); + kernel_.setArg(idx++, static_cast(output_depth_blocks)); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0), + input->dim(1), input->dim(2), input->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_ diff --git a/mace/kernels/opencl/image/split.h b/mace/kernels/opencl/image/split.h new file mode 100644 index 0000000000000000000000000000000000000000..a75642a8317dd084234d175d8f29ea3e49836a78 --- /dev/null +++ b/mace/kernels/opencl/image/split.h @@ -0,0 +1,152 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_ +#define MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_ + +#include "mace/kernels/split.h" + +#include +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class SplitKernel : public OpenCLSplitKernel { + public: + explicit SplitKernel(const int32_t axis) : axis_(axis) {} + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const std::vector &output_list, + StatsFuture *future) override; + + private: + int32_t axis_; + cl::Kernel kernel_; + uint32_t kwg_size_; +}; + +template +MaceStatus SplitKernel::Compute( + OpKernelContext *context, + const Tensor *input, + const std::vector &output_list, + StatsFuture *future) { + const index_t input_channels = input->dim(3); + const size_t outputs_count = output_list.size(); + const index_t output_channels = input_channels / outputs_count; + MACE_CHECK(output_channels % 4 == 0) + << "output channels of split op must be divisible by 4"; + std::vector output_shape( + {input->dim(0), input->dim(1), input->dim(2), output_channels}); + + std::vector image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + for (size_t i = 0; i < outputs_count; ++i) { + MACE_RETURN_IF_ERROR( + output_list[i]->ResizeImage(output_shape, image_shape)); + } + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split"); + built_options.emplace("-Dsplit=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToCLCMDDt(DataTypeToEnum::value)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("split", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + const index_t channel_blk = RoundUpDiv4(output_channels); + + const uint32_t gws[3] = { + static_cast(channel_blk), static_cast(input->dim(2)), + static_cast(input->dim(0) * input->dim(1)), + }; + MACE_OUT_OF_RANGE_INIT(kernel_); + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + cl::Event event; + CallStats call_stats{INT64_MAX, 0}; + for (size_t i = 0; i < outputs_count; ++i) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, static_cast(channel_blk * i)); + kernel_.setArg(idx++, *(output_list[i]->opencl_image())); + + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t j = 0; j < 3; ++j) { + roundup_gws[j] = RoundUp(gws[j], lws[j]); + } + + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } + MACE_CL_RET_STATUS(error); + MACE_OUT_OF_RANGE_VALIDATION; + if (future != nullptr && runtime->is_profiling_enabled()) { + event.wait(); + CallStats tmp_stats; + runtime->GetCallStats(event, &tmp_stats); + call_stats.start_micros = + std::min(tmp_stats.start_micros, call_stats.start_micros); + call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; + } + } + if (future != nullptr) { + future->wait_fn = [runtime, call_stats](CallStats *stats) { + if (stats != nullptr) { + stats->start_micros = call_stats.start_micros; + stats->end_micros = stats->start_micros + call_stats.end_micros; + } + }; + } + + return MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_ diff --git a/mace/kernels/opencl/image/winograd_transform.h b/mace/kernels/opencl/image/winograd_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..107c8dc0cf3f29b4003de29516fc880d995cfe38 --- /dev/null +++ b/mace/kernels/opencl/image/winograd_transform.h @@ -0,0 +1,325 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_ +#define MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_ + +#include "mace/kernels/winograd_transform.h" + +#include +#include +#include +#include + +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { +namespace opencl { +namespace image { + +template +class WinogradTransformKernel : public OpenCLWinogradTransformKernel { + public: + WinogradTransformKernel( + const Padding &padding_type, + const std::vector &paddings, + const int block_size) + : strides_({1, 1}), + dilations_({1, 1}), + padding_type_(padding_type), + paddings_(paddings), + wino_blk_size_(block_size) {} + MaceStatus Compute( + OpKernelContext *context, + const Tensor *input_tensor, + Tensor *output_tensor, + StatsFuture *future) override; + + private: + const std::vector strides_; // [stride_h, stride_w] + const std::vector dilations_; // [dilation_h, dilation_w] + Padding padding_type_; + std::vector paddings_; + const int wino_blk_size_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus WinogradTransformKernel::Compute( + OpKernelContext *context, + const Tensor *input_tensor, + Tensor *output_tensor, + StatsFuture *future) { + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::string obfuscated_kernel_name; + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + if (wino_blk_size_ == 4) { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4"); + built_options.emplace("-Dwinograd_transform_4x4=" + + obfuscated_kernel_name); + } else if (wino_blk_size_ == 2) { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); + built_options.emplace("-Dwinograd_transform_2x2=" + + obfuscated_kernel_name); + } else { + MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); + return MACE_SUCCESS; + } + built_options.emplace("-DDATA_TYPE=" + + DtToUpCompatibleCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", + obfuscated_kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + std::vector output_shape(4); + std::vector filter_shape = {1, input_tensor->dim(3), 3, 3}; + std::vector paddings(2); + if (paddings_.empty()) { + kernels::CalcNHWCPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), dilations_.data(), + strides_.data(), padding_type_, output_shape.data(), paddings.data()); + } else { + paddings = paddings_; + CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), + paddings_.data(), dilations_.data(), strides_.data(), + RoundType::FLOOR, output_shape.data()); + } + const index_t round_h = + (output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_; + const index_t round_w = + (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_; + const index_t out_width = input_tensor->dim(0) * round_h * round_w; + + const float round_hw_r = 1.f / static_cast(round_h * round_w); + const float round_w_r = 1.f / static_cast(round_w); + const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2); + + const uint32_t gws[2] = { + static_cast(out_width), + static_cast(RoundUpDiv4(input_tensor->dim(3))) + }; + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input_tensor->shape())) { + output_shape = {blk_sqr, input_tensor->dim(3), out_width}; + std::vector padded_output_shape = { + output_shape[0], output_shape[1], output_shape[2], 1 + }; + std::vector image_shape; + CalImage2DShape(padded_output_shape, + BufferType::IN_OUT_HEIGHT, + &image_shape); + // remove unused last dimension + MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_2D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input_tensor->opencl_image())); + kernel_.setArg(idx++, *(output_tensor->opencl_image())); + kernel_.setArg(idx++, static_cast(input_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(input_tensor->dim(2))); + kernel_.setArg(idx++, static_cast(input_tensor->dim(3))); + kernel_.setArg(idx++, static_cast(round_h * round_w)); + kernel_.setArg(idx++, round_hw_r); + kernel_.setArg(idx++, static_cast(round_w)); + kernel_.setArg(idx++, round_w_r); + kernel_.setArg(idx++, static_cast(paddings[0] / 2)); + kernel_.setArg(idx++, static_cast(paddings[1] / 2)); + + input_shape_ = input_tensor->shape(); + } + + + const std::vector lws = {kwg_size_ / 8, 8, 0}; + std::string tuning_key = Concat("winograd_transform_kernel", + output_tensor->dim(0), + output_tensor->dim(1), + output_tensor->dim(2)); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} + +template +class WinogradInverseTransformKernel + : public OpenCLWinogradInverseTransformKernel { + public: + WinogradInverseTransformKernel( + const ActivationType activation, + const float relux_max_limit, + const int block_size) + : wino_blk_size_(block_size), + activation_(activation), + relux_max_limit_(relux_max_limit) {} + MaceStatus Compute( + OpKernelContext *context, + const std::vector &inputs, + Tensor *output_tensor, + StatsFuture *future) override; + + private: + const int wino_blk_size_; + const ActivationType activation_; + const float relux_max_limit_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus WinogradInverseTransformKernel::Compute( + OpKernelContext *context, + const std::vector &inputs, + Tensor *output_tensor, + StatsFuture *future) { + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + const Tensor *input_tensor = inputs[0]; + const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr; + + if (kernel_.get() == nullptr) { + std::string obfuscated_kernel_name; + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + if (wino_blk_size_ == 4) { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4"); + built_options.emplace("-Dwinograd_inverse_transform_4x4=" + + obfuscated_kernel_name); + } else if (wino_blk_size_ == 2) { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); + built_options.emplace("-Dwinograd_inverse_transform_2x2=" + + obfuscated_kernel_name); + } else { + MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); + return MACE_SUCCESS; + } + + built_options.emplace("-DDATA_TYPE=" + + DtToUpCompatibleCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + switch (activation_) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation_; + } + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", + obfuscated_kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + Tensor::MappingGuard output_shape_guard(inputs[1]); + const int32_t *output_shape_data = inputs[1]->data(); + const index_t batch = output_shape_data[0]; + const index_t height = output_shape_data[1]; + const index_t width = output_shape_data[2]; + const uint32_t gws[2] = { + static_cast(input_tensor->dim(2)), + static_cast(RoundUpDiv4(input_tensor->dim(1)))}; + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input_tensor->shape())) { + std::vector output_shape = {batch, height, width, + input_tensor->dim(1)}; + std::vector image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); + + const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_; + const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_; + + const float round_hw_r = 1.f / static_cast(round_h * round_w); + const float round_w_r = 1.f / static_cast(round_w); + + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_2D_GWS_ARGS(kernel_, gws); + kernel_.setArg( + idx++, + *(static_cast(input_tensor->opencl_image()))); + if (bias != nullptr) { + kernel_.setArg(idx++, + *(static_cast(bias->opencl_image()))); + } + kernel_.setArg( + idx++, *(static_cast(output_tensor->opencl_image()))); + kernel_.setArg(idx++, static_cast(output_shape[1])); + kernel_.setArg(idx++, static_cast(output_shape[2])); + kernel_.setArg(idx++, static_cast(round_h * round_w)); + kernel_.setArg(idx++, round_hw_r); + kernel_.setArg(idx++, static_cast(round_w)); + kernel_.setArg(idx++, round_w_r); + kernel_.setArg(idx++, relux_max_limit_); + + input_shape_ = input_tensor->shape(); + } + const std::vector lws = {kwg_size_ / 8, 8, 0}; + std::string tuning_key = + Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), + output_tensor->dim(1), output_tensor->dim(2), + output_tensor->dim(3), input_tensor->dim(2)); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MACE_SUCCESS; +} +} // namespace image +} // namespace opencl +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_ diff --git a/mace/kernels/opencl/lstmcell.cc b/mace/kernels/opencl/lstmcell.cc index df351539834d6e8ea1849d6b512c7cd63e9297e9..e210ee582bfc8a9cac401919d97fa3784954c467 100644 --- a/mace/kernels/opencl/lstmcell.cc +++ b/mace/kernels/opencl/lstmcell.cc @@ -13,14 +13,23 @@ // limitations under the License. #include "mace/kernels/lstmcell.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/lstm_cell.h" namespace mace { namespace kernels { +template +LSTMCellFunctor::LSTMCellFunctor( + OpKernelContext *context, + T forget_bias) + : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::LSTMCellKernel(forget_bias)); + } else { + MACE_NOT_IMPLEMENTED; + } +} + template MaceStatus LSTMCellFunctor::operator()( const Tensor *input, @@ -31,76 +40,11 @@ MaceStatus LSTMCellFunctor::operator()( Tensor *cell, Tensor *output, StatsFuture *future) { - MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0, - "LSTM hidden units should be a multiple of 4"); - - const index_t height = input->dim(0); - const index_t width = input->dim(1); - const index_t hidden_units = pre_output->dim(1); - const index_t w_blocks = hidden_units >> 2; - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell"); - built_options.emplace("-Dlstmcell=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - - MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - const uint32_t gws[2] = {static_cast(w_blocks), - static_cast(height)}; - - if (!IsVecEqual(input_shape_, input->shape())) { - std::vector output_shape_padded = {height, 1, 1, hidden_units}; - std::vector output_image_shape; - CalImage2DShape(output_shape_padded, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(), - output_image_shape)); - MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(), - output_image_shape)); - - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_2D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(pre_output->opencl_image())); - kernel_.setArg(idx++, *(weight->opencl_image())); - kernel_.setArg(idx++, *(bias->opencl_image())); - kernel_.setArg(idx++, *(pre_cell->opencl_image())); - kernel_.setArg(idx++, static_cast(forget_bias_)); - kernel_.setArg(idx++, static_cast(width)); - kernel_.setArg(idx++, static_cast(hidden_units)); - kernel_.setArg(idx++, static_cast(RoundUpDiv4(width))); - kernel_.setArg(idx++, *(cell->opencl_image())); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } - - const std::vector lws = {kwg_size_ / 16, 16, 0}; - std::string tuning_key = - Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - OUT_OF_RANGE_VALIDATION(kernel_error_); - - return MACE_SUCCESS; + return kernel_->Compute(context_, input, pre_output, weight, bias, + pre_cell, cell, output, future); } template struct LSTMCellFunctor; - template struct LSTMCellFunctor; } // namespace kernels diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index 407b455d106ad765f4731f98caa948527d1a2129..b8ddc1c84e8039a907bad6ade8ce444a20012228 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -13,13 +13,21 @@ // limitations under the License. #include "mace/kernels/matmul.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" +#include "mace/kernels/opencl/image/matmul.h" namespace mace { namespace kernels { +template +MatMulFunctor::MatMulFunctor(OpKernelContext *context) + : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::MatMulKernel); + } else { + MACE_NOT_IMPLEMENTED; + } +} + template MaceStatus MatMulFunctor::operator()(const Tensor *A, const Tensor *B, @@ -27,68 +35,7 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, bool transpose_a, bool transpose_b, StatsFuture *future) { - MACE_UNUSED(future); - MACE_CHECK(!transpose_a && !transpose_b, - "GPU does not support transpose matmul"); - - index_t rank = A->dim_size(); - index_t height = A->dim(rank - 2); - index_t K = A->dim(rank - 1); - index_t width = B->dim(rank - 1); - index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1, - std::multiplies()); - - std::vector c_shape = A->shape(); - c_shape[rank - 2] = height; - c_shape[rank - 1] = width; - std::vector c_image_shape; - std::vector padded_c_shape = {batch, height, width, 1}; - CalImage2DShape(padded_c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape); - MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape)); - - const index_t height_blocks = RoundUpDiv4(height); - const index_t width_blocks = RoundUpDiv4(width); - const uint32_t gws[2] = { - static_cast(width_blocks), - static_cast(height_blocks * batch), - }; - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); - built_options.emplace("-Dmatmul=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_2D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(A->opencl_image())); - kernel_.setArg(idx++, *(B->opencl_image())); - kernel_.setArg(idx++, *(C->opencl_image())); - kernel_.setArg(idx++, static_cast(height)); - kernel_.setArg(idx++, static_cast(width)); - kernel_.setArg(idx++, static_cast(K)); - kernel_.setArg(idx++, static_cast(height_blocks)); - kernel_.setArg(idx++, static_cast(RoundUpDiv4(K))); - - const std::vector lws = {kwg_size_ / 64, 64, 0}; - std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, A, B, C, transpose_a, transpose_b, future); } template struct MatMulFunctor; diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc index 03f05ca5f5711edb327fd9a14e5de79a21eb074c..f61b9e87cfe0a91a59478167fcccbce565cbe793 100644 --- a/mace/kernels/opencl/out_of_range_check_test.cc +++ b/mace/kernels/opencl/out_of_range_check_test.cc @@ -31,7 +31,7 @@ bool BufferToImageOpImpl(OpKernelContext *context, Tensor *buffer, Tensor *image, const std::vector &image_shape) { - std::unique_ptr kernel_error; + std::unique_ptr oorc_flag; uint32_t gws[2] = {static_cast(image_shape[0]), static_cast(image_shape[1])}; @@ -43,8 +43,8 @@ bool BufferToImageOpImpl(OpKernelContext *context, std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); - OUT_OF_RANGE_CONFIG(kernel_error, context); - NON_UNIFORM_WG_CONFIG; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; if (buffer->dtype() == image->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); @@ -67,12 +67,13 @@ bool BufferToImageOpImpl(OpKernelContext *context, return false; } + MACE_OUT_OF_RANGE_INIT(kernel); uint32_t idx = 0; if (runtime->IsOutOfRangeCheckEnabled()) { kernel.setArg(idx++, - *(static_cast(kernel_error->buffer()))); + *(static_cast(oorc_flag->buffer()))); } - SET_2D_GWS_ARGS(kernel); + MACE_SET_2D_GWS_ARGS(kernel, gws); kernel.setArg(idx++, *(buffer->opencl_buffer())); MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned"); @@ -110,9 +111,9 @@ bool BufferToImageOpImpl(OpKernelContext *context, runtime->command_queue().finish(); bool is_out_of_range = false; if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error->Map(nullptr); - is_out_of_range = *(kernel_error->mutable_data()) == 1 ? true : false; - kernel_error->UnMap(); + oorc_flag->Map(nullptr); + is_out_of_range = *(oorc_flag->mutable_data()) == 1 ? true : false; + oorc_flag->UnMap(); } return is_out_of_range; } diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc index a3f4cfaa53c7b21f95cfcbea219c4fe3853d6a72..759b9219ce06d601716b36b90bc03513de51de9b 100644 --- a/mace/kernels/opencl/pad.cc +++ b/mace/kernels/opencl/pad.cc @@ -13,86 +13,29 @@ // limitations under the License. #include "mace/kernels/pad.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" +#include "mace/kernels/opencl/image/pad.h" namespace mace { namespace kernels { +template +PadFunctor::PadFunctor( + OpKernelContext *context, + const std::vector &paddings, + const float constant_value) + : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::PadKernel(paddings, constant_value)); + } else { + MACE_NOT_IMPLEMENTED; + } +} + template MaceStatus PadFunctor::operator()(const Tensor *input, Tensor *output, StatsFuture *future) { - MACE_CHECK(this->paddings_.size() == - static_cast((input->dim_size() * 2))); - MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) && - (this->paddings_[6] == 0) && (this->paddings_[7] == 0)) - << "Mace only support height/width dimension now"; - auto input_shape = input->shape(); - std::vector output_shape = { - input_shape[0] + this->paddings_[0] + this->paddings_[1], - input_shape[1] + this->paddings_[2] + this->paddings_[3], - input_shape[2] + this->paddings_[4] + this->paddings_[5], - input_shape[3] + this->paddings_[6] + this->paddings_[7]}; - - std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - - const index_t batch = output->dim(0); - const index_t height = output->dim(1); - const index_t width = output->dim(2); - const index_t channels = output->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad"); - built_options.emplace("-Dpad=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - - if (!IsVecEqual(input_shape_, input->shape())) { - int idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(output->opencl_image())); - kernel_.setArg(idx++, this->constant_value_); - kernel_.setArg(idx++, static_cast(input_shape[1])); - kernel_.setArg(idx++, static_cast(input_shape[2])); - kernel_.setArg(idx++, static_cast(output_shape[1])); - kernel_.setArg(idx++, this->paddings_[2]); - kernel_.setArg(idx++, this->paddings_[4]); - - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = Concat("pad", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, input, output, future); } template struct PadFunctor; diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc index c6743750a3b381e5fcbb632980df50340fa872d2..aab536643ca9808ca3db22247bfacd3fa8fab916 100644 --- a/mace/kernels/opencl/pooling.cc +++ b/mace/kernels/opencl/pooling.cc @@ -13,153 +13,45 @@ // limitations under the License. #include "mace/kernels/pooling.h" -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" + +#include "mace/kernels/opencl/buffer/pooling.h" #include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" +#include "mace/kernels/opencl/image/pooling.h" namespace mace { namespace kernels { -namespace { - -std::vector LocalWS(OpenCLRuntime *runtime, - const uint32_t *gws, - const uint32_t kwg_size) { - std::vector lws(4, 0); - if (kwg_size == 0) { - lws[0] = lws[1] = lws[2] = 1; +template +PoolingFunctor::PoolingFunctor( + OpKernelContext *context, + const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding padding_type, + const std::vector &paddings, + const int *dilations) + : PoolingFunctorBase(context, + pooling_type, + kernels, + strides, + padding_type, + paddings, + dilations) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::PoolingKernel); } else { - uint64_t - cache_size = runtime->device_global_mem_cache_size(); - uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - lws[2] = - std::min(std::min(gws[2], base), kwg_size / lws[1]); - const uint32_t lws_size = lws[1] * lws[2]; - lws[0] = gws[0] / 4; - if (lws[0] == 0) { - lws[0] = gws[0]; - } - lws[0] = std::max(std::min(lws[0], kwg_size / lws_size), - 1); + kernel_.reset(new opencl::buffer::PoolingKernel); } - return lws; } -} // namespace - template -MaceStatus PoolingFunctor::operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) - << "Pooling opencl kernel not support dilation yet"; - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - const DataType dt = DataTypeToEnum::value; - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); - built_options.emplace("-Dpooling=" + kernel_name); - - if (pooling_type_ == MAX && input->dtype() == output->dtype()) { - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - built_options.emplace(dt == DT_HALF ? "-DFP16" : ""); - } else { - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - } - if (pooling_type_ == AVG) { - built_options.emplace("-DPOOL_AVG"); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling", - kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - std::vector gws; - if (!IsVecEqual(input_shape_, input->shape())) { - std::vector output_shape(4); - std::vector filter_shape = {input->dim(3), input->dim(3), - kernels_[0], kernels_[1]}; - - std::vector paddings(2); - if (paddings_.empty()) { - kernels::CalcNHWCPaddingAndOutputSize( - input->shape().data(), filter_shape.data(), dilations_, strides_, - padding_type_, output_shape.data(), paddings.data()); - } else { - paddings = paddings_; - CalcOutputSize(input->shape().data(), filter_shape.data(), - paddings_.data(), dilations_, strides_, RoundType::CEIL, - output_shape.data()); - } - - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - index_t batch = output->dim(0); - index_t out_height = output->dim(1); - index_t out_width = output->dim(2); - index_t channels = output->dim(3); - - index_t channel_blocks = (channels + 3) / 4; - - gws = { - static_cast(channel_blocks), static_cast(out_width), - static_cast(batch * out_height), - }; - - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, static_cast(input->dim(1))); - kernel_.setArg(idx++, static_cast(input->dim(2))); - kernel_.setArg(idx++, static_cast(output->dim(1))); - kernel_.setArg(idx++, paddings[0] / 2); - kernel_.setArg(idx++, paddings[1] / 2); - kernel_.setArg(idx++, strides_[0]); - kernel_.setArg(idx++, strides_[1]); - kernel_.setArg(idx++, kernels_[0]); - kernel_.setArg(idx++, kernels_[1]); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } else { - index_t batch = output->dim(0); - index_t out_height = output->dim(1); - index_t out_width = output->dim(2); - index_t channels = output->dim(3); - - index_t channel_blocks = (channels + 3) / 4; - - gws = { - static_cast(channel_blocks), static_cast(out_width), - static_cast(batch * out_height), - }; - } - - const std::vector lws = LocalWS(runtime, gws.data(), kwg_size_); - std::string tuning_key = - Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws.data(), lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; +MaceStatus PoolingFunctor::operator()( + const Tensor *input, + Tensor *output, + StatsFuture *future) { + return kernel_->Compute(context_, input, pooling_type_, kernels_, strides_, + padding_type_, paddings_, dilations_, + output, future); } template struct PoolingFunctor; diff --git a/mace/kernels/opencl/reduce_mean.cc b/mace/kernels/opencl/reduce_mean.cc index a6a45f764a6e78628a98410ba20a423c58e0c6fd..b504334afb9eb1c9b87305c473548cc43cdeae1b 100644 --- a/mace/kernels/opencl/reduce_mean.cc +++ b/mace/kernels/opencl/reduce_mean.cc @@ -13,127 +13,29 @@ // limitations under the License. #include "mace/kernels/reduce_mean.h" -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" +#include "mace/kernels/opencl/image/reduce_mean.h" namespace mace { namespace kernels { +template +ReduceMeanFunctor::ReduceMeanFunctor( + OpKernelContext *context, + const std::vector &axis, + const bool keep_dims) : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::ReduceMeanKernel(axis, keep_dims)); + } else { + MACE_NOT_IMPLEMENTED; + } +} + template MaceStatus ReduceMeanFunctor::operator()( const Tensor *input, Tensor *output, StatsFuture *future) { - MACE_CHECK_NOTNULL(input); -// MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims."); - MACE_CHECK(input->dim_size() == 4, - "reduce mean gpu only support 4-dim input"); - MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2, - "reduce mean gpu only support 1,2-axis reduce"); - index_t batch = input->dim(0); - const index_t in_height = input->dim(1); - const index_t in_width = input->dim(2); - const index_t channels = input->dim(3); - const index_t channel_blocks = RoundUpDiv4(channels); - const uint32_t image_size = static_cast(in_height * in_width); - - auto runtime = context_->device()->opencl_runtime(); - std::vector gws(3); - std::vector lws(3); - std::vector output_shape{batch, 1, 1, channels}; - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - if (kernel_.get() == nullptr) { - const DataType dt = DataTypeToEnum::value; - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean"); - built_options.emplace("-Dreduce_mean=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { - built_options.emplace("-DNON_QUALCOMM_ADRENO"); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean", - kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { - const uint32_t wave_size = - static_cast(runtime->GetKernelWaveSize(kernel_)); - gws = {4, (wave_size / 4), static_cast(batch * channel_blocks)}; - } else { - gws = {4, 16, static_cast(batch * channel_blocks)}; - } - lws = {gws[0], gws[1], 1}; - const int group_size = lws[0] * lws[1] * lws[2]; - const int partial_len = (image_size + group_size - 1) / group_size; - const int remain_index = image_size % group_size; - const float in_width_reciprocal = 1.f / in_width; - const float img_size_reciprocal = 1.f / (in_width * in_height); - const float channel_blk_reciprocal = 1.f / channel_blocks; - - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, (group_size * 4 * sizeof(T)), - nullptr); - kernel_.setArg(idx++, static_cast(group_size)); - kernel_.setArg(idx++, static_cast(partial_len)); - kernel_.setArg(idx++, static_cast(remain_index)); - kernel_.setArg(idx++, static_cast(batch)); - kernel_.setArg(idx++, static_cast(in_height)); - kernel_.setArg(idx++, static_cast(in_width)); - kernel_.setArg(idx++, img_size_reciprocal); - kernel_.setArg(idx++, in_width_reciprocal); - kernel_.setArg(idx++, static_cast(channel_blocks)); - kernel_.setArg(idx++, channel_blk_reciprocal); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } - - cl::Event event; - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } else { - std::vector roundup_gws(lws.size()); - for (size_t i = 0; i < lws.size(); ++i) { - roundup_gws[i] = RoundUp(gws[i], lws[i]); - } - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } - MACE_CL_RET_STATUS(error); - OUT_OF_RANGE_VALIDATION(kernel_error_); - - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } - - return MACE_SUCCESS; + return kernel_->Compute(context_, input, output, future); } template struct ReduceMeanFunctor; diff --git a/mace/kernels/opencl/resize_bicubic.cc b/mace/kernels/opencl/resize_bicubic.cc index 6fc26e52d8d7d1dc2a2e0e1229541fd2800dd358..e45ced4bb72e499f2e6a98dcb7d984c4524246fa 100644 --- a/mace/kernels/opencl/resize_bicubic.cc +++ b/mace/kernels/opencl/resize_bicubic.cc @@ -13,119 +13,31 @@ // limitations under the License. #include "mace/kernels/resize_bicubic.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/core/tensor.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/resize_bicubic.h" namespace mace { namespace kernels { -namespace { -std::vector LocalWS(OpenCLRuntime *runtime, - const uint32_t *gws, - const uint32_t kwg_size) { - std::vector lws(4, 0); - uint64_t cache_size = runtime->device_global_mem_cache_size(); - uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - if (lws[1] >= base) { - lws[0] = std::min(gws[0], base); +template +ResizeBicubicFunctor::ResizeBicubicFunctor( + OpKernelContext *context, + bool align_corners, + const std::vector &size) + : OpKernel(context) { + MACE_CHECK(size.size() == 2); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::ResizeBicubicKernel(align_corners, + size[0], + size[1])); } else { - lws[0] = gws[0] / 8; - if (lws[0] == 0) { - lws[0] = gws[0]; - } - } - lws[0] = std::min(lws[0], kwg_size / lws[1]); - const uint32_t lws_size = lws[0] * lws[1]; - lws[2] = gws[2] / 8; - if (lws[2] == 0) { - lws[2] = gws[2]; + MACE_NOT_IMPLEMENTED; } - lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), - 1); - return lws; } -} // namespace - template MaceStatus ResizeBicubicFunctor::operator()( const Tensor *input, Tensor *output, StatsFuture *future) { - const index_t batch = input->dim(0); - const index_t in_height = input->dim(1); - const index_t in_width = input->dim(2); - const index_t channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - const index_t out_height = out_height_; - const index_t out_width = out_width_; - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(out_width), - static_cast(out_height * batch)}; - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - auto dt = DataTypeToEnum::value; - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache"); - built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize)); - MACE_RETURN_IF_ERROR( - runtime->BuildKernel("resize_bicubic", - kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - if (!IsVecEqual(input_shape_, input->shape())) { - MACE_CHECK(out_height > 0 && out_width > 0); - std::vector output_shape{batch, out_height, out_width, channels}; - - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - float height_scale = - CalculateResizeScale(in_height, out_height, align_corners_); - float width_scale = - CalculateResizeScale(in_width, out_width, align_corners_); - - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(output->opencl_image())); - kernel_.setArg(idx++, height_scale); - kernel_.setArg(idx++, width_scale); - kernel_.setArg(idx++, static_cast(in_height)); - kernel_.setArg(idx++, static_cast(in_width)); - kernel_.setArg(idx++, static_cast(out_height)); - - input_shape_ = input->shape(); - } - - const std::vector lws = LocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, input, output, future); } template struct ResizeBicubicFunctor; diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc index 23e5db1c102979c9f3dbea869016d56ccc359d62..585cab767ad05ff3d6666d72db69d38d91db1ec6 100644 --- a/mace/kernels/opencl/resize_bilinear.cc +++ b/mace/kernels/opencl/resize_bilinear.cc @@ -13,122 +13,29 @@ // limitations under the License. #include "mace/kernels/resize_bilinear.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/core/tensor.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/resize_bilinear.h" namespace mace { namespace kernels { -namespace { -std::vector LocalWS(OpenCLRuntime *runtime, - const uint32_t *gws, - const uint32_t kwg_size) { - std::vector lws(4, 0); - if (kwg_size == 0) { - lws[0] = lws[1] = lws[2] = 1; +template +ResizeBilinearFunctor::ResizeBilinearFunctor( + OpKernelContext *context, + const std::vector &size, + bool align_corners) : OpKernel(context) { + MACE_CHECK(size.size() == 2); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::ResizeBilinearKernel(align_corners, + size[0], + size[1])); } else { - uint64_t - cache_size = runtime->device_global_mem_cache_size(); - uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - if (lws[1] >= base) { - lws[0] = std::min(gws[0], base); - } else { - lws[0] = gws[0] / 8; - if (lws[0] == 0) { - lws[0] = gws[0]; - } - } - lws[0] = std::min(lws[0], kwg_size / lws[1]); - const uint32_t lws_size = lws[0] * lws[1]; - lws[2] = gws[2] / 8; - if (lws[2] == 0) { - lws[2] = gws[2]; - } - lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), - 1); + MACE_NOT_IMPLEMENTED; } - return lws; } - -} // namespace - template MaceStatus ResizeBilinearFunctor::operator()( const Tensor *input, Tensor *output, StatsFuture *future) { - const index_t batch = input->dim(0); - const index_t in_height = input->dim(1); - const index_t in_width = input->dim(2); - const index_t channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - const index_t out_height = out_height_; - const index_t out_width = out_width_; - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(out_width), - static_cast(out_height * batch)}; - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); - built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - MACE_RETURN_IF_ERROR( - runtime->BuildKernel("resize_bilinear", - kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - if (!IsVecEqual(input_shape_, input->shape())) { - MACE_CHECK(out_height > 0 && out_width > 0); - std::vector output_shape{batch, out_height, out_width, channels}; - - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - - float height_scale = - CalculateResizeScale(in_height, out_height, align_corners_); - float width_scale = - CalculateResizeScale(in_width, out_width, align_corners_); - - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, *(output->opencl_image())); - kernel_.setArg(idx++, height_scale); - kernel_.setArg(idx++, width_scale); - kernel_.setArg(idx++, static_cast(in_height)); - kernel_.setArg(idx++, static_cast(in_width)); - kernel_.setArg(idx++, static_cast(out_height)); - - input_shape_ = input->shape(); - } - - const std::vector lws = LocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1), - output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, input, output, future); } template struct ResizeBilinearFunctor; diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc index e84ec7312d6d0e2e5cce33b1253b0ff948af19d5..bad5f1fad75b0b0fd8276e4baebf549393d370d1 100644 --- a/mace/kernels/opencl/softmax.cc +++ b/mace/kernels/opencl/softmax.cc @@ -13,110 +13,28 @@ // limitations under the License. #include "mace/kernels/softmax.h" -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" + +#include "mace/kernels/opencl/buffer/softmax.h" #include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/softmax.h" namespace mace { namespace kernels { -namespace { - -std::vector LocalWS(OpenCLRuntime *runtime, - const uint32_t *gws, - const uint32_t kwg_size) { - std::vector lws(4, 0); - if (kwg_size == 0) { - lws[0] = lws[1] = lws[2] = 1; +template +SoftmaxFunctor::SoftmaxFunctor(OpKernelContext *context) + : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::SoftmaxKernel); } else { - uint64_t - cache_size = runtime->device_global_mem_cache_size(); - uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - if (gws[0] < base) { - lws[0] = gws[0]; - } else { - lws[0] = gws[0] / base; - } - lws[0] = std::min(lws[0], kwg_size / lws[1]); - lws[2] = std::max(std::min( - gws[2], kwg_size / (lws[0] * lws[1])), 1); + kernel_.reset(new opencl::buffer::SoftmaxKernel); } - return lws; } - -} // namespace - template MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, Tensor *output, StatsFuture *future) { - index_t batch = 0; - index_t height = 0; - index_t width = 0; - index_t channels = 0; - - if (logits->dim_size() == 2) { - batch = logits->dim(0); - height = 1; - width = 1; - channels = logits->dim(1); - - } else if (logits->dim_size() == 4) { - batch = logits->dim(0); - height = logits->dim(1); - width = logits->dim(2); - channels = logits->dim(3); - } else { - MACE_NOT_IMPLEMENTED; - } - - const index_t channel_blocks = RoundUpDiv4(channels); - const int remain_channels = channel_blocks * 4 - channels; - - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); - built_options.emplace("-Dsoftmax=" + kernel_name); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name, - built_options, &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - if (!IsVecEqual(input_shape_, logits->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(logits->opencl_image())); - kernel_.setArg(idx++, static_cast(channels)); - kernel_.setArg(idx++, remain_channels); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = logits->shape(); - } - - std::vector lws = LocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat("softmax_opencl_kernel", batch, height, width, channels); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, logits, output, future); } template struct SoftmaxFunctor; diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc index d015a99e3fc4a83c3f679854b59478d9d8668a79..c69db85c73b5c33857f8b7806f2237c16fa4d337 100644 --- a/mace/kernels/opencl/space_to_batch.cc +++ b/mace/kernels/opencl/space_to_batch.cc @@ -16,81 +16,32 @@ #define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_ #include "mace/kernels/space_to_batch.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/space_to_batch.h" namespace mace { namespace kernels { +template +SpaceToBatchFunctor::SpaceToBatchFunctor( + OpKernelContext *context, + const std::vector &paddings, + const std::vector &block_shape) + : SpaceToBatchFunctorBase(context, paddings, block_shape) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::SpaceToBatchKernel); + } else { + MACE_NOT_IMPLEMENTED; + } +} + template MaceStatus SpaceToBatchFunctor::operator()( - Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) { + const Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) { std::vector output_shape(4, 0); CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC, output_shape.data()); - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); - MACE_RETURN_IF_ERROR( - batch_tensor->ResizeImage(output_shape, output_image_shape)); - const char *kernel_name = "space_to_batch"; - const uint32_t chan_blk = RoundUpDiv4(batch_tensor->dim(3)); - const uint32_t gws[3] = { - chan_blk, static_cast(batch_tensor->dim(2)), - static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToCLCMDDt(DataTypeToEnum::value)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch", - obfuscated_kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - if (!IsVecEqual(space_shape_, space_tensor->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - - kernel_.setArg(idx++, *(space_tensor->opencl_image())); - kernel_.setArg(idx++, *(batch_tensor->opencl_image())); - kernel_.setArg(idx++, block_shape_[0]); - kernel_.setArg(idx++, block_shape_[1]); - kernel_.setArg(idx++, paddings_[0]); - kernel_.setArg(idx++, paddings_[2]); - kernel_.setArg(idx++, static_cast(space_tensor->dim(0))); - kernel_.setArg(idx++, static_cast(space_tensor->dim(1))); - kernel_.setArg(idx++, static_cast(space_tensor->dim(2))); - kernel_.setArg(idx++, static_cast(batch_tensor->dim(1))); - kernel_.setArg(idx++, static_cast(batch_tensor->dim(2))); - - space_shape_ = space_tensor->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = - Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1), - batch_tensor->dim(2), batch_tensor->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, space_tensor, paddings_, block_shape_, + output_shape, batch_tensor, future); } template struct SpaceToBatchFunctor; diff --git a/mace/kernels/opencl/space_to_depth.cc b/mace/kernels/opencl/space_to_depth.cc index 1135c79cdadae53a8c151673fbbd8567d25dd29a..3e14047b9eb4833d500cce289c51197608fc82f4 100644 --- a/mace/kernels/opencl/space_to_depth.cc +++ b/mace/kernels/opencl/space_to_depth.cc @@ -13,91 +13,27 @@ // limitations under the License. #include "mace/kernels/space_to_depth.h" -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/kernels/opencl/image/space_to_depth.h" namespace mace { namespace kernels { template -MaceStatus SpaceToDepthOpFunctor::operator()( - const Tensor *input, Tensor *output, StatsFuture *future) { - const index_t batch = input->dim(0); - const index_t input_height = input->dim(1); - const index_t input_width = input->dim(2); - const index_t input_depth = input->dim(3); - - MACE_CHECK((input_depth % 4) == 0, - "input channel should be dividable by 4"); - MACE_CHECK( - (input_width % block_size_ == 0) && (input_height % block_size_ == 0), - "input width and height should be dividable by block_size"); - - const index_t output_height = input_height / block_size_; - const index_t output_width = input_width / block_size_; - const index_t output_depth = input_depth * block_size_ * block_size_; - - const index_t input_depth_blocks = RoundUpDiv4(input_depth); - const index_t output_depth_blocks = RoundUpDiv4(output_depth); - - std::vector output_shape = {batch, output_height, output_width, - output_depth}; - - std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); - MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - - auto runtime = context_->device()->opencl_runtime(); - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - const char *kernel_name = "space_to_depth"; - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth", - obfuscated_kernel_name, - built_options, - &kernel_)); - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); +SpaceToDepthOpFunctor::SpaceToDepthOpFunctor( + OpKernelContext *context, + const int block_size) + : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::SpaceToDepthKernel(block_size)); + } else { + MACE_NOT_IMPLEMENTED; } +} - const uint32_t gws[3] = {static_cast(input_depth_blocks), - static_cast(input_width), - static_cast(input_height * batch)}; - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, static_cast(block_size_)); - kernel_.setArg(idx++, static_cast(input_width)); - kernel_.setArg(idx++, static_cast(input_depth_blocks)); - kernel_.setArg(idx++, static_cast(output_height * batch)); - kernel_.setArg(idx++, static_cast(output_width)); - kernel_.setArg(idx++, static_cast(output_depth_blocks)); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0), - input->dim(1), input->dim(2), input->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; +template +MaceStatus SpaceToDepthOpFunctor::operator()( + const Tensor *input, Tensor *output, StatsFuture *future) { + return kernel_->Compute(context_, input, output, future); } template struct SpaceToDepthOpFunctor; diff --git a/mace/kernels/opencl/split.cc b/mace/kernels/opencl/split.cc index c445b783564095e5ef27ecfc486fbe79b0cc1548..2f2a046ec0c7cc1929c3c3e3c3fd2642d6685bc0 100644 --- a/mace/kernels/opencl/split.cc +++ b/mace/kernels/opencl/split.cc @@ -13,107 +13,28 @@ // limitations under the License. #include "mace/kernels/split.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" +#include "mace/kernels/opencl/image/split.h" namespace mace { namespace kernels { +template +SplitFunctor::SplitFunctor(OpKernelContext *context, + const int32_t axis) + : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::SplitKernel(axis)); + } else { + MACE_NOT_IMPLEMENTED; + } +} + template MaceStatus SplitFunctor::operator()( const Tensor *input, const std::vector &output_list, StatsFuture *future) { - const index_t input_channels = input->dim(3); - const size_t outputs_count = output_list.size(); - const index_t output_channels = input_channels / outputs_count; - MACE_CHECK(output_channels % 4 == 0) - << "output channels of split op must be divisible by 4"; - std::vector output_shape( - {input->dim(0), input->dim(1), input->dim(2), output_channels}); - - std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); - for (size_t i = 0; i < outputs_count; ++i) { - MACE_RETURN_IF_ERROR( - output_list[i]->ResizeImage(output_shape, image_shape)); - } - - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split"); - built_options.emplace("-Dsplit=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToCLCMDDt(DataTypeToEnum::value)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("split", - kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - const index_t channel_blk = RoundUpDiv4(output_channels); - - const uint32_t gws[3] = { - static_cast(channel_blk), static_cast(input->dim(2)), - static_cast(input->dim(0) * input->dim(1)), - }; - - const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); - cl::Event event; - CallStats call_stats{INT64_MAX, 0}; - for (size_t i = 0; i < outputs_count; ++i) { - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_3D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, static_cast(channel_blk * i)); - kernel_.setArg(idx++, *(output_list[i]->opencl_image())); - - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } else { - std::vector roundup_gws(lws.size()); - for (size_t j = 0; j < 3; ++j) { - roundup_gws[j] = RoundUp(gws[j], lws[j]); - } - - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } - MACE_CL_RET_STATUS(error); - OUT_OF_RANGE_VALIDATION(kernel_error_); - if (future != nullptr && runtime->is_profiling_enabled()) { - event.wait(); - CallStats tmp_stats; - runtime->GetCallStats(event, &tmp_stats); - call_stats.start_micros = - std::min(tmp_stats.start_micros, call_stats.start_micros); - call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; - } - } - if (future != nullptr) { - future->wait_fn = [runtime, call_stats](CallStats *stats) { - if (stats != nullptr) { - stats->start_micros = call_stats.start_micros; - stats->end_micros = stats->start_micros + call_stats.end_micros; - } - }; - } - - return MACE_SUCCESS; + return kernel_->Compute(context_, input, output_list, future); } template struct SplitFunctor; diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 43210171a743bdd7dd5640ccaf2415c23bacd553..f64945a4eb1cd14d1f142a042f861e0d825deb18 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -13,239 +13,49 @@ // limitations under the License. #include "mace/kernels/winograd_transform.h" -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" +#include "mace/kernels/opencl/image/winograd_transform.h" namespace mace { namespace kernels { +template +WinogradTransformFunctor::WinogradTransformFunctor( + OpKernelContext *context, + const Padding &padding_type, + const std::vector &paddings, + const int block_size) : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::WinogradTransformKernel( + padding_type, paddings, block_size)); + } else { + MACE_NOT_IMPLEMENTED; + } +} template MaceStatus WinogradTransformFunctor::operator()( const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { - auto runtime = context_->device()->opencl_runtime(); - - if (kernel_.get() == nullptr) { - std::string obfuscated_kernel_name; - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - if (wino_blk_size_ == 4) { - obfuscated_kernel_name = - MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4"); - built_options.emplace("-Dwinograd_transform_4x4=" - + obfuscated_kernel_name); - } else if (wino_blk_size_ == 2) { - obfuscated_kernel_name = - MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); - built_options.emplace("-Dwinograd_transform_2x2=" - + obfuscated_kernel_name); - } else { - MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); - return MACE_SUCCESS; - } - built_options.emplace("-DDATA_TYPE=" + - DtToUpCompatibleCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", - obfuscated_kernel_name, - built_options, - &kernel_)); + return kernel_->Compute(context_, input_tensor, output_tensor, future); +} - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - std::vector output_shape(4); - std::vector filter_shape = {1, input_tensor->dim(3), 3, 3}; - std::vector paddings(2); - if (paddings_.empty()) { - kernels::CalcNHWCPaddingAndOutputSize( - input_tensor->shape().data(), filter_shape.data(), dilations_.data(), - strides_.data(), padding_type_, output_shape.data(), paddings.data()); +template +WinogradInverseTransformFunctor::WinogradInverseTransformFunctor( // NOLINT(whitespace/line_length) + OpKernelContext *context, + const ActivationType activation, + const float relux_max_limit, + const int block_size) : OpKernel(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::WinogradInverseTransformKernel( + activation, relux_max_limit, block_size)); } else { - paddings = paddings_; - CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), - paddings_.data(), dilations_.data(), strides_.data(), - RoundType::FLOOR, output_shape.data()); + MACE_NOT_IMPLEMENTED; } - const index_t round_h = - (output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_; - const index_t round_w = - (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_; - const index_t out_width = input_tensor->dim(0) * round_h * round_w; - - const float round_hw_r = 1.f / static_cast(round_h * round_w); - const float round_w_r = 1.f / static_cast(round_w); - const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2); - - const uint32_t gws[2] = { - static_cast(out_width), - static_cast(RoundUpDiv4(input_tensor->dim(3))) - }; - if (!IsVecEqual(input_shape_, input_tensor->shape())) { - output_shape = {blk_sqr, input_tensor->dim(3), out_width}; - std::vector padded_output_shape = { - output_shape[0], output_shape[1], output_shape[2], 1 - }; - std::vector image_shape; - CalImage2DShape(padded_output_shape, - BufferType::IN_OUT_HEIGHT, - &image_shape); - // remove unused last dimension - MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); - - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_2D_GWS_ARGS(kernel_); - kernel_.setArg(idx++, *(input_tensor->opencl_image())); - kernel_.setArg(idx++, *(output_tensor->opencl_image())); - kernel_.setArg(idx++, static_cast(input_tensor->dim(1))); - kernel_.setArg(idx++, static_cast(input_tensor->dim(2))); - kernel_.setArg(idx++, static_cast(input_tensor->dim(3))); - kernel_.setArg(idx++, static_cast(round_h * round_w)); - kernel_.setArg(idx++, round_hw_r); - kernel_.setArg(idx++, static_cast(round_w)); - kernel_.setArg(idx++, round_w_r); - kernel_.setArg(idx++, static_cast(paddings[0] / 2)); - kernel_.setArg(idx++, static_cast(paddings[1] / 2)); - - input_shape_ = input_tensor->shape(); - } - - - const std::vector lws = {kwg_size_ / 8, 8, 0}; - std::string tuning_key = Concat("winograd_transform_kernel", - output_tensor->dim(0), - output_tensor->dim(1), - output_tensor->dim(2)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; } - template MaceStatus WinogradInverseTransformFunctor::operator()( const std::vector &inputs, Tensor *output_tensor, StatsFuture *future) { - auto runtime = context_->device()->opencl_runtime(); - - const Tensor *input_tensor = inputs[0]; - const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr; - - if (kernel_.get() == nullptr) { - std::string obfuscated_kernel_name; - std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_, context_); - NON_UNIFORM_WG_CONFIG; - if (wino_blk_size_ == 4) { - obfuscated_kernel_name = - MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4"); - built_options.emplace("-Dwinograd_inverse_transform_4x4=" - + obfuscated_kernel_name); - } else if (wino_blk_size_ == 2) { - obfuscated_kernel_name = - MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); - built_options.emplace("-Dwinograd_inverse_transform_2x2=" - + obfuscated_kernel_name); - } else { - MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); - return MACE_SUCCESS; - } - - built_options.emplace("-DDATA_TYPE=" + - DtToUpCompatibleCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); - switch (activation_) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case PRELU: - built_options.emplace("-DUSE_PRELU"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - default: - LOG(FATAL) << "Unknown activation type: " << activation_; - } - - MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", - obfuscated_kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - Tensor::MappingGuard output_shape_guard(inputs[1]); - const int32_t *output_shape_data = inputs[1]->data(); - const index_t batch = output_shape_data[0]; - const index_t height = output_shape_data[1]; - const index_t width = output_shape_data[2]; - const uint32_t gws[2] = { - static_cast(input_tensor->dim(2)), - static_cast(RoundUpDiv4(input_tensor->dim(1)))}; - if (!IsVecEqual(input_shape_, input_tensor->shape())) { - std::vector output_shape = {batch, height, width, - input_tensor->dim(1)}; - std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); - MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); - - const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_; - const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_; - - const float round_hw_r = 1.f / static_cast(round_h * round_w); - const float round_w_r = 1.f / static_cast(round_w); - - uint32_t idx = 0; - OUT_OF_RANGE_SET_ARG; - SET_2D_GWS_ARGS(kernel_); - kernel_.setArg( - idx++, - *(static_cast(input_tensor->opencl_image()))); - if (bias != nullptr) { - kernel_.setArg(idx++, - *(static_cast(bias->opencl_image()))); - } - kernel_.setArg( - idx++, *(static_cast(output_tensor->opencl_image()))); - kernel_.setArg(idx++, static_cast(output_shape[1])); - kernel_.setArg(idx++, static_cast(output_shape[2])); - kernel_.setArg(idx++, static_cast(round_h * round_w)); - kernel_.setArg(idx++, round_hw_r); - kernel_.setArg(idx++, static_cast(round_w)); - kernel_.setArg(idx++, round_w_r); - kernel_.setArg(idx++, relux_max_limit_); - - input_shape_ = input_tensor->shape(); - } - const std::vector lws = {kwg_size_ / 8, 8, 0}; - std::string tuning_key = - Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), - output_tensor->dim(1), output_tensor->dim(2), - output_tensor->dim(3), input_tensor->dim(2)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); - - OUT_OF_RANGE_VALIDATION(kernel_error_); - return MACE_SUCCESS; + return kernel_->Compute(context_, inputs, output_tensor, future); } template struct WinogradTransformFunctor; diff --git a/mace/kernels/pad.h b/mace/kernels/pad.h index 14a4c8d6f4b7438709f1af05d776bec7cb273883..23d60bf439ba4c6ae4ec0b7da4178ec19250c10f 100644 --- a/mace/kernels/pad.h +++ b/mace/kernels/pad.h @@ -23,31 +23,17 @@ #include "mace/core/tensor.h" #include "mace/kernels/kernel.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { -struct PadFunctorBase : OpKernel { - PadFunctorBase(OpKernelContext *context, - const std::vector &paddings, - const float constant_value) - : OpKernel(context), - paddings_(paddings), - constant_value_(constant_value) {} - - std::vector paddings_; - float constant_value_; -}; - template -struct PadFunctor : public PadFunctorBase { +struct PadFunctor : OpKernel { PadFunctor(OpKernelContext *context, const std::vector &paddings, const float constant_value) - : PadFunctorBase(context, paddings, constant_value) {} + : OpKernel(context), + paddings_(paddings), + constant_value_(constant_value) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -93,24 +79,32 @@ struct PadFunctor : public PadFunctorBase { return MACE_SUCCESS; } + + std::vector paddings_; + float constant_value_; }; #ifdef MACE_ENABLE_OPENCL +class OpenCLPadKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLPadKernel); +}; template -struct PadFunctor : PadFunctorBase { +struct PadFunctor : OpKernel { PadFunctor(OpKernelContext *context, const std::vector &paddings, - const float constant_value) - : PadFunctorBase(context, paddings, constant_value) {} + const float constant_value); MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future); + Tensor *output, + StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index c61745284b2288278be0d9c95076a9ab76af45cb..590ea11758c8f8cbc1591c348784a01afb342db7 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -29,10 +29,6 @@ #include #endif -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { enum PoolingType { @@ -84,8 +80,7 @@ struct PoolingFunctor: PoolingFunctorBase { strides, padding_type, paddings, - dilations) { - } + dilations) {} void MaxPooling(const float *input, const index_t *in_shape, @@ -455,6 +450,21 @@ struct PoolingFunctor: PoolingFunctorBase { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLPoolingKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLPoolingKernel); +}; template struct PoolingFunctor : PoolingFunctorBase { PoolingFunctor(OpKernelContext *context, @@ -463,23 +473,13 @@ struct PoolingFunctor : PoolingFunctorBase { const int *strides, const Padding padding_type, const std::vector &paddings, - const int *dilations) - : PoolingFunctorBase(context, - pooling_type, - kernels, - strides, - padding_type, - paddings, - dilations) { - } + const int *dilations); + MaceStatus operator()(const Tensor *input_tensor, - Tensor *output_tensor, - StatsFuture *future); + Tensor *output_tensor, + StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/reduce_mean.h b/mace/kernels/reduce_mean.h index 38d822e5730a5be345ba074523383dbe287bd2c0..db00fd41e0c8c33a32a8389bec29c61bdea124db 100644 --- a/mace/kernels/reduce_mean.h +++ b/mace/kernels/reduce_mean.h @@ -25,33 +25,15 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/kernels/kernel.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif - namespace mace { namespace kernels { -struct ReduceFunctorBase : OpKernel { - ReduceFunctorBase(OpKernelContext *context, - const std::vector &axis, - const bool keep_dims) - : OpKernel(context), - keep_dims_(keep_dims), - axis_(axis) {} - bool keep_dims_; - bool reduce_first_axis_; - const std::vector axis_; - std::vector data_reshape_; - std::vector out_shape_; -}; - template -struct ReduceMeanFunctor : ReduceFunctorBase { +struct ReduceMeanFunctor : OpKernel { ReduceMeanFunctor(OpKernelContext *context, const std::vector &axis, const bool keep_dims) - : ReduceFunctorBase(context, axis, keep_dims) {} + : OpKernel(context), axis_(axis), keep_dims_(keep_dims) {} void Simplify(const Tensor *input) { std::vector bitmap(static_cast(input->dim_size()), false); @@ -209,33 +191,43 @@ struct ReduceMeanFunctor : ReduceFunctorBase { } MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { + Tensor *output, + StatsFuture *future) { MACE_UNUSED(future); Simplify(input); output->Resize(out_shape_); Compute(input, output); return MACE_SUCCESS; } + + const std::vector axis_; + bool keep_dims_; + bool reduce_first_axis_; + std::vector data_reshape_; + std::vector out_shape_; }; #ifdef MACE_ENABLE_OPENCL +class OpenCLReduceMeanKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLReduceMeanKernel); +}; template -struct ReduceMeanFunctor - : ReduceFunctorBase { +struct ReduceMeanFunctor : OpKernel { ReduceMeanFunctor(OpKernelContext *context, - const std::vector axis, - const bool keep_dims) - : ReduceFunctorBase(context, axis, keep_dims) {} + const std::vector &axis, + const bool keep_dims); MaceStatus operator()(const Tensor *input, - Tensor *output_tensor, + Tensor *output, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif diff --git a/mace/kernels/resize_bicubic.h b/mace/kernels/resize_bicubic.h index 52f0ee34d1c9b4ed6e298b888868711df20eaf1b..a33e0549fe73035bc866af5384609f72781c297b 100644 --- a/mace/kernels/resize_bicubic.h +++ b/mace/kernels/resize_bicubic.h @@ -25,10 +25,6 @@ #include "mace/kernels/kernel.h" #include "mace/utils/logging.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -141,32 +137,20 @@ inline void ResizeImage(const float *images, } } -struct ResizeBicubicFunctorBase : OpKernel { - ResizeBicubicFunctorBase(OpKernelContext *context, - const std::vector &size, - bool align_corners) - : OpKernel(context), align_corners_(align_corners) { - MACE_CHECK(size.size() == 2); - out_height_ = size[0]; - out_width_ = size[1]; - } - - protected: - bool align_corners_; - index_t out_height_; - index_t out_width_; -}; - template struct ResizeBicubicFunctor; template<> -struct ResizeBicubicFunctor - : ResizeBicubicFunctorBase { +struct ResizeBicubicFunctor : OpKernel { ResizeBicubicFunctor(OpKernelContext *context, - const std::vector &size, - bool align_corners) - : ResizeBicubicFunctorBase(context, size, align_corners) {} + const bool align_corners, + const std::vector &size) + : OpKernel(context), + align_corners_(align_corners) { + MACE_CHECK(size.size() == 2); + out_height_ = size[0]; + out_width_ = size[1]; + } MaceStatus operator()(const Tensor *input, Tensor *output, @@ -205,25 +189,34 @@ struct ResizeBicubicFunctor return MACE_SUCCESS; } + + bool align_corners_; + index_t out_height_; + index_t out_width_; }; #ifdef MACE_ENABLE_OPENCL +class OpenCLResizeBicubicKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLResizeBicubicKernel); +}; template struct ResizeBicubicFunctor - : ResizeBicubicFunctorBase { + : OpKernel { ResizeBicubicFunctor(OpKernelContext *context, - const std::vector &size, - bool align_corners) - : ResizeBicubicFunctorBase(context, size, align_corners) {} + bool align_corners, + const std::vector &size); MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h index b4ed109a45b3010bc60f4ad873243a4a52e85514..ea3f7aa35b9f3848f93802f71c245898991a9b55 100644 --- a/mace/kernels/resize_bilinear.h +++ b/mace/kernels/resize_bilinear.h @@ -24,10 +24,6 @@ #include "mace/kernels/kernel.h" #include "mace/utils/quantize.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -179,29 +175,16 @@ inline void ResizeImageNHWC(const T *images, } } -struct ResizeBilinearFunctorBase : OpKernel { - ResizeBilinearFunctorBase(OpKernelContext *context, - const std::vector &size, - bool align_corners) - : OpKernel(context), - align_corners_(align_corners) { - MACE_CHECK(size.size() == 2); - out_height_ = size[0]; - out_width_ = size[1]; - } - - protected: - bool align_corners_; - index_t out_height_; - index_t out_width_; -}; - template -struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { +struct ResizeBilinearFunctor : OpKernel { ResizeBilinearFunctor(OpKernelContext *context, const std::vector &size, bool align_corners) - : ResizeBilinearFunctorBase(context, size, align_corners) {} + : OpKernel(context), align_corners_(align_corners) { + MACE_CHECK(size.size() == 2); + out_height_ = size[0]; + out_width_ = size[1]; + } MaceStatus operator()(const Tensor *input, Tensor *output, @@ -255,14 +238,22 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { return MACE_SUCCESS; } + + bool align_corners_; + index_t out_height_; + index_t out_width_; }; template -struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { +struct ResizeBilinearFunctor : OpKernel { ResizeBilinearFunctor(OpKernelContext *context, const std::vector &size, bool align_corners) - : ResizeBilinearFunctorBase(context, size, align_corners) {} + : OpKernel(context), align_corners_(align_corners) { + MACE_CHECK(size.size() == 2); + out_height_ = size[0]; + out_width_ = size[1]; + } MaceStatus operator()(const Tensor *input, Tensor *output, @@ -316,25 +307,34 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { return MACE_SUCCESS; } + + bool align_corners_; + index_t out_height_; + index_t out_width_; }; #ifdef MACE_ENABLE_OPENCL +class OpenCLResizeBilinearKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLResizeBilinearKernel); +}; template struct ResizeBilinearFunctor - : ResizeBilinearFunctorBase { + : OpKernel { ResizeBilinearFunctor(OpKernelContext *context, const std::vector &size, - bool align_corners) - : ResizeBilinearFunctorBase(context, size, align_corners) {} + bool align_corners); MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h index 0c2c91268f4d904daddfe401a166ae8b21a0e7eb..6afca75ac4ec758e2e8cf88309e9f99e9d95698c 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.h @@ -30,10 +30,6 @@ #include "mace/kernels/kernel.h" #include "mace/kernels/quantize.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -356,17 +352,23 @@ struct SoftmaxFunctor : OpKernel { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLSoftmaxKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *logits, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSoftmaxKernel); +}; template struct SoftmaxFunctor : OpKernel { - explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {} + explicit SoftmaxFunctor(OpKernelContext *context); MaceStatus operator()(const Tensor *logits, Tensor *output, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h index 86982963644821f2f980a23d5a270c5a79396636..337baefcca9d1ea3ff231e3652e467473b89578f 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.h @@ -23,10 +23,6 @@ #include "mace/core/tensor.h" #include "mace/kernels/kernel.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -102,7 +98,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { const std::vector &block_shape) : SpaceToBatchFunctorBase(context, paddings, block_shape) {} - MaceStatus operator()(Tensor *space_tensor, + MaceStatus operator()(const Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) { MACE_UNUSED(future); @@ -212,7 +208,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { const std::vector &block_shape) : SpaceToBatchFunctorBase(context, paddings, block_shape) {} - MaceStatus operator()(Tensor *space_tensor, + MaceStatus operator()(const Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) { MACE_UNUSED(future); @@ -311,21 +307,29 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLSpaceToBatchKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *space_tensor, + const std::vector &paddings, + const std::vector &block_shape, + const std::vector &output_shape, + Tensor *batch_tensor, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSpaceToBatchKernel); +}; template struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { SpaceToBatchFunctor(OpKernelContext *context, const std::vector &paddings, - const std::vector &block_shape) - : SpaceToBatchFunctorBase(context, paddings, block_shape) {} + const std::vector &block_shape); - MaceStatus operator()(Tensor *space_tensor, - Tensor *batch_tensor, - StatsFuture *future); + MaceStatus operator()(const Tensor *space_tensor, + Tensor *batch_tensor, + StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector space_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/space_to_depth.h b/mace/kernels/space_to_depth.h index 48dc60781d92a2f58df42bebe30bbf8059887517..2f379bbf8bf34cf59a62d57826bca92b42ffe4bb 100644 --- a/mace/kernels/space_to_depth.h +++ b/mace/kernels/space_to_depth.h @@ -22,10 +22,6 @@ #include "mace/public/mace.h" #include "mace/kernels/kernel.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { @@ -91,20 +87,24 @@ struct SpaceToDepthOpFunctor : OpKernel { }; #ifdef MACE_ENABLE_OPENCL +class OpenCLSpaceToDepthKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSpaceToDepthKernel); +}; template struct SpaceToDepthOpFunctor : OpKernel { explicit SpaceToDepthOpFunctor(OpKernelContext *context, - const int block_size) - : OpKernel(context), block_size_(block_size) {} + const int block_size); MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future); - const int block_size_; - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/split.h b/mace/kernels/split.h index 899e74dac04f9de7c11eb2c3e94f01706b464828..ffef9699ccbeb0fd0404d951e0e6de85c0aeb0d4 100644 --- a/mace/kernels/split.h +++ b/mace/kernels/split.h @@ -25,28 +25,17 @@ #include "mace/kernels/kernel.h" #include "mace/public/mace.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { -struct SplitFunctorBase : OpKernel { - SplitFunctorBase(OpKernelContext *context, const int32_t axis) - : OpKernel(context), axis_(axis) {} - - int32_t axis_; -}; - template -struct SplitFunctor : SplitFunctorBase { +struct SplitFunctor : OpKernel { SplitFunctor(OpKernelContext *context, const int32_t axis) - : SplitFunctorBase(context, axis) {} + : OpKernel(context), axis_(axis) {} MaceStatus operator()(const Tensor *input, - const std::vector &output_list, - StatsFuture *future) { + const std::vector &output_list, + StatsFuture *future) { MACE_UNUSED(future); const index_t input_channels = input->dim(axis_); const size_t outputs_count = output_list.size(); @@ -88,20 +77,28 @@ struct SplitFunctor : SplitFunctorBase { return MACE_SUCCESS; } + + int32_t axis_; }; #ifdef MACE_ENABLE_OPENCL +class OpenCLSplitKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + const std::vector &output_list, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSplitKernel); +}; template -struct SplitFunctor : SplitFunctorBase { - SplitFunctor(OpKernelContext *context, const int32_t axis) - : SplitFunctorBase(context, axis) {} +struct SplitFunctor : OpKernel { + SplitFunctor(OpKernelContext *context, const int32_t axis); MaceStatus operator()(const Tensor *input, const std::vector &output_list, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h index c2e267c480f59118c33380aaf342d04ae37f3b3d..313645980160fa6a34e8144eb1a14d4491e244f2 100644 --- a/mace/kernels/winograd_transform.h +++ b/mace/kernels/winograd_transform.h @@ -23,132 +23,63 @@ #include "mace/kernels/activation.h" #include "mace/kernels/conv_pool_2d_util.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { -struct WinogradTransformFunctorBase : OpKernel { - WinogradTransformFunctorBase(OpKernelContext *context, - const Padding &padding_type, - const std::vector &paddings, - const int block_size) - : OpKernel(context), - strides_({1, 1}), - dilations_({1, 1}), - padding_type_(padding_type), - paddings_(paddings), - wino_blk_size_(block_size) {} - - const std::vector strides_; // [stride_h, stride_w] - const std::vector dilations_; // [dilation_h, dilation_w] - Padding padding_type_; - std::vector paddings_; - const int wino_blk_size_; -}; - -template -struct WinogradTransformFunctor : WinogradTransformFunctorBase { - WinogradTransformFunctor(OpKernelContext *context, - const Padding &padding_type, - const std::vector &paddings, - const int block_size) - : WinogradTransformFunctorBase(context, - padding_type, - paddings, - block_size) {} - - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(input); - MACE_UNUSED(output); - MACE_UNUSED(future); - MACE_NOT_IMPLEMENTED; - return MACE_SUCCESS; - } -}; +template +struct WinogradTransformFunctor; #ifdef MACE_ENABLE_OPENCL +class OpenCLWinogradTransformKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const Tensor *input, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLWinogradTransformKernel); +}; template -struct WinogradTransformFunctor - : WinogradTransformFunctorBase { +struct WinogradTransformFunctor : OpKernel { WinogradTransformFunctor(OpKernelContext *context, const Padding &padding_type, const std::vector &paddings, - const int block_size) - : WinogradTransformFunctorBase(context, - padding_type, - paddings, - block_size) {} + const int block_size); MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL -struct WinogradInverseTransformFunctorBase : OpKernel { - WinogradInverseTransformFunctorBase(OpKernelContext *context, - const ActivationType activation, - const float relux_max_limit, - const int block_size) - : OpKernel(context), - wino_blk_size_(block_size), - activation_(activation), - relux_max_limit_(relux_max_limit) {} - - const int wino_blk_size_; - const ActivationType activation_; - const float relux_max_limit_; -}; template -struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { - WinogradInverseTransformFunctor(OpKernelContext *context, - const ActivationType activation, - const float relux_max_limit, - const int block_size) - : WinogradInverseTransformFunctorBase( - context, activation, relux_max_limit, block_size) {} - - MaceStatus operator()(const std::vector &inputs, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(inputs); - MACE_UNUSED(output); - MACE_UNUSED(future); - MACE_NOT_IMPLEMENTED; - return MACE_SUCCESS; - } -}; +struct WinogradInverseTransformFunctor; #ifdef MACE_ENABLE_OPENCL +class OpenCLWinogradInverseTransformKernel { + public: + virtual MaceStatus Compute( + OpKernelContext *context, + const std::vector &inputs, + Tensor *output, + StatsFuture *future) = 0; + MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLWinogradInverseTransformKernel); +}; template -struct WinogradInverseTransformFunctor - : WinogradInverseTransformFunctorBase { +struct WinogradInverseTransformFunctor : OpKernel { WinogradInverseTransformFunctor(OpKernelContext *context, const ActivationType activation, const float relux_max_limit, - const int block_size) - : WinogradInverseTransformFunctorBase( - context, activation, relux_max_limit, block_size) {} + const int block_size); - MaceStatus operator()(const std::vector &inputs, - Tensor *output, - StatsFuture *future); + MaceStatus operator()(const std::vector &inputs, + Tensor *output, + StatsFuture *future); - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; + std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index b9ae1497a70292a4c57c9e278aef1bb96f574b3a..6e1b44d8be5b7844bc5466c66a8299913c9b280e 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -77,26 +77,36 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { return MaceStatus::MACE_INVALID_ARGS; } - if (!runtime->IsImageSupport()) { - return MaceStatus::MACE_OUT_OF_RESOURCES; - } + const int mem_type_i = + ProtoArgHelper::GetOptionalArg( + *net_def, "opencl_mem_type", + static_cast(MemoryType::GPU_IMAGE)); + const MemoryType mem_type = static_cast(mem_type_i); + + runtime->set_mem_type(mem_type); + if (mem_type == MemoryType::GPU_IMAGE) { + if (!runtime->IsImageSupport()) { + return MaceStatus::MACE_OUT_OF_RESOURCES; + } - auto opencl_max_image_size = runtime->GetMaxImage2DSize(); - if (opencl_max_image_size.empty()) { - return MaceStatus::MACE_OUT_OF_RESOURCES; - } + auto opencl_max_image_size = runtime->GetMaxImage2DSize(); + if (opencl_max_image_size.empty()) { + return MaceStatus::MACE_OUT_OF_RESOURCES; + } - const std::vector net_max_image_size = - ProtoArgHelper::GetRepeatedArgs( - *net_def, "opencl_max_image_size", {0, 0}); + const std::vector net_max_image_size = + ProtoArgHelper::GetRepeatedArgs( + *net_def, "opencl_max_image_size", {0, 0}); - if (static_cast(net_max_image_size[0]) > opencl_max_image_size[0] - || static_cast(net_max_image_size[1]) - > opencl_max_image_size[1]) { - LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size) - << " vs " << MakeString(net_max_image_size); - return MaceStatus::MACE_OUT_OF_RESOURCES; + if (static_cast(net_max_image_size[0]) > opencl_max_image_size[0] + || static_cast(net_max_image_size[1]) + > opencl_max_image_size[1]) { + LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size) + << " vs " << MakeString(net_max_image_size); + return MaceStatus::MACE_OUT_OF_RESOURCES; + } } + return MaceStatus::MACE_SUCCESS; } #endif diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 342dbc0d4c6b8ce6728b3d276e61bdec00f6a134..54a885aba5011dcf80f847540b8c262f691e8718 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -54,14 +54,14 @@ cc_library( "*_test.cc", "*_benchmark.cc", "ops_test_util.cc", - "buffer_to_image.cc", - "image_to_buffer.cc", + "buffer_transform.cc", + "buffer_inverse_transform.cc", "lstmcell.cc", ], ) + if_opencl_enabled( [ - "buffer_to_image.cc", - "image_to_buffer.cc", + "buffer_transform.cc", + "buffer_inverse_transform.cc", "lstmcell.cc", ], ), diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h index 3f56e8d94cb1b447ae5ce0203255e3a59813a34b..458db28481f82860372dea3ba7a55ce8ea50f404 100644 --- a/mace/ops/batch_to_space.h +++ b/mace/ops/batch_to_space.h @@ -36,7 +36,7 @@ class BatchToSpaceNDOp : public Operator { MaceStatus Run(StatsFuture *future) override { const Tensor *batch_tensor = this->Input(INPUT); Tensor *space_tensor = this->Output(OUTPUT); - return functor_(space_tensor, const_cast(batch_tensor), future); + return functor_(batch_tensor, space_tensor, future); } private: diff --git a/mace/ops/buffer_to_image.cc b/mace/ops/buffer_inverse_transform.cc similarity index 71% rename from mace/ops/buffer_to_image.cc rename to mace/ops/buffer_inverse_transform.cc index 83569ba3546e0b6a640f199b565d987f0486368e..af52d482c95d4185c6ab5ee5cf9f7aa0bc52c688 100644 --- a/mace/ops/buffer_to_image.cc +++ b/mace/ops/buffer_inverse_transform.cc @@ -12,23 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/buffer_to_image.h" +#include "mace/ops/buffer_inverse_transform.h" namespace mace { namespace ops { -void Register_BufferToImage(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") +void Register_BufferInverseTransform(OperatorRegistryBase *op_registry) { + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferInverseTransform") .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - BufferToImageOp); + BufferInverseTransformOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferInverseTransform") .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - BufferToImageOp); + BufferInverseTransformOp); } } // namespace ops diff --git a/mace/ops/image_to_buffer.h b/mace/ops/buffer_inverse_transform.h similarity index 78% rename from mace/ops/image_to_buffer.h rename to mace/ops/buffer_inverse_transform.h index fc259a01b9c2d7c5ac01cc05762bbe1d12abe2b5..9eefb0f0be2e6f3fbbdbe7253fd56a67fefb1b1c 100644 --- a/mace/ops/image_to_buffer.h +++ b/mace/ops/buffer_inverse_transform.h @@ -12,19 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_IMAGE_TO_BUFFER_H_ -#define MACE_OPS_IMAGE_TO_BUFFER_H_ +#ifndef MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_ +#define MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_ #include "mace/core/operator.h" -#include "mace/kernels/image_to_buffer.h" +#include "mace/kernels/buffer_inverse_transform.h" namespace mace { namespace ops { template -class ImageToBufferOp : public Operator { +class BufferInverseTransformOp : public Operator { public: - ImageToBufferOp(const OperatorDef &op_def, OpKernelContext *context) + BufferInverseTransformOp(const OperatorDef &op_def, OpKernelContext *context) : Operator(op_def, context), functor_(context, OperatorBase::GetOptionalArg("wino_block_size", 2)) {} @@ -40,7 +40,7 @@ class ImageToBufferOp : public Operator { } private: - kernels::ImageToBufferFunctor functor_; + kernels::BufferInverseTransformFunctor functor_; protected: MACE_OP_INPUT_TAGS(INPUT); @@ -50,4 +50,4 @@ class ImageToBufferOp : public Operator { } // namespace ops } // namespace mace -#endif // MACE_OPS_IMAGE_TO_BUFFER_H_ +#endif // MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_ diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc index 6b2325b725177ff203deecda7b5e55bc74d47b6a..040e666e7c2ce5647bb39d022819bb2a977f5f31 100644 --- a/mace/ops/buffer_to_image_test.cc +++ b/mace/ops/buffer_to_image_test.cc @@ -24,7 +24,7 @@ template void TestBidirectionTransform(const int type, const std::vector &input_shape) { OpsTestNet net; - OpDefBuilder("BufferToImage", "BufferToImageTest") + OpDefBuilder("BufferTransform", "BufferTransformTest") .Input("Input") .Output("B2IOutput") .AddIntArg("buffer_type", type) @@ -37,7 +37,7 @@ void TestBidirectionTransform(const int type, // Run net.RunOp(D); - OpDefBuilder("ImageToBuffer", "ImageToBufferTest") + OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") .Input("B2IOutput") .Output("I2BOutput") .AddIntArg("buffer_type", type) @@ -159,7 +159,7 @@ template void TestDiffTypeBidirectionTransform(const int type, const std::vector &input_shape) { OpsTestNet net; - OpDefBuilder("BufferToImage", "BufferToImageTest") + OpDefBuilder("BufferTransform", "BufferTransformTest") .Input("Input") .Output("B2IOutput") .AddIntArg("buffer_type", type) @@ -172,7 +172,7 @@ void TestDiffTypeBidirectionTransform(const int type, // Run net.RunOp(D); - OpDefBuilder("ImageToBuffer", "ImageToBufferTest") + OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") .Input("B2IOutput") .Output("I2BOutput") .AddIntArg("buffer_type", type) @@ -198,7 +198,7 @@ void TestStringHalfBidirectionTransform(const int type, const std::vector &input_shape, const unsigned char *input_data) { OpsTestNet net; - OpDefBuilder("BufferToImage", "BufferToImageTest") + OpDefBuilder("BufferTransform", "BufferTransformTest") .Input("Input") .Output("B2IOutput") .AddIntArg("buffer_type", type) @@ -213,7 +213,7 @@ void TestStringHalfBidirectionTransform(const int type, // Run net.RunOp(D); - OpDefBuilder("ImageToBuffer", "ImageToBufferTest") + OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") .Input("B2IOutput") .Output("I2BOutput") .AddIntArg("buffer_type", type) diff --git a/mace/ops/image_to_buffer.cc b/mace/ops/buffer_transform.cc similarity index 73% rename from mace/ops/image_to_buffer.cc rename to mace/ops/buffer_transform.cc index cc60d146417069b03afa72a8ac8ea0b656212ba9..bab1b894d4955f9269c905ef57f970537d3d837b 100644 --- a/mace/ops/image_to_buffer.cc +++ b/mace/ops/buffer_transform.cc @@ -12,23 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/image_to_buffer.h" +#include "mace/ops/buffer_transform.h" namespace mace { namespace ops { -void Register_ImageToBuffer(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") +void Register_BufferTransform(OperatorRegistryBase *op_registry) { + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferTransform") .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - ImageToBufferOp); + BufferTransformOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferTransform") .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), - ImageToBufferOp); + BufferTransformOp); } } // namespace ops diff --git a/mace/ops/buffer_to_image.h b/mace/ops/buffer_transform.h similarity index 80% rename from mace/ops/buffer_to_image.h rename to mace/ops/buffer_transform.h index 0fa34c30f52a339de00e5f1d5efd28fe844a433b..94a4779f94ae752cdb779b50c234fa1e679f790c 100644 --- a/mace/ops/buffer_to_image.h +++ b/mace/ops/buffer_transform.h @@ -12,19 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_BUFFER_TO_IMAGE_H_ -#define MACE_OPS_BUFFER_TO_IMAGE_H_ +#ifndef MACE_OPS_BUFFER_TRANSFORM_H_ +#define MACE_OPS_BUFFER_TRANSFORM_H_ #include "mace/core/operator.h" -#include "mace/kernels/buffer_to_image.h" +#include "mace/kernels/buffer_transform.h" namespace mace { namespace ops { template -class BufferToImageOp : public Operator { +class BufferTransformOp : public Operator { public: - BufferToImageOp(const OperatorDef &op_def, OpKernelContext *context) + BufferTransformOp(const OperatorDef &op_def, OpKernelContext *context) : Operator(op_def, context), functor_(context, OperatorBase::GetOptionalArg("wino_block_size", 2)) {} @@ -41,7 +41,7 @@ class BufferToImageOp : public Operator { } private: - kernels::BufferToImageFunctor functor_; + kernels::BufferTransformFunctor functor_; protected: MACE_OP_INPUT_TAGS(INPUT); @@ -50,4 +50,4 @@ class BufferToImageOp : public Operator { } // namespace ops } // namespace mace -#endif // MACE_OPS_BUFFER_TO_IMAGE_H_ +#endif // MACE_OPS_BUFFER_TRANSFORM_H_ diff --git a/mace/ops/buffer_transform_test.cc b/mace/ops/buffer_transform_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..aff6855f6df39d50568a9c722d60e732694fc75b --- /dev/null +++ b/mace/ops/buffer_transform_test.cc @@ -0,0 +1,115 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "gtest/gtest.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class BufferTransformTest : public OpsTestBase { + protected: + virtual void SetUp() { + OpTestContext::Get()->SetOCLBufferTestFlag(); + } +}; + +namespace { +template +void TestBidirectionTransform(const int type, + const std::vector &input_shape) { + OpsTestNet net; + OpDefBuilder("BufferTransform", "BufferTransformTest") + .Input("Input") + .Output("TransformedOutput") + .AddIntArg("buffer_type", type) + .AddIntArg("T", DataTypeToEnum::value) + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput("Input", input_shape); + + // Run + net.RunOp(DeviceType::GPU); + + OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") + .Input("TransformedOutput") + .Output("Output") + .AddIntArg("buffer_type", type) + .AddIntArg("T", DataTypeToEnum::value) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(DeviceType::GPU); + + if (DataTypeToEnum::value == DataTypeToEnum::value) { + EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(), + net.GetOutput("Output")->UnderlyingBuffer()); + } else { + // Check + ExpectTensorNear(*net.GetOutput("Input"), + *net.GetOutput("Output"), + 1e-3, 1e-4); + } +} +} // namespace + +TEST_F(BufferTransformTest, FloatToHalf) { + TestBidirectionTransform(kernels::BufferType::IN_OUT_CHANNEL, + {1, 2, 3, 4}); +} + +TEST_F(BufferTransformTest, HalfToHalf) { + TestBidirectionTransform(kernels::BufferType::IN_OUT_CHANNEL, + {1, 2, 3, 4}); +} + +namespace { +template +void TestArgumentTransform(const index_t input_size) { + OpsTestNet net; + OpDefBuilder("BufferTransform", "BufferTransformTest") + .Input("Input") + .Output("Output") + .AddIntArg("buffer_type", kernels::BufferType::ARGUMENT) + .AddIntArg("T", DataTypeToEnum::value) + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput("Input", {input_size}); + + // Run + net.RunOp(DeviceType::GPU); + + auto output_tensor = net.GetOutput("Output"); + index_t expected_size = RoundUp(input_size, 4); + EXPECT_EQ(expected_size, output_tensor->buffer_shape()[0]); + + // Check + ExpectTensorNear(*net.GetTensor("Input"), *output_tensor, + 1e-3, 1e-4); +} +} // namespace + +TEST_F(BufferTransformTest, Argument) { + TestArgumentTransform(30); + TestArgumentTransform(32); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h index 5864e1edb0ad1bc4eed4c9db9d1411ea1a6499c2..9d2c24262e1f27c623ea6758d55d63c40791825b 100644 --- a/mace/ops/conv_2d.h +++ b/mace/ops/conv_2d.h @@ -40,8 +40,7 @@ class Conv2dOp : public ConvPool2dOpBase { "NOOP")), OperatorBase::GetOptionalArg("max_limit", 0.0f), static_cast(OperatorBase::GetOptionalArg( - "is_filter_transformed", false)), - context->workspace()->GetScratchBuffer(D)) {} + "is_filter_transformed", false))) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 750d64ef2feba2ca855e7dfe7e4c067d213b35ae..354b1935edff27fe2b917409eccc9573bb5f2b53 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -23,21 +23,26 @@ namespace mace { namespace ops { namespace test { -class Conv2dOpTest : public OpsTestBase {}; +class Conv2dOpTest : public OpsTestBase { + protected: + virtual void SetUp() { + OpTestContext::Get()->SetOCLImageTestFlag(); + } +}; namespace { template void TestNHWCSimple3x3VALID() { OpsTestNet net; // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {1, 2, 3, 3}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {0.1f}); + net.AddInputFromArray("Bias", {1}, {0.1f}); if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -50,7 +55,6 @@ void TestNHWCSimple3x3VALID() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -77,15 +81,19 @@ void TestNHWCSimple3x3VALID() { net.RunOp(D); // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - kernels::BufferType::IN_OUT_CHANNEL); + ImageToBuffer(&net, "OutputImage", "Output", + kernels::BufferType::IN_OUT_CHANNEL); } else { MACE_NOT_IMPLEMENTED; } auto expected = net.CreateTensor({1, 1, 1, 1}, {18.1f}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + if (DataTypeToEnum::value == DataType::DT_FLOAT) { + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + } else { + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-3, 1e-3); + } } template @@ -93,14 +101,14 @@ void TestNHWCSimple3x3SAME() { OpsTestNet net; // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {1, 2, 3, 3}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {0.1f}); + net.AddInputFromArray("Bias", {1}, {0.1f}); if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -113,7 +121,6 @@ void TestNHWCSimple3x3SAME() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -140,8 +147,8 @@ void TestNHWCSimple3x3SAME() { net.RunOp(D); // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - kernels::BufferType::IN_OUT_CHANNEL); + ImageToBuffer(&net, "OutputImage", "Output", + kernels::BufferType::IN_OUT_CHANNEL); } else { MACE_NOT_IMPLEMENTED; @@ -151,7 +158,11 @@ void TestNHWCSimple3x3SAME() { {1, 3, 3, 1}, {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + if (DataTypeToEnum::value == DataType::DT_FLOAT) { + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + } else { + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-3, 1e-3); + } } } // namespace @@ -165,6 +176,11 @@ TEST_F(Conv2dOpTest, OPENCLSimple) { TestNHWCSimple3x3SAME(); } +TEST_F(Conv2dOpTest, OPENCLHalfSimple) { + TestNHWCSimple3x3VALID(); + TestNHWCSimple3x3SAME(); +} + namespace { template void TestNHWCSimple3x3WithoutBias() { @@ -638,7 +654,7 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, auto func = [&](int stride_h, int stride_w, Padding padding) { // generate random input - index_t batch = 3; + index_t batch = 1; index_t height = input_shape[0]; index_t width = input_shape[1]; index_t kernel_h = filter_shape[0]; @@ -713,7 +729,7 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, - 1e-1); + 1e-2); }; func(1, 1, VALID); diff --git a/mace/ops/core_test.cc b/mace/ops/core_test.cc index 8eecd77dca5d2ae149b3a3711b5c1d3cd631d6ad..6c1b25de265619aaaa665ad153271b53fc567a01 100644 --- a/mace/ops/core_test.cc +++ b/mace/ops/core_test.cc @@ -26,7 +26,7 @@ TEST(CoreTest, INIT_MODE) { Workspace ws; op_defs.emplace_back(OperatorDef()); - OpDefBuilder("BufferToImage", "BufferToImageTest") + OpDefBuilder("BufferTransform", "BufferTransformTest") .Input("Input") .Output("B2IOutput") .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER) @@ -43,7 +43,7 @@ TEST(CoreTest, INIT_MODE) { } op_defs.emplace_back(OperatorDef()); - OpDefBuilder("ImageToBuffer", "ImageToBufferTest") + OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") .Input("B2IOutput") .Output("Output") .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER) diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index 7f7e1c17e7d25ef7c5813cdaf596f8dbb21ae6c1..39dd69449b91b88fa167cb51e1b8a8e9655a70ee 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -250,19 +250,19 @@ void TestNxNS12(const index_t height, const index_t width) { Padding type) { // generate random input static unsigned int seed = time(NULL); - index_t batch = 1 + rand_r(&seed) % 5; - index_t input_channels = 3 + rand_r(&seed) % 16; + index_t batch = 1; + index_t channel = 32; index_t multiplier = 1; // Construct graph OpsTestNet net; // Add input data net.AddRandomInput( - "Input", {batch, height, width, input_channels}); + "Input", {batch, height, width, channel}); net.AddRandomInput( - "Filter", {multiplier, input_channels, kernel_h, kernel_w}); + "Filter", {multiplier, channel, kernel_h, kernel_w}); net.AddRandomInput("Bias", - {multiplier * input_channels}); + {multiplier * channel}); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -275,6 +275,8 @@ void TestNxNS12(const index_t height, const index_t width) { .AddIntArg("padding", type) .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddStringArg("activation", "RELUX") + .AddFloatArg("max_limit", 6.0) .Finalize(net.NewOperatorDef()); // Run on cpu @@ -302,6 +304,8 @@ void TestNxNS12(const index_t height, const index_t width) { .AddIntArg("padding", type) .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddStringArg("activation", "RELUX") + .AddFloatArg("max_limit", 6.0) .Finalize(net.NewOperatorDef()); net.RunOp(DeviceType::GPU); diff --git a/mace/ops/ops_register.cc b/mace/ops/ops_register.cc index dddc032614e37a43cc4968e969ee43bafa1b30f8..7fda59bc6d0aa4ae92acc05c5aa83b7fe91c62a1 100644 --- a/mace/ops/ops_register.cc +++ b/mace/ops/ops_register.cc @@ -67,8 +67,8 @@ extern void Register_WinogradInverseTransform(OperatorRegistryBase *op_registry) extern void Register_WinogradTransform(OperatorRegistryBase *op_registry); #ifdef MACE_ENABLE_OPENCL -extern void Register_BufferToImage(OperatorRegistryBase *op_registry); -extern void Register_ImageToBuffer(OperatorRegistryBase *op_registry); +extern void Register_BufferTransform(OperatorRegistryBase *op_registry); +extern void Register_BufferInverseTransform(OperatorRegistryBase *op_registry); extern void Register_LSTMCell(OperatorRegistryBase *op_registry); #endif // MACE_ENABLE_OPENCL } // namespace ops @@ -125,8 +125,8 @@ OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() { ops::Register_WinogradTransform(this); #ifdef MACE_ENABLE_OPENCL - ops::Register_BufferToImage(this); - ops::Register_ImageToBuffer(this); + ops::Register_BufferTransform(this); + ops::Register_BufferInverseTransform(this); ops::Register_LSTMCell(this); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc index 5e94c3a60735bc058467739f148e08153355db6d..a41e5b0947b59718a9ef275b9971eb71726e2f17 100644 --- a/mace/ops/ops_test_util.cc +++ b/mace/ops/ops_test_util.cc @@ -27,18 +27,11 @@ OpTestContext *OpTestContext::Get(int num_threads, return &instance; } -std::shared_ptr OpTestContext::gpu_context() const { - return gpu_context_; -} - -Device *OpTestContext::GetDevice(DeviceType device_type) { - return device_map_[device_type].get(); -} - OpTestContext::OpTestContext(int num_threads, CPUAffinityPolicy cpu_affinity_policy, bool use_gemmlowp) - : gpu_context_(new GPUContext()) { + : gpu_context_(new GPUContext()), + opencl_mem_types_({MemoryType::GPU_IMAGE}) { device_map_[DeviceType::CPU] = std::unique_ptr( new CPUDevice(num_threads, cpu_affinity_policy, @@ -50,6 +43,30 @@ OpTestContext::OpTestContext(int num_threads, GPUPriorityHint::PRIORITY_NORMAL)); } +std::shared_ptr OpTestContext::gpu_context() const { + return gpu_context_; +} + +Device *OpTestContext::GetDevice(DeviceType device_type) { + return device_map_[device_type].get(); +} + +std::vector OpTestContext::opencl_mem_types() { + return opencl_mem_types_; +} + +void OpTestContext::SetOCLBufferTestFlag() { + opencl_mem_types_ = {MemoryType::GPU_BUFFER}; +} + +void OpTestContext::SetOCLImageTestFlag() { + opencl_mem_types_ = {MemoryType::GPU_IMAGE}; +} + +void OpTestContext::SetOCLImageAndBufferTestFlag() { + opencl_mem_types_ = {MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER}; +} + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index b9b4cc4d76b8c71d5066a172eafe65ce3b203098..296fa3b9f2811a6d9987ec141b8130a4ba2cb151 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -120,7 +120,10 @@ class OpTestContext { bool use_gemmlowp = true); std::shared_ptr gpu_context() const; Device *GetDevice(DeviceType device_type); - + std::vector opencl_mem_types(); + void SetOCLBufferTestFlag(); + void SetOCLImageTestFlag(); + void SetOCLImageAndBufferTestFlag(); private: OpTestContext(int num_threads, CPUAffinityPolicy cpu_affinity_policy, @@ -128,6 +131,7 @@ class OpTestContext { MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext); std::shared_ptr gpu_context_; + std::vector opencl_mem_types_; std::map> device_map_; }; @@ -459,8 +463,19 @@ class OpsTestNet { // Test and benchmark should setup model once and run multiple times. // Setup time should not be counted during benchmark. MaceStatus RunOp(DeviceType device) { - Setup(device); - return Run(); + if (device == DeviceType::GPU) { + auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types(); + for (auto type : opencl_mem_types) { + OpTestContext::Get()->GetDevice(device) + ->opencl_runtime()->set_mem_type(type); + Setup(device); + MACE_RETURN_IF_ERROR(Run()); + } + return MACE_SUCCESS; + } else { + Setup(device); + return Run(); + } } // DEPRECATED(liyin): @@ -512,6 +527,7 @@ class OpsTestBase : public ::testing::Test { } virtual void TearDown() { + OpTestContext::Get()->SetOCLImageTestFlag(); } }; @@ -747,7 +763,7 @@ void BufferToImage(OpsTestNet *net, const int wino_block_size = 2) { MACE_CHECK_NOTNULL(net); - OpDefBuilder("BufferToImage", "BufferToImageTest") + OpDefBuilder("BufferTransform", "BufferTransformTest") .Input(input_name) .Output(output_name) .AddIntArg("buffer_type", type) @@ -755,7 +771,7 @@ void BufferToImage(OpsTestNet *net, .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net->NewOperatorDef()); - // Run + // TODO(liuqi): Use AddNewOperatorDef, and run all ops with same NetDef. net->RunOp(D); net->Sync(); @@ -769,7 +785,7 @@ void ImageToBuffer(OpsTestNet *net, const int wino_block_size = 2) { MACE_CHECK_NOTNULL(net); - OpDefBuilder("ImageToBuffer", "ImageToBufferTest") + OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") .Input(input_name) .Output(output_name) .AddIntArg("buffer_type", type) diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc index 082ee4d0cfa710ddf38ade8a6cd3c046dc551e74..36b9d607acaf7d1da094e30055fc81963e4f9018 100644 --- a/mace/ops/pooling_benchmark.cc +++ b/mace/ops/pooling_benchmark.cc @@ -111,6 +111,7 @@ void Pooling(int iters, #define MACE_BM_POOLING(N, C, H, W, K, S, PA, PO) \ MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, float, CPU); \ MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, float, GPU); \ + MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, half, GPU); \ MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, uint8_t, CPU); diff --git a/mace/ops/resize_bicubic.h b/mace/ops/resize_bicubic.h index 23b4c116b660ae814e9c8085a7cbf90712861c02..df9fc11c269ad6eed0dedc9e99b9bf4c98af3ebe 100644 --- a/mace/ops/resize_bicubic.h +++ b/mace/ops/resize_bicubic.h @@ -27,8 +27,8 @@ class ResizeBicubicOp : public Operator { ResizeBicubicOp(const OperatorDef &operator_def, OpKernelContext *context) : Operator(operator_def, context), functor_(context, - OperatorBase::GetRepeatedArgs("size", {-1, -1}), - OperatorBase::GetOptionalArg("align_corners", false)) {} + OperatorBase::GetOptionalArg("align_corners", false), + OperatorBase::GetRepeatedArgs("size", {-1, -1})) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(0); diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h index 39b90b1afbb4fd06c0825e9a4274b9f8f4fa23b7..fabd7bb235197f9b0101b46e77709b2d86977346 100644 --- a/mace/ops/space_to_batch.h +++ b/mace/ops/space_to_batch.h @@ -36,7 +36,7 @@ class SpaceToBatchNDOp : public Operator { MaceStatus Run(StatsFuture *future) override { const Tensor *space_tensor = this->Input(INPUT); Tensor *batch_tensor = this->Output(OUTPUT); - return functor_(const_cast(space_tensor), batch_tensor, future); + return functor_(space_tensor, batch_tensor, future); } private: diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index 18b4cea403066395aaf2473226c1acabec65ac5b..bcbb743918454a1639c2b9eef125fcfa500bb4a2 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -106,6 +106,7 @@ def main(unused_args): option.winograd = FLAGS.winograd option.quantize = FLAGS.quantize option.quantize_range_file = FLAGS.quantize_range_file + option.cl_mem_type = FLAGS.cl_mem_type input_node_names = FLAGS.input_node.split(',') input_node_shapes = FLAGS.input_shape.split(':') @@ -323,6 +324,11 @@ def parse_args(): type=str, default="", help="file path of quantize range for each tensor") + parser.add_argument( + "--cl_mem_type", + type=str, + default="image", + help="which memory type to use.[image|buffer]") return parser.parse_known_args() diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 7587f64738fb26ed47d8b45e17218b241ec36a70..b7974a245113537cb604fcfcd1c62f864863406f 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -131,8 +131,8 @@ class MaceKeyword(object): mace_output_node_name = 'mace_output_node' mace_buffer_type = 'buffer_type' mace_mode = 'mode' - mace_buffer_to_image = 'BufferToImage' - mace_image_to_buffer = 'ImageToBuffer' + mace_buffer_transform = 'BufferTransform' + mace_buffer_inverse_transform = 'BufferInverseTransform' # arg related str mace_padding_str = 'padding' mace_padding_values_str = 'padding_values' @@ -175,6 +175,7 @@ class MaceKeyword(object): mace_opencl_max_image_size = "opencl_max_image_size" mace_seperate_buffer_str = 'seperate_buffer' mace_scalar_input_index_str = 'scalar_input_index' + mace_opencl_mem_type = "opencl_mem_type" class TransformerRule(Enum): @@ -194,7 +195,7 @@ class TransformerRule(Enum): RESHAPE_FC_WEIGHT = 14 TRANSPOSE_DATA_FORMAT = 15 TRANSFORM_GLOBAL_CONV_TO_FC = 16 - TRANSFORM_BUFFER_IMAGE = 17 + ADD_BUFFER_TRANSFORM = 17 ADD_DEVICE = 18 SORT_BY_EXECUTION = 19 ADD_IN_OUT_TENSOR_INFO = 20 @@ -208,6 +209,7 @@ class TransformerRule(Enum): TRANSFORM_FAKE_QUANTIZE = 28 CHECK_QUANTIZE_INFO = 29 REARRANGE_BATCH_TO_SPACE = 30 + ADD_OPENCL_INFORMATIONS = 31 class ConverterInterface(object): @@ -265,6 +267,7 @@ class ConverterOption(object): self._quantize = False self._quantize_range_file = "" self._transformer_option = None + self._cl_mem_type = "" @property def input_nodes(self): @@ -298,6 +301,10 @@ class ConverterOption(object): def transformer_option(self): return self._transformer_option + @property + def cl_mem_type(self): + return self._cl_mem_type + @input_nodes.setter def input_nodes(self, input_nodes): for node in input_nodes: @@ -338,6 +345,10 @@ class ConverterOption(object): def transformer_option(self, transformer_option): self._transformer_option = transformer_option + @cl_mem_type.setter + def cl_mem_type(self, cl_mem_type): + self._cl_mem_type = cl_mem_type + def disable_transpose_filters(self): if TransformerRule.TRANSPOSE_FILTERS in self._transformer_option: self._transformer_option.remove(TransformerRule.TRANSPOSE_FILTERS) @@ -377,11 +388,12 @@ class ConverterOption(object): # Mace model structure related transformation TransformerRule.ADD_IN_OUT_TENSOR_INFO, # Device related transformation - TransformerRule.TRANSFORM_BUFFER_IMAGE, + TransformerRule.ADD_BUFFER_TRANSFORM, TransformerRule.ADD_DEVICE, # Data type related transformation TransformerRule.UPDATE_FLOAT_OP_DATA_TYPE, # Transform finalization + TransformerRule.ADD_OPENCL_INFORMATIONS, TransformerRule.ADD_MACE_INPUT_AND_OUTPUT_NODES, # for quantization entropy calibration use TransformerRule.SORT_BY_EXECUTION, diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 5760c76fbca15d3f898983525d499accaac94814..982ab5083c474ac0e6358841cdc49be4f3a4a701 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -80,8 +80,8 @@ class Transformer(base_converter.ConverterInterface): TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC: self.transform_global_conv_to_fc, TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight, - TransformerRule.TRANSFORM_BUFFER_IMAGE: - self.transform_buffer_image, + TransformerRule.ADD_BUFFER_TRANSFORM: + self.add_buffer_transform, TransformerRule.QUANTIZE_NODES: self.quantize_nodes, TransformerRule.ADD_QUANTIZE_TENSOR_RANGE: @@ -94,6 +94,8 @@ class Transformer(base_converter.ConverterInterface): self.update_float_op_data_type, TransformerRule.ADD_MACE_INPUT_AND_OUTPUT_NODES: self.add_mace_input_and_output_nodes, + TransformerRule.ADD_OPENCL_INFORMATIONS: + self.add_opencl_informations, TransformerRule.SORT_BY_EXECUTION: self.sort_by_execution, TransformerRule.CHECK_QUANTIZE_INFO: self.check_quantize_info, @@ -1269,13 +1271,13 @@ class Transformer(base_converter.ConverterInterface): return False - def buffer_to_image(self, op, input_idx, input_type): + def buffer_transform(self, op, input_idx, input_type): net = self._model input_name = op.input[input_idx] op_def = net.op.add() op_def.name = input_name.replace(':', '_') + "_b2i" output_name = op_def.name - op_def.type = MaceKeyword.mace_buffer_to_image + op_def.type = MaceKeyword.mace_buffer_transform op_def.input.extend([input_name]) op_def.output.extend([output_name]) @@ -1307,68 +1309,69 @@ class Transformer(base_converter.ConverterInterface): self._opencl_max_image_size[1] = max(self._opencl_max_image_size[1], img_shape[1]) - def transform_buffer_image(self): + def add_buffer_transform(self): if self._option.device != DeviceType.GPU.value: return False - print("Transform buffer to image") + print("Add buffer transform op") net = self._model for op in net.op: if op.type == MaceOp.Conv2D.name \ or op.type == MaceOp.Deconv2D.name: - self.buffer_to_image(op, 1, OpenCLBufferType.CONV2D_FILTER) + self.buffer_transform(op, 1, OpenCLBufferType.CONV2D_FILTER) if len(op.input) >= 3 and op.type == MaceOp.Conv2D.name: - self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) elif len(op.input) >= 4 and op.type == MaceOp.Deconv2D.name: - self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT) elif op.type == MaceOp.DepthwiseConv2d.name: - self.buffer_to_image(op, 1, OpenCLBufferType.DW_CONV2D_FILTER) + self.buffer_transform(op, 1, OpenCLBufferType.DW_CONV2D_FILTER) if len(op.input) >= 3: - self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) elif op.type == MaceOp.BiasAdd.name: - self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT) elif op.type == MaceOp.Eltwise.name and len(op.input) == 2: if op.input[0] in self._consts \ and len(self._consts[op.input[0]].dims) == 1: - self.buffer_to_image(op, 0, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 0, OpenCLBufferType.ARGUMENT) if op.input[1] in self._consts \ and len(self._consts[op.input[1]].dims) == 1: - self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT) elif op.type == MaceOp.FoldedBatchNorm.name: - self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) - self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) if len(op.input) >= 4: - self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT) elif op.type == MaceOp.MatMul.name and \ ConverterUtil.get_arg(op, MaceKeyword.mace_winograd_filter_transformed) is not None: # noqa - self.buffer_to_image(op, 0, OpenCLBufferType.WINOGRAD_FILTER) + self.buffer_transform(op, 0, OpenCLBufferType.WINOGRAD_FILTER) elif op.type == MaceOp.WinogradInverseTransform.name \ and len(op.input) >= 3: - self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) elif op.type == MaceOp.FullyConnected.name: - self.buffer_to_image(op, 1, OpenCLBufferType.WEIGHT_WIDTH) + self.buffer_transform(op, 1, OpenCLBufferType.WEIGHT_WIDTH) if len(op.input) >= 3: - self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) elif op.type == MaceOp.Activation.name: if ConverterUtil.get_arg(op, MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name: # noqa - self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT) elif op.type == MaceOp.LSTMCell.name: if op.input[1] in self._consts: - self.buffer_to_image(op, 1, - OpenCLBufferType.IN_OUT_CHANNEL) - self.buffer_to_image(op, 2, OpenCLBufferType.IN_OUT_CHANNEL) - self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT) + self.buffer_transform(op, 1, + OpenCLBufferType.IN_OUT_CHANNEL) + self.buffer_transform(op, 2, OpenCLBufferType.IN_OUT_CHANNEL) + self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT) if op.input[4] in self._consts: - self.buffer_to_image(op, 4, - OpenCLBufferType.IN_OUT_CHANNEL) + self.buffer_transform(op, 4, + OpenCLBufferType.IN_OUT_CHANNEL) # Add OpenCL max image size - arg = net.arg.add() - arg.name = MaceKeyword.mace_opencl_max_image_size - arg.ints.extend(self._opencl_max_image_size) + if self._option.cl_mem_type == "image": + arg = net.arg.add() + arg.name = MaceKeyword.mace_opencl_max_image_size + arg.ints.extend(self._opencl_max_image_size) for input_node in self._option.input_nodes.values(): new_input_name = MaceKeyword.mace_input_node_name \ @@ -1376,7 +1379,7 @@ class Transformer(base_converter.ConverterInterface): op_def = self._model.op.add() op_def.name = self.normalize_op_name(input_node.name) - op_def.type = MaceKeyword.mace_buffer_to_image + op_def.type = MaceKeyword.mace_buffer_transform op_def.input.extend([new_input_name]) op_def.output.extend([input_node.name]) output_shape = op_def.output_shape.add() @@ -1394,7 +1397,7 @@ class Transformer(base_converter.ConverterInterface): + '_' + output_node.name op_def = self._model.op.add() op_def.name = self.normalize_op_name(output_name) - op_def.type = MaceKeyword.mace_image_to_buffer + op_def.type = MaceKeyword.mace_buffer_inverse_transform op_def.input.extend([output_node.name]) op_def.output.extend([output_name]) if output_node.shape: @@ -1920,3 +1923,16 @@ class Transformer(base_converter.ConverterInterface): and op.type != MaceOp.Dequantize.name): # noqa mace_check(len(op.output) == len(op.quantize_info), "missing quantize info: %s" % op) + + def add_opencl_informations(self): + if self._option.device != DeviceType.GPU.value: + return False + + print("Add OpenCL informations") + + net = self._model + + arg = net.arg.add() + arg.name = MaceKeyword.mace_opencl_mem_type + arg.i = mace_pb2.GPU_IMAGE if self._option.cl_mem_type == "image"\ + else mace_pb2.GPU_BUFFER diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py index 10a0ab5029722e75c4226556ea7b9620e93f469c..e12d2a6cf7e2745f3e8f5b07a3da526bdc9c4bd5 100644 --- a/mace/python/tools/memory_optimizer.py +++ b/mace/python/tools/memory_optimizer.py @@ -18,6 +18,8 @@ from mace.proto import mace_pb2 from mace.python.tools.converter_tool import base_converter as cvt from mace.python.tools.converter_tool.base_converter import DeviceType +from mace.python.tools.converter_tool.base_converter import ConverterUtil +from mace.python.tools.converter_tool.base_converter import MaceKeyword from mace.python.tools.convert_util import calculate_image_shape from mace.python.tools.convert_util import OpenCLBufferType @@ -56,6 +58,10 @@ class MemoryOptimizer(object): self.total_mem_count = 0 self.input_ref_counter = {} self.mem_ref_counter = {} + ocl_mem_type_arg = ConverterUtil.get_arg( + net_def, MaceKeyword.mace_opencl_mem_type) + self.cl_mem_type = ocl_mem_type_arg.i if ocl_mem_type_arg is not None\ + else None consumers = {} for op in net_def.op: @@ -223,13 +229,13 @@ class MemoryOptimizer(object): class GPUMemoryOptimizer(MemoryOptimizer): def op_need_optimize_memory(self, op): - if op.type == 'BufferToImage': + if op.type == MaceKeyword.mace_buffer_transform: for arg in op.arg: if arg.name == 'mode' and arg.i == 0: return False - return op.type != 'ImageToBuffer' + return op.type != MaceKeyword.mace_buffer_inverse_transform - def get_op_mem_block(self, op_type, output_shape, output_type): + def get_op_image_mem_block(self, op_type, output_shape): if op_type == 'WinogradTransform' or op_type == 'MatMul': buffer_shape = list(output_shape) + [1] mem_block = MemoryBlock( @@ -264,6 +270,16 @@ class GPUMemoryOptimizer(MemoryOptimizer): buffer_shape)) return mem_block + def get_op_buffer_mem_block(self, output_shape): + return MemoryBlock(mace_pb2.GPU_BUFFER, + [reduce(operator.mul, output_shape, 1), 1]) + + def get_op_mem_block(self, op_type, output_shape, output_type): + if self.cl_mem_type == mace_pb2.GPU_IMAGE: + return self.get_op_image_mem_block(op_type, output_shape) + else: + return self.get_op_buffer_mem_block(output_shape) + def mem_size(self, memory_block): if memory_block.mem_type == mace_pb2.GPU_IMAGE: return memory_block.block[0] * memory_block.block[1] * 4 @@ -295,21 +311,22 @@ class GPUMemoryOptimizer(MemoryOptimizer): max_image_size_x = max(max_image_size_x, block.x) max_image_size_y = max(max_image_size_y, block.y) - # Update OpenCL max image size - net_ocl_max_img_size_arg = None - for arg in self.net_def.arg: - if arg.name == cvt.MaceKeyword.mace_opencl_max_image_size: - net_ocl_max_img_size_arg = arg - max_image_size_x = max(arg.ints[0], max_image_size_x) - max_image_size_y = max(arg.ints[1], max_image_size_y) - break - if net_ocl_max_img_size_arg is None: - net_ocl_max_img_size_arg = self.net_def.arg.add() - net_ocl_max_img_size_arg.name = \ - cvt.MaceKeyword.mace_opencl_max_image_size - - net_ocl_max_img_size_arg.ints[:] = [max_image_size_x, - max_image_size_y] + if self.cl_mem_type == mace_pb2.GPU_IMAGE: + # Update OpenCL max image size + net_ocl_max_img_size_arg = None + for arg in self.net_def.arg: + if arg.name == cvt.MaceKeyword.mace_opencl_max_image_size: + net_ocl_max_img_size_arg = arg + max_image_size_x = max(arg.ints[0], max_image_size_x) + max_image_size_y = max(arg.ints[1], max_image_size_y) + break + if net_ocl_max_img_size_arg is None: + net_ocl_max_img_size_arg = self.net_def.arg.add() + net_ocl_max_img_size_arg.name = \ + cvt.MaceKeyword.mace_opencl_max_image_size + + net_ocl_max_img_size_arg.ints[:] = [max_image_size_x, + max_image_size_y] def optimize_gpu_memory(net_def): diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index 6d554bbe3dbfbd88f338e2602c77ec6f86a2317d..7f768adc7e2420c7163e862b6be244983b0a1791 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -69,7 +69,7 @@ void BufferToImage(const std::string &input_name, const int mode = NetMode::NORMAL) { OperatorDef operator_def; - ops::test::OpDefBuilder("BufferToImage", "BufferToImageOp") + ops::test::OpDefBuilder("BufferTransform", "BufferTransformOp") .Input(input_name) .Output(output_name) .AddIntArg("buffer_type", buffer_type) @@ -93,7 +93,7 @@ void ImageToBuffer(const std::string &input_name, NetDef *net_def) { OperatorDef operator_def; - ops::test::OpDefBuilder("ImageToBuffer", "ImageToBufferOp") + ops::test::OpDefBuilder("BufferInverseTransform", "BufferInverseTransformOp") .Input(input_name) .Output(output_name) .AddIntArg("buffer_type", buffer_type) diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index 83d3b33dfb1894a486197af41c7344608bff6e9a..945758b947c71baa1a6550dc1cf7a076ffebad65 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -70,7 +70,7 @@ void BufferToImage(const std::string &input_name, const int mode = NetMode::NORMAL) { OperatorDef operator_def; - ops::test::OpDefBuilder("BufferToImage", "BufferToImageOp") + ops::test::OpDefBuilder("BufferTransform", "BufferTransformOp") .Input(input_name) .Output(output_name) .AddIntArg("buffer_type", buffer_type) @@ -95,7 +95,7 @@ void ImageToBuffer(const std::string &input_name, NetDef *net_def) { OperatorDef operator_def; - ops::test::OpDefBuilder("ImageToBuffer", "ImageToBufferOp") + ops::test::OpDefBuilder("BufferInverseTransform", "BufferInverseTransformOp") .Input(input_name) .Output(output_name) .AddIntArg("buffer_type", buffer_type) diff --git a/mace/utils/utils.h b/mace/utils/utils.h index c9e561b51e04e5c7c0ae7cf310897e4c5aeea8ec..12138cad8ecdc2fba4d9742faa8079c5519416bc 100644 --- a/mace/utils/utils.h +++ b/mace/utils/utils.h @@ -33,6 +33,12 @@ namespace mace { CLASSNAME &operator=(const CLASSNAME &) = delete #endif +#ifndef MACE_VIRTUAL_EMPTY_DESTRUCTOR +#define MACE_VIRTUAL_EMPTY_DESTRUCTOR(CLASSNAME) \ + public: \ + virtual ~CLASSNAME() {} +#endif + template Integer RoundUp(Integer i, Integer factor) { return (i + factor - 1) / factor * factor; diff --git a/repository/opencl-kernel/opencl_kernel_configure.bzl b/repository/opencl-kernel/opencl_kernel_configure.bzl index 6fd1d056071e8e40295246c4b33c16a0f5817324..0fe17b05af040f5d5faedd4e61586c4678e83600 100644 --- a/repository/opencl-kernel/opencl_kernel_configure.bzl +++ b/repository/opencl-kernel/opencl_kernel_configure.bzl @@ -23,30 +23,38 @@ def _opencl_encrypt_kernel_impl(repository_ctx): unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/activation.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/addn.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/batch_norm.cl")) + unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/batch_to_space.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/bias_add.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/buffer_to_image.cl")) + unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/buffer_transform.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/channel_shuffle.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/common.h")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/concat.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_1x1.cl")) + unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_3x3.cl")) + unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_buffer.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/crop.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/deconv_2d.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depth_to_space.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depthwise_conv2d.cl")) + unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/eltwise.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/fully_connected.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/lstmcell.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/matmul.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pad.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pooling.cl")) + unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pooling_buffer.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/reduce_mean.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/resize_bicubic.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/resize_bilinear.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/split.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/softmax.cl")) + unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/softmax_buffer.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/space_to_batch.cl")) + unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/space_to_depth.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/winograd_transform.cl")) python_bin_path = repository_ctx.which("python") diff --git a/tools/converter.py b/tools/converter.py index 2a9aa30b2af353fc7189e419c2de1327e89af403..0e4e789b3622fc9a9a0758cdf5304fe80aae124c 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -203,6 +203,7 @@ class YAMLKeyword(object): validation_inputs_data = 'validation_inputs_data' validation_threshold = 'validation_threshold' graph_optimize_options = 'graph_optimize_options' # internal use for now + cl_mem_type = 'cl_mem_type' class ModuleName(object): @@ -692,7 +693,7 @@ def get_model_files(model_file_path, return model_file, weight_file -def convert_model(configs): +def convert_model(configs, cl_mem_type): # Remove previous output dirs library_name = configs[YAMLKeyword.library_name] if not os.path.exists(BUILD_OUTPUT_DIR): @@ -735,6 +736,10 @@ def convert_model(configs): StringFormatter.block("Convert %s model" % model_name)) model_config = configs[YAMLKeyword.models][model_name] runtime = model_config[YAMLKeyword.runtime] + if cl_mem_type: + model_config[YAMLKeyword.cl_mem_type] = cl_mem_type + else: + model_config[YAMLKeyword.cl_mem_type] = "image" model_file_path, weight_file_path = get_model_files( model_config[YAMLKeyword.model_file_path], @@ -769,6 +774,7 @@ def convert_model(configs): model_config[YAMLKeyword.obfuscate], configs[YAMLKeyword.model_graph_format], data_type, + model_config[YAMLKeyword.cl_mem_type], ",".join(model_config.get(YAMLKeyword.graph_optimize_options, []))) if configs[YAMLKeyword.model_graph_format] == ModelFormat.file: @@ -844,7 +850,7 @@ def convert_func(flags): print_configuration(configs) - convert_model(configs) + convert_model(configs, flags.cl_mem_type) if configs[YAMLKeyword.model_graph_format] == ModelFormat.code: build_model_lib(configs, flags.address_sanitizer) @@ -1683,6 +1689,11 @@ def parse_args(): 'convert', parents=[all_type_parent_parser, convert_run_parent_parser], help='convert to mace model (file or code)') + convert.add_argument( + "--cl_mem_type", + type=str, + default=None, + help="Which type of OpenCL memory type to use [image | buffer].") convert.set_defaults(func=convert_func) run = subparsers.add_parser( 'run', diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 8a266ee884c7d62289131cf819a411e0a306ebf7..a5e0dfa7da9e4163cf87c803001cd76563addb64 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -340,7 +340,7 @@ def bazel_build(target, enable_neon=True, enable_opencl=True, address_sanitizer=False, - symbol_hidden=False, + symbol_hidden=True, extra_args=""): print("* Build %s with ABI %s" % (target, abi)) if abi == "host": @@ -560,6 +560,7 @@ def gen_model_code(model_codegen_dir, obfuscate, model_graph_format, data_type, + cl_mem_type, graph_optimize_options): bazel_build_common("//mace/python/tools:converter") @@ -591,6 +592,7 @@ def gen_model_code(model_codegen_dir, "--model_graph_format=%s" % model_graph_format, "--data_type=%s" % data_type, "--graph_optimize_options=%s" % graph_optimize_options, + "--cl_mem_type=%s" % cl_mem_type, _fg=True)