提交 e2a40a03 编写于 作者: 刘琦

Merge branch 'gpu-buffer' into 'master'

Refactor OpenCL kernel for supporting buffer.

See merge request !803
...@@ -8,6 +8,7 @@ stages: ...@@ -8,6 +8,7 @@ stages:
- ops_test - ops_test
- api_test - api_test
- python_tools_tests - python_tools_tests
- model_tests
- build_android_demo - build_android_demo
- ops_benchmark - ops_benchmark
- extra_tests - extra_tests
...@@ -113,6 +114,18 @@ python_tools_tests: ...@@ -113,6 +114,18 @@ python_tools_tests:
python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
model_tests:
stage: model_tests
script:
- pwd
- rm -rf mace-models
- GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
- CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1.yml
- >
python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
- CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
- > - >
python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "mace/core/device.h" #include "mace/core/device.h"
#include "mace/core/buffer.h"
namespace mace { namespace mace {
CPUDevice::CPUDevice(const int num_threads, CPUDevice::CPUDevice(const int num_threads,
...@@ -21,7 +23,8 @@ CPUDevice::CPUDevice(const int num_threads, ...@@ -21,7 +23,8 @@ CPUDevice::CPUDevice(const int num_threads,
const bool use_gemmlowp) const bool use_gemmlowp)
: cpu_runtime_(new CPURuntime(num_threads, : cpu_runtime_(new CPURuntime(num_threads,
policy, policy,
use_gemmlowp)) {} use_gemmlowp)),
scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
CPUDevice::~CPUDevice() = default; CPUDevice::~CPUDevice() = default;
...@@ -31,6 +34,7 @@ CPURuntime *CPUDevice::cpu_runtime() { ...@@ -31,6 +34,7 @@ CPURuntime *CPUDevice::cpu_runtime() {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
OpenCLRuntime *CPUDevice::opencl_runtime() { OpenCLRuntime *CPUDevice::opencl_runtime() {
LOG(FATAL) << "CPU device should not call OpenCL Runtime";
return nullptr; return nullptr;
} }
#endif #endif
...@@ -43,4 +47,8 @@ DeviceType CPUDevice::device_type() const { ...@@ -43,4 +47,8 @@ DeviceType CPUDevice::device_type() const {
return DeviceType::CPU; return DeviceType::CPU;
} }
ScratchBuffer *CPUDevice::scratch_buffer() {
return scratch_buffer_.get();
}
} // namespace mace } // namespace mace
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
namespace mace { namespace mace {
class ScratchBuffer;
class Device { class Device {
public: public:
virtual ~Device() {} virtual ~Device() {}
...@@ -37,6 +39,7 @@ class Device { ...@@ -37,6 +39,7 @@ class Device {
virtual Allocator *allocator() = 0; virtual Allocator *allocator() = 0;
virtual DeviceType device_type() const = 0; virtual DeviceType device_type() const = 0;
virtual ScratchBuffer *scratch_buffer() = 0;
}; };
class CPUDevice : public Device { class CPUDevice : public Device {
...@@ -53,9 +56,11 @@ class CPUDevice : public Device { ...@@ -53,9 +56,11 @@ class CPUDevice : public Device {
Allocator *allocator() override; Allocator *allocator() override;
DeviceType device_type() const override; DeviceType device_type() const override;
ScratchBuffer *scratch_buffer() override;
private: private:
std::unique_ptr<CPURuntime> cpu_runtime_; std::unique_ptr<CPURuntime> cpu_runtime_;
std::unique_ptr<ScratchBuffer> scratch_buffer_;
}; };
} // namespace mace } // namespace mace
......
...@@ -15,7 +15,9 @@ ...@@ -15,7 +15,9 @@
#ifndef MACE_CORE_FUTURE_H_ #ifndef MACE_CORE_FUTURE_H_
#define MACE_CORE_FUTURE_H_ #define MACE_CORE_FUTURE_H_
#include <algorithm>
#include <functional> #include <functional>
#include <vector>
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
...@@ -25,9 +27,7 @@ struct CallStats; ...@@ -25,9 +27,7 @@ struct CallStats;
// Wait the call to finish and get the stats if param is not nullptr // Wait the call to finish and get the stats if param is not nullptr
struct StatsFuture { struct StatsFuture {
std::function<void(CallStats *)> wait_fn = [](CallStats *) { std::function<void(CallStats *)> wait_fn;
LOG(FATAL) << "wait_fn must be properly set";
};
}; };
inline void SetFutureDefaultWaitFn(StatsFuture *future) { inline void SetFutureDefaultWaitFn(StatsFuture *future) {
...@@ -41,6 +41,29 @@ inline void SetFutureDefaultWaitFn(StatsFuture *future) { ...@@ -41,6 +41,29 @@ inline void SetFutureDefaultWaitFn(StatsFuture *future) {
} }
} }
inline void MergeMultipleFutureWaitFn(
const std::vector<StatsFuture> &org_futures,
StatsFuture *dst_future) {
if (dst_future != nullptr) {
dst_future->wait_fn = [org_futures](CallStats *stats) {
if (stats != nullptr) {
stats->start_micros = INT64_MAX;
stats->end_micros = 0;
for (auto &org_future : org_futures) {
CallStats tmp_stats;
if (org_future.wait_fn != nullptr) {
org_future.wait_fn(&tmp_stats);
stats->start_micros = std::min(stats->start_micros,
tmp_stats.start_micros);
stats->end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
stats->end_micros += stats->start_micros;
}
};
}
}
} // namespace mace } // namespace mace
#endif // MACE_CORE_FUTURE_H_ #endif // MACE_CORE_FUTURE_H_
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "mace/core/runtime/opencl/gpu_device.h" #include "mace/core/runtime/opencl/gpu_device.h"
#include "mace/core/buffer.h"
namespace mace { namespace mace {
GPUDevice::GPUDevice(Tuner<uint32_t> *tuner, GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
...@@ -27,7 +29,8 @@ GPUDevice::GPUDevice(Tuner<uint32_t> *tuner, ...@@ -27,7 +29,8 @@ GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
CPUDevice(num_threads, cpu_affinity_policy, use_gemmlowp), CPUDevice(num_threads, cpu_affinity_policy, use_gemmlowp),
runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf, runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
opencl_binary_storage, tuner)), opencl_binary_storage, tuner)),
allocator_(new OpenCLAllocator(runtime_.get())) {} allocator_(new OpenCLAllocator(runtime_.get())),
scratch_buffer_(new ScratchBuffer(allocator_.get())) {}
GPUDevice::~GPUDevice() = default; GPUDevice::~GPUDevice() = default;
...@@ -43,4 +46,8 @@ DeviceType GPUDevice::device_type() const { ...@@ -43,4 +46,8 @@ DeviceType GPUDevice::device_type() const {
return DeviceType::GPU; return DeviceType::GPU;
} }
ScratchBuffer *GPUDevice::scratch_buffer() {
return scratch_buffer_.get();
}
} // namespace mace } // namespace mace
...@@ -37,9 +37,11 @@ class GPUDevice : public CPUDevice { ...@@ -37,9 +37,11 @@ class GPUDevice : public CPUDevice {
OpenCLRuntime *opencl_runtime() override; OpenCLRuntime *opencl_runtime() override;
Allocator *allocator() override; Allocator *allocator() override;
DeviceType device_type() const override; DeviceType device_type() const override;
ScratchBuffer *scratch_buffer() override;
private: private:
std::unique_ptr<OpenCLRuntime> runtime_; std::unique_ptr<OpenCLRuntime> runtime_;
std::unique_ptr<OpenCLAllocator> allocator_; std::unique_ptr<OpenCLAllocator> allocator_;
std::unique_ptr<ScratchBuffer> scratch_buffer_;
}; };
} // namespace mace } // namespace mace
......
...@@ -31,8 +31,6 @@ ...@@ -31,8 +31,6 @@
namespace mace { namespace mace {
std::string kOpenCLParameterPath; // NOLINT(runtime/string)
extern const std::map<std::string, std::vector<unsigned char>> extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap; kEncryptedProgramMap;
...@@ -286,7 +284,8 @@ OpenCLRuntime::OpenCLRuntime( ...@@ -286,7 +284,8 @@ OpenCLRuntime::OpenCLRuntime(
is_opencl_avaliable_(false), is_opencl_avaliable_(false),
is_profiling_enabled_(false), is_profiling_enabled_(false),
opencl_version_(CL_VER_UNKNOWN), opencl_version_(CL_VER_UNKNOWN),
gpu_type_(UNKNOWN) { gpu_type_(UNKNOWN),
mem_type_(MemoryType::GPU_IMAGE) {
std::vector<cl::Platform> all_platforms; std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms); cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) { if (all_platforms.size() == 0) {
...@@ -471,6 +470,14 @@ uint32_t OpenCLRuntime::device_compute_units() const { ...@@ -471,6 +470,14 @@ uint32_t OpenCLRuntime::device_compute_units() const {
return device_compute_units_; return device_compute_units_;
} }
bool OpenCLRuntime::UseImageMemory() {
return this->mem_type_ == MemoryType::GPU_IMAGE;
}
void OpenCLRuntime::set_mem_type(MemoryType type) {
this->mem_type_ = type;
}
bool OpenCLRuntime::BuildProgramFromCache( bool OpenCLRuntime::BuildProgramFromCache(
const std::string &built_program_key, const std::string &built_program_key,
const std::string &build_options_str, const std::string &build_options_str,
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "mace/core/file_storage.h" #include "mace/core/file_storage.h"
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/proto/mace.pb.h"
#include "mace/utils/string_util.h" #include "mace/utils/string_util.h"
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
...@@ -82,6 +83,9 @@ class OpenCLRuntime { ...@@ -82,6 +83,9 @@ class OpenCLRuntime {
uint32_t device_compute_units() const; uint32_t device_compute_units() const;
Tuner<uint32_t> *tuner(); Tuner<uint32_t> *tuner();
bool is_opencl_avaliable(); bool is_opencl_avaliable();
// TODO(liuqi): remove this function in the future, make decision at runtime.
bool UseImageMemory();
void set_mem_type(MemoryType type);
void GetCallStats(const cl::Event &event, CallStats *stats); void GetCallStats(const cl::Event &event, CallStats *stats);
uint64_t GetDeviceMaxWorkGroupSize(); uint64_t GetDeviceMaxWorkGroupSize();
...@@ -129,6 +133,7 @@ class OpenCLRuntime { ...@@ -129,6 +133,7 @@ class OpenCLRuntime {
bool is_profiling_enabled_; bool is_profiling_enabled_;
OpenCLVersion opencl_version_; OpenCLVersion opencl_version_;
GPUType gpu_type_; GPUType gpu_type_;
MemoryType mem_type_;
// All OpenCL object must be a pointer and manually deleted before unloading // All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library. // OpenCL library.
std::shared_ptr<cl::Context> context_; std::shared_ptr<cl::Context> context_;
......
...@@ -101,13 +101,14 @@ enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 }; ...@@ -101,13 +101,14 @@ enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 };
class Tensor { class Tensor {
public: public:
Tensor(Allocator *alloc, DataType type, Tensor(Allocator *alloc, DataType type,
bool is_weight = false) bool is_weight = false,
const std::string name = "")
: allocator_(alloc), : allocator_(alloc),
dtype_(type), dtype_(type),
buffer_(nullptr), buffer_(nullptr),
is_buffer_owner_(true), is_buffer_owner_(true),
unused_(false), unused_(false),
name_(""), name_(name),
is_weight_(is_weight), is_weight_(is_weight),
scale_(0.f), scale_(0.f),
zero_point_(0), zero_point_(0),
...@@ -115,12 +116,13 @@ class Tensor { ...@@ -115,12 +116,13 @@ class Tensor {
maxval_(0.f) {} maxval_(0.f) {}
Tensor(BufferBase *buffer, DataType dtype, Tensor(BufferBase *buffer, DataType dtype,
bool is_weight = false) bool is_weight = false,
const std::string name = "")
: dtype_(dtype), : dtype_(dtype),
buffer_(buffer), buffer_(buffer),
is_buffer_owner_(false), is_buffer_owner_(false),
unused_(false), unused_(false),
name_(""), name_(name),
is_weight_(is_weight), is_weight_(is_weight),
scale_(0.f), scale_(0.f),
zero_point_(0), zero_point_(0),
...@@ -129,12 +131,13 @@ class Tensor { ...@@ -129,12 +131,13 @@ class Tensor {
Tensor(const BufferSlice &buffer_slice, Tensor(const BufferSlice &buffer_slice,
DataType dtype, DataType dtype,
bool is_weight = false) bool is_weight = false,
const std::string name = "")
: dtype_(dtype), : dtype_(dtype),
buffer_slice_(buffer_slice), buffer_slice_(buffer_slice),
is_buffer_owner_(false), is_buffer_owner_(false),
unused_(false), unused_(false),
name_(""), name_(name),
is_weight_(is_weight), is_weight_(is_weight),
scale_(0.f), scale_(0.f),
zero_point_(0), zero_point_(0),
...@@ -152,6 +155,8 @@ class Tensor { ...@@ -152,6 +155,8 @@ class Tensor {
} }
} }
inline std::string name() const { return name_; }
inline DataType dtype() const { return dtype_; } inline DataType dtype() const { return dtype_; }
inline void SetDtype(DataType dtype) { dtype_ = dtype; } inline void SetDtype(DataType dtype) { dtype_ = dtype; }
...@@ -188,11 +193,15 @@ class Tensor { ...@@ -188,11 +193,15 @@ class Tensor {
shape_configured_ = shape_configured; shape_configured_ = shape_configured;
} }
inline const std::vector<index_t> &buffer_shape() const {
return buffer_shape_;
}
inline index_t dim_size() const { return shape_.size(); } inline index_t dim_size() const { return shape_.size(); }
inline index_t dim(unsigned int index) const { inline index_t dim(unsigned int index) const {
MACE_CHECK(index < shape_.size(), "Dim out of range: ", index, " >= ", MACE_CHECK(index < shape_.size(),
shape_.size()); name_, ": Dim out of range: ", index, " >= ", shape_.size());
return shape_[index]; return shape_[index];
} }
...@@ -214,12 +223,12 @@ class Tensor { ...@@ -214,12 +223,12 @@ class Tensor {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
inline cl::Image *opencl_image() const { inline cl::Image *opencl_image() const {
MACE_CHECK(has_opencl_image(), "do not have image"); MACE_CHECK(has_opencl_image(), name_, " do not have image");
return static_cast<cl::Image *>(buffer_->buffer()); return static_cast<cl::Image *>(buffer_->buffer());
} }
inline cl::Buffer *opencl_buffer() const { inline cl::Buffer *opencl_buffer() const {
MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer"); MACE_CHECK(has_opencl_buffer(), name_, " do not have opencl buffer");
return static_cast<cl::Buffer *>(buffer_->buffer()); return static_cast<cl::Buffer *>(buffer_->buffer());
} }
#endif #endif
...@@ -268,12 +277,14 @@ class Tensor { ...@@ -268,12 +277,14 @@ class Tensor {
inline MaceStatus Resize(const std::vector<index_t> &shape) { inline MaceStatus Resize(const std::vector<index_t> &shape) {
shape_ = shape; shape_ = shape;
buffer_shape_ = shape;
image_shape_.clear(); image_shape_.clear();
if (buffer_ != nullptr) { if (buffer_ != nullptr) {
MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage."); MACE_CHECK(!has_opencl_image(),
name_, ": Cannot resize image, use ResizeImage.");
if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) { if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to " LOG(WARNING) << name_ << ": Resize buffer from size " << buffer_->size()
<< raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE; << " to " << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE;
return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE); return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE);
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
...@@ -296,19 +307,22 @@ class Tensor { ...@@ -296,19 +307,22 @@ class Tensor {
allocator_ = other.allocator_; allocator_ = other.allocator_;
dtype_ = other.dtype_; dtype_ = other.dtype_;
shape_ = other.shape_; shape_ = other.shape_;
buffer_shape_ = other.buffer_shape_;
image_shape_ = other.image_shape_; image_shape_ = other.image_shape_;
} }
inline MaceStatus ResizeImage(const std::vector<index_t> &shape, inline MaceStatus ResizeImage(const std::vector<index_t> &shape,
const std::vector<size_t> &image_shape) { const std::vector<size_t> &image_shape) {
shape_ = shape; shape_ = shape;
buffer_shape_ = shape;
image_shape_ = image_shape; image_shape_ = image_shape;
if (buffer_ == nullptr) { if (buffer_ == nullptr) {
MACE_CHECK(is_buffer_owner_); MACE_CHECK(is_buffer_owner_);
buffer_ = new Image(allocator_); buffer_ = new Image(allocator_);
return buffer_->Allocate(image_shape, dtype_); return buffer_->Allocate(image_shape, dtype_);
} else { } else {
MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize."); MACE_CHECK(has_opencl_image(),
name_, ": Cannot ResizeImage buffer, use Resize.");
Image *image = dynamic_cast<Image *>(buffer_); Image *image = dynamic_cast<Image *>(buffer_);
MACE_CHECK(image_shape[0] <= image->image_shape()[0] && MACE_CHECK(image_shape[0] <= image->image_shape()[0] &&
image_shape[1] <= image->image_shape()[1], image_shape[1] <= image->image_shape()[1],
...@@ -366,8 +380,6 @@ class Tensor { ...@@ -366,8 +380,6 @@ class Tensor {
inline BufferBase *UnderlyingBuffer() const { return buffer_; } inline BufferBase *UnderlyingBuffer() const { return buffer_; }
inline void SetSourceOpName(const std::string name) { name_ = name; }
inline void DebugPrint() const { inline void DebugPrint() const {
using namespace numerical_chars; // NOLINT(build/namespaces) using namespace numerical_chars; // NOLINT(build/namespaces)
std::stringstream os; std::stringstream os;
...@@ -459,9 +471,12 @@ class Tensor { ...@@ -459,9 +471,12 @@ class Tensor {
private: private:
Allocator *allocator_; Allocator *allocator_;
DataType dtype_; DataType dtype_;
// the shape of buffer(logical)
std::vector<index_t> shape_; std::vector<index_t> shape_;
std::vector<index_t> shape_configured_; std::vector<index_t> shape_configured_;
std::vector<size_t> image_shape_; std::vector<size_t> image_shape_;
// the shape of buffer(physical storage)
std::vector<index_t> buffer_shape_;
BufferBase *buffer_; BufferBase *buffer_;
BufferSlice buffer_slice_; BufferSlice buffer_slice_;
bool is_buffer_owner_; bool is_buffer_owner_;
......
...@@ -44,8 +44,7 @@ bool HasQuantizeOp(const NetDef &net_def) { ...@@ -44,8 +44,7 @@ bool HasQuantizeOp(const NetDef &net_def) {
} }
} // namespace } // namespace
Workspace::Workspace() : Workspace::Workspace() = default;
host_scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
Tensor *Workspace::CreateTensor(const std::string &name, Tensor *Workspace::CreateTensor(const std::string &name,
Allocator *alloc, Allocator *alloc,
...@@ -54,8 +53,8 @@ Tensor *Workspace::CreateTensor(const std::string &name, ...@@ -54,8 +53,8 @@ Tensor *Workspace::CreateTensor(const std::string &name,
VLOG(3) << "Tensor " << name << " already exists. Skipping."; VLOG(3) << "Tensor " << name << " already exists. Skipping.";
} else { } else {
VLOG(3) << "Creating Tensor " << name; VLOG(3) << "Creating Tensor " << name;
tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type)); tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type,
tensor_map_[name]->SetSourceOpName(name); false, name));
} }
return GetTensor(name); return GetTensor(name);
} }
...@@ -171,7 +170,10 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -171,7 +170,10 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
tensor_buffer_.get(), const_tensor.offset(), tensor_buffer_.get(), const_tensor.offset(),
const_tensor.data_size() * const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type())), GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type(), true)); const_tensor.data_type(),
true,
const_tensor.name()));
tensor->Reshape(dims); tensor->Reshape(dims);
tensor->SetScale(const_tensor.scale()); tensor->SetScale(const_tensor.scale());
tensor->SetZeroPoint(const_tensor.zero_point()); tensor->SetZeroPoint(const_tensor.zero_point());
...@@ -275,7 +277,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -275,7 +277,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
std::unique_ptr<BufferBase> tensor_buf( std::unique_ptr<BufferBase> tensor_buf(
new Buffer(device->allocator())); new Buffer(device->allocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate( MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype))); mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf)); std::move(tensor_buf));
} }
...@@ -301,10 +304,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -301,10 +304,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
} }
std::unique_ptr<Tensor> tensor std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]),
output_type)); output_type, false, op.output(i)));
tensor->SetSourceOpName(op.name()); if (device_type == DeviceType::GPU && tensor->has_opencl_image()) {
if (device_type == DeviceType::GPU) { VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
<< " Mem: " << mem_ids[i] << " Mem: " << mem_ids[i]
<< " Image shape: " << " Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer()) << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
...@@ -312,8 +314,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -312,8 +314,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
<< ", " << ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer()) << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1]; ->image_shape()[1];
} else if (device_type == DeviceType::CPU) { } else {
VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
<< " Mem: " << mem_ids[i] << " Mem: " << mem_ids[i]
<< ", Buffer size: " << tensor->UnderlyingBuffer()->size(); << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
} }
...@@ -356,14 +358,6 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -356,14 +358,6 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
if (device_type == CPU) {
return host_scratch_buffer_.get();
} else {
return nullptr;
}
}
void Workspace::RemoveUnusedBuffer() { void Workspace::RemoveUnusedBuffer() {
auto iter = tensor_map_.begin(); auto iter = tensor_map_.begin();
auto end_iter = tensor_map_.end(); auto end_iter = tensor_map_.end();
......
...@@ -52,8 +52,6 @@ class Workspace { ...@@ -52,8 +52,6 @@ class Workspace {
Device *device, Device *device,
const unsigned char *model_data); const unsigned char *model_data);
ScratchBuffer *GetScratchBuffer(DeviceType device_type);
void RemoveUnusedBuffer(); void RemoveUnusedBuffer();
void RemoveAndReloadBuffer(const NetDef &net_def, void RemoveAndReloadBuffer(const NetDef &net_def,
...@@ -64,15 +62,12 @@ class Workspace { ...@@ -64,15 +62,12 @@ class Workspace {
MaceStatus CreateOutputTensorBuffer(const NetDef &net_def, MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
Device *device); Device *device);
Device *device_;
TensorMap tensor_map_; TensorMap tensor_map_;
std::unique_ptr<BufferBase> tensor_buffer_; std::unique_ptr<BufferBase> tensor_buffer_;
PreallocatedPooledAllocator preallocated_allocator_; PreallocatedPooledAllocator preallocated_allocator_;
std::unique_ptr<ScratchBuffer> host_scratch_buffer_;
bool fused_buffer_; bool fused_buffer_;
MACE_DISABLE_COPY_AND_ASSIGN(Workspace); MACE_DISABLE_COPY_AND_ASSIGN(Workspace);
......
...@@ -32,6 +32,8 @@ cc_library( ...@@ -32,6 +32,8 @@ cc_library(
) + if_opencl_enabled(glob( ) + if_opencl_enabled(glob(
[ [
"opencl/*.cc", "opencl/*.cc",
"opencl/image/*.cc",
"opencl/buffer/*.cc",
], ],
exclude = [ exclude = [
"opencl/*_test.cc", "opencl/*_test.cc",
...@@ -43,14 +45,16 @@ cc_library( ...@@ -43,14 +45,16 @@ cc_library(
"arm/*.h", "arm/*.h",
], ],
exclude = [ exclude = [
"buffer_to_image.h", "buffer_transform.h",
"image_to_buffer.h", "buffer_inverse_transform.h",
"lstmcell.h", "lstmcell.h",
], ],
) + if_opencl_enabled(glob([ ) + if_opencl_enabled(glob([
"opencl/*.h", "opencl/*.h",
"buffer_to_image.h", "opencl/image/*.h",
"image_to_buffer.h", "opencl/buffer/*.h",
"buffer_transform.h",
"buffer_inverse_transform.h",
"lstmcell.h", "lstmcell.h",
])), ])),
copts = [ copts = [
......
...@@ -26,10 +26,6 @@ ...@@ -26,10 +26,6 @@
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -164,15 +160,22 @@ class ActivationFunctor<DeviceType::CPU, float> : OpKernel { ...@@ -164,15 +160,22 @@ class ActivationFunctor<DeviceType::CPU, float> : OpKernel {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLActivationKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLActivationKernel);
};
template <typename T> template <typename T>
class ActivationFunctor<DeviceType::GPU, T> : OpKernel { class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
public: public:
ActivationFunctor(OpKernelContext *context, ActivationFunctor(OpKernelContext *context,
ActivationType type, ActivationType type,
T relux_max_limit) T relux_max_limit);
: OpKernel(context),
activation_(type),
relux_max_limit_(static_cast<T>(relux_max_limit)) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
...@@ -180,13 +183,7 @@ class ActivationFunctor<DeviceType::GPU, T> : OpKernel { ...@@ -180,13 +183,7 @@ class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
StatsFuture *future); StatsFuture *future);
private: private:
ActivationType activation_; std::unique_ptr<OpenCLActivationKernel> kernel_;
T relux_max_limit_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::string tuning_key_prefix_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -26,10 +26,6 @@ ...@@ -26,10 +26,6 @@
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -96,17 +92,23 @@ struct AddNFunctor : OpKernel { ...@@ -96,17 +92,23 @@ struct AddNFunctor : OpKernel {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLAddNKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLAddNKernel);
};
template <typename T> template <typename T>
struct AddNFunctor<DeviceType::GPU, T> : OpKernel { struct AddNFunctor<DeviceType::GPU, T> : OpKernel {
explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {} explicit AddNFunctor(OpKernelContext *context);
MaceStatus operator()(const std::vector<const Tensor *> &input_tensors, MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLAddNKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -26,41 +26,22 @@ ...@@ -26,41 +26,22 @@
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct BatchNormFunctorBase : OpKernel {
BatchNormFunctorBase(OpKernelContext *context,
bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: OpKernel(context),
folded_constant_(folded_constant),
activation_(activation),
relux_max_limit_(relux_max_limit) {}
const bool folded_constant_;
const ActivationType activation_;
const float relux_max_limit_;
};
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct BatchNormFunctor; struct BatchNormFunctor;
template<> template<>
struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase { struct BatchNormFunctor<DeviceType::CPU, float> : OpKernel {
BatchNormFunctor(OpKernelContext *context, BatchNormFunctor(OpKernelContext *context,
const bool folded_constant, const bool folded_constant,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: BatchNormFunctorBase(context, : OpKernel(context),
folded_constant, folded_constant_(folded_constant),
activation, activation_(activation),
relux_max_limit) {} relux_max_limit_(relux_max_limit) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *scale, const Tensor *scale,
...@@ -133,19 +114,33 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase { ...@@ -133,19 +114,33 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
return MACE_SUCCESS; return MACE_SUCCESS;
} }
const bool folded_constant_;
const ActivationType activation_;
const float relux_max_limit_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLBatchNormKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchNormKernel);
};
template<typename T> template<typename T>
struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase { struct BatchNormFunctor<DeviceType::GPU, T> : OpKernel {
BatchNormFunctor(OpKernelContext *context, BatchNormFunctor(OpKernelContext *context,
const bool folded_constant, const bool folded_constant,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit);
: BatchNormFunctorBase(context,
folded_constant,
activation,
relux_max_limit) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *scale, const Tensor *scale,
const Tensor *offset, const Tensor *offset,
...@@ -154,10 +149,7 @@ struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase { ...@@ -154,10 +149,7 @@ struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
const float epsilon, const float epsilon,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLBatchNormKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -24,10 +24,6 @@ ...@@ -24,10 +24,6 @@
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -51,7 +47,8 @@ struct BatchToSpaceFunctorBase : OpKernel { ...@@ -51,7 +47,8 @@ struct BatchToSpaceFunctorBase : OpKernel {
void CalculateBatchToSpaceOutputShape(const Tensor *input_tensor, void CalculateBatchToSpaceOutputShape(const Tensor *input_tensor,
const DataFormat data_format, const DataFormat data_format,
index_t *output_shape) { index_t *output_shape) {
MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D"); MACE_CHECK(input_tensor->dim_size() == 4,
"Input(", input_tensor->name(), ") shape should be 4D");
index_t batch = input_tensor->dim(0); index_t batch = input_tensor->dim(0);
index_t channels = 0; index_t channels = 0;
index_t height = 0; index_t height = 0;
...@@ -96,8 +93,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase { ...@@ -96,8 +93,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
const std::vector<int> &block_shape) const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {} : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor, MaceStatus operator()(const Tensor *batch_tensor,
Tensor *batch_tensor, Tensor *space_tensor,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
...@@ -191,8 +188,8 @@ struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase { ...@@ -191,8 +188,8 @@ struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
const std::vector<int> &block_shape) const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {} : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor, MaceStatus operator()(const Tensor *batch_tensor,
Tensor *batch_tensor, Tensor *space_tensor,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
...@@ -272,21 +269,29 @@ struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase { ...@@ -272,21 +269,29 @@ struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLBatchToSpaceKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchToSpaceKernel);
};
template <typename T> template <typename T>
struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase { struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase {
BatchToSpaceFunctor(OpKernelContext *context, BatchToSpaceFunctor(OpKernelContext *context,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<int> &block_shape) const std::vector<int> &block_shape);
: BatchToSpaceFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor, MaceStatus operator()(const Tensor *batch_tensor,
Tensor *batch_tensor, Tensor *space_tensor,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLBatchToSpaceKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> space_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -24,10 +24,6 @@ ...@@ -24,10 +24,6 @@
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -96,18 +92,26 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase { ...@@ -96,18 +92,26 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLBiasAddKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBiasAddKernel);
};
template <typename T> template <typename T>
struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase { struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase {
BiasAddFunctor(OpKernelContext *context, const DataFormat data_format) BiasAddFunctor(OpKernelContext *context, const DataFormat data_format);
: BiasAddFunctorBase(context, data_format) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_; std::unique_ptr<OpenCLBiasAddKernel> kernel_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_ #ifndef MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
#define MACE_KERNELS_BUFFER_TO_IMAGE_H_ #define MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
#include <memory> #include <memory>
#include <vector> #include <vector>
...@@ -26,18 +26,19 @@ ...@@ -26,18 +26,19 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct BufferToImageFunctorBase : OpKernel { struct BufferInverseTransformFunctorBase : OpKernel {
explicit BufferToImageFunctorBase(OpKernelContext *context, BufferInverseTransformFunctorBase(OpKernelContext *context,
const int wino_blk_size) const int wino_blk_size)
: OpKernel(context), wino_blk_size_(wino_blk_size) {} : OpKernel(context),
wino_blk_size_(wino_blk_size) {}
const int wino_blk_size_; const int wino_blk_size_;
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase { struct BufferInverseTransformFunctor : BufferInverseTransformFunctorBase {
explicit BufferToImageFunctor(OpKernelContext *context, explicit BufferInverseTransformFunctor(OpKernelContext *context,
const int wino_blk_size) const int wino_blk_size)
: BufferToImageFunctorBase(context, wino_blk_size) {} : BufferInverseTransformFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
...@@ -51,22 +52,31 @@ struct BufferToImageFunctor : BufferToImageFunctorBase { ...@@ -51,22 +52,31 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
} }
}; };
class OpenCLBufferInverseTransformKernel {
public:
virtual MaceStatus Compute(OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferInverseTransformKernel)
};
template <typename T> template <typename T>
struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase { struct BufferInverseTransformFunctor<DeviceType::GPU, T>
explicit BufferToImageFunctor(OpKernelContext *context, : BufferInverseTransformFunctorBase {
const int wino_blk_size) explicit BufferInverseTransformFunctor(OpKernelContext *context,
: BufferToImageFunctorBase(context, wino_blk_size) {} const int wino_blk_size);
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_BUFFER_TO_IMAGE_H_ #endif // MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_KERNELS_IMAGE_TO_BUFFER_H_ #ifndef MACE_KERNELS_BUFFER_TRANSFORM_H_
#define MACE_KERNELS_IMAGE_TO_BUFFER_H_ #define MACE_KERNELS_BUFFER_TRANSFORM_H_
#include <memory> #include <memory>
#include <vector> #include <vector>
...@@ -26,18 +26,19 @@ ...@@ -26,18 +26,19 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct ImageToBufferFunctorBase : OpKernel { struct BufferTransformFunctorBase : OpKernel {
ImageToBufferFunctorBase(OpKernelContext *context, explicit BufferTransformFunctorBase(OpKernelContext *context,
const int wino_blk_size) const int wino_blk_size)
: OpKernel(context), : OpKernel(context), wino_blk_size_(wino_blk_size) {}
wino_blk_size_(wino_blk_size) {}
const int wino_blk_size_; const int wino_blk_size_;
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ImageToBufferFunctor : ImageToBufferFunctorBase { struct BufferTransformFunctor : BufferTransformFunctorBase {
ImageToBufferFunctor(OpKernelContext *context, const int wino_blk_size) BufferTransformFunctor(OpKernelContext *context,
: ImageToBufferFunctorBase(context, wino_blk_size) {} const int wino_blk_size)
: BufferTransformFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
...@@ -51,22 +52,30 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { ...@@ -51,22 +52,30 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
} }
}; };
class OpenCLBufferTransformKernel {
public:
virtual MaceStatus Compute(OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferTransformKernel)
};
template <typename T> template <typename T>
struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase { struct BufferTransformFunctor<DeviceType::GPU, T> : BufferTransformFunctorBase {
ImageToBufferFunctor(OpKernelContext *context, BufferTransformFunctor(OpKernelContext *context, const int wino_blk_size);
const int wino_blk_size)
: ImageToBufferFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_IMAGE_TO_BUFFER_H_ #endif // MACE_KERNELS_BUFFER_TRANSFORM_H_
...@@ -71,20 +71,24 @@ struct ChannelShuffleFunctor : OpKernel { ...@@ -71,20 +71,24 @@ struct ChannelShuffleFunctor : OpKernel {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLChannelShuffleKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLChannelShuffleKernel);
};
template<typename T> template<typename T>
struct ChannelShuffleFunctor<DeviceType::GPU, T> : OpKernel { struct ChannelShuffleFunctor<DeviceType::GPU, T> : OpKernel {
ChannelShuffleFunctor(OpKernelContext *context, const int groups) ChannelShuffleFunctor(OpKernelContext *context, const int groups);
: OpKernel(context), groups_(groups) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLChannelShuffleKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
const int groups_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -24,24 +24,13 @@ ...@@ -24,24 +24,13 @@
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct ConcatFunctorBase : OpKernel {
ConcatFunctorBase(OpKernelContext *context, const int32_t axis)
: OpKernel(context), axis_(axis) {}
int32_t axis_;
};
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ConcatFunctor : ConcatFunctorBase { struct ConcatFunctor : OpKernel {
ConcatFunctor(OpKernelContext *context, const int32_t axis) ConcatFunctor(OpKernelContext *context, const int32_t axis)
: ConcatFunctorBase(context, axis) {} : OpKernel(context), axis_(axis) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_list, MaceStatus operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
...@@ -98,21 +87,29 @@ struct ConcatFunctor : ConcatFunctorBase { ...@@ -98,21 +87,29 @@ struct ConcatFunctor : ConcatFunctorBase {
return MACE_SUCCESS; return MACE_SUCCESS;
} }
int32_t axis_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLConcatKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConcatKernel);
};
template <typename T> template <typename T>
struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase { struct ConcatFunctor<DeviceType::GPU, T> : OpKernel {
ConcatFunctor(OpKernelContext *context, const int32_t axis) ConcatFunctor(OpKernelContext *context, const int32_t axis);
: ConcatFunctorBase(context, axis) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_list, MaceStatus operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_; std::unique_ptr<OpenCLConcatKernel> kernel_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -35,10 +35,6 @@ ...@@ -35,10 +35,6 @@
#include "mace/kernels/quantize.h" #include "mace/kernels/quantize.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -78,8 +74,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -78,8 +74,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const bool is_filter_transformed, const bool is_filter_transformed)
ScratchBuffer *scratch)
: Conv2dFunctorBase(context, : Conv2dFunctorBase(context,
strides, strides,
padding_type, padding_type,
...@@ -88,8 +83,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -88,8 +83,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
activation, activation,
relux_max_limit), relux_max_limit),
transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT), transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT),
is_filter_transformed_(is_filter_transformed), is_filter_transformed_(is_filter_transformed) {}
scratch_(scratch) {}
void Conv2dGeneral(const float *input, void Conv2dGeneral(const float *input,
const float *filter, const float *filter,
...@@ -494,14 +488,15 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -494,14 +488,15 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
} }
// Init scratch buffer // Init scratch buffer
scratch_->Rewind(); ScratchBuffer *scratch = context_->device()->scratch_buffer();
scratch_->GrowSize(total_scratch_size); scratch->Rewind();
scratch->GrowSize(total_scratch_size);
Tensor Tensor
transformed_input(scratch_->Scratch(transformed_input_size), DT_FLOAT); transformed_input(scratch->Scratch(transformed_input_size), DT_FLOAT);
Tensor Tensor
transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT); transformed_output(scratch->Scratch(transformed_output_size), DT_FLOAT);
Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT); Tensor padded_input(scratch->Scratch(padded_input_size), DT_FLOAT);
Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT); Tensor padded_output(scratch->Scratch(padded_output_size), DT_FLOAT);
const index_t extra_input_shape[4] = const index_t extra_input_shape[4] =
{batch, input_channels, extra_input_height, extra_input_width}; {batch, input_channels, extra_input_height, extra_input_width};
const index_t extra_output_shape[4] = const index_t extra_output_shape[4] =
...@@ -559,7 +554,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -559,7 +554,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
transformed_output_data, transformed_output_data,
pad_output, pad_output,
&sgemm_, &sgemm_,
scratch_); scratch);
}; };
} else if (use_neon_3x3_s1) { } else if (use_neon_3x3_s1) {
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
...@@ -588,7 +583,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -588,7 +583,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
channels, channels,
pad_output, pad_output,
&sgemm_, &sgemm_,
scratch_); scratch);
}; };
} else if (use_neon_5x5_s1) { } else if (use_neon_5x5_s1) {
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
...@@ -735,7 +730,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -735,7 +730,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
Tensor transformed_filter_; Tensor transformed_filter_;
bool is_filter_transformed_; bool is_filter_transformed_;
ScratchBuffer *scratch_;
SGemm sgemm_; SGemm sgemm_;
}; };
...@@ -748,16 +742,14 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase { ...@@ -748,16 +742,14 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const bool is_filter_transformed, const bool is_filter_transformed)
ScratchBuffer *scratch)
: Conv2dFunctorBase(context, : Conv2dFunctorBase(context,
strides, strides,
padding_type, padding_type,
paddings, paddings,
dilations, dilations,
activation, activation,
relux_max_limit), relux_max_limit) {
scratch_(scratch) {
MACE_UNUSED(is_filter_transformed); MACE_UNUSED(is_filter_transformed);
} }
...@@ -926,13 +918,14 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase { ...@@ -926,13 +918,14 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
bool im2col_required = bool im2col_required =
filter_h != 1 || filter_w != 1 || stride_h != 1 || stride_w != 1; filter_h != 1 || filter_w != 1 || stride_h != 1 || stride_w != 1;
total_scratch_size += (im2col_required ? im2col_size : 0); total_scratch_size += (im2col_required ? im2col_size : 0);
scratch_->Rewind(); ScratchBuffer *scratch = context_->device()->scratch_buffer();
scratch_->GrowSize(total_scratch_size); scratch->Rewind();
scratch->GrowSize(total_scratch_size);
std::unique_ptr<Tensor> zero_bias; std::unique_ptr<Tensor> zero_bias;
const int32_t *bias_data = nullptr; const int32_t *bias_data = nullptr;
if (bias == nullptr) { if (bias == nullptr) {
zero_bias.reset(new Tensor(scratch_->Scratch(zero_bias_size), DT_INT32)); zero_bias.reset(new Tensor(scratch->Scratch(zero_bias_size), DT_INT32));
zero_bias->Reshape({channels}); zero_bias->Reshape({channels});
zero_bias->Clear(); zero_bias->Clear();
bias_data = zero_bias->data<int32_t>(); bias_data = zero_bias->data<int32_t>();
...@@ -944,7 +937,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase { ...@@ -944,7 +937,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
auto gemm_input_data = input_data; auto gemm_input_data = input_data;
if (im2col_required) { if (im2col_required) {
// prepare im2col // prepare im2col
im2col.reset(new Tensor(scratch_->Scratch(im2col_size), DT_UINT8)); im2col.reset(new Tensor(scratch->Scratch(im2col_size), DT_UINT8));
uint8_t *im2col_data = im2col->mutable_data<uint8_t>(); uint8_t *im2col_data = im2col->mutable_data<uint8_t>();
Im2col(input_data, input->shape(), filter_h, filter_w, stride_h, Im2col(input_data, input->shape(), filter_h, filter_w, stride_h,
stride_w, static_cast<uint8_t>(input->zero_point()), stride_w, static_cast<uint8_t>(input->zero_point()),
...@@ -976,12 +969,28 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase { ...@@ -976,12 +969,28 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
return MACE_SUCCESS; return MACE_SUCCESS;
} }
ScratchBuffer *scratch_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> class OpenCLConv2dKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConv2dKernel);
};
template <typename T>
struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase { struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
Conv2dFunctor(OpKernelContext *context, Conv2dFunctor(OpKernelContext *context,
const int *strides, const int *strides,
...@@ -990,18 +999,7 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase { ...@@ -990,18 +999,7 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const bool is_filter_transformed, const bool is_filter_transformed);
ScratchBuffer *scratch)
: Conv2dFunctorBase(context,
strides,
padding_type,
paddings,
dilations,
activation,
relux_max_limit) {
MACE_UNUSED(is_filter_transformed);
MACE_UNUSED(scratch);
}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -1009,10 +1007,7 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase { ...@@ -1009,10 +1007,7 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLConv2dKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -210,6 +210,20 @@ void CalcOutputSize(const index_t *input_shape, ...@@ -210,6 +210,20 @@ void CalcOutputSize(const index_t *input_shape,
} }
} }
void CalcNCHWInputShape(const index_t *output_shape,
const index_t *filter_shape,
const int *strides,
const int *dilations,
index_t *input_shape) {
MACE_CHECK_NOTNULL(input_shape);
input_shape[0] = output_shape[0];
input_shape[1] = filter_shape[1];
input_shape[2] = (output_shape[2] - 1) * strides[0] +
(filter_shape[2] - 1) * dilations[0] + 1;
input_shape[3] = (output_shape[3] - 1) * strides[1] +
(filter_shape[3] - 1) * dilations[1] + 1;
}
void CalcOutputSize(const index_t *input_shape, // NHWC void CalcOutputSize(const index_t *input_shape, // NHWC
const index_t *filter_shape, // OIHW const index_t *filter_shape, // OIHW
const int *padding_size, const int *padding_size,
...@@ -234,8 +248,8 @@ void CalcNCHWOutputSize(const index_t *input_shape, // NCHW ...@@ -234,8 +248,8 @@ void CalcNCHWOutputSize(const index_t *input_shape, // NCHW
void CalPaddingSize(const index_t *input_shape, // NCHW void CalPaddingSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW const index_t *filter_shape, // OIHW
const int *dilations,
const int *strides, const int *strides,
const int *dilations,
Padding padding, Padding padding,
int *padding_size) { int *padding_size) {
MACE_CHECK(dilations[0] > 0 && dilations[1] > 0, MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,
......
...@@ -84,6 +84,12 @@ void CalcNCHWOutputSize(const index_t *input_shape, ...@@ -84,6 +84,12 @@ void CalcNCHWOutputSize(const index_t *input_shape,
const RoundType round_type, const RoundType round_type,
index_t *output_shape); index_t *output_shape);
void CalcNCHWInputShape(const index_t *output_shape,
const index_t *filter_shape,
const int *strides,
const int *dilations,
index_t *input_shape);
void CalPaddingSize(const index_t *input_shape, // NCHW void CalPaddingSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW const index_t *filter_shape, // OIHW
const int *dilations, const int *dilations,
...@@ -91,6 +97,7 @@ void CalPaddingSize(const index_t *input_shape, // NCHW ...@@ -91,6 +97,7 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
Padding padding, Padding padding,
int *padding_size); int *padding_size);
MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input, MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input,
const int pad_top, const int pad_bottom, const int pad_top, const int pad_bottom,
const int pad_left, const int pad_right, const int pad_left, const int pad_right,
......
...@@ -24,32 +24,18 @@ ...@@ -24,32 +24,18 @@
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct CropFunctorBase : OpKernel { template <DeviceType D, typename T>
CropFunctorBase(OpKernelContext *context, struct CropFunctor : OpKernel {
CropFunctor(OpKernelContext *context,
const int axis, const int axis,
const std::vector<int> &offset) const std::vector<int> &offset)
: OpKernel(context), : OpKernel(context),
axis_(axis), axis_(axis),
offset_(offset) {} offset_(offset) {}
const int axis_;
std::vector<int> offset_;
};
template <DeviceType D, typename T>
struct CropFunctor : CropFunctorBase {
CropFunctor(OpKernelContext *context,
const int axis,
const std::vector<int> &offset)
: CropFunctorBase(context, axis, offset) {}
void crop_copy(const T* input_data, T* output_data, void crop_copy(const T* input_data, T* output_data,
const std::vector<index_t> &input_shape, const std::vector<index_t> &input_shape,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
...@@ -121,23 +107,31 @@ struct CropFunctor : CropFunctorBase { ...@@ -121,23 +107,31 @@ struct CropFunctor : CropFunctorBase {
return MACE_SUCCESS; return MACE_SUCCESS;
} }
const int axis_;
std::vector<int> offset_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLCropKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLCropKernel);
};
template <typename T> template <typename T>
struct CropFunctor<DeviceType::GPU, T> : CropFunctorBase { struct CropFunctor<DeviceType::GPU, T> : OpKernel {
CropFunctor(OpKernelContext *context, CropFunctor(OpKernelContext *context,
const int axis, const int axis,
const std::vector<int> &offset) const std::vector<int> &offset);
: CropFunctorBase(context, axis, offset) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_list, MaceStatus operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLCropKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -28,10 +28,6 @@ ...@@ -28,10 +28,6 @@
#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/conv_pool_2d_util.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -317,6 +313,22 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { ...@@ -317,6 +313,22 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLDeconv2dKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *padding_data,
const ActivationType activation,
const float relux_max_limit,
const std::vector<index_t> &output_shape,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDeconv2dKernel);
};
template <typename T> template <typename T>
struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase { struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
Deconv2dFunctor(OpKernelContext *context, Deconv2dFunctor(OpKernelContext *context,
...@@ -325,14 +337,7 @@ struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase { ...@@ -325,14 +337,7 @@ struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit);
: Deconv2dFunctorBase(context,
strides,
padding_type,
paddings,
output_shape,
activation,
relux_max_limit) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -341,10 +346,7 @@ struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase { ...@@ -341,10 +346,7 @@ struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLDeconv2dKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -93,20 +93,24 @@ struct DepthToSpaceOpFunctor : OpKernel { ...@@ -93,20 +93,24 @@ struct DepthToSpaceOpFunctor : OpKernel {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLDepthToSpaceKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthToSpaceKernel);
};
template<typename T> template<typename T>
struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel { struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
DepthToSpaceOpFunctor(OpKernelContext *context, DepthToSpaceOpFunctor(OpKernelContext *context,
const int block_size) const int block_size);
: OpKernel(context), block_size_(block_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
const int block_size_; std::unique_ptr<OpenCLDepthToSpaceKernel> kernel_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -501,6 +501,24 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t> ...@@ -501,6 +501,24 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLDepthwiseConv2dKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthwiseConv2dKernel);
};
template<typename T> template<typename T>
struct DepthwiseConv2dFunctor<DeviceType::GPU, T> struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
: DepthwiseConv2dFunctorBase { : DepthwiseConv2dFunctorBase {
...@@ -510,14 +528,7 @@ struct DepthwiseConv2dFunctor<DeviceType::GPU, T> ...@@ -510,14 +528,7 @@ struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit);
: DepthwiseConv2dFunctorBase(context,
strides,
padding_type,
paddings,
dilations,
activation,
relux_max_limit) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -525,10 +536,7 @@ struct DepthwiseConv2dFunctor<DeviceType::GPU, T> ...@@ -525,10 +536,7 @@ struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLDepthwiseConv2dKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -27,10 +27,6 @@ ...@@ -27,10 +27,6 @@
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#include "mace/utils/quantize.h" #include "mace/utils/quantize.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -805,11 +801,12 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -805,11 +801,12 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
} }
} }
struct EltwiseFunctorBase : OpKernel { template <DeviceType D, typename T>
EltwiseFunctorBase(OpKernelContext *context, struct EltwiseFunctor : OpKernel {
EltwiseFunctor(OpKernelContext *context,
const EltwiseType type, const EltwiseType type,
const std::vector<float> &coeff, const std::vector<float> &coeff,
const float scalar_input, const float scalar_input, // float as it comes from arg
const int32_t scalar_input_index, const int32_t scalar_input_index,
const DataFormat data_format) const DataFormat data_format)
: OpKernel(context), : OpKernel(context),
...@@ -819,28 +816,6 @@ struct EltwiseFunctorBase : OpKernel { ...@@ -819,28 +816,6 @@ struct EltwiseFunctorBase : OpKernel {
scalar_input_index_(scalar_input_index), scalar_input_index_(scalar_input_index),
data_format_(data_format) {} data_format_(data_format) {}
EltwiseType type_;
std::vector<float> coeff_;
float scalar_input_;
int32_t scalar_input_index_;
DataFormat data_format_;
};
template <DeviceType D, typename T>
struct EltwiseFunctor : EltwiseFunctorBase {
EltwiseFunctor(OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff,
const float scalar_input, // float as it comes from arg
const int32_t scalar_input_index,
const DataFormat data_format)
: EltwiseFunctorBase(context,
type,
coeff,
scalar_input,
scalar_input_index,
data_format) {}
template <typename DstType> template <typename DstType>
MaceStatus DoEltwise(const Tensor *input0, MaceStatus DoEltwise(const Tensor *input0,
const Tensor *input1, const Tensor *input1,
...@@ -957,23 +932,28 @@ struct EltwiseFunctor : EltwiseFunctorBase { ...@@ -957,23 +932,28 @@ struct EltwiseFunctor : EltwiseFunctorBase {
} }
} }
EltwiseType type_;
std::vector<float> coeff_;
float scalar_input_;
int32_t scalar_input_index_;
DataFormat data_format_;
Tensor scalar_tensor_; Tensor scalar_tensor_;
}; };
template <> template <>
struct EltwiseFunctor<DeviceType::CPU, uint8_t> : EltwiseFunctorBase { struct EltwiseFunctor<DeviceType::CPU, uint8_t> : OpKernel {
EltwiseFunctor(OpKernelContext *context, EltwiseFunctor(OpKernelContext *context,
const EltwiseType type, const EltwiseType type,
const std::vector<float> &coeff, const std::vector<float> &coeff,
const float scalar_input, // float as it comes from arg const float scalar_input, // float as it comes from arg
const int32_t scalar_input_index, const int32_t scalar_input_index,
const DataFormat data_format) const DataFormat data_format)
: EltwiseFunctorBase(context, : OpKernel(context),
type, type_(type),
coeff, coeff_(coeff),
scalar_input, scalar_input_(scalar_input),
scalar_input_index, scalar_input_index_(scalar_input_index),
data_format) {} data_format_(data_format) {}
MaceStatus operator()(const Tensor *input0, MaceStatus operator()(const Tensor *input0,
const Tensor *input1, const Tensor *input1,
...@@ -1093,33 +1073,41 @@ struct EltwiseFunctor<DeviceType::CPU, uint8_t> : EltwiseFunctorBase { ...@@ -1093,33 +1073,41 @@ struct EltwiseFunctor<DeviceType::CPU, uint8_t> : EltwiseFunctorBase {
return MACE_SUCCESS; return MACE_SUCCESS;
} }
EltwiseType type_;
std::vector<float> coeff_;
float scalar_input_;
int32_t scalar_input_index_;
DataFormat data_format_;
Tensor scalar_tensor_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLEltwiseKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLEltwiseKernel);
};
template <typename T> template <typename T>
struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase { struct EltwiseFunctor<DeviceType::GPU, T> : OpKernel {
EltwiseFunctor(OpKernelContext *context, EltwiseFunctor(OpKernelContext *context,
const EltwiseType type, const EltwiseType type,
const std::vector<float> &coeff, const std::vector<float> &coeff,
const float scalar_input, const float scalar_input,
const int32_t scalar_input_index, const int32_t scalar_input_index,
const DataFormat data_format) const DataFormat data_format);
: EltwiseFunctorBase(context,
type,
coeff,
scalar_input,
scalar_input_index,
data_format) {}
MaceStatus operator()(const Tensor *input0, MaceStatus operator()(const Tensor *input0,
const Tensor *input1, const Tensor *input1,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLEltwiseKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -151,12 +151,24 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase { ...@@ -151,12 +151,24 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLFullyConnectedKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLFullyConnectedKernel);
};
template <typename T> template <typename T>
struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase { struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
FullyConnectedFunctor(OpKernelContext *context, FullyConnectedFunctor(OpKernelContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit);
: FullyConnectedBase(context, activation, relux_max_limit) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *weight, const Tensor *weight,
...@@ -164,11 +176,7 @@ struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase { ...@@ -164,11 +176,7 @@ struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLFullyConnectedKernel> kernel_;
std::vector<uint32_t> gws_;
std::vector<uint32_t> lws_;
std::vector<index_t> input_shape_;
std::unique_ptr<BufferBase> kernel_error_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -35,11 +35,23 @@ namespace kernels { ...@@ -35,11 +35,23 @@ namespace kernels {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct LSTMCellFunctor; struct LSTMCellFunctor;
class OpenCLLSTMCellKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *pre_output,
const Tensor *weight,
const Tensor *bias,
const Tensor *pre_cell,
Tensor *cell,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLLSTMCellKernel);
};
template <typename T> template <typename T>
struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{ struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
LSTMCellFunctor(OpKernelContext *context, T forget_bias) LSTMCellFunctor(OpKernelContext *context, T forget_bias);
: OpKernel(context),
forget_bias_(static_cast<T>(forget_bias)) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *pre_output, const Tensor *pre_output,
const Tensor *weight, const Tensor *weight,
...@@ -49,11 +61,7 @@ struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{ ...@@ -49,11 +61,7 @@ struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
T forget_bias_; std::unique_ptr<OpenCLLSTMCellKernel> kernel_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -34,10 +34,6 @@ ...@@ -34,10 +34,6 @@
#include "mace/kernels/gemmlowp_util.h" #include "mace/kernels/gemmlowp_util.h"
#include "mace/kernels/sgemm.h" #include "mace/kernels/sgemm.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -89,7 +85,7 @@ struct MatMulFunctor : OpKernel { ...@@ -89,7 +85,7 @@ struct MatMulFunctor : OpKernel {
const index_t height_b = B->dim(rank - 2); const index_t height_b = B->dim(rank - 2);
const index_t width_b = B->dim(rank - 1); const index_t width_b = B->dim(rank - 1);
auto scratch_buffer = context_->workspace()->GetScratchBuffer(D); auto scratch_buffer = context_->device()->scratch_buffer();
scratch_buffer->Rewind(); scratch_buffer->Rewind();
index_t scratch_size = C->raw_max_size(); index_t scratch_size = C->raw_max_size();
if (!A->is_weight()) { if (!A->is_weight()) {
...@@ -112,7 +108,7 @@ struct MatMulFunctor : OpKernel { ...@@ -112,7 +108,7 @@ struct MatMulFunctor : OpKernel {
A->is_weight(), A->is_weight(),
B->is_weight(), B->is_weight(),
c_ptr_base, c_ptr_base,
scratch_buffer); context_->device()->scratch_buffer());
return MACE_SUCCESS; return MACE_SUCCESS;
} }
...@@ -218,9 +214,21 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel { ...@@ -218,9 +214,21 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLMatMulKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *A,
const Tensor *B,
Tensor *C,
bool transpose_a,
bool transpose_b,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLMatMulKernel);
};
template <typename T> template <typename T>
struct MatMulFunctor<DeviceType::GPU, T> : OpKernel { struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {} explicit MatMulFunctor(OpKernelContext *context);
MaceStatus operator()(const Tensor *A, MaceStatus operator()(const Tensor *A,
const Tensor *B, const Tensor *B,
...@@ -229,9 +237,7 @@ struct MatMulFunctor<DeviceType::GPU, T> : OpKernel { ...@@ -229,9 +237,7 @@ struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
bool transpose_b, bool transpose_b,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLMatMulKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -13,96 +13,31 @@ ...@@ -13,96 +13,31 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/image/activation.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
ActivationFunctor<DeviceType::GPU, T>::ActivationFunctor(
OpKernelContext *context,
ActivationType type,
T relux_max_limit) : OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(
new opencl::image::ActivationKernel<T>(type, relux_max_limit));
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()( MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = input->dim(0); return kernel_->Compute(context_, input, alpha, output, future);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
switch (activation_) {
case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct ActivationFunctor<DeviceType::GPU, float>; template struct ActivationFunctor<DeviceType::GPU, float>;
......
...@@ -13,97 +13,32 @@ ...@@ -13,97 +13,32 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/addn.h" #include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/image/addn.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
AddNFunctor<DeviceType::GPU, T>::AddNFunctor(OpKernelContext *context)
: OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(
new opencl::image::AddNKernel<T>);
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
const std::vector<const Tensor *> &input_tensors, const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
size_t size = input_tensors.size(); return kernel_->Compute(context_, input_tensors, output_tensor, future);
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context_->device()->opencl_runtime();
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_2D_GWS_ARGS(kernel_);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct AddNFunctor<DeviceType::GPU, float>; template struct AddNFunctor<DeviceType::GPU, float>;
template struct AddNFunctor<DeviceType::GPU, half>; template struct AddNFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
......
...@@ -13,14 +13,26 @@ ...@@ -13,14 +13,26 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/batch_norm.h" #include "mace/kernels/batch_norm.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/batch_norm.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
BatchNormFunctor<DeviceType::GPU, T>::BatchNormFunctor(
OpKernelContext *context,
const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::BatchNormKernel<T>(
folded_constant, activation, relux_max_limit));
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()( MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
...@@ -31,84 +43,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()( ...@@ -31,84 +43,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
const float epsilon, const float epsilon,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr)); return kernel_->Compute(context_, input, scale, offset, mean,
var, epsilon, output, future);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (folded_constant_) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (!folded_constant_) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct BatchNormFunctor<DeviceType::GPU, float>; template struct BatchNormFunctor<DeviceType::GPU, float>;
......
...@@ -16,84 +16,31 @@ ...@@ -16,84 +16,31 @@
#define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_ #define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
#include "mace/kernels/batch_to_space.h" #include "mace/kernels/batch_to_space.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/batch_to_space.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
BatchToSpaceFunctor<DeviceType::GPU, T>::BatchToSpaceFunctor(
OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::BatchToSpaceKernel<T>);
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus BatchToSpaceFunctor<DeviceType::GPU, T>::operator()( MaceStatus BatchToSpaceFunctor<DeviceType::GPU, T>::operator()(
Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) { const Tensor *batch_tensor, Tensor *space_tensor, StatsFuture *future) {
std::vector<index_t> output_shape(4, 0); std::vector<index_t> output_shape(4, 0);
CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC, CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
output_shape.data()); output_shape.data());
return kernel_->Compute(context_, batch_tensor, paddings_, block_shape_,
std::vector<size_t> output_image_shape; output_shape, space_tensor, future);
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape_[0]);
kernel_.setArg(idx++, block_shape_[1]);
kernel_.setArg(idx++, paddings_[0]);
kernel_.setArg(idx++, paddings_[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
space_shape_ = space_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct BatchToSpaceFunctor<DeviceType::GPU, float>; template struct BatchToSpaceFunctor<DeviceType::GPU, float>;
......
...@@ -13,13 +13,23 @@ ...@@ -13,13 +13,23 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/bias_add.h" #include "mace/kernels/bias_add.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/bias_add.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
BiasAddFunctor<DeviceType::GPU, T>::BiasAddFunctor(
OpKernelContext *context,
const DataFormat data_format)
: BiasAddFunctorBase(context, data_format) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::BiasAddKernel<T>);
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
...@@ -27,75 +37,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -27,75 +37,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC, MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC,
"gpu only support biasadd for 4-dimensional NHWC format tensor"); "gpu only support biasadd for 4-dimensional NHWC format tensor");
return kernel_->Compute(context_, input, bias, output, future);
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MACE_SUCCESS;
} }
template struct BiasAddFunctor<DeviceType::GPU, float>; template struct BiasAddFunctor<DeviceType::GPU, float>;
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
#define MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
#include "mace/kernels/buffer_inverse_transform.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
MaceStatus BufferTypeTransform(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future);
template <typename T>
class BufferInverseTransform: public OpenCLBufferInverseTransformKernel {
public:
MaceStatus Compute(OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
};
template <typename T>
MaceStatus BufferInverseTransform<T>::Compute(OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(type);
MACE_UNUSED(wino_blk_size);
const DataType dt = DataTypeToEnum<T>::value;
if (input->dtype() != output->dtype()) {
return BufferTypeTransform(context, &kernel_, input, dt, output, future);
} else {
SetFutureDefaultWaitFn(future);
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/opencl/buffer/buffer_transform.h"
#include <vector>
#include <set>
#include <string>
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
MaceStatus TransformConv2DFilter(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future) {
const index_t out_chan = input->dim(0);
const index_t in_chan = input->dim(1);
const index_t filter_height = input->dim(2);
const index_t filter_width = input->dim(3);
std::vector<index_t> transformed_shape = {
filter_height, filter_width,
RoundUpDiv4(out_chan),
RoundUp<index_t>(in_chan, 4),
4,
};
uint32_t gws[3];
gws[0] = static_cast<uint32_t>(transformed_shape[3]);
gws[1] = static_cast<uint32_t>(transformed_shape[2]);
gws[2] = static_cast<uint32_t>(filter_height * filter_width);
MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
output->Reshape(input->shape());
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_NON_UNIFORM_WG_CONFIG;
MACE_OUT_OF_RANGE_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter");
built_options.emplace("-Dtransform_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
kernel));
}
MACE_OUT_OF_RANGE_INIT(*kernel);
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel->setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
kernel->setArg(idx++, *(output->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(out_chan));
kernel->setArg(idx++, static_cast<int32_t>(in_chan));
kernel->setArg(idx++, static_cast<int32_t>(filter_height));
kernel->setArg(idx++, static_cast<int32_t>(filter_width));
kernel->setArg(idx++, static_cast<int32_t>(
in_chan * filter_height * filter_width));
std::string tuning_key =
Concat("transform_conv_filter",
transformed_shape[0],
transformed_shape[1],
transformed_shape[2],
transformed_shape[3]);
std::vector<uint32_t> lws = {4, 4, 4, 0};
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION
// Mark the buffer unused.
const_cast<Tensor *>(input)->MarkUnused();
return MACE_SUCCESS;
}
MaceStatus TransformDWConv2DFilter(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future) {
const index_t multiplier = input->dim(0);
const index_t in_chan = input->dim(1);
const index_t filter_height = input->dim(2);
const index_t filter_width = input->dim(3);
std::vector<index_t> transformed_shape = {
multiplier, RoundUpDiv4(in_chan),
filter_height, filter_width, 4,
};
uint32_t gws[3];
gws[0] = static_cast<uint32_t>(filter_width);
gws[1] = static_cast<uint32_t>(filter_height);
gws[2] = static_cast<uint32_t>(transformed_shape[1]);
MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
output->Reshape(input->shape());
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter");
built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
kernel));
}
MACE_OUT_OF_RANGE_INIT(*kernel);
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel->setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
kernel->setArg(idx++, *(output->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(in_chan));
kernel->setArg(idx++, static_cast<int32_t>(filter_height * filter_width));
std::string tuning_key =
Concat("transform_conv_filter",
transformed_shape[0],
transformed_shape[1],
transformed_shape[2],
transformed_shape[3]);
std::vector<uint32_t> lws = {4, 4, 4, 0};
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION
// Mark the buffer unused.
const_cast<Tensor *>(input)->MarkUnused();
return MACE_SUCCESS;
}
MaceStatus TransformArgument(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future) {
const index_t size = input->dim(0);
std::vector<index_t> transformed_shape = {RoundUp<index_t>(size, 4)};
uint32_t gws = static_cast<uint32_t>(RoundUpDiv4(transformed_shape[0]));
MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
output->Reshape(input->shape());
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg");
built_options.emplace("-Dtransform_arg=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
kernel));
}
MACE_OUT_OF_RANGE_INIT(*kernel);
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size());
kernel->setArg(idx++, gws);
kernel->setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel->setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
kernel->setArg(idx++, *(output->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(size));
const uint32_t lws =
static_cast<uint32_t>(RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize()));
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(gws),
cl::NDRange(lws), nullptr, &event);
} else {
uint32_t roundup_gws = RoundUp(gws, lws);
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(roundup_gws),
cl::NDRange(lws), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
// Mark the buffer unused.
const_cast<Tensor *>(input)->MarkUnused();
return MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
#define MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
#include <vector>
#include "mace/kernels/buffer_transform.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
MaceStatus BufferTypeTransform(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future);
MaceStatus TransformConv2DFilter(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future);
MaceStatus TransformDWConv2DFilter(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future);
MaceStatus TransformArgument(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future);
template <typename T>
class BufferTransform: public OpenCLBufferTransformKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BufferTransform<T>::Compute(OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(type);
MACE_UNUSED(wino_blk_size);
const DataType dt = DataTypeToEnum<T>::value;
switch (type) {
case CONV2D_FILTER:
return TransformConv2DFilter(context, &kernel_, input,
dt, output, future);
case DW_CONV2D_FILTER:
return TransformDWConv2DFilter(context, &kernel_, input,
dt, output, future);
case ARGUMENT:
return TransformArgument(context, &kernel_, input, dt, output, future);
default:
if (input->dtype() != dt) {
return BufferTypeTransform(context, &kernel_, input,
dt, output, future);
} else {
SetFutureDefaultWaitFn(future);
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
MaceStatus BufferTypeTransform(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
const uint32_t gws =
static_cast<uint32_t>(RoundUpDiv4(output->size()));
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type");
built_options.emplace("-Dtransform_data_type=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
kernel));
}
MACE_OUT_OF_RANGE_INIT(*kernel);
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
kernel->setArg(idx++, gws);
kernel->setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel->setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
kernel->setArg(idx++, *(output->opencl_buffer()));
const uint32_t lws =
static_cast<uint32_t>(RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize()));
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(gws),
cl::NDRange(lws), nullptr, &event);
} else {
uint32_t roundup_gws = RoundUp(gws, lws);
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(roundup_gws),
cl::NDRange(lws), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
// Mark the buffer unused.
const_cast<Tensor *>(input)->MarkUnused();
return MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
#define MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
#include "mace/kernels/conv_2d.h"
#include <functional>
#include <memory>
#include <vector>
#include "mace/kernels/opencl/buffer/utils.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
namespace conv2d {
extern MaceStatus Conv2d1x1(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *padded_input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const bool input_changed,
Tensor *output,
StatsFuture *future);
extern MaceStatus Conv2dGeneral(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const bool input_changed,
Tensor *output,
StatsFuture *future);
} // namespace conv2d
template <typename T>
class Conv2dKernel : public OpenCLConv2dKernel {
public:
Conv2dKernel() : old_scratch_size_(0) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) override;
private:
index_t old_scratch_size_;
cl::Kernel kernels_[2];
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus Conv2dKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) {
StatsFuture pad_future, conv_future;
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
bool use_1x1 = filter_h == 1 && filter_w == 1;
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w, tile_c = 4;
if (use_1x1) {
tile_w = 2;
} else {
tile_w = 4;
}
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
input->dtype()));
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
if (use_1x1) {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2d1x1(
context, &kernels_[1], pad_input, filter, bias, strides,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
input_changed, output, &conv_future);
};
} else {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2dGeneral(
context, &kernels_[1], pad_input, filter, bias, strides, dilations,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
input_changed, output, &conv_future);
};
}
MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
MergeMultipleFutureWaitFn({pad_future, conv_future}, future);
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
namespace conv2d {
MaceStatus Conv2d1x1(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *padded_input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const bool input_changed,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channel = output->dim(3);
const index_t in_height = padded_input->dim(1);
const index_t in_width = padded_input->dim(2);
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1_buffer",
kernel_name,
built_options, kernel));
}
const uint32_t gws[2] = {static_cast<uint32_t>(
RoundUpDiv4(channel) *
RoundUpDiv<index_t>(width, 2)),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_2D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input->opencl_buffer()));
kernel->setArg(idx++, *(filter->opencl_buffer()));
if (bias != nullptr) {
kernel->setArg(idx++, *(bias->opencl_buffer()));
}
kernel->setArg(idx++, static_cast<int32_t>(in_height));
kernel->setArg(idx++, static_cast<int32_t>(in_width));
kernel->setArg(idx++, static_cast<int32_t>(padded_input->dim(3)));
kernel->setArg(idx++,
static_cast<int32_t>(filter->buffer_shape()[3]));
kernel->setArg(idx++, static_cast<int32_t>(height));
kernel->setArg(idx++, static_cast<int32_t>(width));
kernel->setArg(idx++, static_cast<int32_t>(channel));
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
std::string tuning_key =
Concat("conv2d_1x1_buffer", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
std::vector<uint32_t> lws = {16, 4, 0};
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws,
lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace conv2d
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
namespace conv2d {
MaceStatus Conv2dGeneral(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *padded_input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const bool input_changed,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channel = output->dim(3);
const index_t in_height = padded_input->dim(1);
const index_t in_width = padded_input->dim(2);
const index_t in_channel = padded_input->dim(3);
const index_t filter_height = filter->dim(2);
const index_t filter_width = filter->dim(3);
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_buffer",
kernel_name,
built_options, kernel));
}
const uint32_t gws[2] = {static_cast<uint32_t>(
RoundUpDiv4(channel) * RoundUpDiv4(width)),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
auto filter_buffer_shape = filter->buffer_shape();
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_2D_GWS_ARGS(*kernel, gws)
kernel->setArg(idx++, *(padded_input->opencl_buffer()));
kernel->setArg(idx++, *(filter->opencl_buffer()));
if (bias != nullptr) {
kernel->setArg(idx++, *(bias->opencl_buffer()));
}
kernel->setArg(idx++, static_cast<int32_t>(in_height));
kernel->setArg(idx++, static_cast<int32_t>(in_width));
kernel->setArg(idx++, static_cast<int32_t>(padded_input->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(filter_height));
kernel->setArg(idx++, static_cast<int32_t>(filter_width));
kernel->setArg(idx++,
static_cast<int32_t>(filter_buffer_shape[3]));
kernel->setArg(idx++, static_cast<int32_t>(
filter_buffer_shape[2] * filter_buffer_shape[3]
* filter_buffer_shape[4]));
kernel->setArg(idx++, static_cast<int32_t>(height));
kernel->setArg(idx++, static_cast<int32_t>(width));
kernel->setArg(idx++, static_cast<int32_t>(channel));
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, static_cast<int32_t>(
dilations[0] * in_width * in_channel));
kernel->setArg(idx++, static_cast<int32_t>(
dilations[1] * in_channel));
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
std::string tuning_key =
Concat("conv2d_general_buffer", output->dim(0), output->dim(1),
output->dim(2), output->dim(3), filter_height, filter_width);
std::vector<uint32_t> lws = {16, 4, 0};
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws,
lws, future));
MACE_OUT_OF_RANGE_VALIDATION
return MACE_SUCCESS;
}
} // namespace conv2d
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/opencl/buffer/depthwise_conv2d.h"
#include <set>
#include <string>
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
namespace depthwise {
MaceStatus DepthwiseConv2d(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *padded_input, // NHWC
const Tensor *filter, // HWIM
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const bool input_changed,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channel = output->dim(3);
const index_t in_height = padded_input->dim(1);
const index_t in_width = padded_input->dim(2);
const index_t in_channel = padded_input->dim(3);
const index_t filter_height = filter->dim(2);
const index_t filter_width = filter->dim(3);
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("depthwise_conv2d_buffer", kernel_name,
built_options, kernel));
}
const uint32_t gws[2] = {
static_cast<uint32_t>(RoundUpDiv4(channel) * RoundUpDiv4(width)),
static_cast<uint32_t>(height * batch)
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_2D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input->opencl_buffer()));
kernel->setArg(idx++, *(filter->opencl_buffer()));
if (bias != nullptr) {
kernel->setArg(idx++, *(bias->opencl_buffer()));
}
kernel->setArg(idx++, static_cast<uint32_t>(in_height));
kernel->setArg(idx++, static_cast<uint32_t>(in_width));
kernel->setArg(idx++, static_cast<uint32_t>(in_channel));
kernel->setArg(idx++, static_cast<uint32_t>(filter_height));
kernel->setArg(idx++, static_cast<uint32_t>(filter_width));
kernel->setArg(idx++, static_cast<uint32_t>(filter_height * filter_width));
kernel->setArg(idx++, static_cast<uint32_t>(height));
kernel->setArg(idx++, static_cast<uint32_t>(width));
kernel->setArg(idx++, static_cast<uint32_t>(channel));
kernel->setArg(idx++, static_cast<uint32_t>(strides[0]));
kernel->setArg(idx++, static_cast<uint32_t>(strides[1]));
kernel->setArg(idx++, static_cast<int32_t>(
dilations[0] * in_width * in_channel));
kernel->setArg(idx++, static_cast<int32_t>(
dilations[1] * in_channel));
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
std::vector<uint32_t> lws = {16, 4, 0};
std::string tuning_key =
Concat("depthwise_conv2d_buffer_kernel", in_height, in_width, in_channel,
filter_height, filter_width, channel);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION
return MACE_SUCCESS;
}
} // namespace depthwise
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
#define MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
#include "mace/kernels/depthwise_conv2d.h"
#include <functional>
#include <memory>
#include <vector>
#include "mace/kernels/opencl/buffer/utils.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
namespace depthwise {
MaceStatus DepthwiseConv2d(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *padded_input, // NHWC
const Tensor *filter, // HWIM
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const bool input_changed,
Tensor *output,
StatsFuture *future);
} // namespace depthwise
template <typename T>
class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
public:
DepthwiseConv2dKernel() : old_scratch_size_(0) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) override;
private:
index_t old_scratch_size_;
cl::Kernel kernels_[2];
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus DepthwiseConv2dKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) {
StatsFuture pad_future, dw_conv_future;
index_t filter_w = filter->dim(3);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
MACE_CHECK(filter->dim(0) * input_channels == channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w = 4, tile_c = 4;
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
input->dtype()));
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
MACE_RETURN_IF_ERROR(
depthwise::DepthwiseConv2d(
context, &kernels_[1], padded_input_ptr, filter, bias, strides,
dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
input_changed, output, &dw_conv_future));
MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, future);
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
#define MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
#include "mace/kernels/pooling.h"
#include <functional>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/buffer/utils.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
template <typename T>
class PoolingKernel : public OpenCLPoolingKernel {
public:
PoolingKernel() : old_scratch_size_(0) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
Tensor *output,
StatsFuture *future) override;
private:
index_t old_scratch_size_;
cl::Kernel kernels_[2];
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus PoolingKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
StatsFuture pad_future, pooling_future;
index_t input_channels = input->dim(3);
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::CEIL,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
auto runtime = context->device()->opencl_runtime();
// pad input
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
input->dtype()));
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, 0, 0,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
cl::Kernel *kernel = &kernels_[1];
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
} else {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
kernel_name,
built_options,
kernel));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
kernel->setArg(idx++, paddings[0] / 2);
kernel->setArg(idx++, paddings[1] / 2);
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, kernels[0]);
kernel->setArg(idx++, kernels[1]);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
const std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, &pooling_future));
MACE_OUT_OF_RANGE_VALIDATION
MergeMultipleFutureWaitFn({pad_future, pooling_future}, future);
return MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
#define MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
#include "mace/kernels/softmax.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
template <typename T>
class SoftmaxKernel : public OpenCLSoftmaxKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *logits,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus SoftmaxKernel<T>::Compute(
OpKernelContext *context,
const Tensor *logits,
Tensor *output,
StatsFuture *future) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_buffer()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_buffer()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION
return MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/opencl/buffer/utils.h"
#include <set>
#include <string>
#include <vector>
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
MaceStatus PadInput(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const int pad_top,
const int pad_left,
const bool input_changed,
Tensor *padded_input,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t in_channel = input->dim(3);
const index_t padded_height = padded_input->dim(1);
const index_t padded_width = padded_input->dim(2);
const index_t padded_channel = padded_input->dim(3);
const uint32_t gws[2] = {
static_cast<uint32_t>(padded_width * RoundUpDiv4(padded_channel)),
static_cast<uint32_t>(padded_height * batch)
};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("Dpad_input");
built_options.emplace("-Dpad_input=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input->dtype()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_transform",
kernel_name,
built_options,
kernel));
}
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, padded_input->size());
MACE_SET_2D_GWS_ARGS(*kernel, gws)
kernel->setArg(idx++, *(input->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(in_height));
kernel->setArg(idx++, static_cast<int32_t>(in_width));
kernel->setArg(idx++, static_cast<int32_t>(in_channel));
kernel->setArg(idx++, static_cast<int32_t>(padded_height));
kernel->setArg(idx++, static_cast<int32_t>(padded_width));
kernel->setArg(idx++, static_cast<int32_t>(padded_channel));
kernel->setArg(idx++, pad_top);
kernel->setArg(idx++, pad_left);
kernel->setArg(idx++, *(padded_input->opencl_buffer()));
}
std::string tuning_key =
Concat("pad_input", batch, in_height, in_width, in_channel,
padded_height, padded_width, padded_channel);
std::vector<uint32_t> lws = {8, 4, 0};
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION
return MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
#define MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
#include "mace/core/future.h"
#include "mace/core/op_kernel_context.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
MaceStatus PadInput(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const int pad_top,
const int pad_left,
const bool input_changed,
Tensor *padded_input,
StatsFuture *future);
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/buffer_inverse_transform.h"
#include "mace/kernels/opencl/image/image_to_buffer.h"
#include "mace/kernels/opencl/buffer/buffer_inverse_transform.h"
namespace mace {
namespace kernels {
template<typename T>
BufferInverseTransformFunctor<
DeviceType::GPU, T>::BufferInverseTransformFunctor(
OpKernelContext *context,
const int wino_blk_size)
: BufferInverseTransformFunctorBase(context, wino_blk_size) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::ImageToBuffer<T>);
} else {
kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
}
}
template <typename T>
MaceStatus BufferInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const BufferType type,
Tensor *output,
StatsFuture *future) {
return kernel_->Compute(context_, input, type,
wino_blk_size_, output, future);
}
template struct BufferInverseTransformFunctor<DeviceType::GPU, float>;
template struct BufferInverseTransformFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/buffer_transform.h"
#include "mace/kernels/opencl/image/buffer_to_image.h"
#include "mace/kernels/opencl/buffer/buffer_transform.h"
namespace mace {
namespace kernels {
template<typename T>
BufferTransformFunctor<DeviceType::GPU, T>::BufferTransformFunctor(
OpKernelContext *context,
const int wino_blk_size)
: BufferTransformFunctorBase(context, wino_blk_size) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::BufferToImage<T>);
} else {
kernel_.reset(new opencl::buffer::BufferTransform<T>);
}
}
template <typename T>
MaceStatus BufferTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const BufferType type,
Tensor *output,
StatsFuture *future) {
return kernel_->Compute(context_, input, type,
wino_blk_size_, output, future);
}
template struct BufferTransformFunctor<DeviceType::GPU, float>;
template struct BufferTransformFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
...@@ -13,73 +13,26 @@ ...@@ -13,73 +13,26 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/channel_shuffle.h" #include "mace/kernels/channel_shuffle.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/kernels/opencl/image/channel_shuffle.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ChannelShuffleFunctor<DeviceType::GPU, T>::ChannelShuffleFunctor(
const Tensor *input, Tensor *output, StatsFuture *future) { OpKernelContext *context,
MACE_RETURN_IF_ERROR(output->ResizeLike(input)); const int groups) : OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
const index_t batch = input->dim(0); kernel_.reset(new opencl::image::ChannelShuffleKernel<T>(groups));
const index_t height = input->dim(1); } else {
const index_t width = input->dim(2); MACE_NOT_IMPLEMENTED;
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
MACE_CHECK(channels_per_group % 4 == 0,
"channels per group must be multiple of 4");
MACE_CHECK(groups_ % 4 == 0, "groups must be multiple of 4");
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
} }
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_); template <typename T>
std::string tuning_key = MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1), const Tensor *input, Tensor *output, StatsFuture *future) {
output->dim(2), output->dim(3)); return kernel_->Compute(context_, input, output, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct ChannelShuffleFunctor<DeviceType::GPU, float>; template struct ChannelShuffleFunctor<DeviceType::GPU, float>;
......
#include <common.h> #include <common.h>
__kernel void activation(KERNEL_ERROR_PARAMS __kernel void activation(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
#ifdef USE_PRELU #ifdef USE_PRELU
......
#include <common.h> #include <common.h>
__kernel void addn(KERNEL_ERROR_PARAMS __kernel void addn(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1, __read_only image2d_t input1,
......
#include <common.h> #include <common.h>
// Supported data types: half/float // Supported data types: half/float
__kernel void batch_norm(KERNEL_ERROR_PARAMS __kernel void batch_norm(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t scale, __read_only image2d_t scale,
......
#include <common.h> #include <common.h>
__kernel void batch_to_space(KERNEL_ERROR_PARAMS __kernel void batch_to_space(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t batch_data, __read_only image2d_t batch_data,
__write_only image2d_t space_data, __write_only image2d_t space_data,
......
#include <common.h> #include <common.h>
// Supported data types: half/float // Supported data types: half/float
__kernel void bias_add(KERNEL_ERROR_PARAMS __kernel void bias_add(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t bias, __read_only image2d_t bias,
......
#include <common.h> #include <common.h>
__kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS __kernel void filter_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* OIHW */ __global const DATA_TYPE *input, /* OIHW */
__private const int input_offset, __private const int input_offset,
...@@ -52,7 +52,7 @@ __kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS ...@@ -52,7 +52,7 @@ __kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS __kernel void filter_image_to_buffer(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* OIHW */ __global DATA_TYPE *output, /* OIHW */
__private const int out_channel, __private const int out_channel,
...@@ -102,7 +102,7 @@ __kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS ...@@ -102,7 +102,7 @@ __kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS
} }
// TODO(liuqi): Support multiplier > 1 // TODO(liuqi): Support multiplier > 1
__kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS __kernel void dw_filter_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* MIHW */ __global const DATA_TYPE *input, /* MIHW */
__private const int input_offset, __private const int input_offset,
...@@ -154,7 +154,7 @@ __kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS ...@@ -154,7 +154,7 @@ __kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS __kernel void in_out_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */ __global const DATA_TYPE *input, /* nhwc */
__private const int input_offset, __private const int input_offset,
...@@ -196,7 +196,7 @@ __kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS ...@@ -196,7 +196,7 @@ __kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS __kernel void in_out_image_to_buffer(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* nhwc */ __global DATA_TYPE *output, /* nhwc */
__private const int height, __private const int height,
...@@ -236,7 +236,7 @@ __kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS ...@@ -236,7 +236,7 @@ __kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS
} }
} }
__kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS __kernel void arg_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, __global const DATA_TYPE *input,
__private const int input_offset, __private const int input_offset,
...@@ -272,7 +272,7 @@ __kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS ...@@ -272,7 +272,7 @@ __kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS __kernel void arg_image_to_buffer(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, __global DATA_TYPE *output,
__private const int count, __private const int count,
...@@ -306,7 +306,7 @@ __kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS ...@@ -306,7 +306,7 @@ __kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS
} }
__kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS __kernel void in_out_height_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //nhwc __global const DATA_TYPE *input, //nhwc
__private const int input_offset, __private const int input_offset,
...@@ -349,7 +349,7 @@ __kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS ...@@ -349,7 +349,7 @@ __kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS __kernel void in_out_height_image_to_buffer(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //nhwc __global DATA_TYPE *output, //nhwc
__private const int height, __private const int height,
...@@ -387,7 +387,7 @@ __kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS ...@@ -387,7 +387,7 @@ __kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS
output[offset] = values.w; output[offset] = values.w;
} }
__kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS __kernel void in_out_width_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */ __global const DATA_TYPE *input, /* nhwc */
__private const int input_offset, __private const int input_offset,
...@@ -430,7 +430,7 @@ __kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS ...@@ -430,7 +430,7 @@ __kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS __kernel void weight_height_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, // OIHW __global const DATA_TYPE *input, // OIHW
__private const int input_offset, __private const int input_offset,
...@@ -475,7 +475,7 @@ __kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS ...@@ -475,7 +475,7 @@ __kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS __kernel void weight_height_image_to_buffer(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //OIHW __global DATA_TYPE *output, //OIHW
__private const int out_channels, __private const int out_channels,
...@@ -517,7 +517,7 @@ __kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS ...@@ -517,7 +517,7 @@ __kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS
} }
__kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS __kernel void weight_width_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, // OIHW __global const DATA_TYPE *input, // OIHW
__private const int input_offset, __private const int input_offset,
...@@ -565,7 +565,7 @@ __kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS ...@@ -565,7 +565,7 @@ __kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS __kernel void weight_width_image_to_buffer(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, // OIHW __global DATA_TYPE *output, // OIHW
__private const int in_channels, __private const int in_channels,
...@@ -609,7 +609,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS ...@@ -609,7 +609,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
} }
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS __kernel void winograd_filter_buffer_to_image_2x2(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W __global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset, __private const int input_offset,
...@@ -714,7 +714,7 @@ __kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS ...@@ -714,7 +714,7 @@ __kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
} }
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS __kernel void winograd_filter_image_to_buffer_2x2(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W __global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height, __private const int height,
...@@ -757,7 +757,7 @@ __kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS ...@@ -757,7 +757,7 @@ __kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS
} }
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS __kernel void winograd_filter_buffer_to_image_6x6(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W __global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset, __private const int input_offset,
...@@ -891,7 +891,7 @@ PROCESS(7); ...@@ -891,7 +891,7 @@ PROCESS(7);
#undef PROCESS #undef PROCESS
} }
__kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS __kernel void winograd_filter_image_to_buffer_6x6(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W __global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height, __private const int height,
...@@ -933,7 +933,7 @@ __kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS ...@@ -933,7 +933,7 @@ __kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS
} }
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS __kernel void winograd_filter_buffer_to_image_4x4(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W __global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset, __private const int input_offset,
...@@ -1040,7 +1040,7 @@ __kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS ...@@ -1040,7 +1040,7 @@ __kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
#undef PROCESS #undef PROCESS
} }
__kernel void winograd_filter_image_to_buffer_4x4(KERNEL_ERROR_PARAMS __kernel void winograd_filter_image_to_buffer_4x4(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W __global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height, __private const int height,
......
#include <common.h>
__kernel void pad_input(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global IN_DATA_TYPE *input,
__private const int in_height,
__private const int in_width,
__private const int in_chan,
__private const int padded_height,
__private const int padded_width,
__private const int padded_chan,
__private const int pad_top,
__private const int pad_left,
__global DATA_TYPE *output) {
const int padded_wc_blk_idx = get_global_id(0);
const int padded_hb_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (padded_wc_blk_idx >= global_size_dim0 ||
padded_hb_idx >= global_size_dim1) {
return;
}
#endif
const int padded_chan_blk = (padded_chan + 3) >> 2;
const int padded_width_idx = padded_wc_blk_idx / padded_chan_blk;
const int padded_chan_blk_idx = padded_wc_blk_idx % padded_chan_blk;
const int batch_idx = padded_hb_idx / padded_height;
const int padded_height_idx = padded_hb_idx % padded_height;
const int padded_chan_idx = padded_chan_blk_idx << 2;
const int in_height_idx = padded_height_idx - pad_top;
const int in_width_idx = padded_width_idx - pad_left;
const int padded_offset = mad24(mad24(mad24(batch_idx, padded_height, padded_height_idx),
padded_width, padded_width_idx), padded_chan, padded_chan_idx);
const int in_offset = mad24(mad24(mad24(batch_idx, in_height, in_height_idx),
in_width, in_width_idx), in_chan, padded_chan_idx);
DATA_TYPE4 value = 0;
if (0 <= in_height_idx && in_height_idx < in_height &&
0 <= in_width_idx && in_width_idx < in_width) {
const int remain_chan = in_chan - padded_chan_idx;
if (remain_chan < 4) {
switch (remain_chan) {
case 3:
value.z = CONVERT(input[in_offset + 2]);
case 2:
value.y = CONVERT(input[in_offset + 1]);
case 1:
value.x = CONVERT(input[in_offset]);
}
} else {
value = CONVERT4(vload4(0, input + in_offset));
}
}
vstore4(value, 0, output + padded_offset);
CHECK_OUT_OF_RANGE_FOR_BUFFER(padded_offset + 3);
}
// OIHW -> [H, W, (O+3) / 4, I, 4]
__kernel void transform_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__global IN_DATA_TYPE *input, // OIHW
__private const int input_offset,
__global DATA_TYPE *output,
__private const int out_chan,
__private const int in_chan,
__private const int height,
__private const int width,
__private const int inner_size) {
const int in_chan_idx = get_global_id(0);
const int out_chan_blk_idx = get_global_id(1);
const int hw_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (in_chan_idx >= global_size_dim0 ||
out_chan_blk_idx >= global_size_dim1 ||
hw_idx >= global_size_dim2) {
return;
}
#endif
const int t_in_chan = global_size_dim0;
const int out_chan_blk = global_size_dim1;
const int h_idx = hw_idx / width;
const int w_idx = hw_idx % width;
const int out_chan_idx = out_chan_blk_idx << 2;
const int in_offset = mad24(mad24(mad24(out_chan_idx, in_chan, in_chan_idx),
height, h_idx), width, w_idx) + input_offset;
const int out_offset = (mad24(mad24(mad24(h_idx, width, w_idx),
out_chan_blk, out_chan_blk_idx), t_in_chan, in_chan_idx) << 2);
DATA_TYPE4 value = 0;
if (in_chan_idx < in_chan) {
if (out_chan_idx + 3 < out_chan) {
value.x = CONVERT(input[in_offset]);
value.y = CONVERT(input[in_offset + inner_size]);
value.z = CONVERT(input[in_offset + 2 * inner_size]);
value.w = CONVERT(input[in_offset + 3 * inner_size]);
} else {
const int diff = out_chan - out_chan_idx;
switch(diff) {
case 3:
value.z = CONVERT(input[in_offset + 2 * inner_size]);
case 2:
value.y = CONVERT(input[in_offset + inner_size]);
case 1:
value.x = CONVERT(input[in_offset]);
}
}
}
VSTORE4(value, output, out_offset);
}
// MIHW -> [M, (I+3) / 4, H, W, 4]
__kernel void transform_dw_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__global IN_DATA_TYPE *input, // MIHW
__private const int input_offset,
__global DATA_TYPE *output,
__private const int in_chan,
__private const int in_hw) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int in_chan_blk_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (width_idx >= global_size_dim0 ||
height_idx >= global_size_dim1 ||
in_chan_blk_idx >= global_size_dim2) {
return;
}
#endif
const int width = global_size_dim0;
const int height = global_size_dim1;
const int in_chan_idx = in_chan_blk_idx << 2;
const int in_offset = mad24(in_chan_idx, in_hw,
mad24(height_idx, width, width_idx)) + input_offset;
const int out_offset = mad24(in_chan_blk_idx, in_hw,
mad24(height_idx, width, width_idx)) << 2;
DATA_TYPE4 value = 0;
if (in_chan_idx + 3 < in_chan) {
value.x = CONVERT(input[in_offset]);
value.y = CONVERT(input[in_offset + in_hw]);
value.z = CONVERT(input[in_offset + (in_hw << 1)]);
value.w = CONVERT(input[in_offset + in_hw + (in_hw << 1)]);
} else {
const int diff = in_chan - in_chan_idx;
switch(diff) {
case 3:
value.z = CONVERT(input[in_offset + (in_hw << 1)]);
case 2:
value.y = CONVERT(input[in_offset + in_hw]);
case 1:
value.x = CONVERT(input[in_offset]);
}
}
VSTORE4(value, output, out_offset);
}
__kernel void transform_arg(BUFFER_OUT_OF_RANGE_PARAMS
__private const int global_size_dim0,
__global IN_DATA_TYPE *input,
__private const int input_offset,
__global DATA_TYPE *output,
__private int size) {
const int blk_idx = get_global_id(0);
#ifndef NON_UNIFORM_WORK_GROUP
if (blk_idx >= global_size_dim0) {
return;
}
#endif
const int idx = blk_idx << 2;
const int diff = size - idx;
const int in_idx = idx + input_offset;
DATA_TYPE4 value = 0;
if (diff < 4) {
switch (diff) {
case 3:
value.z = CONVERT(input[in_idx + 2]);
case 2:
value.y = CONVERT(input[in_idx + 1]);
case 1:
value.x = CONVERT(input[in_idx]);
}
} else {
value = CONVERT4(vload4(0, input + in_idx));
}
VSTORE4(value, output, idx);
}
__kernel void transform_data_type(BUFFER_OUT_OF_RANGE_PARAMS
__private const int global_size_dim0,
__global IN_DATA_TYPE *input,
__private const int input_offset,
__global DATA_TYPE *output) {
const int out_idx = get_global_id(0);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_idx >= global_size_dim0) {
return;
}
#endif
DATA_TYPE4 input_value = CONVERT4(vload4(out_idx, input + input_offset));
vstore4(input_value, out_idx, output);
}
#include <common.h> #include <common.h>
// assume channes_per_group mod 4 = 0 && groups mod 4 == 0 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
__kernel void channel_shuffle(KERNEL_ERROR_PARAMS __kernel void channel_shuffle(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int groups, __private const int groups,
......
...@@ -24,19 +24,13 @@ ...@@ -24,19 +24,13 @@
#define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type) #define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
#define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4) #define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4)
#define OUT_DATA_TYPE4 VEC_DATA_TYPE(OUT_DATA_TYPE, 4)
#ifdef OUT_OF_RANGE_CHECK #define CONVERT_STR(value, type) convert_##type((value))
#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
check_out_of_range_for_image2d(image, (coord).x, (coord).y, kernel_error);
#else
#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)
#endif
#define READ_IMAGET(image, sampler, coord) \ #define CONVERT_TO(value, type) CONVERT_STR(value, type)
CMD_TYPE(read_image, CMD_DATA_TYPE)(image, sampler, coord) #define CONVERT(value) CONVERT_TO(value, DATA_TYPE)
#define WRITE_IMAGET(image, coord, value) \ #define CONVERT4(value) CONVERT_TO(value, DATA_TYPE4)
CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value);
#define GLOBAL_WORK_GROUP_SIZE_DIM2 \ #define GLOBAL_WORK_GROUP_SIZE_DIM2 \
__private const int global_size_dim0, \ __private const int global_size_dim0, \
...@@ -47,16 +41,37 @@ ...@@ -47,16 +41,37 @@
__private const int global_size_dim1, \ __private const int global_size_dim1, \
__private const int global_size_dim2, __private const int global_size_dim2,
// oorc for 'Out Of Range Check'
#ifdef OUT_OF_RANGE_CHECK #ifdef OUT_OF_RANGE_CHECK
#define OUT_OF_RANGE_PARAMS \
__global int *oorc_flag,
#define KERNEL_ERROR_PARAMS \ #define BUFFER_OUT_OF_RANGE_PARAMS \
__global char *kernel_error, __global int *oorc_flag, \
__private const int oorc_output_length,
#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
check_out_of_range_for_image2d(image, (coord).x, (coord).y, oorc_flag);
#define CHECK_OUT_OF_RANGE_FOR_BUFFER(idx) \
check_out_of_range_for_buffer(oorc_output_length, (idx), oorc_flag);
#else #else
#define OUT_OF_RANGE_PARAMS
#define BUFFER_OUT_OF_RANGE_PARAMS
#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)
#define CHECK_OUT_OF_RANGE_FOR_BUFFER(idx)
#endif
#define KERNEL_ERROR_PARAMS #define READ_IMAGET(image, sampler, coord) \
CMD_TYPE(read_image, CMD_DATA_TYPE)(image, sampler, coord)
#define WRITE_IMAGET(image, coord, value) \
CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value);
#define VSTORE4(data, output, offset) \
CHECK_OUT_OF_RANGE_FOR_BUFFER((offset) + 3) \
vstore4(data, 0, output + (offset));
#endif
__constant sampler_t SAMPLER = __constant sampler_t SAMPLER =
CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
...@@ -66,6 +81,7 @@ inline float4 do_sigmoid(float4 in) { ...@@ -66,6 +81,7 @@ inline float4 do_sigmoid(float4 in) {
return native_recip(1.0f + native_exp(-in)); return native_recip(1.0f + native_exp(-in));
} }
#ifdef DATA_TYPE
inline DATA_TYPE4 do_activation(DATA_TYPE4 in, inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
#ifdef USE_PRELU #ifdef USE_PRELU
DATA_TYPE4 prelu_alpha, DATA_TYPE4 prelu_alpha,
...@@ -89,17 +105,25 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in, ...@@ -89,17 +105,25 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
#endif #endif
return out; return out;
} }
#endif
inline void check_out_of_range_for_image2d(__write_only image2d_t image, inline void check_out_of_range_for_image2d(__write_only image2d_t image,
__private const int x, __private const int x,
__private const int y, __private const int y,
global char *kernel_error) { __global int *oorc_flag) {
#ifdef OUT_OF_RANGE_CHECK
int2 image_dim = get_image_dim(image); int2 image_dim = get_image_dim(image);
if (x >= image_dim.x || y >= image_dim.y) { if (x >= image_dim.x || y >= image_dim.y) {
*kernel_error = 1; *oorc_flag = 1;
} }
#endif
} }
inline void check_out_of_range_for_buffer(__private const int length,
__private const int idx,
__global int *oorc_flag) {
if (idx >= length) {
*oorc_flag = idx - length + 1;
}
}
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
...@@ -22,7 +22,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left, ...@@ -22,7 +22,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
} }
// Supported data type: half/float // Supported data type: half/float
__kernel void concat_channel(KERNEL_ERROR_PARAMS __kernel void concat_channel(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input0, __read_only image2d_t input0,
__read_only image2d_t input1, __read_only image2d_t input1,
...@@ -84,7 +84,7 @@ __kernel void concat_channel(KERNEL_ERROR_PARAMS ...@@ -84,7 +84,7 @@ __kernel void concat_channel(KERNEL_ERROR_PARAMS
} }
// Required: All input channels are divisible by 4 // Required: All input channels are divisible by 4
__kernel void concat_channel_multi(KERNEL_ERROR_PARAMS __kernel void concat_channel_multi(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int chan_blk_offset, __private const int chan_blk_offset,
......
#include <common.h> #include <common.h>
__kernel void conv_2d(KERNEL_ERROR_PARAMS __kernel void conv_2d(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */ __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
......
#include <common.h> #include <common.h>
__kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS __kernel void conv_2d_1x1(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, cout/4 */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
......
#include <common.h>
__kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global IN_DATA_TYPE *padded_input,
__global IN_DATA_TYPE *filter,
#ifdef BIAS
__global IN_DATA_TYPE *bias,
#endif
__private const int in_height,
__private const int in_width,
__private const int in_chan,
__private const int filter_in_chan,
__private const int out_height,
__private const int out_width,
__private const int out_chan,
__private const int stride_h,
__private const int stride_w,
__private const float relux_max_limit,
__global OUT_DATA_TYPE *output) {
const int out_wc_blk_idx = get_global_id(0);
const int out_hb_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_wc_blk_idx >= global_size_dim0 ||
out_hb_idx >= global_size_dim1) {
return;
}
#endif
const int out_chan_blk = (out_chan + 3) >> 2;
const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
const int batch_idx = out_hb_idx / out_height;
const int out_height_idx = out_hb_idx % out_height;
const int out_width_idx = out_width_blk_idx << 1;
const int out_chan_idx = out_chan_blk_idx << 2;
const int in_height_idx = mul24(out_height_idx, stride_h);
const int in_width_idx = mul24(out_width_idx, stride_w);
const int strided_chan = mul24(in_chan, stride_w);
#ifdef BIAS
DATA_TYPE4 out0 = CONVERT4(vload4(0, bias + out_chan_idx));
DATA_TYPE4 out1 = out0;
#else
DATA_TYPE4 out0 = 0;
DATA_TYPE4 out1 = 0;
#endif
int in_offset = mul24(mad24(mad24(batch_idx, in_height, in_height_idx),
in_width, in_width_idx), in_chan);
int filter_offset = mul24(out_chan_blk_idx, filter_in_chan) << 2;
DATA_TYPE4 in0, in1;
DATA_TYPE4 w0, w1, w2, w3;
for (int in_chan_idx = 0; in_chan_idx < in_chan; in_chan_idx += 4) {
w0 = CONVERT4(vload4(0, filter + filter_offset));
w1 = CONVERT4(vload4(0, filter + filter_offset + 4));
w2 = CONVERT4(vload4(0, filter + filter_offset + 8));
w3 = CONVERT4(vload4(0, filter + filter_offset + 12));
in0 = CONVERT4(vload4(0, padded_input + in_offset));
in1 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan));
out0 = mad((DATA_TYPE4)(in0.x), w0, out0);
out0 = mad((DATA_TYPE4)(in0.y), w1, out0);
out0 = mad((DATA_TYPE4)(in0.z), w2, out0);
out0 = mad((DATA_TYPE4)(in0.w), w3, out0);
out1 = mad((DATA_TYPE4)(in1.x), w0, out1);
out1 = mad((DATA_TYPE4)(in1.y), w1, out1);
out1 = mad((DATA_TYPE4)(in1.z), w2, out1);
out1 = mad((DATA_TYPE4)(in1.w), w3, out1);
filter_offset += 16;
in_offset += 4;
}
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
out0 = do_activation(out0, relux_max_limit);
out1 = do_activation(out1, relux_max_limit);
#endif
int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),
out_width, out_width_idx), out_chan, out_chan_idx);
#define WRITE_OUTPUT(i) \
if (out_chan_idx + 4 > out_chan) { \
const int diff = out_chan - out_chan_idx; \
switch(diff) { \
case 3: \
output[out_offset + 2] = CONVERT_TO(out##i.z, OUT_DATA_TYPE); \
case 2: \
output[out_offset + 1] = CONVERT_TO(out##i.y, OUT_DATA_TYPE); \
case 1: \
output[out_offset] = CONVERT_TO(out##i.x, OUT_DATA_TYPE); \
} \
CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + diff - 1); \
} else { \
VSTORE4(CONVERT_TO(out##i, OUT_DATA_TYPE4), output, out_offset); \
}
WRITE_OUTPUT(0);
if (out_width_idx + 1 >= out_width) return;
out_offset += out_chan;
WRITE_OUTPUT(1);
#undef WRITE_OUTPUT
}
#include <common.h> #include <common.h>
__kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS __kernel void conv_2d_3x3(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */ __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
......
#include <common.h>
__kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global IN_DATA_TYPE *padded_input,
__global IN_DATA_TYPE *filter,
#ifdef BIAS
__global IN_DATA_TYPE *bias,
#endif
__private const int in_height,
__private const int in_width,
__private const int in_chan,
__private const int filter_height,
__private const int filter_width,
__private const int filter_in_chan,
__private const int filter_chan_size,
__private const int out_height,
__private const int out_width,
__private const int out_chan,
__private const int stride_h,
__private const int stride_w,
__private const int dilated_h_offset,
__private const int dilated_w_offset,
__private const float relux_max_limit,
__global OUT_DATA_TYPE *output) {
const int out_wc_blk_idx = get_global_id(0);
const int out_hb_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_wc_blk_idx >= global_size_dim0 ||
out_hb_idx >= global_size_dim1) {
return;
}
#endif
const int out_chan_blk = (out_chan + 3) >> 2;
const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
const int batch_idx = out_hb_idx / out_height;
const int out_height_idx = out_hb_idx % out_height;
const int out_width_idx = out_width_blk_idx << 2;
const int out_chan_idx = out_chan_blk_idx << 2;
const int in_height_idx = mul24(out_height_idx, stride_h);
const int in_width_idx = mul24(out_width_idx, stride_w);
const int strided_chan = mul24(in_chan, stride_w);
#ifdef BIAS
DATA_TYPE4 out0 = CONVERT4(vload4(0, bias + out_chan_idx));
DATA_TYPE4 out1 = out0;
DATA_TYPE4 out2 = out0;
DATA_TYPE4 out3 = out0;
#else
DATA_TYPE4 out0 = 0;
DATA_TYPE4 out1 = 0;
DATA_TYPE4 out2 = 0;
DATA_TYPE4 out3 = 0;
#endif
const int in_offset_base = mul24(mad24(mad24(batch_idx, in_height, in_height_idx),
in_width, in_width_idx), in_chan);
int filter_offset_base = mul24(out_chan_blk_idx, filter_in_chan) << 2;
DATA_TYPE4 in0, in1, in2, in3;
DATA_TYPE4 w0, w1, w2, w3;
for (int filter_h_idx = 0; filter_h_idx < filter_height; ++filter_h_idx) {
int in_height_offset = mad24(filter_h_idx, dilated_h_offset, in_offset_base);
for (int filter_w_idx = 0; filter_w_idx < filter_width; ++filter_w_idx) {
int filter_offset = filter_offset_base;
int in_offset = mad24(filter_w_idx, dilated_w_offset, in_height_offset);
for (int in_chan_idx = 0; in_chan_idx < in_chan; in_chan_idx += 4) {
w0 = CONVERT4(vload4(0, filter + filter_offset));
w1 = CONVERT4(vload4(0, filter + filter_offset + 4));
w2 = CONVERT4(vload4(0, filter + filter_offset + 8));
w3 = CONVERT4(vload4(0, filter + filter_offset + 12));
in0 = CONVERT4(vload4(0, padded_input + in_offset));
in1 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan));
in2 = CONVERT4(vload4(0, padded_input + in_offset + (strided_chan << 1)));
in3 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan + (strided_chan << 1)));
out0 = mad((DATA_TYPE4)(in0.x), w0, out0);
out0 = mad((DATA_TYPE4)(in0.y), w1, out0);
out0 = mad((DATA_TYPE4)(in0.z), w2, out0);
out0 = mad((DATA_TYPE4)(in0.w), w3, out0);
out1 = mad((DATA_TYPE4)(in1.x), w0, out1);
out1 = mad((DATA_TYPE4)(in1.y), w1, out1);
out1 = mad((DATA_TYPE4)(in1.z), w2, out1);
out1 = mad((DATA_TYPE4)(in1.w), w3, out1);
out2 = mad((DATA_TYPE4)(in2.x), w0, out2);
out2 = mad((DATA_TYPE4)(in2.y), w1, out2);
out2 = mad((DATA_TYPE4)(in2.z), w2, out2);
out2 = mad((DATA_TYPE4)(in2.w), w3, out2);
out3 = mad((DATA_TYPE4)(in3.x), w0, out3);
out3 = mad((DATA_TYPE4)(in3.y), w1, out3);
out3 = mad((DATA_TYPE4)(in3.z), w2, out3);
out3 = mad((DATA_TYPE4)(in3.w), w3, out3);
filter_offset += 16;
in_offset += 4;
}
filter_offset_base += filter_chan_size;
}
}
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
out0 = do_activation(out0, relux_max_limit);
out1 = do_activation(out1, relux_max_limit);
out2 = do_activation(out2, relux_max_limit);
out3 = do_activation(out3, relux_max_limit);
#endif
int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),
out_width, out_width_idx), out_chan, out_chan_idx);
#define WRITE_OUTPUT(i) \
if (out_chan_idx + 4 > out_chan) { \
const int diff = out_chan - out_chan_idx; \
switch(diff) { \
case 3: \
output[out_offset + 2] = CONVERT_TO(out##i.z, OUT_DATA_TYPE); \
case 2: \
output[out_offset + 1] = CONVERT_TO(out##i.y, OUT_DATA_TYPE); \
case 1: \
output[out_offset] = CONVERT_TO(out##i.x, OUT_DATA_TYPE); \
} \
CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + diff - 1); \
} else { \
VSTORE4(CONVERT_TO(out##i, OUT_DATA_TYPE4), output, out_offset); \
}
WRITE_OUTPUT(0);
if (out_width_idx + 1 >= out_width) return;
out_offset += out_chan;
WRITE_OUTPUT(1);
if (out_width_idx + 2 >= out_width) return;
out_offset += out_chan;
WRITE_OUTPUT(2);
if (out_width_idx + 3 >= out_width) return;
out_offset += out_chan;
WRITE_OUTPUT(3);
#undef WRITE_OUTPUT
}
#include <common.h> #include <common.h>
__kernel void crop(KERNEL_ERROR_PARAMS __kernel void crop(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int offset_b, __private const int offset_b,
......
#include <common.h> #include <common.h>
__kernel void deconv_2d(KERNEL_ERROR_PARAMS __kernel void deconv_2d(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t weights, __read_only image2d_t weights,
......
#include <common.h> #include <common.h>
__kernel void depth_to_space(KERNEL_ERROR_PARAMS __kernel void depth_to_space(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int block_size, __private const int block_size,
......
#include <common.h> #include <common.h>
// Only multiplier = 1 is supported // Only multiplier = 1 is supported
__kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS __kernel void depthwise_conv2d(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
...@@ -136,7 +136,7 @@ __kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS ...@@ -136,7 +136,7 @@ __kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
} }
__kernel void depthwise_conv2d_s1(KERNEL_ERROR_PARAMS __kernel void depthwise_conv2d_s1(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
......
#include <common.h>
#define BLOCK_SIZE 4
__kernel void depthwise_conv2d(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global IN_DATA_TYPE *padded_input,
__global IN_DATA_TYPE *filter,
#ifdef BIAS
__global IN_DATA_TYPE *bias,
#endif
__private const int in_height,
__private const int in_width,
__private const int in_chan,
__private const int filter_height,
__private const int filter_width,
__private const int filter_hw,
__private const int out_height,
__private const int out_width,
__private const int out_chan,
__private const int stride_h,
__private const int stride_w,
__private const int dilated_h_offset,
__private const int dilated_w_offset,
__private const float relux_max_limit,
__global OUT_DATA_TYPE *output) {
const int out_wc_blk_idx = get_global_id(0);
const int out_hb_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_wc_blk_idx >= global_size_dim0 ||
out_hb_idx >= global_size_dim1) {
return;
}
#endif
const int out_chan_blk = (out_chan + 3) >> 2;
const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
const int batch_idx = out_hb_idx / out_height;
const int out_height_idx = out_hb_idx % out_height;
const int out_width_idx = out_width_blk_idx << 2;
const int out_chan_idx = out_chan_blk_idx << 2;
const int in_chan_idx = out_chan_idx;
const int in_height_idx = mul24(out_height_idx, stride_h);
const int in_width_idx = mul24(out_width_idx, stride_w);
const int strided_chan = mul24(in_chan, stride_w);
#ifdef BIAS
DATA_TYPE4 out0 = CONVERT4(vload4(0, bias + out_chan_idx));
DATA_TYPE4 out1 = out0;
DATA_TYPE4 out2 = out0;
DATA_TYPE4 out3 = out0;
#else
DATA_TYPE4 out0 = 0;
DATA_TYPE4 out1 = 0;
DATA_TYPE4 out2 = 0;
DATA_TYPE4 out3 = 0;
#endif
const int in_offset_base = mad24(mad24(mad24(batch_idx, in_height, in_height_idx),
in_width, in_width_idx), in_chan, in_chan_idx);
int filter_offset = mul24(out_chan_blk_idx, filter_hw) << 2;
DATA_TYPE4 in0, in1, in2, in3;
DATA_TYPE4 w;
for (int filter_h_idx = 0; filter_h_idx < filter_height; ++filter_h_idx) {
int in_offset = mad24(filter_h_idx, dilated_h_offset, in_offset_base);
for (int filter_w_idx = 0; filter_w_idx < filter_width; ++filter_w_idx) {
w = CONVERT4(vload4(0, filter + filter_offset));
in0 = CONVERT4(vload4(0, padded_input + in_offset));
in1 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan));
in2 = CONVERT4(vload4(0, padded_input + in_offset + (strided_chan << 1)));
in3 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan + (strided_chan << 1)));
out0 = mad(in0, w, out0);
out1 = mad(in1, w, out1);
out2 = mad(in2, w, out2);
out3 = mad(in3, w, out3);
filter_offset += 4;
in_offset += dilated_w_offset;
}
}
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
out0 = do_activation(out0, relux_max_limit);
out1 = do_activation(out1, relux_max_limit);
out2 = do_activation(out2, relux_max_limit);
out3 = do_activation(out3, relux_max_limit);
#endif
int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),
out_width, out_width_idx), out_chan, out_chan_idx);
#define WRITE_OUTPUT(i) \
if (out_chan_idx + 4 > out_chan) { \
const int diff = out_chan - out_chan_idx; \
switch(diff) { \
case 3: \
output[out_offset + 2] = CONVERT_TO(out##i.z, OUT_DATA_TYPE); \
case 2: \
output[out_offset + 1] = CONVERT_TO(out##i.y, OUT_DATA_TYPE); \
case 1: \
output[out_offset] = CONVERT_TO(out##i.x, OUT_DATA_TYPE); \
} \
CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + diff - 1); \
} else { \
VSTORE4(CONVERT_TO(out##i, OUT_DATA_TYPE4), output, out_offset); \
}
WRITE_OUTPUT(0);
if (out_width_idx + 1 >= out_width) return;
out_offset += out_chan;
WRITE_OUTPUT(1);
if (out_width_idx + 2 >= out_width) return;
out_offset += out_chan;
WRITE_OUTPUT(2);
if (out_width_idx + 3 >= out_width) return;
out_offset += out_chan;
WRITE_OUTPUT(3);
#undef WRITE_OUTPUT
}
#include <common.h> #include <common.h>
__kernel void eltwise(KERNEL_ERROR_PARAMS __kernel void eltwise(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input0, __read_only image2d_t input0,
#if INPUT_TYPE == 1 #if INPUT_TYPE == 1
......
#include <common.h> #include <common.h>
// output = weight * input + bias // output = weight * input + bias
__kernel void fully_connected(KERNEL_ERROR_PARAMS __kernel void fully_connected(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t weight, __read_only image2d_t weight,
...@@ -64,7 +64,7 @@ __kernel void fully_connected(KERNEL_ERROR_PARAMS ...@@ -64,7 +64,7 @@ __kernel void fully_connected(KERNEL_ERROR_PARAMS
} }
// output = weight * input + bias // output = weight * input + bias
__kernel void fully_connected_width(KERNEL_ERROR_PARAMS __kernel void fully_connected_width(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t weight, __read_only image2d_t weight,
......
#include <common.h> #include <common.h>
__kernel void lstmcell(KERNEL_ERROR_PARAMS __kernel void lstmcell(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t pre_output, __read_only image2d_t pre_output,
......
#include <common.h> #include <common.h>
// C = A * B // C = A * B
__kernel void matmul(KERNEL_ERROR_PARAMS __kernel void matmul(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t A, __read_only image2d_t A,
__read_only image2d_t B, __read_only image2d_t B,
......
#include <common.h> #include <common.h>
__kernel void pad(KERNEL_ERROR_PARAMS __kernel void pad(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
......
...@@ -16,7 +16,7 @@ inline int calculate_avg_block_size(const int pool_size_h, ...@@ -16,7 +16,7 @@ inline int calculate_avg_block_size(const int pool_size_h,
} }
// Supported data type: half/float // Supported data type: half/float
__kernel void pooling(KERNEL_ERROR_PARAMS __kernel void pooling(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int in_height, __private const int in_height,
......
#include <common.h>
#define MIN_VALUE -FLT_MAX
inline int calculate_avg_block_size(const int pool_size_h,
const int pool_size_w,
const int pos_h,
const int pos_w,
const int h_size,
const int w_size) {
const int h_start = max(0, pos_h);
const int w_start = max(0, pos_w);
const int h_end = min(pos_h + pool_size_h, h_size);
const int w_end = min(pos_w + pool_size_w, w_size);
return mul24((h_end - h_start), (w_end - w_start));
}
// Supported data type: half/float
__kernel void pooling(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__global IN_DATA_TYPE *input,
__private const int in_height,
__private const int in_width,
__private const int in_chan,
__private const int out_height,
__private const int out_chan,
__private const int pad_top,
__private const int pad_left,
__private const int stride_h,
__private const int stride_w,
__private const int kernel_h,
__private const int kernel_w,
__global OUT_DATA_TYPE *output) {
const int out_chan_blk_idx = get_global_id(0);
const int out_width_idx = get_global_id(1);
const int out_hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_chan_blk_idx >= global_size_dim0 ||
out_width_idx >= global_size_dim1 ||
out_hb_idx >= global_size_dim2) {
return;
}
#endif
const int out_width = global_size_dim1;
const int in_wc_size = mul24(in_width, in_chan);
const int batch_idx = out_hb_idx / out_height;
const int out_height_idx = out_hb_idx % out_height;
const int chan_idx = out_chan_blk_idx << 2;
const int in_height_start = mul24(out_height_idx, stride_h) - pad_top;
const int in_width_start = mul24(out_width_idx, stride_w) - pad_left;
int in_offset_base = mad24(mad24(mad24(batch_idx, in_height, in_height_start),
in_width, in_width_start), in_chan, chan_idx);
#ifdef POOL_AVG
DATA_TYPE4 res = 0;
for (int height = 0; height < kernel_h; ++height) {
int in_height_idx = in_height_start + height;
if (0 <= in_height_idx && in_height_idx < in_height) {
int in_offset = mad24(height, in_wc_size, in_offset_base);
for (int width = 0; width < kernel_w; ++width) {
int in_width_idx = in_width_start + width;
if (0 <= in_width_idx && in_width_idx < in_width) {
DATA_TYPE4 in = CONVERT4(vload4(0, input + in_offset));
res = res + in;
}
in_offset += in_chan;
}
}
}
const int block_size = calculate_avg_block_size(kernel_h,
kernel_w,
in_height_start,
in_width_start,
in_height,
in_width);
res /= block_size;
#else
DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE);
for (int height = 0; height < kernel_h; ++height) {
int in_height_idx = in_height_start + height;
if (0 <= in_height_idx && in_height_idx < in_height) {
int in_offset = mad24(height, in_wc_size, in_offset_base);
for (int width = 0; width < kernel_w; ++width) {
int in_width_idx = in_width_start + width;
if (0 <= in_width_idx && in_width_idx < in_width) {
DATA_TYPE4 in = CONVERT4(vload4(0, input + in_offset));
res = fmax(res, in);
}
in_offset += in_chan;
}
}
}
#endif
const int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),
out_width, out_width_idx), out_chan, chan_idx);
int remain_chan = out_chan - chan_idx;
if (remain_chan < 4) {
switch(remain_chan) {
case 3:
output[out_offset + 2] = res.z;
case 2:
output[out_offset + 1] = res.y;
case 1:
output[out_offset] = res.x;
}
CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + remain_chan - 1);
} else {
VSTORE4(CONVERT_TO(res, OUT_DATA_TYPE4), output, out_offset);
}
}
#include <common.h> #include <common.h>
__kernel void reduce_mean(KERNEL_ERROR_PARAMS __kernel void reduce_mean(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__local DATA_TYPE4 *group_sum, __local DATA_TYPE4 *group_sum,
......
...@@ -10,7 +10,7 @@ inline float coeff_odd(float i) { ...@@ -10,7 +10,7 @@ inline float coeff_odd(float i) {
return ((-0.75f * x + 3.75f) * x - 6.0f) * x + 3.0f; return ((-0.75f * x + 3.75f) * x - 6.0f) * x + 3.0f;
} }
__kernel void resize_bicubic_nocache(KERNEL_ERROR_PARAMS __kernel void resize_bicubic_nocache(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
......
#include <common.h> #include <common.h>
__kernel void resize_bilinear_nocache(KERNEL_ERROR_PARAMS __kernel void resize_bilinear_nocache(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__write_only image2d_t output, __write_only image2d_t output,
......
#include <common.h> #include <common.h>
__kernel void softmax(KERNEL_ERROR_PARAMS __kernel void softmax(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int channels, __private const int channels,
......
#include <common.h>
__kernel void softmax(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__global IN_DATA_TYPE *input,
__private const int height,
__private const int channels,
__private const int remain_channels,
__global OUT_DATA_TYPE *output) {
const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
}
#endif
const int chan_blks = global_size_dim0 - 1;
const int width = global_size_dim1;
const int batch_idx = hb_idx / height;
const int height_idx = hb_idx % height;
const int chan_idx = chan_blk_idx << 2;
const int offset_base = mul24(mad24(mad24(batch_idx, height, height_idx),
width, width_idx), channels);
int in_offset = offset_base;
DATA_TYPE max_value = -FLT_MAX;
DATA_TYPE sum = 0;
DATA_TYPE4 data;
for (short i = 0; i < chan_blks; ++i) {
data = CONVERT4(vload4(0, input + in_offset));
max_value = max(max_value, data.x);
max_value = max(max_value, data.y);
max_value = max(max_value, data.z);
max_value = max(max_value, data.w);
in_offset += 4;
}
switch(remain_channels) {
case 0:
max_value = max(max_value, CONVERT(input[in_offset + 3]));
case 1:
max_value = max(max_value, CONVERT(input[in_offset + 2]));
case 2:
max_value = max(max_value, CONVERT(input[in_offset + 1]));
case 3:
max_value = max(max_value, CONVERT(input[in_offset]));
}
in_offset = offset_base;
for (short i = 0; i < chan_blks; ++i) {
data = CONVERT4(vload4(0, input + in_offset));
data = native_exp(data - max_value);
sum += data.x;
sum += data.y;
sum += data.z;
sum += data.w;
in_offset += 4;
}
switch(remain_channels) {
case 0:
sum += native_exp(CONVERT(input[in_offset + 3]) - max_value);
case 1:
sum += native_exp(CONVERT(input[in_offset + 2]) - max_value);
case 2:
sum += native_exp(CONVERT(input[in_offset + 1]) - max_value);
case 3:
sum += native_exp(CONVERT(input[in_offset]) - max_value);
}
int remain_chan = channels - chan_idx;
int offset = offset_base + chan_idx;
if (remain_chan < 4) {
switch(remain_chan) {
case 3:
output[offset + 2] = native_exp(CONVERT(input[offset + 2]) - max_value) / sum;
case 2:
output[offset + 1] = native_exp(CONVERT(input[offset + 1]) - max_value) / sum;
case 1:
output[offset] = native_exp(CONVERT(input[offset]) - max_value) / sum;
}
} else {
data = CONVERT4(vload4(0, input + offset));
data = native_exp(data - max_value) / sum;
VSTORE4(CONVERT_TO(data, OUT_DATA_TYPE4), output, offset);
}
}
#include <common.h> #include <common.h>
__kernel void space_to_batch(KERNEL_ERROR_PARAMS __kernel void space_to_batch(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t space_data, __read_only image2d_t space_data,
__write_only image2d_t batch_data, __write_only image2d_t batch_data,
......
#include <common.h> #include <common.h>
__kernel void space_to_depth(KERNEL_ERROR_PARAMS __kernel void space_to_depth(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int block_size, __private const int block_size,
......
#include <common.h> #include <common.h>
__kernel void split(KERNEL_ERROR_PARAMS __kernel void split(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int chan_blk_offset, __private const int chan_blk_offset,
......
#include <common.h> #include <common.h>
__kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS __kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
...@@ -118,7 +118,7 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS ...@@ -118,7 +118,7 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS
} }
} }
__kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input, __read_only image2d_t input,
#ifdef BIAS #ifdef BIAS
...@@ -231,7 +231,7 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS ...@@ -231,7 +231,7 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
} }
__kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS __kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
...@@ -390,7 +390,7 @@ __kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS ...@@ -390,7 +390,7 @@ __kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS
} }
} }
__kernel void winograd_inverse_transform_4x4(KERNEL_ERROR_PARAMS __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input, __read_only image2d_t input,
#ifdef BIAS #ifdef BIAS
......
...@@ -13,191 +13,21 @@ ...@@ -13,191 +13,21 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/concat.h" #include "mace/kernels/concat.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/concat.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace { template <typename T>
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ConcatFunctor<DeviceType::GPU, T>::ConcatFunctor(
const uint32_t *gws, OpKernelContext *context,
const uint32_t kwg_size) { const int32_t axis)
std::vector<uint32_t> lws(4, 0); : OpKernel(context) {
if (kwg_size == 0) { if (context->device()->opencl_runtime()->UseImageMemory()) {
lws[0] = lws[1] = lws[2] = 1; kernel_.reset(new opencl::image::ConcatKernel<T>(axis));
} else {
uint64_t
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] =
std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
}
return lws;
}
} // namespace
static MaceStatus Concat2(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channel = output->dim(3);
const int channel_blk = RoundUpDiv4(channel);
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name);
if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
}
if (input0->dim(3) % 4 == 0) {
built_options.emplace("-DDIVISIBLE_FOUR");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel));
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG_PTR;
SET_3D_GWS_ARGS_PTR(kernel, gws);
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input0->opencl_image())));
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input1->opencl_image())));
kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
kernel->setArg(idx++,
*(static_cast<cl::Image2D *>(output->opencl_image())));
*prev_input_shape = input0->shape();
}
const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
std::string tuning_key =
Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
}
static MaceStatus ConcatN(OpKernelContext *context,
cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel));
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const int inputs_count = input_list.size();
index_t chan_blk_offset = 0;
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
for (int i = 0; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
index_t input_channel_blk = input->dim(3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG_PTR;
SET_3D_GWS_ARGS_PTR(kernel, gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
kernel->setArg(idx++, *(output->opencl_image()));
chan_blk_offset += input_channel_blk;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else { } else {
std::vector<uint32_t> roundup_gws(lws.size()); MACE_NOT_IMPLEMENTED;
for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
OUT_OF_RANGE_VALIDATION(*kernel_error);
if (future != nullptr && runtime->is_profiling_enabled()) {
event.wait();
CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros =
std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
if (future != nullptr) {
future->wait_fn = [call_stats](CallStats *stats) {
if (stats != nullptr) {
stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros;
}
};
} }
return MACE_SUCCESS;
} }
template <typename T> template <typename T>
...@@ -205,52 +35,7 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()( ...@@ -205,52 +35,7 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const int inputs_count = input_list.size(); return kernel_->Compute(context_, input_list, output, future);
MACE_CHECK(inputs_count >= 2 && axis_ == 3)
<< "Concat opencl kernel only support >=2 elements with axis == 3";
const Tensor *input0 = input_list[0];
bool divisible_four = input0->dim(axis_) % 4 == 0;
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
divisible_four &= input->dim(axis_) % 4 == 0;
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis_) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis_] += input->dim(axis_);
}
MACE_CHECK(
inputs_count == 2 || divisible_four,
"Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
return Concat2(context_,
&kernel_, input_list[0], input_list[1],
DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_, &kernel_error_);
default:
if (divisible_four) {
return ConcatN(context_,
&kernel_, input_list, DataTypeToEnum<T>::value, output,
future, &kwg_size_, &kernel_error_);
} else {
MACE_NOT_IMPLEMENTED;
}
}
return MACE_SUCCESS;
} }
template struct ConcatFunctor<DeviceType::GPU, float>; template struct ConcatFunctor<DeviceType::GPU, float>;
......
...@@ -13,61 +13,37 @@ ...@@ -13,61 +13,37 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/conv_2d.h" #include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/image/conv_2d.h"
#include "mace/kernels/opencl/buffer/conv_2d.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *runtime, template<typename T>
cl::Kernel *kernel, Conv2dFunctor<DeviceType::GPU, T>::Conv2dFunctor(
const Tensor *input, OpKernelContext *context,
const Tensor *filter, const int *strides,
const Tensor *bias, const Padding &padding_type,
const int stride, const std::vector<int> &paddings,
const int *padding,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const bool is_filter_transformed)
std::vector<index_t> *prev_input_shape, : Conv2dFunctorBase(context,
Tensor *output, strides,
StatsFuture *future, padding_type,
uint32_t *kwg_size, paddings,
std::unique_ptr<BufferBase> *kernel_error); dilations,
activation,
relux_max_limit) {
MACE_UNUSED(is_filter_transformed);
extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *runtime, if (context->device()->opencl_runtime()->UseImageMemory()) {
cl::Kernel *kernel, kernel_.reset(new opencl::image::Conv2dKernel<T>);
const Tensor *input, } else {
const Tensor *filter, kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
const Tensor *bias, }
const int stride, }
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
extern MaceStatus Conv2dOpencl(OpKernelContext *runtime,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
template <typename T> template <typename T>
MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
...@@ -75,61 +51,11 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -75,61 +51,11 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
typedef MaceStatus (*Conv2dOpenclFunction)( // Compute
OpKernelContext *runtime, cl::Kernel * kernel, const Tensor *input, return kernel_->Compute(context_, input, filter, bias,
const Tensor *filter, const Tensor *bias, const int stride, strides_, padding_type_, paddings_,
const int *padding, const int *dilations, dilations_, activation_, relux_max_limit_,
const ActivationType activation, output, future);
const float relux_max_limit, const DataType dt,
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[3] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3};
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides_[0] != strides_[1] ||
(dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< ",dilations " << dilations_[0] << "x" << dilations_[1]
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED;
}
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter->shape().data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
if (kernel_h == kernel_w && kernel_h <= 3 &&
selector[kernel_h - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1];
return conv2d_func(context_,
&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_);
} else {
return Conv2dOpencl(context_,
&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_);
}
} }
template struct Conv2dFunctor<DeviceType::GPU, float>; template struct Conv2dFunctor<DeviceType::GPU, float>;
......
...@@ -13,170 +13,29 @@ ...@@ -13,170 +13,29 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/crop.h" #include "mace/kernels/crop.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/crop.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace { template <typename T>
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, CropFunctor<DeviceType::GPU, T>::CropFunctor(OpKernelContext *context,
const uint32_t *gws, const int axis,
const uint32_t kwg_size) { const std::vector<int> &offset)
std::vector<uint32_t> lws(4, 0); : OpKernel(context) {
if (kwg_size == 0) { if (context->device()->opencl_runtime()->UseImageMemory()) {
lws[0] = lws[1] = lws[2] = 1; kernel_.reset(new opencl::image::CropKernel<T>(axis, offset));
} else { } else {
uint64_t MACE_NOT_IMPLEMENTED;
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] =
std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
} }
return lws;
} }
} // namespace
template <typename T> template <typename T>
MaceStatus CropFunctor<DeviceType::GPU, T>::operator()( MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); return kernel_->Compute(context_, input_list, output, future);
const int32_t inputs_count = static_cast<int32_t>(input_list.size());
MACE_CHECK(inputs_count >= 2)
<< "Crop opencl kernel only support 2 elements input";
const Tensor *input0 = input_list[0];
const Tensor *input1 = input_list[1];
const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
MACE_CHECK(in0_dims == 4 && in1_dims == 4,
"Crop op only supports 4-dims inputs now.");
std::vector<int32_t> offsets(4, 0);
std::vector<index_t> output_shape(input0->shape());
switch (axis_) {
case 0:
if (offset_.size() == 1) {
offsets[0] = offset_[0];
offsets[1] = offset_[0];
offsets[2] = offset_[0];
offsets[3] = offset_[0];
} else if (offset_.size() == 4) {
offsets[0] = offset_[0];
offsets[1] = offset_[2];
offsets[2] = offset_[3];
offsets[3] = offset_[1];
}
for (int i = 0; i < 4; ++i) {
output_shape[i] = input1->dim(i);
}
break;
case 1:
if (offset_.size() == 1) {
offsets[1] = offset_[0];
offsets[2] = offset_[0];
offsets[3] = offset_[0];
} else if (offset_.size() == 3) {
offsets[1] = offset_[1];
offsets[2] = offset_[2];
offsets[3] = offset_[0];
}
for (int i = 1; i < 4; ++i) {
output_shape[i] = input1->dim(i);
}
break;
case 2:
if (offset_.size() == 1) {
offsets[1] = offset_[0];
offsets[2] = offset_[0];
} else if (offset_.size() == 2) {
offsets[1] = offset_[0];
offsets[2] = offset_[1];
}
output_shape[1] = input1->dim(1);
output_shape[2] = input1->dim(2);
break;
case 3:
if (offset_.size() == 1) {
offsets[2] = offset_[0];
}
output_shape[2] = input1->dim(2);
break;
default:
MACE_CHECK(axis_ >= 0 && axis_ < 4, "axis is out of boundary.");
break;
}
MACE_CHECK(offsets[3] % 4 == 0,
"MACE opencl only supports cropping channel offset divisible by 4.");
for (index_t i = 0; i < 4; ++i) {
MACE_CHECK(input0->dim(i) - offsets[i] >= input1->dim(i))
<< "the crop for dimension" << i << "is out of bound with size"
<< input1->dim(i) << "and offset" << offsets[i];
}
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
const index_t channel_blk = RoundUpDiv4(output->dim(3));
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1))
};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
built_options.emplace("-Dcrop=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(offsets[0]));
kernel_.setArg(idx++, static_cast<int>(offsets[1]));
kernel_.setArg(idx++, static_cast<int>(offsets[2]));
kernel_.setArg(idx++, static_cast<int>(offset_chan_blk));
kernel_.setArg(idx++, static_cast<int>(input0->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input0->dim(2)));
kernel_.setArg(idx++, static_cast<int>(output->dim(1)));
kernel_.setArg(idx++, static_cast<int>(output->dim(2)));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct CropFunctor<DeviceType::GPU, float>; template struct CropFunctor<DeviceType::GPU, float>;
......
...@@ -13,140 +13,34 @@ ...@@ -13,140 +13,34 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/deconv_2d.h" #include "mace/kernels/deconv_2d.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/image/deconv_2d.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace { template <typename T>
Deconv2dFunctor<DeviceType::GPU, T>::Deconv2dFunctor(
MaceStatus Deconv2dOpencl(OpKernelContext *context, OpKernelContext *context,
cl::Kernel *kernel, const std::vector<int> &strides,
const Tensor *input, const Padding &padding_type,
const Tensor *filter, const std::vector<int> &paddings,
const Tensor *bias, const std::vector<index_t> &output_shape,
const int *strides,
const int *paddings,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit)
const DataType dt, : Deconv2dFunctorBase(context,
std::vector<index_t> *prev_input_shape, strides,
Tensor *output, padding_type,
StatsFuture *future, paddings,
uint32_t *kwg_size, output_shape,
std::unique_ptr<BufferBase> *kernel_error) { activation,
const index_t batch = output->dim(0); relux_max_limit) {
const index_t height = output->dim(1); if (context->device()->opencl_runtime()->UseImageMemory()) {
const index_t width = output->dim(2); kernel_.reset(new opencl::image::Deconv2dKernel<T>);
const index_t channels = output->dim(3); } else {
const index_t input_channels = input->dim(3); MACE_NOT_IMPLEMENTED;
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const int stride_h = strides[0];
const int stride_w = strides[1];
MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
#define MACE_WIDTH_BLK 5
const index_t n_strides = (width + stride_w - 1) / stride_w;
const index_t width_blocks =
((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride_w;
const float stride_h_r = 1.f / static_cast<float>(stride_h);
const float stride_w_r = 1.f / static_cast<float>(stride_w);
const int padding_h = (paddings[0] + 1) >> 1;
const int padding_w = (paddings[1] + 1) >> 1;
const int align_h = stride_h - 1 - padding_h;
const int align_w = stride_w - 1 - padding_w;
const int kernel_size = filter->dim(2) * filter->dim(3);
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
built_options.emplace("-Ddeconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
built_options, kernel));
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG_PTR;
SET_3D_GWS_ARGS_PTR(kernel, gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel->setArg(idx++, *(bias->opencl_image()));
}
kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(input->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(height));
kernel->setArg(idx++, static_cast<int32_t>(width));
kernel->setArg(idx++, static_cast<int32_t>(channels));
kernel->setArg(idx++, static_cast<int32_t>(stride_h));
kernel->setArg(idx++, static_cast<int32_t>(stride_w));
kernel->setArg(idx++, stride_h_r);
kernel->setArg(idx++, stride_w_r);
kernel->setArg(idx++, static_cast<int32_t>(align_h));
kernel->setArg(idx++, static_cast<int32_t>(align_w));
kernel->setArg(idx++, static_cast<int32_t>(padding_h));
kernel->setArg(idx++, static_cast<int32_t>(padding_w));
kernel->setArg(idx++, static_cast<int32_t>(filter->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(filter->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(kernel_size));
kernel->setArg(idx++, static_cast<int32_t>(input_channel_blocks));
kernel->setArg(idx++, static_cast<int32_t>(channel_blocks));
*prev_input_shape = input->shape();
} }
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, *kwg_size);
std::string tuning_key =
Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
} }
} // namespace
template <typename T> template <typename T>
MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()( MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
...@@ -188,16 +82,10 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()( ...@@ -188,16 +82,10 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
output_shape.data(), output_shape.data(),
paddings.data()); paddings.data());
} }
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return Deconv2dOpencl(context_, &kernel_, input, filter, bias, return kernel_->Compute(context_, input, filter, bias,
strides_.data(), paddings.data(), activation_, strides_.data(), paddings.data(), activation_,
relux_max_limit_, DataTypeToEnum<T>::value, relux_max_limit_, output_shape, output, future);
&input_shape_, output, future,
&kwg_size_, &kernel_error_);
} }
template struct Deconv2dFunctor<DeviceType::GPU, float>; template struct Deconv2dFunctor<DeviceType::GPU, float>;
......
...@@ -13,98 +13,26 @@ ...@@ -13,98 +13,26 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/depth_to_space.h" #include "mace/kernels/depth_to_space.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/kernels/opencl/image/depth_to_space.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
DepthToSpaceOpFunctor<DeviceType::GPU, T>::DepthToSpaceOpFunctor(
OpKernelContext *context,
const int block_size)
: OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::DepthToSpaceKernel<T>(block_size));
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, Tensor *output, StatsFuture *future) { const Tensor *input, Tensor *output, StatsFuture *future) {
const index_t batch = input->dim(0); return kernel_->Compute(context_, input, output, future);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3);
MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
"input depth should be dividable by block_size * block_size",
input_depth);
MACE_CHECK((input_depth % 4) == 0,
"input channel should be dividable by 4");
const index_t output_height = input_height * block_size_;
const index_t output_width = input_width * block_size_;
const index_t output_depth = input_depth / (block_size_ * block_size_);
MACE_CHECK(output_depth % 4 == 0, "output channel not support:")
<< output_depth;
const index_t input_depth_blocks = RoundUpDiv4(input_depth);
const index_t output_depth_blocks = RoundUpDiv4(output_depth);
std::vector<index_t> output_shape = {batch,
output_height,
output_width,
output_depth};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output_depth)),
static_cast<uint32_t>(output_width),
static_cast<uint32_t>(output_height * batch)
};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
const char *kernel_name = kernel_name = "depth_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(input_height * batch));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
std::string tuning_key = Concat("depth_to_space_opencl_kernel",
batch, output_height,
output_width, output_depth);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct DepthToSpaceOpFunctor<DeviceType::GPU, float>; template struct DepthToSpaceOpFunctor<DeviceType::GPU, float>;
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/depthwise_conv2d.h"
#include "mace/kernels/opencl/buffer/depthwise_conv2d.h"
#include "mace/kernels/opencl/image/depthwise_conv2d.h"
namespace mace {
namespace kernels {
template <typename T>
DepthwiseConv2dFunctor<DeviceType::GPU, T>::DepthwiseConv2dFunctor(
OpKernelContext *context,
const int *strides,
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit)
: DepthwiseConv2dFunctorBase(context,
strides,
padding_type,
paddings,
dilations,
activation,
relux_max_limit) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>);
} else {
kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>);
}
}
template <typename T>
MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const Tensor *filter, /* MIHW */
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
return kernel_->Compute(context_, input, filter, bias,
strides_, padding_type_, paddings_,
dilations_, activation_, relux_max_limit_,
output, future);
}
template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
template struct DepthwiseConv2dFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
...@@ -13,125 +13,33 @@ ...@@ -13,125 +13,33 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/eltwise.h" #include "mace/kernels/eltwise.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/eltwise.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
EltwiseFunctor<DeviceType::GPU, T>::EltwiseFunctor(
OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff,
const float scalar_input,
const int32_t scalar_input_index,
const DataFormat data_format) : OpKernel(context) {
MACE_UNUSED(data_format);
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::EltwiseKernel<T>(
type, coeff, scalar_input, scalar_input_index));
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
const Tensor *input1, const Tensor *input1,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); return kernel_->Compute(context_, input0, input1, output, future);
bool swapped = false;
if (input1 != nullptr) {
MACE_CHECK(input0->dim_size() == input1->dim_size() ||
input0->dim_size() == 1 || input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape";
if (input0->size() != input1->size()) {
if (input0->size() < input1->size()) {
std::swap(input0, input1);
swapped = true;
}
if (input1->dim_size() == 1) {
MACE_CHECK(input0->dim(3) == input1->dim(0))
<< "Element-Wise op only support channel dimension broadcast";
} else {
MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) &&
input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 &&
input1->dim(2) == 1)
<< "Element-Wise op only support channel dimension broadcast";
}
}
}
if (scalar_input_index_ == 0) {
swapped = !swapped;
}
std::vector<index_t> output_shape(4);
output_shape[0] = input0->dim(0);
output_shape[1] = input0->dim(1);
output_shape[2] = input0->dim(2);
output_shape[3] = input0->dim(3);
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t batch_height_pixels = batch * height;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch_height_pixels)};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
built_options.emplace("-Deltwise=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (input1 == nullptr) {
built_options.emplace("-DINPUT_TYPE=1");
} else if (input0->size() != input1->size()) {
if (input1->dim(0) == 1 || input1->dim_size() == 1)
built_options.emplace("-DINPUT_TYPE=3");
else
built_options.emplace("-DINPUT_TYPE=2");
if (swapped) built_options.emplace("-DSWAPPED");
}
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input0->opencl_image()));
if (input1 == nullptr) {
kernel_.setArg(idx++, scalar_input_);
} else {
kernel_.setArg(idx++, *(input1->opencl_image()));
}
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
if (!coeff_.empty()) {
kernel_.setArg(idx++, coeff_[0]);
kernel_.setArg(idx++, coeff_[1]);
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct EltwiseFunctor<DeviceType::GPU, float>; template struct EltwiseFunctor<DeviceType::GPU, float>;
......
...@@ -13,239 +13,23 @@ ...@@ -13,239 +13,23 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/fully_connected.h" #include "mace/kernels/fully_connected.h"
#include "mace/kernels/opencl/image/fully_connected.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
template <typename T> template <typename T>
MaceStatus FCWXKernel(OpKernelContext *context, FullyConnectedFunctor<DeviceType::GPU, T>::FullyConnectedFunctor(
cl::Kernel *kernel, OpKernelContext *context,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
std::vector<index_t> *prev_input_shape,
Tensor *output,
const ActivationType activation, const ActivationType activation,
std::vector<uint32_t> *gws, const float relux_max_limit)
std::vector<uint32_t> *lws, : FullyConnectedBase(context, activation, relux_max_limit) {
const float relux_max_limit, if (context->device()->opencl_runtime()->UseImageMemory()) {
StatsFuture *future, kernel_.reset(new opencl::image::FullyConnectedKernel<T>);
std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws);
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
const index_t batch = output->dim(0);
const index_t output_size = output->dim(3);
const index_t output_blocks = RoundUpDiv4(output_size);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
built_options.emplace("-Dfully_connected_width=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
built_options, kernel));
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(*kernel));
*gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
*lws = {(*gws)[0], (*gws)[1], inter_local_blks};
} else {
*gws = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
*lws = {(*gws)[0], (*gws)[1], inter_local_blks};
}
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t batch = output->dim(0);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
(*gws)[2] = static_cast<uint32_t>(batch * output_blocks);
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG_PTR;
SET_3D_GWS_ARGS_PTR(kernel, *gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) {
kernel->setArg(idx++, *(bias->opencl_image()));
}
kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, ((*lws)[0] * (*lws)[1] * (*lws)[2] * sizeof(float)),
nullptr);
kernel->setArg(idx++, static_cast<int>(input->dim(1)));
kernel->setArg(idx++, static_cast<int>(input->dim(2)));
kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
kernel->setArg(idx++, static_cast<int>(output_blocks));
kernel->setArg(idx++, relux_max_limit);
*prev_input_shape = input->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]),
cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
} else { } else {
std::vector<uint32_t> roundup_gws(lws->size()); MACE_NOT_IMPLEMENTED;
for (size_t i = 0; i < lws->size(); ++i) {
roundup_gws[i] = RoundUp((*gws)[i], (*lws)[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
}
OUT_OF_RANGE_VALIDATION(*kernel_error);
MACE_CL_RET_STATUS(error);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
} }
};
}
return MACE_SUCCESS;
} }
template <typename T>
MaceStatus FCWTXKernel(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
std::vector<index_t> *prev_input_shape,
Tensor *output,
const ActivationType activation,
std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws,
const float relux_max_limit,
StatsFuture *future,
std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws);
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
built_options.emplace("-Dfully_connected=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("fully_connected", kernel_name,
built_options, kernel));
uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
*lws = {16, kwg_size / 16, 0};
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t batch = output->dim(0);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
*gws = {
static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
};
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG_PTR;
SET_2D_GWS_ARGS_PTR(kernel, *gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) {
kernel->setArg(idx++, *(bias->opencl_image()));
}
kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, static_cast<int>(input->dim(1)));
kernel->setArg(idx++, static_cast<int>(input->dim(2)));
kernel->setArg(idx++, static_cast<int>(input->dim(3)));
// FIXME handle flexable data type: half not supported
kernel->setArg(idx++, relux_max_limit);
*prev_input_shape = input->shape();
}
std::string tuning_key =
Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
gws->data(), *lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
}
} // namespace
template <typename T> template <typename T>
MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()( MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
...@@ -253,16 +37,9 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()( ...@@ -253,16 +37,9 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)}; return kernel_->Compute(
std::vector<size_t> output_image_shape; context_, input, weight, bias, activation_, relux_max_limit_,
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output, future);
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return FCWXKernel<T>(context_,
&kernel_, input, weight, bias, &input_shape_, output,
activation_, &gws_, &lws_, relux_max_limit_, future,
&kernel_error_);
} }
template struct FullyConnectedFunctor<DeviceType::GPU, float>; template struct FullyConnectedFunctor<DeviceType::GPU, float>;
......
...@@ -30,60 +30,61 @@ ...@@ -30,60 +30,61 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
// oorc for 'Out Of Range Check'
#define MACE_OUT_OF_RANGE_DEFINITION \
std::shared_ptr<BufferBase> oorc_flag;
#define OUT_OF_RANGE_CONFIG(kernel_error, context) \ #define MACE_OUT_OF_RANGE_CONFIG \
if (runtime->IsOutOfRangeCheckEnabled()) { \ if (runtime->IsOutOfRangeCheckEnabled()) { \
built_options.emplace("-DOUT_OF_RANGE_CHECK"); \ built_options.emplace("-DOUT_OF_RANGE_CHECK"); \
(kernel_error) = std::move(std::unique_ptr<Buffer>( \ }
#define MACE_OUT_OF_RANGE_INIT(kernel) \
if (runtime->IsOutOfRangeCheckEnabled()) { \
oorc_flag = std::move(std::unique_ptr<Buffer>( \
new Buffer((context)->device()->allocator()))); \ new Buffer((context)->device()->allocator()))); \
MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1)); \ MACE_RETURN_IF_ERROR((oorc_flag)->Allocate(sizeof(int))); \
(kernel_error)->Map(nullptr); \ oorc_flag->Map(nullptr); \
*((kernel_error)->mutable_data<char>()) = 0; \ *(oorc_flag->mutable_data<int>()) = 0; \
(kernel_error)->UnMap(); \ oorc_flag->UnMap(); \
(kernel).setArg(0, \
*(static_cast<cl::Buffer *>(oorc_flag->buffer())));\
} }
#define OUT_OF_RANGE_SET_ARG \ #define MACE_OUT_OF_RANGE_SET_ARGS(kernel) \
if (runtime->IsOutOfRangeCheckEnabled()) { \ if (runtime->IsOutOfRangeCheckEnabled()) { \
kernel_.setArg(idx++, \ (kernel).setArg(idx++, \
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); \ *(static_cast<cl::Buffer *>(oorc_flag->buffer())));\
} }
#define OUT_OF_RANGE_SET_ARG_PTR \ #define MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel, size) \
if (runtime->IsOutOfRangeCheckEnabled()) { \ if (runtime->IsOutOfRangeCheckEnabled()) { \
kernel->setArg(idx++, \ (kernel).setArg(idx++, \
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); \ *(static_cast<cl::Buffer *>(oorc_flag->buffer()))); \
(kernel).setArg(idx++, static_cast<int>(size)); \
} }
#define OUT_OF_RANGE_VALIDATION(kernel_error) \ #define MACE_OUT_OF_RANGE_VALIDATION \
if (runtime->IsOutOfRangeCheckEnabled()) { \ if (runtime->IsOutOfRangeCheckEnabled()) { \
(kernel_error)->Map(nullptr); \ oorc_flag->Map(nullptr); \
char *kerror_code = (kernel_error)->mutable_data<char>(); \ int *kerror_code = oorc_flag->mutable_data<int>(); \
MACE_CHECK(*kerror_code == 0, "Kernel error code: ", *kerror_code);\ MACE_CHECK(*kerror_code == 0, "Kernel error code: ", *kerror_code); \
(kernel_error)->UnMap(); \ oorc_flag->UnMap(); \
} }
#define NON_UNIFORM_WG_CONFIG \ #define MACE_NON_UNIFORM_WG_CONFIG \
if (runtime->IsNonUniformWorkgroupsSupported()) { \ if (runtime->IsNonUniformWorkgroupsSupported()) { \
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); \ built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); \
} }
#define SET_3D_GWS_ARGS(kernel) \ #define MACE_SET_3D_GWS_ARGS(kernel, gws) \
kernel.setArg(idx++, gws[0]); \ (kernel).setArg(idx++, (gws)[0]); \
kernel.setArg(idx++, gws[1]); \ (kernel).setArg(idx++, (gws)[1]); \
kernel.setArg(idx++, gws[2]); (kernel).setArg(idx++, (gws)[2]);
#define SET_2D_GWS_ARGS(kernel) \
kernel.setArg(idx++, gws[0]); \
kernel.setArg(idx++, gws[1]);
#define SET_3D_GWS_ARGS_PTR(kernel, gws) \ #define MACE_SET_2D_GWS_ARGS(kernel, gws) \
kernel->setArg(idx++, (gws)[0]); \ (kernel).setArg(idx++, (gws)[0]); \
kernel->setArg(idx++, (gws)[1]); \ (kernel).setArg(idx++, (gws)[1]);
kernel->setArg(idx++, (gws)[2]);
#define SET_2D_GWS_ARGS_PTR(kernel, gws) \
kernel->setArg(idx++, (gws)[0]); \
kernel->setArg(idx++, (gws)[1]);
// Max execution time of OpenCL kernel for tuning to prevent UI stuck. // Max execution time of OpenCL kernel for tuning to prevent UI stuck.
const float kMaxKernelExecTime = 1000.0; // microseconds const float kMaxKernelExecTime = 1000.0; // microseconds
...@@ -114,6 +115,10 @@ std::string DtToCLDt(const DataType dt); ...@@ -114,6 +115,10 @@ std::string DtToCLDt(const DataType dt);
// e.g. half -> float // e.g. half -> float
std::string DtToUpCompatibleCLDt(const DataType dt); std::string DtToUpCompatibleCLDt(const DataType dt);
// CPU data type to OpenCL condition data type used in select
// e.g. half -> float
std::string DtToCLCondDt(const DataType dt);
// Tuning or Run OpenCL kernel with 3D work group size // Tuning or Run OpenCL kernel with 3D work group size
MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime, MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
const cl::Kernel &kernel, const cl::Kernel &kernel,
...@@ -167,6 +172,7 @@ std::string Concat(Args... args) { ...@@ -167,6 +172,7 @@ std::string Concat(Args... args) {
std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime, std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
const uint32_t *gws, const uint32_t *gws,
const uint32_t kwg_size); const uint32_t kwg_size);
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_OPENCL_HELPER_H_ #endif // MACE_KERNELS_OPENCL_HELPER_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_
#define MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_
#include "mace/kernels/activation.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class ActivationKernel : public OpenCLActivationKernel {
public:
ActivationKernel(ActivationType type,
T relux_max_limit)
: activation_(type), relux_max_limit_(relux_max_limit) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future) override;
private:
ActivationType activation_;
T relux_max_limit_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
std::string tuning_key_prefix_;
};
template <typename T>
MaceStatus ActivationKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
switch (activation_) {
case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_ADDN_H_
#define MACE_KERNELS_OPENCL_IMAGE_ADDN_H_
#include "mace/kernels/addn.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class AddNKernel : public OpenCLAddNKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus AddNKernel<T>::Compute(
OpKernelContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_ADDN_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_
#define MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_
#include "mace/kernels/batch_norm.h"
#include <memory>
#include <vector>
#include <set>
#include <string>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class BatchNormKernel : public OpenCLBatchNormKernel {
public:
BatchNormKernel(
const bool folded_constant,
const ActivationType activation,
const float relux_max_limit);
MaceStatus Compute(OpKernelContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
Tensor *output,
StatsFuture *future) override;
private:
const bool folded_constant_;
const ActivationType activation_;
const float relux_max_limit_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
BatchNormKernel<T>::BatchNormKernel(const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: folded_constant_(folded_constant),
activation_(activation),
relux_max_limit_(relux_max_limit) {}
template <typename T>
MaceStatus BatchNormKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (folded_constant_) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (!folded_constant_) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
#define MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
#include "mace/kernels/batch_to_space.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BatchToSpaceKernel<T>::Compute(
OpKernelContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor,
StatsFuture *future) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = batch_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_
#define MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_
#include "mace/kernels/bias_add.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class BiasAddKernel : public OpenCLBiasAddKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BiasAddKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_
...@@ -12,29 +12,55 @@ ...@@ -12,29 +12,55 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/kernels/buffer_to_image.h" #ifndef MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
#define MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/buffer_transform.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace opencl {
namespace image {
template <typename T> template <typename T>
MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( class BufferToImage : public OpenCLBufferTransformKernel {
const Tensor *buffer, public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const BufferType type, const BufferType type,
Tensor *image, const int wino_blk_size,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BufferToImage<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) { StatsFuture *future) {
auto formatted_buffer_shape = FormatBufferShape(buffer->shape(), type); auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_); CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size);
if (type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = std::vector<index_t> new_shape =
{(wino_blk_size_ + 2) * (wino_blk_size_ + 2), {(wino_blk_size + 2) * (wino_blk_size + 2),
buffer->dim(0), buffer->dim(1)}; input->dim(0), input->dim(1)};
MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(new_shape, image_shape));
} else { } else {
MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
} }
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]), uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
...@@ -67,25 +93,26 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -67,25 +93,26 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
break; break;
case WINOGRAD_FILTER: { case WINOGRAD_FILTER: {
std::stringstream ss_tmp; std::stringstream ss_tmp;
gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2); gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_buffer_to_image_" ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size_ << "x" << wino_blk_size_; << wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str(); kernel_name = ss_tmp.str();
break; break;
} }
} }
auto runtime = context_->device()->opencl_runtime(); auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_); MACE_OUT_OF_RANGE_CONFIG;
NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
if (buffer->dtype() == image->dtype()) { if (input->dtype() == output->dtype()) {
built_options.emplace( built_options.emplace(
"-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
...@@ -100,30 +127,31 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -100,30 +127,31 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_)); "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
} }
if (!IsVecEqual(input_shape_, buffer->shape())) { MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG; MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
SET_2D_GWS_ARGS(kernel_); MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(buffer->opencl_buffer())); kernel_.setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned"); "buffer offset not aligned");
kernel_.setArg(idx++, kernel_.setArg(idx++,
static_cast<uint32_t>(buffer->buffer_offset() / static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(buffer->dtype()))); GetEnumTypeSize(input->dtype())));
if (type == CONV2D_FILTER) { if (type == CONV2D_FILTER) {
const index_t const index_t
inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); inner_size = input->dim(1) * input->dim(2) * input->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size)); kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) { } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(1))); kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
} else if (type == ARGUMENT) { } else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
} else { } else {
kernel_.setArg(idx++, kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1])); static_cast<uint32_t>(formatted_buffer_shape[1]));
...@@ -132,8 +160,8 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -132,8 +160,8 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
kernel_.setArg(idx++, kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3])); static_cast<uint32_t>(formatted_buffer_shape[3]));
} }
kernel_.setArg(idx++, *(image->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = buffer->shape(); input_shape_ = input->shape();
} }
const uint32_t kwg_size = const uint32_t kwg_size =
...@@ -157,7 +185,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -157,7 +185,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
} }
MACE_CL_RET_STATUS(error); MACE_CL_RET_STATUS(error);
OUT_OF_RANGE_VALIDATION(kernel_error_); MACE_OUT_OF_RANGE_VALIDATION;
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
event.wait(); event.wait();
...@@ -168,13 +196,14 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -168,13 +196,14 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
} }
// Mark the buffer unused. // Mark the buffer unused.
const_cast<Tensor *>(buffer)->MarkUnused(); const_cast<Tensor *>(input)->MarkUnused();
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template struct BufferToImageFunctor<DeviceType::GPU, float>; } // namespace image
template struct BufferToImageFunctor<DeviceType::GPU, half>; } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
#define MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
#include "mace/kernels/channel_shuffle.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
public:
explicit ChannelShuffleKernel(const int groups) : groups_(groups) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) override;
private:
const int groups_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ChannelShuffleKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
MACE_CHECK(channels_per_group % 4 == 0,
"channels per group must be multiple of 4");
MACE_CHECK(groups_ % 4 == 0, "groups must be multiple of 4");
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/opencl/image/concat.h"
#include <algorithm>
#include <set>
#include <string>
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
namespace concat {
namespace {
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] =
std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
}
return lws;
}
} // namespace
MaceStatus Concat2(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channel = output->dim(3);
const int channel_blk = RoundUpDiv4(channel);
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name);
if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
}
if (input0->dim(3) % 4 == 0) {
built_options.emplace("-DDIVISIBLE_FOUR");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel));
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
MACE_OUT_OF_RANGE_INIT(*kernel);
if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input0->opencl_image())));
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input1->opencl_image())));
kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
kernel->setArg(idx++,
*(static_cast<cl::Image2D *>(output->opencl_image())));
*prev_input_shape = input0->shape();
}
const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
std::string tuning_key =
Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
MaceStatus ConcatN(OpKernelContext *context,
cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel));
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const int inputs_count = input_list.size();
index_t chan_blk_offset = 0;
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
MACE_OUT_OF_RANGE_INIT(*kernel);
for (int i = 0; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
index_t input_channel_blk = input->dim(3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
kernel->setArg(idx++, *(output->opencl_image()));
chan_blk_offset += input_channel_blk;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (future != nullptr && runtime->is_profiling_enabled()) {
event.wait();
CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros =
std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
if (future != nullptr) {
future->wait_fn = [call_stats](CallStats *stats) {
if (stats != nullptr) {
stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros;
}
};
}
return MACE_SUCCESS;
}
} // namespace concat
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_
#define MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_
#include "mace/kernels/concat.h"
#include <memory>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
namespace concat {
MaceStatus Concat2(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size);
MaceStatus ConcatN(OpKernelContext *context,
cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size);
} // namespace concat
template <typename T>
class ConcatKernel : public OpenCLConcatKernel {
public:
explicit ConcatKernel(const int32_t axis) : axis_(axis) {}
MaceStatus Compute(
OpKernelContext *context,
const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) override;
private:
int32_t axis_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ConcatKernel<T>::Compute(
OpKernelContext *context,
const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) {
const int inputs_count = input_list.size();
MACE_CHECK(inputs_count >= 2 && axis_ == 3)
<< "Concat opencl kernel only support >=2 elements with axis == 3";
const Tensor *input0 = input_list[0];
bool divisible_four = input0->dim(axis_) % 4 == 0;
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
divisible_four &= input->dim(axis_) % 4 == 0;
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis_) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis_] += input->dim(axis_);
}
MACE_CHECK(
inputs_count == 2 || divisible_four,
"Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
switch (inputs_count) {
case 2:
return concat::Concat2(
context, &kernel_, input_list[0], input_list[1],
DataTypeToEnum<T>::value, &input_shape_, output, future, &kwg_size_);
default:
if (divisible_four) {
return concat::ConcatN(context, &kernel_, input_list,
DataTypeToEnum<T>::value, output, future,
&kwg_size_);
} else {
MACE_NOT_IMPLEMENTED;
}
}
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_
#define MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_
#include "mace/kernels/conv_2d.h"
#include <memory>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size);
extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size);
extern MaceStatus Conv2dOpencl(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size);
template <typename T>
class Conv2dKernel : public OpenCLConv2dKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus Conv2dKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) {
typedef MaceStatus (*Conv2dOpenclFunction)(
OpKernelContext *context,
cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt,
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
uint32_t *kwg_size);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[3] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3};
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1] ||
(dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< ",dilations " << dilations[0] << "x" << dilations[1]
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED;
}
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
if (kernel_h == kernel_w && kernel_h <= 3 &&
selector[kernel_h - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1];
return conv2d_func(context,
&kernel_, input, filter, bias, strides[0], paddings.data(), dilations,
activation, relux_max_limit, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_);
} else {
return Conv2dOpencl(
context, &kernel_, input, filter, bias,
strides[0], paddings.data(), dilations,
activation, relux_max_limit, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_);
}
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace opencl {
namespace image {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
...@@ -78,8 +80,7 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, ...@@ -78,8 +80,7 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size) {
std::unique_ptr<BufferBase> *kernel_error) {
MACE_UNUSED(padding); MACE_UNUSED(padding);
MACE_UNUSED(dilations); MACE_UNUSED(dilations);
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -96,13 +97,13 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, ...@@ -96,13 +97,13 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
auto runtime = context->device()->opencl_runtime(); auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
MACE_CHECK(input_batch == batch); MACE_CHECK(input_batch == batch);
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error, context); MACE_OUT_OF_RANGE_CONFIG;
NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
...@@ -139,12 +140,13 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, ...@@ -139,12 +140,13 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(*kernel);
// Support different input size // Support different input size
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG_PTR; MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
SET_3D_GWS_ARGS_PTR(kernel, gws); MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -169,9 +171,11 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, ...@@ -169,9 +171,11 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error); MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS; return MACE_SUCCESS;
} }
} // namespace image
} // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -21,6 +21,9 @@ ...@@ -21,6 +21,9 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace opencl {
namespace image {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
...@@ -71,8 +74,7 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, ...@@ -71,8 +74,7 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size) {
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -84,11 +86,12 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, ...@@ -84,11 +86,12 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
const index_t width_blocks = RoundUpDiv<index_t, 5>(width); const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
auto runtime = context->device()->opencl_runtime(); auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error, context); MACE_OUT_OF_RANGE_CONFIG;
NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
...@@ -123,12 +126,13 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, ...@@ -123,12 +126,13 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(*kernel);
// Support different input size // Support different input size
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG_PTR; MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
SET_3D_GWS_ARGS_PTR(kernel, gws); MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -149,16 +153,17 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, ...@@ -149,16 +153,17 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size); std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1), Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error); MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS; return MACE_SUCCESS;
} }
} // namespace image
} // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -21,6 +21,9 @@ ...@@ -21,6 +21,9 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace opencl {
namespace image {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
...@@ -79,8 +82,7 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context, ...@@ -79,8 +82,7 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size) {
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -92,11 +94,12 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context, ...@@ -92,11 +94,12 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context,
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv4(width);
auto runtime = context->device()->opencl_runtime(); auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error, context); MACE_OUT_OF_RANGE_CONFIG;
NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-Dconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
...@@ -131,12 +134,13 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context, ...@@ -131,12 +134,13 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(*kernel);
// Support different input size // Support different input size
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG_PTR; MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
SET_3D_GWS_ARGS_PTR(kernel, gws); MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -168,9 +172,11 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context, ...@@ -168,9 +172,11 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error); MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS; return MACE_SUCCESS;
} }
} // namespace image
} // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_CROP_H_
#define MACE_KERNELS_OPENCL_IMAGE_CROP_H_
#include "mace/kernels/crop.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class CropKernel : public OpenCLCropKernel {
public:
explicit CropKernel(
const int axis,
const std::vector<int> &offset)
: axis_(axis), offset_(offset) {}
MaceStatus Compute(
OpKernelContext *context,
const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) override;
private:
const int axis_;
std::vector<int> offset_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus CropKernel<T>::Compute(
OpKernelContext *context,
const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) {
const int32_t inputs_count = static_cast<int32_t>(input_list.size());
MACE_CHECK(inputs_count >= 2)
<< "Crop opencl kernel only support 2 elements input";
const Tensor *input0 = input_list[0];
const Tensor *input1 = input_list[1];
const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
MACE_CHECK(in0_dims == 4 && in1_dims == 4,
"Crop op only supports 4-dims inputs now.");
std::vector<int32_t> offsets(4, 0);
std::vector<index_t> output_shape(input0->shape());
switch (axis_) {
case 0:
if (offset_.size() == 1) {
offsets[0] = offset_[0];
offsets[1] = offset_[0];
offsets[2] = offset_[0];
offsets[3] = offset_[0];
} else if (offset_.size() == 4) {
offsets[0] = offset_[0];
offsets[1] = offset_[2];
offsets[2] = offset_[3];
offsets[3] = offset_[1];
}
for (int i = 0; i < 4; ++i) {
output_shape[i] = input1->dim(i);
}
break;
case 1:
if (offset_.size() == 1) {
offsets[1] = offset_[0];
offsets[2] = offset_[0];
offsets[3] = offset_[0];
} else if (offset_.size() == 3) {
offsets[1] = offset_[1];
offsets[2] = offset_[2];
offsets[3] = offset_[0];
}
for (int i = 1; i < 4; ++i) {
output_shape[i] = input1->dim(i);
}
break;
case 2:
if (offset_.size() == 1) {
offsets[1] = offset_[0];
offsets[2] = offset_[0];
} else if (offset_.size() == 2) {
offsets[1] = offset_[0];
offsets[2] = offset_[1];
}
output_shape[1] = input1->dim(1);
output_shape[2] = input1->dim(2);
break;
case 3:
if (offset_.size() == 1) {
offsets[2] = offset_[0];
}
output_shape[2] = input1->dim(2);
break;
default:
MACE_CHECK(axis_ >= 0 && axis_ < 4, "axis is out of boundary.");
break;
}
MACE_CHECK(offsets[3] % 4 == 0,
"MACE opencl only supports cropping channel"
" offset divisible by 4.");
for (index_t i = 0; i < 4; ++i) {
MACE_CHECK(input0->dim(i) - offsets[i] >= input1->dim(i))
<< "the crop for dimension" << i << "is out of bound with size"
<< input1->dim(i) << "and offset" << offsets[i];
}
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
const index_t channel_blk = RoundUpDiv4(output->dim(3));
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1))
};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
built_options.emplace("-Dcrop=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(offsets[0]));
kernel_.setArg(idx++, static_cast<int>(offsets[1]));
kernel_.setArg(idx++, static_cast<int>(offsets[2]));
kernel_.setArg(idx++, static_cast<int>(offset_chan_blk));
kernel_.setArg(idx++, static_cast<int>(input0->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input0->dim(2)));
kernel_.setArg(idx++, static_cast<int>(output->dim(1)));
kernel_.setArg(idx++, static_cast<int>(output->dim(2)));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_CROP_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_
#define MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_
#include "mace/kernels/deconv_2d.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class Deconv2dKernel : public OpenCLDeconv2dKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *padding_data,
const ActivationType activation,
const float relux_max_limit,
const std::vector<index_t> &output_shape,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus Deconv2dKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *padding_data,
const ActivationType activation,
const float relux_max_limit,
const std::vector<index_t> &output_shape,
Tensor *output,
StatsFuture *future) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const DataType dt = DataTypeToEnum<T>::value;
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t input_channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const int stride_h = strides[0];
const int stride_w = strides[1];
MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
#define MACE_WIDTH_BLK 5
const index_t n_strides = (width + stride_w - 1) / stride_w;
const index_t width_blocks =
((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride_w;
const float stride_h_r = 1.f / static_cast<float>(stride_h);
const float stride_w_r = 1.f / static_cast<float>(stride_w);
const int padding_h = (padding_data[0] + 1) >> 1;
const int padding_w = (padding_data[1] + 1) >> 1;
const int align_h = stride_h - 1 - padding_h;
const int align_w = stride_w - 1 - padding_w;
const int kernel_size = filter->dim(2) * filter->dim(3);
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
built_options.emplace("-Ddeconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit);
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
kernel_.setArg(idx++, stride_h_r);
kernel_.setArg(idx++, stride_w_r);
kernel_.setArg(idx++, static_cast<int32_t>(align_h));
kernel_.setArg(idx++, static_cast<int32_t>(align_w));
kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
kernel_.setArg(idx++, static_cast<int32_t>(input_channel_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
#define MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
#include "mace/kernels/depth_to_space.h"
#include <memory>
#include <vector>
#include <set>
#include <string>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel {
public:
explicit DepthToSpaceKernel(const int block_size)
: block_size_(block_size) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) override;
private:
const int block_size_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus DepthToSpaceKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3);
MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
"input depth should be dividable by block_size * block_size",
input_depth);
MACE_CHECK((input_depth % 4) == 0,
"input channel should be dividable by 4");
const index_t output_height = input_height * block_size_;
const index_t output_width = input_width * block_size_;
const index_t output_depth = input_depth / (block_size_ * block_size_);
MACE_CHECK(output_depth % 4 == 0, "output channel not support:")
<< output_depth;
const index_t input_depth_blocks = RoundUpDiv4(input_depth);
const index_t output_depth_blocks = RoundUpDiv4(output_depth);
std::vector<index_t> output_shape = {batch,
output_height,
output_width,
output_depth};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output_depth)),
static_cast<uint32_t>(output_width),
static_cast<uint32_t>(output_height * batch)
};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
const char *kernel_name = kernel_name = "depth_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(input_height * batch));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
std::string tuning_key = Concat("depth_to_space_opencl_kernel",
batch, output_height,
output_width, output_depth);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
...@@ -12,14 +12,17 @@ ...@@ -12,14 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/depthwise_conv2d.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/depthwise_conv2d.h" #include <algorithm>
#include "mace/kernels/opencl/helper.h" #include <set>
#include "mace/utils/tuner.h" #include <string>
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace opencl {
namespace image {
namespace depthwise {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
...@@ -60,7 +63,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ...@@ -60,7 +63,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
} // namespace } // namespace
static MaceStatus DepthwiseConv2d(OpKernelContext *context, MaceStatus DepthwiseConv2d(OpKernelContext *context,
cl::Kernel *kernel, cl::Kernel *kernel,
const Tensor *input, // NHWC const Tensor *input, // NHWC
const Tensor *filter, // HWIM const Tensor *filter, // HWIM
...@@ -74,8 +77,7 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context, ...@@ -74,8 +77,7 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size, uint32_t *kwg_size) {
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -93,11 +95,12 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context, ...@@ -93,11 +95,12 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->opencl_runtime(); auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error, context); MACE_OUT_OF_RANGE_CONFIG;
NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d_s1"); kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d_s1");
...@@ -135,6 +138,7 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context, ...@@ -135,6 +138,7 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context,
*kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
MACE_OUT_OF_RANGE_INIT(*kernel);
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t input_height = input->dim(1); const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2); const index_t input_width = input->dim(2);
...@@ -147,8 +151,8 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context, ...@@ -147,8 +151,8 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context,
input_channels); input_channels);
uint32_t idx = 0; uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG_PTR; MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
SET_3D_GWS_ARGS_PTR(kernel, gws); MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -179,60 +183,12 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context, ...@@ -179,60 +183,12 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error); MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template <typename T> } // namespace depthwise
MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()( } // namespace image
const Tensor *input, } // namespace opencl
const Tensor *filter, /* MIHW */
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides_[0] != strides_[1]) {
LOG(FATAL) << "GPU depthwise conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet.";
}
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return DepthwiseConv2d(
context_,
&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_);
}
template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
template struct DepthwiseConv2dFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
#define MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
#include "mace/kernels/depthwise_conv2d.h"
#include <memory>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
namespace depthwise {
MaceStatus DepthwiseConv2d(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input, // NHWC
const Tensor *filter, // HWIM
const Tensor *bias,
const int stride,
const int *paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
uint32_t *kwg_size);
} // namespace depthwise
template <typename T>
class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus DepthwiseConv2dKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) {
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides[0] != strides[1]) {
LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides[0] << "x" << strides[1]
<< " is not implemented yet, using slow version";
MACE_NOT_IMPLEMENTED;
}
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return depthwise::DepthwiseConv2d(
context, &kernel_, input, filter, bias, strides[0], paddings.data(),
dilations, activation, relux_max_limit, DataTypeToEnum<T>::value,
&input_shape_, output, future, &kwg_size_);
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_
#define MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_
#include "mace/kernels/eltwise.h"
#include <memory>
#include <utility>
#include <vector>
#include <set>
#include <string>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class EltwiseKernel : public OpenCLEltwiseKernel {
public:
explicit EltwiseKernel(
const EltwiseType type,
const std::vector<float> &coeff,
const float scalar_input,
const int32_t scalar_input_index)
: type_(type),
coeff_(coeff),
scalar_input_(scalar_input),
scalar_input_index_(scalar_input_index) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output,
StatsFuture *future) override;
private:
EltwiseType type_;
std::vector<float> coeff_;
float scalar_input_;
int32_t scalar_input_index_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus EltwiseKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output,
StatsFuture *future) {
bool swapped = false;
if (input1 != nullptr) {
MACE_CHECK(input0->dim_size() == input1->dim_size() ||
input0->dim_size() == 1 || input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape";
if (input0->size() != input1->size()) {
if (input0->size() < input1->size()) {
std::swap(input0, input1);
swapped = true;
}
if (input1->dim_size() == 1) {
MACE_CHECK(input0->dim(3) == input1->dim(0))
<< "Element-Wise op only support channel dimension broadcast";
} else {
MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) &&
input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 &&
input1->dim(2) == 1)
<< "Element-Wise op only support channel dimension broadcast";
}
}
}
if (scalar_input_index_ == 0) {
swapped = !swapped;
}
std::vector<index_t> output_shape(4);
output_shape[0] = input0->dim(0);
output_shape[1] = input0->dim(1);
output_shape[2] = input0->dim(2);
output_shape[3] = input0->dim(3);
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t batch_height_pixels = batch * height;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch_height_pixels)};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
built_options.emplace("-Deltwise=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (input1 == nullptr) {
built_options.emplace("-DINPUT_TYPE=1");
} else if (input0->size() != input1->size()) {
if (input1->dim(0) == 1 || input1->dim_size() == 1)
built_options.emplace("-DINPUT_TYPE=3");
else
built_options.emplace("-DINPUT_TYPE=2");
if (swapped) built_options.emplace("-DSWAPPED");
}
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input0->opencl_image()));
if (input1 == nullptr) {
kernel_.setArg(idx++, scalar_input_);
} else {
kernel_.setArg(idx++, *(input1->opencl_image()));
}
kernel_.setArg(idx++, static_cast<int32_t>(height));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(channels));
if (!coeff_.empty()) {
kernel_.setArg(idx++, coeff_[0]);
kernel_.setArg(idx++, coeff_[1]);
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_
#define MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_
#include "mace/kernels/fully_connected.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
std::vector<uint32_t> gws_;
std::vector<uint32_t> lws_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus FullyConnectedKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const index_t batch = output->dim(0);
const index_t output_size = output->dim(3);
const index_t output_blocks = RoundUpDiv4(output_size);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
built_options.emplace("-Dfully_connected_width=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
built_options, &kernel_));
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws_ = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
lws_ = {gws_[0], gws_[1], inter_local_blks};
} else {
gws_ = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
lws_ = {gws_[0], gws_[1], inter_local_blks};
}
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
const index_t batch = output->dim(0);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
gws_[2] = static_cast<uint32_t>(batch * output_blocks);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
kernel_.setArg(idx++, static_cast<int>(output_blocks));
kernel_.setArg(idx++, relux_max_limit);
input_shape_ = input->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]),
cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws_.size());
for (size_t i = 0; i < lws_.size(); ++i) {
roundup_gws[i] = RoundUp(gws_[i], lws_[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
}
MACE_OUT_OF_RANGE_VALIDATION;
MACE_CL_RET_STATUS(error);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_
...@@ -12,24 +12,47 @@ ...@@ -12,24 +12,47 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/kernels/image_to_buffer.h" #ifndef MACE_KERNELS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_
#include "mace/core/runtime/opencl/opencl_runtime.h" #define MACE_KERNELS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/buffer_inverse_transform.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace opencl {
namespace image {
template <typename T> template <typename T>
MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( class ImageToBuffer : public OpenCLBufferInverseTransformKernel {
const Tensor *image, public:
MaceStatus Compute(OpKernelContext *context,
const Tensor *input,
const BufferType type, const BufferType type,
Tensor *buffer, const int wino_blk_size,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ImageToBuffer<T>::Compute(OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) { StatsFuture *future) {
auto formatted_buffer_shape = FormatBufferShape(image->shape(), type); auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_); CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size);
MACE_RETURN_IF_ERROR(buffer->Resize(image->shape())); MACE_RETURN_IF_ERROR(output->Resize(input->shape()));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]), uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])}; static_cast<uint32_t>(image_shape[1])};
...@@ -49,9 +72,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -49,9 +72,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
break; break;
case WINOGRAD_FILTER: { case WINOGRAD_FILTER: {
std::stringstream ss_tmp; std::stringstream ss_tmp;
gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2); gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
ss_tmp << "winograd_filter_image_to_buffer_" ss_tmp << "winograd_filter_image_to_buffer_"
<< wino_blk_size_ << "x" << wino_blk_size_; << wino_blk_size << "x" << wino_blk_size;
kernel_name = ss_tmp.str(); kernel_name = ss_tmp.str();
break; break;
} }
...@@ -67,17 +90,18 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -67,17 +90,18 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
break; break;
} }
auto runtime = context_->device()->opencl_runtime(); auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_); MACE_OUT_OF_RANGE_CONFIG;
NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
if (buffer->dtype() == image->dtype()) { if (output->dtype() == input->dtype()) {
built_options.emplace( built_options.emplace(
"-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
...@@ -94,25 +118,26 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -94,25 +118,26 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
&kernel_)); &kernel_));
} }
if (!IsVecEqual(input_shape_, image->shape())) { MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG; MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
SET_2D_GWS_ARGS(kernel_); MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(buffer->opencl_buffer())); kernel_.setArg(idx++, *(output->opencl_buffer()));
if (type == CONV2D_FILTER) { if (type == CONV2D_FILTER) {
const index_t const index_t
inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); inner_size = output->dim(1) * output->dim(2) * output->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size)); kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == ARGUMENT) { } else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
} else if (type == WEIGHT_HEIGHT) { } else if (type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(1))); kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
} else { } else {
kernel_.setArg(idx++, kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1])); static_cast<uint32_t>(formatted_buffer_shape[1]));
...@@ -121,8 +146,8 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -121,8 +146,8 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
kernel_.setArg(idx++, kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3])); static_cast<uint32_t>(formatted_buffer_shape[3]));
} }
kernel_.setArg(idx++, *(image->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
input_shape_ = image->shape(); input_shape_ = input->shape();
} }
const uint32_t kwg_size = const uint32_t kwg_size =
...@@ -146,7 +171,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -146,7 +171,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
} }
MACE_CL_RET_STATUS(error); MACE_CL_RET_STATUS(error);
OUT_OF_RANGE_VALIDATION(kernel_error_); MACE_OUT_OF_RANGE_VALIDATION;
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
event.wait(); event.wait();
...@@ -159,8 +184,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -159,8 +184,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS; return MACE_SUCCESS;
} }
template struct ImageToBufferFunctor<DeviceType::GPU, float>; } // namespace image
template struct ImageToBufferFunctor<DeviceType::GPU, half>; } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_
#define MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_
#include <memory>
#include <vector>
#include <set>
#include <string>
#include "mace/kernels/lstmcell.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class LSTMCellKernel : public OpenCLLSTMCellKernel {
public:
explicit LSTMCellKernel(
const T forget_bias)
: forget_bias_(forget_bias) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *pre_output,
const Tensor *weight,
const Tensor *bias,
const Tensor *pre_cell,
Tensor *cell,
Tensor *output,
StatsFuture *future) override;
private:
T forget_bias_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus LSTMCellKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *pre_output,
const Tensor *weight,
const Tensor *bias,
const Tensor *pre_cell,
Tensor *cell,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0,
"LSTM hidden units should be a multiple of 4");
const index_t height = input->dim(0);
const index_t width = input->dim(1);
const index_t hidden_units = pre_output->dim(1);
const index_t w_blocks = hidden_units >> 2;
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
built_options.emplace("-Dlstmcell=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[2] = {static_cast<uint32_t>(w_blocks),
static_cast<uint32_t>(height)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape_padded, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
output_image_shape));
MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
output_image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(pre_output->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(pre_cell->opencl_image()));
kernel_.setArg(idx++, static_cast<float>(forget_bias_));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(hidden_units));
kernel_.setArg(idx++, static_cast<int32_t>(RoundUpDiv4(width)));
kernel_.setArg(idx++, *(cell->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_
#define MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_
#include "mace/kernels/matmul.h"
#include <functional>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class MatMulKernel : public OpenCLMatMulKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *A,
const Tensor *B,
Tensor *C,
bool transpose_a,
bool transpose_b,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
uint32_t kwg_size_;
};
template <typename T>
MaceStatus MatMulKernel<T>::Compute(
OpKernelContext *context,
const Tensor *A,
const Tensor *B,
Tensor *C,
bool transpose_a,
bool transpose_b,
StatsFuture *future) {
MACE_UNUSED(future);
MACE_CHECK(!transpose_a && !transpose_b,
"GPU does not support transpose matmul");
index_t rank = A->dim_size();
index_t height = A->dim(rank - 2);
index_t K = A->dim(rank - 1);
index_t width = B->dim(rank - 1);
index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
std::multiplies<index_t>());
std::vector<index_t> c_shape = A->shape();
c_shape[rank - 2] = height;
c_shape[rank - 1] = width;
std::vector<size_t> c_image_shape;
std::vector<index_t> padded_c_shape = {batch, height, width, 1};
CalImage2DShape(padded_c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
const index_t height_blocks = RoundUpDiv4(height);
const index_t width_blocks = RoundUpDiv4(width);
const uint32_t gws[2] = {
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height_blocks * batch),
};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
kernel_.setArg(idx++, static_cast<int>(K));
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_PAD_H_
#define MACE_KERNELS_OPENCL_IMAGE_PAD_H_
#include "mace/kernels/pad.h"
#include <memory>
#include <vector>
#include <set>
#include <string>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class PadKernel : public OpenCLPadKernel {
public:
PadKernel(const std::vector<int> &paddings,
const float constant_value)
: paddings_(paddings), constant_value_(constant_value) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) override;
private:
std::vector<int> paddings_;
float constant_value_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus PadKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(this->paddings_.size() ==
static_cast<size_t>((input->dim_size() * 2)));
MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
(this->paddings_[6] == 0) && (this->paddings_[7] == 0))
<< "Mace only support height/width dimension now";
auto input_shape = input->shape();
std::vector<index_t> output_shape = {
input_shape[0] + this->paddings_[0] + this->paddings_[1],
input_shape[1] + this->paddings_[2] + this->paddings_[3],
input_shape[2] + this->paddings_[4] + this->paddings_[5],
input_shape[3] + this->paddings_[6] + this->paddings_[7]};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
built_options.emplace("-Dpad=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, this->constant_value_);
kernel_.setArg(idx++, static_cast<int32_t>(input_shape[1]));
kernel_.setArg(idx++, static_cast<int32_t>(input_shape[2]));
kernel_.setArg(idx++, static_cast<int32_t>(output_shape[1]));
kernel_.setArg(idx++, this->paddings_[2]);
kernel_.setArg(idx++, this->paddings_[4]);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_PAD_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_POOLING_H_
#define MACE_KERNELS_OPENCL_IMAGE_POOLING_H_
#include "mace/kernels/pooling.h"
#include <algorithm>
#include <memory>
#include <vector>
#include <set>
#include <string>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
namespace pooling {
inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] =
std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2];
lws[0] = gws[0] / 4;
if (lws[0] == 0) {
lws[0] = gws[0];
}
lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws_size),
1);
}
return lws;
}
} // namespace pooling
template <typename T>
class PoolingKernel : public OpenCLPoolingKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus PoolingKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::CEIL,
output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel_.setArg(idx++, paddings[0] / 2);
kernel_.setArg(idx++, paddings[1] / 2);
kernel_.setArg(idx++, strides[0]);
kernel_.setArg(idx++, strides[1]);
kernel_.setArg(idx++, kernels[0]);
kernel_.setArg(idx++, kernels[1]);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = pooling::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_POOLING_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_
#define MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_
#include "mace/kernels/reduce_mean.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class ReduceMeanKernel : public OpenCLReduceMeanKernel {
public:
ReduceMeanKernel(const std::vector<int> axis,
const bool keep_dims)
: axis_(axis), keep_dims_(keep_dims) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) override;
private:
const std::vector<int> axis_;
bool keep_dims_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ReduceMeanKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input);
// MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims.");
MACE_CHECK(input->dim_size() == 4,
"reduce mean gpu only support 4-dim input");
MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2,
"reduce mean gpu only support 1,2-axis reduce");
index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
std::vector<uint32_t> gws(3);
std::vector<uint32_t> lws(3);
std::vector<index_t> output_shape{batch, 1, 1, channels};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean");
built_options.emplace("-Dreduce_mean=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
} else {
gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
}
lws = {gws[0], gws[1], 1};
const int group_size = lws[0] * lws[1] * lws[2];
const int partial_len = (image_size + group_size - 1) / group_size;
const int remain_index = image_size % group_size;
const float in_width_reciprocal = 1.f / in_width;
const float img_size_reciprocal = 1.f / (in_width * in_height);
const float channel_blk_reciprocal = 1.f / channel_blocks;
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, (group_size * 4 * sizeof(T)),
nullptr);
kernel_.setArg(idx++, static_cast<int32_t>(group_size));
kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
kernel_.setArg(idx++, static_cast<int32_t>(batch));
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, img_size_reciprocal);
kernel_.setArg(idx++, in_width_reciprocal);
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
kernel_.setArg(idx++, channel_blk_reciprocal);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
#define MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
#include "mace/kernels/resize_bicubic.h"
#include <algorithm>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
namespace resize_bicubic {
inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else {
lws[0] = gws[0] / 8;
if (lws[0] == 0) {
lws[0] = gws[0];
}
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = gws[2] / 8;
if (lws[2] == 0) {
lws[2] = gws[2];
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
return lws;
}
} // namespace resize_bicubic
template <typename T>
class ResizeBicubicKernel : public OpenCLResizeBicubicKernel {
public:
ResizeBicubicKernel(bool align_corners,
const index_t out_height,
const index_t out_width)
: align_corners_(align_corners),
out_height_(out_height),
out_width_(out_width) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) override;
private:
bool align_corners_;
index_t out_height_;
index_t out_width_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ResizeBicubicKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t out_height = out_height_;
const index_t out_width = out_width_;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
auto dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bicubic",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
#define MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
#include "mace/kernels/resize_bilinear.h"
#include <algorithm>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
namespace resize_bilinear {
inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else {
lws[0] = gws[0] / 8;
if (lws[0] == 0) {
lws[0] = gws[0];
}
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = gws[2] / 8;
if (lws[2] == 0) {
lws[2] = gws[2];
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
}
return lws;
}
} // namespace resize_bilinear
template <typename T>
class ResizeBilinearKernel : public OpenCLResizeBilinearKernel {
public:
ResizeBilinearKernel(bool align_corners,
const index_t out_height,
const index_t out_width)
: align_corners_(align_corners),
out_height_(out_height),
out_width_(out_width) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) override;
private:
bool align_corners_;
index_t out_height_;
index_t out_width_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus ResizeBilinearKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t out_height = out_height_;
const index_t out_width = out_width_;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bilinear",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const std::vector<uint32_t>
lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_
#define MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_
#include "mace/kernels/softmax.h"
#include <algorithm>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
namespace softmax {
inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (gws[0] < base) {
lws[0] = gws[0];
} else {
lws[0] = gws[0] / base;
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
lws[2] = std::max<uint32_t>(std::min<uint32_t>(
gws[2], kwg_size / (lws[0] * lws[1])), 1);
}
return lws;
}
} // namespace softmax
template <typename T>
class SoftmaxKernel : public OpenCLSoftmaxKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *logits,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus SoftmaxKernel<T>::Compute(
OpKernelContext *context,
const Tensor *logits,
Tensor *output,
StatsFuture *future) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = softmax::LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
#define MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
#include "mace/kernels/space_to_batch.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *space_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *batch_tensor,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus SpaceToBatchKernel<T>::Compute(
OpKernelContext *context,
const Tensor *space_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *batch_tensor,
StatsFuture *future) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
batch_tensor->ResizeImage(output_shape, output_image_shape));
const char *kernel_name = "space_to_batch";
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, space_tensor->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape[0]);
kernel_.setArg(idx++, block_shape[1]);
kernel_.setArg(idx++, paddings[0]);
kernel_.setArg(idx++, paddings[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
input_shape_ = space_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
#define MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
#include "mace/kernels/space_to_depth.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel {
public:
explicit SpaceToDepthKernel(const int block_size)
: block_size_(block_size) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) override;
private:
const int block_size_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus SpaceToDepthKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t input_depth = input->dim(3);
MACE_CHECK((input_depth % 4) == 0,
"input channel should be dividable by 4");
MACE_CHECK(
(input_width % block_size_ == 0) && (input_height % block_size_ == 0),
"input width and height should be dividable by block_size");
const index_t output_height = input_height / block_size_;
const index_t output_width = input_width / block_size_;
const index_t output_depth = input_depth * block_size_ * block_size_;
const index_t input_depth_blocks = RoundUpDiv4(input_depth);
const index_t output_depth_blocks = RoundUpDiv4(output_depth);
std::vector<index_t> output_shape = {batch, output_height, output_width,
output_depth};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
const char *kernel_name = "space_to_depth";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(input_depth_blocks),
static_cast<uint32_t>(input_width),
static_cast<uint32_t>(input_height * batch)};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(output_height * batch));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
input->dim(1), input->dim(2), input->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_
#define MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_
#include "mace/kernels/split.h"
#include <algorithm>
#include <memory>
#include <vector>
#include <set>
#include <string>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class SplitKernel : public OpenCLSplitKernel {
public:
explicit SplitKernel(const int32_t axis) : axis_(axis) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const std::vector<Tensor *> &output_list,
StatsFuture *future) override;
private:
int32_t axis_;
cl::Kernel kernel_;
uint32_t kwg_size_;
};
template <typename T>
MaceStatus SplitKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const std::vector<Tensor *> &output_list,
StatsFuture *future) {
const index_t input_channels = input->dim(3);
const size_t outputs_count = output_list.size();
const index_t output_channels = input_channels / outputs_count;
MACE_CHECK(output_channels % 4 == 0)
<< "output channels of split op must be divisible by 4";
std::vector<index_t> output_shape(
{input->dim(0), input->dim(1), input->dim(2), output_channels});
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
for (size_t i = 0; i < outputs_count; ++i) {
MACE_RETURN_IF_ERROR(
output_list[i]->ResizeImage(output_shape, image_shape));
}
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
built_options.emplace("-Dsplit=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("split",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const index_t channel_blk = RoundUpDiv4(output_channels);
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(0) * input->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(kernel_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
for (size_t i = 0; i < outputs_count; ++i) {
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION;
if (future != nullptr && runtime->is_profiling_enabled()) {
event.wait();
CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros =
std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
if (future != nullptr) {
future->wait_fn = [runtime, call_stats](CallStats *stats) {
if (stats != nullptr) {
stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros;
}
};
}
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
#define MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
#include "mace/kernels/winograd_transform.h"
#include <memory>
#include <vector>
#include <set>
#include <string>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace image {
template <typename T>
class WinogradTransformKernel : public OpenCLWinogradTransformKernel {
public:
WinogradTransformKernel(
const Padding &padding_type,
const std::vector<int> &paddings,
const int block_size)
: strides_({1, 1}),
dilations_({1, 1}),
padding_type_(padding_type),
paddings_(paddings),
wino_blk_size_(block_size) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future) override;
private:
const std::vector<int> strides_; // [stride_h, stride_w]
const std::vector<int> dilations_; // [dilation_h, dilation_w]
Padding padding_type_;
std::vector<int> paddings_;
const int wino_blk_size_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus WinogradTransformKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future) {
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
built_options.emplace("-Dwinograd_transform_4x4="
+ obfuscated_kernel_name);
} else if (wino_blk_size_ == 2) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
built_options.emplace("-Dwinograd_transform_2x2="
+ obfuscated_kernel_name);
} else {
MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
return MACE_SUCCESS;
}
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {1, input_tensor->dim(3), 3, 3};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
strides_.data(), padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
paddings_.data(), dilations_.data(), strides_.data(),
RoundType::FLOOR, output_shape.data());
}
const index_t round_h =
(output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_;
const index_t round_w =
(output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_;
const index_t out_width = input_tensor->dim(0) * round_h * round_w;
const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
const float round_w_r = 1.f / static_cast<float>(round_w);
const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
const uint32_t gws[2] = {
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))
};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
output_shape = {blk_sqr, input_tensor->dim(3), out_width};
std::vector<index_t> padded_output_shape = {
output_shape[0], output_shape[1], output_shape[2], 1
};
std::vector<size_t> image_shape;
CalImage2DShape(padded_output_shape,
BufferType::IN_OUT_HEIGHT,
&image_shape);
// remove unused last dimension
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input_tensor->opencl_image()));
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, round_hw_r);
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, round_w_r);
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
input_shape_ = input_tensor->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = Concat("winograd_transform_kernel",
output_tensor->dim(0),
output_tensor->dim(1),
output_tensor->dim(2));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
template <typename T>
class WinogradInverseTransformKernel
: public OpenCLWinogradInverseTransformKernel {
public:
WinogradInverseTransformKernel(
const ActivationType activation,
const float relux_max_limit,
const int block_size)
: wino_blk_size_(block_size),
activation_(activation),
relux_max_limit_(relux_max_limit) {}
MaceStatus Compute(
OpKernelContext *context,
const std::vector<const Tensor*> &inputs,
Tensor *output_tensor,
StatsFuture *future) override;
private:
const int wino_blk_size_;
const ActivationType activation_;
const float relux_max_limit_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus WinogradInverseTransformKernel<T>::Compute(
OpKernelContext *context,
const std::vector<const Tensor*> &inputs,
Tensor *output_tensor,
StatsFuture *future) {
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
const Tensor *input_tensor = inputs[0];
const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
built_options.emplace("-Dwinograd_inverse_transform_4x4="
+ obfuscated_kernel_name);
} else if (wino_blk_size_ == 2) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
built_options.emplace("-Dwinograd_inverse_transform_2x2="
+ obfuscated_kernel_name);
} else {
MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
return MACE_SUCCESS;
}
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation_) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
Tensor::MappingGuard output_shape_guard(inputs[1]);
const int32_t *output_shape_data = inputs[1]->data<int32_t>();
const index_t batch = output_shape_data[0];
const index_t height = output_shape_data[1];
const index_t width = output_shape_data[2];
const uint32_t gws[2] = {
static_cast<uint32_t>(input_tensor->dim(2)),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
std::vector<index_t> output_shape = {batch, height, width,
input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_;
const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_;
const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
const float round_w_r = 1.f / static_cast<float>(round_w);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(
idx++,
*(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
if (bias != nullptr) {
kernel_.setArg(idx++,
*(static_cast<const cl::Image2D *>(bias->opencl_image())));
}
kernel_.setArg(
idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, round_hw_r);
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, round_w_r);
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input_tensor->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key =
Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(2),
output_tensor->dim(3), input_tensor->dim(2));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
...@@ -13,14 +13,23 @@ ...@@ -13,14 +13,23 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/lstmcell.h" #include "mace/kernels/lstmcell.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/lstm_cell.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
LSTMCellFunctor<DeviceType::GPU, T>::LSTMCellFunctor(
OpKernelContext *context,
T forget_bias)
: OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::LSTMCellKernel<T>(forget_bias));
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()( MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
...@@ -31,76 +40,11 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()( ...@@ -31,76 +40,11 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
Tensor *cell, Tensor *cell,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0, return kernel_->Compute(context_, input, pre_output, weight, bias,
"LSTM hidden units should be a multiple of 4"); pre_cell, cell, output, future);
const index_t height = input->dim(0);
const index_t width = input->dim(1);
const index_t hidden_units = pre_output->dim(1);
const index_t w_blocks = hidden_units >> 2;
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
built_options.emplace("-Dlstmcell=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[2] = {static_cast<uint32_t>(w_blocks),
static_cast<uint32_t>(height)};
if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape_padded, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
output_image_shape));
MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
output_image_shape));
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_2D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(pre_output->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(pre_cell->opencl_image()));
kernel_.setArg(idx++, static_cast<float>(forget_bias_));
kernel_.setArg(idx++, static_cast<int32_t>(width));
kernel_.setArg(idx++, static_cast<int32_t>(hidden_units));
kernel_.setArg(idx++, static_cast<int32_t>(RoundUpDiv4(width)));
kernel_.setArg(idx++, *(cell->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct LSTMCellFunctor<DeviceType::GPU, float>; template struct LSTMCellFunctor<DeviceType::GPU, float>;
template struct LSTMCellFunctor<DeviceType::GPU, half>; template struct LSTMCellFunctor<DeviceType::GPU, half>;
} // namespace kernels } // namespace kernels
......
...@@ -13,13 +13,21 @@ ...@@ -13,13 +13,21 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/matmul.h" #include "mace/kernels/matmul.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/matmul.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
MatMulFunctor<DeviceType::GPU, T>::MatMulFunctor(OpKernelContext *context)
: OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::MatMulKernel<T>);
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const Tensor *B, const Tensor *B,
...@@ -27,68 +35,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -27,68 +35,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
bool transpose_a, bool transpose_a,
bool transpose_b, bool transpose_b,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); return kernel_->Compute(context_, A, B, C, transpose_a, transpose_b, future);
MACE_CHECK(!transpose_a && !transpose_b,
"GPU does not support transpose matmul");
index_t rank = A->dim_size();
index_t height = A->dim(rank - 2);
index_t K = A->dim(rank - 1);
index_t width = B->dim(rank - 1);
index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
std::multiplies<index_t>());
std::vector<index_t> c_shape = A->shape();
c_shape[rank - 2] = height;
c_shape[rank - 1] = width;
std::vector<size_t> c_image_shape;
std::vector<index_t> padded_c_shape = {batch, height, width, 1};
CalImage2DShape(padded_c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
const index_t height_blocks = RoundUpDiv4(height);
const index_t width_blocks = RoundUpDiv4(width);
const uint32_t gws[2] = {
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height_blocks * batch),
};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_2D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
kernel_.setArg(idx++, static_cast<int>(K));
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct MatMulFunctor<DeviceType::GPU, float>; template struct MatMulFunctor<DeviceType::GPU, float>;
......
...@@ -31,7 +31,7 @@ bool BufferToImageOpImpl(OpKernelContext *context, ...@@ -31,7 +31,7 @@ bool BufferToImageOpImpl(OpKernelContext *context,
Tensor *buffer, Tensor *buffer,
Tensor *image, Tensor *image,
const std::vector<size_t> &image_shape) { const std::vector<size_t> &image_shape) {
std::unique_ptr<BufferBase> kernel_error; std::unique_ptr<BufferBase> oorc_flag;
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]), uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])}; static_cast<uint32_t>(image_shape[1])};
...@@ -43,8 +43,8 @@ bool BufferToImageOpImpl(OpKernelContext *context, ...@@ -43,8 +43,8 @@ bool BufferToImageOpImpl(OpKernelContext *context,
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
OUT_OF_RANGE_CONFIG(kernel_error, context); MACE_OUT_OF_RANGE_CONFIG;
NON_UNIFORM_WG_CONFIG; MACE_NON_UNIFORM_WG_CONFIG;
if (buffer->dtype() == image->dtype()) { if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + built_options.emplace("-DDATA_TYPE=" +
DtToCLDt(DataTypeToEnum<float>::value)); DtToCLDt(DataTypeToEnum<float>::value));
...@@ -67,12 +67,13 @@ bool BufferToImageOpImpl(OpKernelContext *context, ...@@ -67,12 +67,13 @@ bool BufferToImageOpImpl(OpKernelContext *context,
return false; return false;
} }
MACE_OUT_OF_RANGE_INIT(kernel);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel.setArg(idx++, kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error->buffer()))); *(static_cast<cl::Buffer *>(oorc_flag->buffer())));
} }
SET_2D_GWS_ARGS(kernel); MACE_SET_2D_GWS_ARGS(kernel, gws);
kernel.setArg(idx++, *(buffer->opencl_buffer())); kernel.setArg(idx++, *(buffer->opencl_buffer()));
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
"buffer offset not aligned"); "buffer offset not aligned");
...@@ -110,9 +111,9 @@ bool BufferToImageOpImpl(OpKernelContext *context, ...@@ -110,9 +111,9 @@ bool BufferToImageOpImpl(OpKernelContext *context,
runtime->command_queue().finish(); runtime->command_queue().finish();
bool is_out_of_range = false; bool is_out_of_range = false;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error->Map(nullptr); oorc_flag->Map(nullptr);
is_out_of_range = *(kernel_error->mutable_data<char>()) == 1 ? true : false; is_out_of_range = *(oorc_flag->mutable_data<char>()) == 1 ? true : false;
kernel_error->UnMap(); oorc_flag->UnMap();
} }
return is_out_of_range; return is_out_of_range;
} }
......
...@@ -13,86 +13,29 @@ ...@@ -13,86 +13,29 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/pad.h" #include "mace/kernels/pad.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/pad.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
PadFunctor<DeviceType::GPU, T>::PadFunctor(
OpKernelContext *context,
const std::vector<int> &paddings,
const float constant_value)
: OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::PadKernel<T>(paddings, constant_value));
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(this->paddings_.size() == return kernel_->Compute(context_, input, output, future);
static_cast<size_t>((input->dim_size() * 2)));
MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
(this->paddings_[6] == 0) && (this->paddings_[7] == 0))
<< "Mace only support height/width dimension now";
auto input_shape = input->shape();
std::vector<index_t> output_shape = {
input_shape[0] + this->paddings_[0] + this->paddings_[1],
input_shape[1] + this->paddings_[2] + this->paddings_[3],
input_shape[2] + this->paddings_[4] + this->paddings_[5],
input_shape[3] + this->paddings_[6] + this->paddings_[7]};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
built_options.emplace("-Dpad=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, this->constant_value_);
kernel_.setArg(idx++, static_cast<int32_t>(input_shape[1]));
kernel_.setArg(idx++, static_cast<int32_t>(input_shape[2]));
kernel_.setArg(idx++, static_cast<int32_t>(output_shape[1]));
kernel_.setArg(idx++, this->paddings_[2]);
kernel_.setArg(idx++, this->paddings_[4]);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct PadFunctor<DeviceType::GPU, float>; template struct PadFunctor<DeviceType::GPU, float>;
......
...@@ -13,153 +13,45 @@ ...@@ -13,153 +13,45 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/pooling.h" #include "mace/kernels/pooling.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/buffer/pooling.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/kernels/opencl/image/pooling.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace { template <typename T>
PoolingFunctor<DeviceType::GPU, T>::PoolingFunctor(
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, OpKernelContext *context,
const uint32_t *gws, const PoolingType pooling_type,
const uint32_t kwg_size) { const int *kernels,
std::vector<uint32_t> lws(4, 0); const int *strides,
if (kwg_size == 0) { const Padding padding_type,
lws[0] = lws[1] = lws[2] = 1; const std::vector<int> &paddings,
const int *dilations)
: PoolingFunctorBase(context,
pooling_type,
kernels,
strides,
padding_type,
paddings,
dilations) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::PoolingKernel<T>);
} else { } else {
uint64_t kernel_.reset(new opencl::buffer::PoolingKernel<T>);
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] =
std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2];
lws[0] = gws[0] / 4;
if (lws[0] == 0) {
lws[0] = gws[0];
}
lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws_size),
1);
} }
return lws;
} }
} // namespace
template <typename T> template <typename T>
MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) return kernel_->Compute(context_, input, pooling_type_, kernels_, strides_,
<< "Pooling opencl kernel not support dilation yet"; padding_type_, paddings_, dilations_,
output, future);
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type_ == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
}
if (pooling_type_ == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<uint32_t> gws;
if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels_[0], kernels_[1]};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::CEIL,
output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
gws = {
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel_.setArg(idx++, paddings[0] / 2);
kernel_.setArg(idx++, paddings[1] / 2);
kernel_.setArg(idx++, strides_[0]);
kernel_.setArg(idx++, strides_[1]);
kernel_.setArg(idx++, kernels_[0]);
kernel_.setArg(idx++, kernels_[1]);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
} else {
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
gws = {
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
}
const std::vector<uint32_t> lws = LocalWS(runtime, gws.data(), kwg_size_);
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws.data(), lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct PoolingFunctor<DeviceType::GPU, float>; template struct PoolingFunctor<DeviceType::GPU, float>;
......
...@@ -13,127 +13,29 @@ ...@@ -13,127 +13,29 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/reduce_mean.h" #include "mace/kernels/reduce_mean.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/kernels/opencl/image/reduce_mean.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
ReduceMeanFunctor<DeviceType::GPU, T>::ReduceMeanFunctor(
OpKernelContext *context,
const std::vector<int> &axis,
const bool keep_dims) : OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::ReduceMeanKernel<T>(axis, keep_dims));
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()( MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK_NOTNULL(input); return kernel_->Compute(context_, input, output, future);
// MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims.");
MACE_CHECK(input->dim_size() == 4,
"reduce mean gpu only support 4-dim input");
MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2,
"reduce mean gpu only support 1,2-axis reduce");
index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
auto runtime = context_->device()->opencl_runtime();
std::vector<uint32_t> gws(3);
std::vector<uint32_t> lws(3);
std::vector<index_t> output_shape{batch, 1, 1, channels};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean");
built_options.emplace("-Dreduce_mean=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
const uint32_t wave_size =
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
} else {
gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
}
lws = {gws[0], gws[1], 1};
const int group_size = lws[0] * lws[1] * lws[2];
const int partial_len = (image_size + group_size - 1) / group_size;
const int remain_index = image_size % group_size;
const float in_width_reciprocal = 1.f / in_width;
const float img_size_reciprocal = 1.f / (in_width * in_height);
const float channel_blk_reciprocal = 1.f / channel_blocks;
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, (group_size * 4 * sizeof(T)),
nullptr);
kernel_.setArg(idx++, static_cast<int32_t>(group_size));
kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
kernel_.setArg(idx++, static_cast<int32_t>(batch));
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, img_size_reciprocal);
kernel_.setArg(idx++, in_width_reciprocal);
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
kernel_.setArg(idx++, channel_blk_reciprocal);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MACE_SUCCESS;
} }
template struct ReduceMeanFunctor<DeviceType::GPU, float>; template struct ReduceMeanFunctor<DeviceType::GPU, float>;
......
...@@ -13,119 +13,31 @@ ...@@ -13,119 +13,31 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/resize_bicubic.h" #include "mace/kernels/resize_bicubic.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/resize_bicubic.h"
#include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace { template <typename T>
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ResizeBicubicFunctor<DeviceType::GPU, T>::ResizeBicubicFunctor(
const uint32_t *gws, OpKernelContext *context,
const uint32_t kwg_size) { bool align_corners,
std::vector<uint32_t> lws(4, 0); const std::vector<index_t> &size)
uint64_t cache_size = runtime->device_global_mem_cache_size(); : OpKernel(context) {
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); MACE_CHECK(size.size() == 2);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); if (context->device()->opencl_runtime()->UseImageMemory()) {
if (lws[1] >= base) { kernel_.reset(new opencl::image::ResizeBicubicKernel<T>(align_corners,
lws[0] = std::min<uint32_t>(gws[0], base); size[0],
size[1]));
} else { } else {
lws[0] = gws[0] / 8; MACE_NOT_IMPLEMENTED;
if (lws[0] == 0) {
lws[0] = gws[0];
}
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = gws[2] / 8;
if (lws[2] == 0) {
lws[2] = gws[2];
} }
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
return lws;
} }
} // namespace
template <typename T> template <typename T>
MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()( MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, Tensor *output, StatsFuture *future) { const Tensor *input, Tensor *output, StatsFuture *future) {
const index_t batch = input->dim(0); return kernel_->Compute(context_, input, output, future);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t out_height = out_height_;
const index_t out_width = out_width_;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
auto dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bicubic",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_);
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct ResizeBicubicFunctor<DeviceType::GPU, float>; template struct ResizeBicubicFunctor<DeviceType::GPU, float>;
......
...@@ -13,122 +13,29 @@ ...@@ -13,122 +13,29 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/resize_bilinear.h" #include "mace/kernels/resize_bilinear.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/resize_bilinear.h"
#include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace { template <typename T>
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, ResizeBilinearFunctor<DeviceType::GPU, T>::ResizeBilinearFunctor(
const uint32_t *gws, OpKernelContext *context,
const uint32_t kwg_size) { const std::vector<index_t> &size,
std::vector<uint32_t> lws(4, 0); bool align_corners) : OpKernel(context) {
if (kwg_size == 0) { MACE_CHECK(size.size() == 2);
lws[0] = lws[1] = lws[2] = 1; if (context->device()->opencl_runtime()->UseImageMemory()) {
} else { kernel_.reset(new opencl::image::ResizeBilinearKernel<T>(align_corners,
uint64_t size[0],
cache_size = runtime->device_global_mem_cache_size(); size[1]));
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else { } else {
lws[0] = gws[0] / 8; MACE_NOT_IMPLEMENTED;
if (lws[0] == 0) {
lws[0] = gws[0];
}
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = gws[2] / 8;
if (lws[2] == 0) {
lws[2] = gws[2];
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
} }
return lws;
} }
} // namespace
template <typename T> template <typename T>
MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, Tensor *output, StatsFuture *future) { const Tensor *input, Tensor *output, StatsFuture *future) {
const index_t batch = input->dim(0); return kernel_->Compute(context_, input, output, future);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t out_height = out_height_;
const index_t out_width = out_width_;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bilinear",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_);
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
kernel_.setArg(idx++, width_scale);
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct ResizeBilinearFunctor<DeviceType::GPU, float>; template struct ResizeBilinearFunctor<DeviceType::GPU, float>;
......
...@@ -13,110 +13,28 @@ ...@@ -13,110 +13,28 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/softmax.h" #include "mace/kernels/softmax.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/buffer/softmax.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/kernels/opencl/image/softmax.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace { template <typename T>
SoftmaxFunctor<DeviceType::GPU, T>::SoftmaxFunctor(OpKernelContext *context)
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime, : OpKernel(context) {
const uint32_t *gws, if (context->device()->opencl_runtime()->UseImageMemory()) {
const uint32_t kwg_size) { kernel_.reset(new opencl::image::SoftmaxKernel<T>);
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (gws[0] < base) {
lws[0] = gws[0];
} else { } else {
lws[0] = gws[0] / base; kernel_.reset(new opencl::buffer::SoftmaxKernel<T>);
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
lws[2] = std::max<uint32_t>(std::min<uint32_t>(
gws[2], kwg_size / (lws[0] * lws[1])), 1);
} }
return lws;
} }
} // namespace
template <typename T> template <typename T>
MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
index_t batch = 0; return kernel_->Compute(context_, logits, output, future);
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct SoftmaxFunctor<DeviceType::GPU, float>; template struct SoftmaxFunctor<DeviceType::GPU, float>;
......
...@@ -16,81 +16,32 @@ ...@@ -16,81 +16,32 @@
#define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_ #define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
#include "mace/kernels/space_to_batch.h" #include "mace/kernels/space_to_batch.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/space_to_batch.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
SpaceToBatchFunctor<DeviceType::GPU, T>::SpaceToBatchFunctor(
OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: SpaceToBatchFunctorBase(context, paddings, block_shape) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::SpaceToBatchKernel<T>);
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) { const Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
std::vector<index_t> output_shape(4, 0); std::vector<index_t> output_shape(4, 0);
CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC, CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC,
output_shape.data()); output_shape.data());
std::vector<size_t> output_image_shape; return kernel_->Compute(context_, space_tensor, paddings_, block_shape_,
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_shape, batch_tensor, future);
&output_image_shape);
MACE_RETURN_IF_ERROR(
batch_tensor->ResizeImage(output_shape, output_image_shape));
const char *kernel_name = "space_to_batch";
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape_[0]);
kernel_.setArg(idx++, block_shape_[1]);
kernel_.setArg(idx++, paddings_[0]);
kernel_.setArg(idx++, paddings_[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
space_shape_ = space_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct SpaceToBatchFunctor<DeviceType::GPU, float>; template struct SpaceToBatchFunctor<DeviceType::GPU, float>;
......
...@@ -13,91 +13,27 @@ ...@@ -13,91 +13,27 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/space_to_depth.h" #include "mace/kernels/space_to_depth.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/kernels/opencl/image/space_to_depth.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
MaceStatus SpaceToDepthOpFunctor<DeviceType::GPU, T>::operator()( SpaceToDepthOpFunctor<DeviceType::GPU, T>::SpaceToDepthOpFunctor(
const Tensor *input, Tensor *output, StatsFuture *future) { OpKernelContext *context,
const index_t batch = input->dim(0); const int block_size)
const index_t input_height = input->dim(1); : OpKernel(context) {
const index_t input_width = input->dim(2); if (context->device()->opencl_runtime()->UseImageMemory()) {
const index_t input_depth = input->dim(3); kernel_.reset(new opencl::image::SpaceToDepthKernel<T>(block_size));
} else {
MACE_CHECK((input_depth % 4) == 0, MACE_NOT_IMPLEMENTED;
"input channel should be dividable by 4");
MACE_CHECK(
(input_width % block_size_ == 0) && (input_height % block_size_ == 0),
"input width and height should be dividable by block_size");
const index_t output_height = input_height / block_size_;
const index_t output_width = input_width / block_size_;
const index_t output_depth = input_depth * block_size_ * block_size_;
const index_t input_depth_blocks = RoundUpDiv4(input_depth);
const index_t output_depth_blocks = RoundUpDiv4(output_depth);
std::vector<index_t> output_shape = {batch, output_height, output_width,
output_depth};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
const char *kernel_name = "space_to_depth";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
}
const uint32_t gws[3] = {static_cast<uint32_t>(input_depth_blocks), template <typename T>
static_cast<uint32_t>(input_width), MaceStatus SpaceToDepthOpFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(input_height * batch)}; const Tensor *input, Tensor *output, StatsFuture *future) {
if (!IsVecEqual(input_shape_, input->shape())) { return kernel_->Compute(context_, input, output, future);
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(output_height * batch));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
input->dim(1), input->dim(2), input->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct SpaceToDepthOpFunctor<DeviceType::GPU, float>; template struct SpaceToDepthOpFunctor<DeviceType::GPU, float>;
......
...@@ -13,107 +13,28 @@ ...@@ -13,107 +13,28 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/split.h" #include "mace/kernels/split.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/image/split.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T>
SplitFunctor<DeviceType::GPU, T>::SplitFunctor(OpKernelContext *context,
const int32_t axis)
: OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::SplitKernel<T>(axis));
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T> template <typename T>
MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()( MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, const Tensor *input,
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
StatsFuture *future) { StatsFuture *future) {
const index_t input_channels = input->dim(3); return kernel_->Compute(context_, input, output_list, future);
const size_t outputs_count = output_list.size();
const index_t output_channels = input_channels / outputs_count;
MACE_CHECK(output_channels % 4 == 0)
<< "output channels of split op must be divisible by 4";
std::vector<index_t> output_shape(
{input->dim(0), input->dim(1), input->dim(2), output_channels});
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
for (size_t i = 0; i < outputs_count; ++i) {
MACE_RETURN_IF_ERROR(
output_list[i]->ResizeImage(output_shape, image_shape));
}
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
built_options.emplace("-Dsplit=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("split",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const index_t channel_blk = RoundUpDiv4(output_channels);
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(0) * input->dim(1)),
};
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
for (size_t i = 0; i < outputs_count; ++i) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (future != nullptr && runtime->is_profiling_enabled()) {
event.wait();
CallStats tmp_stats;
runtime->GetCallStats(event, &tmp_stats);
call_stats.start_micros =
std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
if (future != nullptr) {
future->wait_fn = [runtime, call_stats](CallStats *stats) {
if (stats != nullptr) {
stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros;
}
};
}
return MACE_SUCCESS;
} }
template struct SplitFunctor<DeviceType::GPU, float>; template struct SplitFunctor<DeviceType::GPU, float>;
......
...@@ -13,239 +13,49 @@ ...@@ -13,239 +13,49 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/winograd_transform.h" #include "mace/kernels/winograd_transform.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/kernels/opencl/image/winograd_transform.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( WinogradTransformFunctor<DeviceType::GPU, T>::WinogradTransformFunctor(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { OpKernelContext *context,
auto runtime = context_->device()->opencl_runtime(); const Padding &padding_type,
const std::vector<int> &paddings,
if (kernel_.get() == nullptr) { const int block_size) : OpKernel(context) {
std::string obfuscated_kernel_name; if (context->device()->opencl_runtime()->UseImageMemory()) {
std::set<std::string> built_options; kernel_.reset(new opencl::image::WinogradTransformKernel<T>(
OUT_OF_RANGE_CONFIG(kernel_error_, context_); padding_type, paddings, block_size));
NON_UNIFORM_WG_CONFIG;
if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
built_options.emplace("-Dwinograd_transform_4x4="
+ obfuscated_kernel_name);
} else if (wino_blk_size_ == 2) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
built_options.emplace("-Dwinograd_transform_2x2="
+ obfuscated_kernel_name);
} else { } else {
MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); MACE_NOT_IMPLEMENTED;
return MACE_SUCCESS;
} }
built_options.emplace("-DDATA_TYPE=" + }
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value)); template <typename T>
built_options.emplace("-DCMD_DATA_TYPE=" + MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value)); const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", return kernel_->Compute(context_, input_tensor, output_tensor, future);
obfuscated_kernel_name, }
built_options,
&kernel_));
kwg_size_ = template <typename T>
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); WinogradInverseTransformFunctor<DeviceType::GPU, T>::WinogradInverseTransformFunctor( // NOLINT(whitespace/line_length)
} OpKernelContext *context,
std::vector<index_t> output_shape(4); const ActivationType activation,
std::vector<index_t> filter_shape = {1, input_tensor->dim(3), 3, 3}; const float relux_max_limit,
std::vector<int> paddings(2); const int block_size) : OpKernel(context) {
if (paddings_.empty()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
kernels::CalcNHWCPaddingAndOutputSize( kernel_.reset(new opencl::image::WinogradInverseTransformKernel<T>(
input_tensor->shape().data(), filter_shape.data(), dilations_.data(), activation, relux_max_limit, block_size));
strides_.data(), padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; MACE_NOT_IMPLEMENTED;
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
paddings_.data(), dilations_.data(), strides_.data(),
RoundType::FLOOR, output_shape.data());
} }
const index_t round_h =
(output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_;
const index_t round_w =
(output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_;
const index_t out_width = input_tensor->dim(0) * round_h * round_w;
const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
const float round_w_r = 1.f / static_cast<float>(round_w);
const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
const uint32_t gws[2] = {
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))
};
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
output_shape = {blk_sqr, input_tensor->dim(3), out_width};
std::vector<index_t> padded_output_shape = {
output_shape[0], output_shape[1], output_shape[2], 1
};
std::vector<size_t> image_shape;
CalImage2DShape(padded_output_shape,
BufferType::IN_OUT_HEIGHT,
&image_shape);
// remove unused last dimension
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_2D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input_tensor->opencl_image()));
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, round_hw_r);
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, round_w_r);
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
input_shape_ = input_tensor->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = Concat("winograd_transform_kernel",
output_tensor->dim(0),
output_tensor->dim(1),
output_tensor->dim(2));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template <typename T> template <typename T>
MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const std::vector<const Tensor*> &inputs, const std::vector<const Tensor*> &inputs,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
auto runtime = context_->device()->opencl_runtime(); return kernel_->Compute(context_, inputs, output_tensor, future);
const Tensor *input_tensor = inputs[0];
const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr;
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
built_options.emplace("-Dwinograd_inverse_transform_4x4="
+ obfuscated_kernel_name);
} else if (wino_blk_size_ == 2) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
built_options.emplace("-Dwinograd_inverse_transform_2x2="
+ obfuscated_kernel_name);
} else {
MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
return MACE_SUCCESS;
}
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation_) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
Tensor::MappingGuard output_shape_guard(inputs[1]);
const int32_t *output_shape_data = inputs[1]->data<int32_t>();
const index_t batch = output_shape_data[0];
const index_t height = output_shape_data[1];
const index_t width = output_shape_data[2];
const uint32_t gws[2] = {
static_cast<uint32_t>(input_tensor->dim(2)),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
std::vector<index_t> output_shape = {batch, height, width,
input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_;
const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_;
const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
const float round_w_r = 1.f / static_cast<float>(round_w);
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_2D_GWS_ARGS(kernel_);
kernel_.setArg(
idx++,
*(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
if (bias != nullptr) {
kernel_.setArg(idx++,
*(static_cast<const cl::Image2D *>(bias->opencl_image())));
}
kernel_.setArg(
idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, round_hw_r);
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, round_w_r);
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input_tensor->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key =
Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(2),
output_tensor->dim(3), input_tensor->dim(2));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
} }
template struct WinogradTransformFunctor<DeviceType::GPU, float>; template struct WinogradTransformFunctor<DeviceType::GPU, float>;
......
...@@ -23,32 +23,18 @@ ...@@ -23,32 +23,18 @@
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct PadFunctorBase : OpKernel { template<DeviceType D, typename T>
PadFunctorBase(OpKernelContext *context, struct PadFunctor : OpKernel {
PadFunctor(OpKernelContext *context,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const float constant_value) const float constant_value)
: OpKernel(context), : OpKernel(context),
paddings_(paddings), paddings_(paddings),
constant_value_(constant_value) {} constant_value_(constant_value) {}
std::vector<int> paddings_;
float constant_value_;
};
template<DeviceType D, typename T>
struct PadFunctor : public PadFunctorBase {
PadFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const float constant_value)
: PadFunctorBase(context, paddings, constant_value) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -93,24 +79,32 @@ struct PadFunctor : public PadFunctorBase { ...@@ -93,24 +79,32 @@ struct PadFunctor : public PadFunctorBase {
return MACE_SUCCESS; return MACE_SUCCESS;
} }
std::vector<int> paddings_;
float constant_value_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLPadKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLPadKernel);
};
template <typename T> template <typename T>
struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase { struct PadFunctor<DeviceType::GPU, T> : OpKernel {
PadFunctor(OpKernelContext *context, PadFunctor(OpKernelContext *context,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const float constant_value) const float constant_value);
: PadFunctorBase(context, paddings, constant_value) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLPadKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -29,10 +29,6 @@ ...@@ -29,10 +29,6 @@
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
enum PoolingType { enum PoolingType {
...@@ -84,8 +80,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase { ...@@ -84,8 +80,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
strides, strides,
padding_type, padding_type,
paddings, paddings,
dilations) { dilations) {}
}
void MaxPooling(const float *input, void MaxPooling(const float *input,
const index_t *in_shape, const index_t *in_shape,
...@@ -455,6 +450,21 @@ struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase { ...@@ -455,6 +450,21 @@ struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLPoolingKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLPoolingKernel);
};
template <typename T> template <typename T>
struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase { struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
PoolingFunctor(OpKernelContext *context, PoolingFunctor(OpKernelContext *context,
...@@ -463,23 +473,13 @@ struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase { ...@@ -463,23 +473,13 @@ struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
const int *strides, const int *strides,
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations) const int *dilations);
: PoolingFunctorBase(context,
pooling_type,
kernels,
strides,
padding_type,
paddings,
dilations) {
}
MaceStatus operator()(const Tensor *input_tensor, MaceStatus operator()(const Tensor *input_tensor,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLPoolingKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -25,33 +25,15 @@ ...@@ -25,33 +25,15 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct ReduceFunctorBase : OpKernel {
ReduceFunctorBase(OpKernelContext *context,
const std::vector<int> &axis,
const bool keep_dims)
: OpKernel(context),
keep_dims_(keep_dims),
axis_(axis) {}
bool keep_dims_;
bool reduce_first_axis_;
const std::vector<int> axis_;
std::vector<int> data_reshape_;
std::vector<index_t> out_shape_;
};
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ReduceMeanFunctor : ReduceFunctorBase { struct ReduceMeanFunctor : OpKernel {
ReduceMeanFunctor(OpKernelContext *context, ReduceMeanFunctor(OpKernelContext *context,
const std::vector<int> &axis, const std::vector<int> &axis,
const bool keep_dims) const bool keep_dims)
: ReduceFunctorBase(context, axis, keep_dims) {} : OpKernel(context), axis_(axis), keep_dims_(keep_dims) {}
void Simplify(const Tensor *input) { void Simplify(const Tensor *input) {
std::vector<bool> bitmap(static_cast<uint32_t>(input->dim_size()), false); std::vector<bool> bitmap(static_cast<uint32_t>(input->dim_size()), false);
...@@ -217,25 +199,35 @@ struct ReduceMeanFunctor : ReduceFunctorBase { ...@@ -217,25 +199,35 @@ struct ReduceMeanFunctor : ReduceFunctorBase {
Compute(input, output); Compute(input, output);
return MACE_SUCCESS; return MACE_SUCCESS;
} }
const std::vector<int> axis_;
bool keep_dims_;
bool reduce_first_axis_;
std::vector<int> data_reshape_;
std::vector<index_t> out_shape_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLReduceMeanKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLReduceMeanKernel);
};
template <typename T> template <typename T>
struct ReduceMeanFunctor<DeviceType::GPU, T> struct ReduceMeanFunctor<DeviceType::GPU, T> : OpKernel {
: ReduceFunctorBase {
ReduceMeanFunctor(OpKernelContext *context, ReduceMeanFunctor(OpKernelContext *context,
const std::vector<int> axis, const std::vector<int> &axis,
const bool keep_dims) const bool keep_dims);
: ReduceFunctorBase(context, axis, keep_dims) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output_tensor, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLReduceMeanKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif #endif
......
...@@ -25,10 +25,6 @@ ...@@ -25,10 +25,6 @@
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -141,32 +137,20 @@ inline void ResizeImage(const float *images, ...@@ -141,32 +137,20 @@ inline void ResizeImage(const float *images,
} }
} }
struct ResizeBicubicFunctorBase : OpKernel {
ResizeBicubicFunctorBase(OpKernelContext *context,
const std::vector<index_t> &size,
bool align_corners)
: OpKernel(context), align_corners_(align_corners) {
MACE_CHECK(size.size() == 2);
out_height_ = size[0];
out_width_ = size[1];
}
protected:
bool align_corners_;
index_t out_height_;
index_t out_width_;
};
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct ResizeBicubicFunctor; struct ResizeBicubicFunctor;
template<> template<>
struct ResizeBicubicFunctor<DeviceType::CPU, float> struct ResizeBicubicFunctor<DeviceType::CPU, float> : OpKernel {
: ResizeBicubicFunctorBase {
ResizeBicubicFunctor(OpKernelContext *context, ResizeBicubicFunctor(OpKernelContext *context,
const std::vector<index_t> &size, const bool align_corners,
bool align_corners) const std::vector<index_t> &size)
: ResizeBicubicFunctorBase(context, size, align_corners) {} : OpKernel(context),
align_corners_(align_corners) {
MACE_CHECK(size.size() == 2);
out_height_ = size[0];
out_width_ = size[1];
}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
...@@ -205,25 +189,34 @@ struct ResizeBicubicFunctor<DeviceType::CPU, float> ...@@ -205,25 +189,34 @@ struct ResizeBicubicFunctor<DeviceType::CPU, float>
return MACE_SUCCESS; return MACE_SUCCESS;
} }
bool align_corners_;
index_t out_height_;
index_t out_width_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLResizeBicubicKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLResizeBicubicKernel);
};
template<typename T> template<typename T>
struct ResizeBicubicFunctor<DeviceType::GPU, T> struct ResizeBicubicFunctor<DeviceType::GPU, T>
: ResizeBicubicFunctorBase { : OpKernel {
ResizeBicubicFunctor(OpKernelContext *context, ResizeBicubicFunctor(OpKernelContext *context,
const std::vector<index_t> &size, bool align_corners,
bool align_corners) const std::vector<index_t> &size);
: ResizeBicubicFunctorBase(context, size, align_corners) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLResizeBicubicKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -24,10 +24,6 @@ ...@@ -24,10 +24,6 @@
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#include "mace/utils/quantize.h" #include "mace/utils/quantize.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -179,30 +175,17 @@ inline void ResizeImageNHWC(const T *images, ...@@ -179,30 +175,17 @@ inline void ResizeImageNHWC(const T *images,
} }
} }
struct ResizeBilinearFunctorBase : OpKernel { template<DeviceType D, typename T>
ResizeBilinearFunctorBase(OpKernelContext *context, struct ResizeBilinearFunctor : OpKernel {
ResizeBilinearFunctor(OpKernelContext *context,
const std::vector<index_t> &size, const std::vector<index_t> &size,
bool align_corners) bool align_corners)
: OpKernel(context), : OpKernel(context), align_corners_(align_corners) {
align_corners_(align_corners) {
MACE_CHECK(size.size() == 2); MACE_CHECK(size.size() == 2);
out_height_ = size[0]; out_height_ = size[0];
out_width_ = size[1]; out_width_ = size[1];
} }
protected:
bool align_corners_;
index_t out_height_;
index_t out_width_;
};
template<DeviceType D, typename T>
struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
ResizeBilinearFunctor(OpKernelContext *context,
const std::vector<index_t> &size,
bool align_corners)
: ResizeBilinearFunctorBase(context, size, align_corners) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -255,14 +238,22 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { ...@@ -255,14 +238,22 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
return MACE_SUCCESS; return MACE_SUCCESS;
} }
bool align_corners_;
index_t out_height_;
index_t out_width_;
}; };
template<DeviceType D> template<DeviceType D>
struct ResizeBilinearFunctor<D, uint8_t> : ResizeBilinearFunctorBase { struct ResizeBilinearFunctor<D, uint8_t> : OpKernel {
ResizeBilinearFunctor(OpKernelContext *context, ResizeBilinearFunctor(OpKernelContext *context,
const std::vector<index_t> &size, const std::vector<index_t> &size,
bool align_corners) bool align_corners)
: ResizeBilinearFunctorBase(context, size, align_corners) {} : OpKernel(context), align_corners_(align_corners) {
MACE_CHECK(size.size() == 2);
out_height_ = size[0];
out_width_ = size[1];
}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
...@@ -316,25 +307,34 @@ struct ResizeBilinearFunctor<D, uint8_t> : ResizeBilinearFunctorBase { ...@@ -316,25 +307,34 @@ struct ResizeBilinearFunctor<D, uint8_t> : ResizeBilinearFunctorBase {
return MACE_SUCCESS; return MACE_SUCCESS;
} }
bool align_corners_;
index_t out_height_;
index_t out_width_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLResizeBilinearKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLResizeBilinearKernel);
};
template<typename T> template<typename T>
struct ResizeBilinearFunctor<DeviceType::GPU, T> struct ResizeBilinearFunctor<DeviceType::GPU, T>
: ResizeBilinearFunctorBase { : OpKernel {
ResizeBilinearFunctor(OpKernelContext *context, ResizeBilinearFunctor(OpKernelContext *context,
const std::vector<index_t> &size, const std::vector<index_t> &size,
bool align_corners) bool align_corners);
: ResizeBilinearFunctorBase(context, size, align_corners) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLResizeBilinearKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -30,10 +30,6 @@ ...@@ -30,10 +30,6 @@
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#include "mace/kernels/quantize.h" #include "mace/kernels/quantize.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -356,17 +352,23 @@ struct SoftmaxFunctor<DeviceType::CPU, uint8_t> : OpKernel { ...@@ -356,17 +352,23 @@ struct SoftmaxFunctor<DeviceType::CPU, uint8_t> : OpKernel {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLSoftmaxKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *logits,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSoftmaxKernel);
};
template<typename T> template<typename T>
struct SoftmaxFunctor<DeviceType::GPU, T> : OpKernel { struct SoftmaxFunctor<DeviceType::GPU, T> : OpKernel {
explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {} explicit SoftmaxFunctor(OpKernelContext *context);
MaceStatus operator()(const Tensor *logits, MaceStatus operator()(const Tensor *logits,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLSoftmaxKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -23,10 +23,6 @@ ...@@ -23,10 +23,6 @@
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -102,7 +98,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -102,7 +98,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
const std::vector<int> &block_shape) const std::vector<int> &block_shape)
: SpaceToBatchFunctorBase(context, paddings, block_shape) {} : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor, MaceStatus operator()(const Tensor *space_tensor,
Tensor *batch_tensor, Tensor *batch_tensor,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
...@@ -212,7 +208,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, uint8_t> : SpaceToBatchFunctorBase { ...@@ -212,7 +208,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, uint8_t> : SpaceToBatchFunctorBase {
const std::vector<int> &block_shape) const std::vector<int> &block_shape)
: SpaceToBatchFunctorBase(context, paddings, block_shape) {} : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor, MaceStatus operator()(const Tensor *space_tensor,
Tensor *batch_tensor, Tensor *batch_tensor,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
...@@ -311,21 +307,29 @@ struct SpaceToBatchFunctor<DeviceType::CPU, uint8_t> : SpaceToBatchFunctorBase { ...@@ -311,21 +307,29 @@ struct SpaceToBatchFunctor<DeviceType::CPU, uint8_t> : SpaceToBatchFunctorBase {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLSpaceToBatchKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *space_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *batch_tensor,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSpaceToBatchKernel);
};
template <typename T> template <typename T>
struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase { struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(OpKernelContext *context, SpaceToBatchFunctor(OpKernelContext *context,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<int> &block_shape) const std::vector<int> &block_shape);
: SpaceToBatchFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor, MaceStatus operator()(const Tensor *space_tensor,
Tensor *batch_tensor, Tensor *batch_tensor,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLSpaceToBatchKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> space_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -22,10 +22,6 @@ ...@@ -22,10 +22,6 @@
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -91,20 +87,24 @@ struct SpaceToDepthOpFunctor : OpKernel { ...@@ -91,20 +87,24 @@ struct SpaceToDepthOpFunctor : OpKernel {
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLSpaceToDepthKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSpaceToDepthKernel);
};
template<typename T> template<typename T>
struct SpaceToDepthOpFunctor<DeviceType::GPU, T> : OpKernel { struct SpaceToDepthOpFunctor<DeviceType::GPU, T> : OpKernel {
explicit SpaceToDepthOpFunctor(OpKernelContext *context, explicit SpaceToDepthOpFunctor(OpKernelContext *context,
const int block_size) const int block_size);
: OpKernel(context), block_size_(block_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
const int block_size_; std::unique_ptr<OpenCLSpaceToDepthKernel> kernel_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -25,24 +25,13 @@ ...@@ -25,24 +25,13 @@
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct SplitFunctorBase : OpKernel {
SplitFunctorBase(OpKernelContext *context, const int32_t axis)
: OpKernel(context), axis_(axis) {}
int32_t axis_;
};
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct SplitFunctor : SplitFunctorBase { struct SplitFunctor : OpKernel {
SplitFunctor(OpKernelContext *context, const int32_t axis) SplitFunctor(OpKernelContext *context, const int32_t axis)
: SplitFunctorBase(context, axis) {} : OpKernel(context), axis_(axis) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
...@@ -88,20 +77,28 @@ struct SplitFunctor : SplitFunctorBase { ...@@ -88,20 +77,28 @@ struct SplitFunctor : SplitFunctorBase {
return MACE_SUCCESS; return MACE_SUCCESS;
} }
int32_t axis_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
class OpenCLSplitKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const std::vector<Tensor *> &output_list,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSplitKernel);
};
template<typename T> template<typename T>
struct SplitFunctor<DeviceType::GPU, T> : SplitFunctorBase { struct SplitFunctor<DeviceType::GPU, T> : OpKernel {
SplitFunctor(OpKernelContext *context, const int32_t axis) SplitFunctor(OpKernelContext *context, const int32_t axis);
: SplitFunctorBase(context, axis) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLSplitKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -23,132 +23,63 @@ ...@@ -23,132 +23,63 @@
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/conv_pool_2d_util.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct WinogradTransformFunctorBase : OpKernel { template <DeviceType D, typename T>
WinogradTransformFunctorBase(OpKernelContext *context, struct WinogradTransformFunctor;
const Padding &padding_type,
const std::vector<int> &paddings,
const int block_size)
: OpKernel(context),
strides_({1, 1}),
dilations_({1, 1}),
padding_type_(padding_type),
paddings_(paddings),
wino_blk_size_(block_size) {}
const std::vector<int> strides_; // [stride_h, stride_w]
const std::vector<int> dilations_; // [dilation_h, dilation_w]
Padding padding_type_;
std::vector<int> paddings_;
const int wino_blk_size_;
};
template<DeviceType D, typename T> #ifdef MACE_ENABLE_OPENCL
struct WinogradTransformFunctor : WinogradTransformFunctorBase { class OpenCLWinogradTransformKernel {
WinogradTransformFunctor(OpKernelContext *context, public:
const Padding &padding_type, virtual MaceStatus Compute(
const std::vector<int> &paddings, OpKernelContext *context,
const int block_size) const Tensor *input,
: WinogradTransformFunctorBase(context,
padding_type,
paddings,
block_size) {}
MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) = 0;
MACE_UNUSED(input); MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLWinogradTransformKernel);
MACE_UNUSED(output);
MACE_UNUSED(future);
MACE_NOT_IMPLEMENTED;
return MACE_SUCCESS;
}
}; };
#ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct WinogradTransformFunctor<DeviceType::GPU, T> struct WinogradTransformFunctor<DeviceType::GPU, T> : OpKernel {
: WinogradTransformFunctorBase {
WinogradTransformFunctor(OpKernelContext *context, WinogradTransformFunctor(OpKernelContext *context,
const Padding &padding_type, const Padding &padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int block_size) const int block_size);
: WinogradTransformFunctorBase(context,
padding_type,
paddings,
block_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLWinogradTransformKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
struct WinogradInverseTransformFunctorBase : OpKernel {
WinogradInverseTransformFunctorBase(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit,
const int block_size)
: OpKernel(context),
wino_blk_size_(block_size),
activation_(activation),
relux_max_limit_(relux_max_limit) {}
const int wino_blk_size_;
const ActivationType activation_;
const float relux_max_limit_;
};
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { struct WinogradInverseTransformFunctor;
WinogradInverseTransformFunctor(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit,
const int block_size)
: WinogradInverseTransformFunctorBase(
context, activation, relux_max_limit, block_size) {}
MaceStatus operator()(const std::vector<const Tensor*> &inputs, #ifdef MACE_ENABLE_OPENCL
class OpenCLWinogradInverseTransformKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const std::vector<const Tensor*> &inputs,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) = 0;
MACE_UNUSED(inputs); MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLWinogradInverseTransformKernel);
MACE_UNUSED(output);
MACE_UNUSED(future);
MACE_NOT_IMPLEMENTED;
return MACE_SUCCESS;
}
}; };
#ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct WinogradInverseTransformFunctor<DeviceType::GPU, T> struct WinogradInverseTransformFunctor<DeviceType::GPU, T> : OpKernel {
: WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(OpKernelContext *context, WinogradInverseTransformFunctor(OpKernelContext *context,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const int block_size) const int block_size);
: WinogradInverseTransformFunctorBase(
context, activation, relux_max_limit, block_size) {}
MaceStatus operator()(const std::vector<const Tensor*> &inputs, MaceStatus operator()(const std::vector<const Tensor *> &inputs,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; std::unique_ptr<OpenCLWinogradInverseTransformKernel> kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
......
...@@ -77,6 +77,14 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { ...@@ -77,6 +77,14 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
return MaceStatus::MACE_INVALID_ARGS; return MaceStatus::MACE_INVALID_ARGS;
} }
const int mem_type_i =
ProtoArgHelper::GetOptionalArg<NetDef, int>(
*net_def, "opencl_mem_type",
static_cast<MemoryType>(MemoryType::GPU_IMAGE));
const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
runtime->set_mem_type(mem_type);
if (mem_type == MemoryType::GPU_IMAGE) {
if (!runtime->IsImageSupport()) { if (!runtime->IsImageSupport()) {
return MaceStatus::MACE_OUT_OF_RESOURCES; return MaceStatus::MACE_OUT_OF_RESOURCES;
} }
...@@ -97,6 +105,8 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { ...@@ -97,6 +105,8 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
<< " vs " << MakeString(net_max_image_size); << " vs " << MakeString(net_max_image_size);
return MaceStatus::MACE_OUT_OF_RESOURCES; return MaceStatus::MACE_OUT_OF_RESOURCES;
} }
}
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
#endif #endif
......
...@@ -54,14 +54,14 @@ cc_library( ...@@ -54,14 +54,14 @@ cc_library(
"*_test.cc", "*_test.cc",
"*_benchmark.cc", "*_benchmark.cc",
"ops_test_util.cc", "ops_test_util.cc",
"buffer_to_image.cc", "buffer_transform.cc",
"image_to_buffer.cc", "buffer_inverse_transform.cc",
"lstmcell.cc", "lstmcell.cc",
], ],
) + if_opencl_enabled( ) + if_opencl_enabled(
[ [
"buffer_to_image.cc", "buffer_transform.cc",
"image_to_buffer.cc", "buffer_inverse_transform.cc",
"lstmcell.cc", "lstmcell.cc",
], ],
), ),
......
...@@ -36,7 +36,7 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -36,7 +36,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *batch_tensor = this->Input(INPUT); const Tensor *batch_tensor = this->Input(INPUT);
Tensor *space_tensor = this->Output(OUTPUT); Tensor *space_tensor = this->Output(OUTPUT);
return functor_(space_tensor, const_cast<Tensor *>(batch_tensor), future); return functor_(batch_tensor, space_tensor, future);
} }
private: private:
......
...@@ -12,23 +12,23 @@ ...@@ -12,23 +12,23 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/buffer_to_image.h" #include "mace/ops/buffer_inverse_transform.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
void Register_BufferToImage(OperatorRegistryBase *op_registry) { void Register_BufferInverseTransform(OperatorRegistryBase *op_registry) {
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferInverseTransform")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
BufferToImageOp<DeviceType::GPU, float>); BufferInverseTransformOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferInverseTransform")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
BufferToImageOp<DeviceType::GPU, half>); BufferInverseTransformOp<DeviceType::GPU, half>);
} }
} // namespace ops } // namespace ops
......
...@@ -12,19 +12,19 @@ ...@@ -12,19 +12,19 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_IMAGE_TO_BUFFER_H_ #ifndef MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_
#define MACE_OPS_IMAGE_TO_BUFFER_H_ #define MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/kernels/image_to_buffer.h" #include "mace/kernels/buffer_inverse_transform.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class ImageToBufferOp : public Operator<D, T> { class BufferInverseTransformOp : public Operator<D, T> {
public: public:
ImageToBufferOp(const OperatorDef &op_def, OpKernelContext *context) BufferInverseTransformOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, context), : Operator<D, T>(op_def, context),
functor_(context, functor_(context,
OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {} OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
...@@ -40,7 +40,7 @@ class ImageToBufferOp : public Operator<D, T> { ...@@ -40,7 +40,7 @@ class ImageToBufferOp : public Operator<D, T> {
} }
private: private:
kernels::ImageToBufferFunctor<D, T> functor_; kernels::BufferInverseTransformFunctor<D, T> functor_;
protected: protected:
MACE_OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
...@@ -50,4 +50,4 @@ class ImageToBufferOp : public Operator<D, T> { ...@@ -50,4 +50,4 @@ class ImageToBufferOp : public Operator<D, T> {
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_IMAGE_TO_BUFFER_H_ #endif // MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_
...@@ -24,7 +24,7 @@ template <DeviceType D, typename T> ...@@ -24,7 +24,7 @@ template <DeviceType D, typename T>
void TestBidirectionTransform(const int type, void TestBidirectionTransform(const int type,
const std::vector<index_t> &input_shape) { const std::vector<index_t> &input_shape) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferToImage", "BufferToImageTest") OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input") .Input("Input")
.Output("B2IOutput") .Output("B2IOutput")
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
...@@ -37,7 +37,7 @@ void TestBidirectionTransform(const int type, ...@@ -37,7 +37,7 @@ void TestBidirectionTransform(const int type,
// Run // Run
net.RunOp(D); net.RunOp(D);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest") OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("B2IOutput") .Input("B2IOutput")
.Output("I2BOutput") .Output("I2BOutput")
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
...@@ -159,7 +159,7 @@ template <DeviceType D, typename T> ...@@ -159,7 +159,7 @@ template <DeviceType D, typename T>
void TestDiffTypeBidirectionTransform(const int type, void TestDiffTypeBidirectionTransform(const int type,
const std::vector<index_t> &input_shape) { const std::vector<index_t> &input_shape) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferToImage", "BufferToImageTest") OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input") .Input("Input")
.Output("B2IOutput") .Output("B2IOutput")
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
...@@ -172,7 +172,7 @@ void TestDiffTypeBidirectionTransform(const int type, ...@@ -172,7 +172,7 @@ void TestDiffTypeBidirectionTransform(const int type,
// Run // Run
net.RunOp(D); net.RunOp(D);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest") OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("B2IOutput") .Input("B2IOutput")
.Output("I2BOutput") .Output("I2BOutput")
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
...@@ -198,7 +198,7 @@ void TestStringHalfBidirectionTransform(const int type, ...@@ -198,7 +198,7 @@ void TestStringHalfBidirectionTransform(const int type,
const std::vector<index_t> &input_shape, const std::vector<index_t> &input_shape,
const unsigned char *input_data) { const unsigned char *input_data) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferToImage", "BufferToImageTest") OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input") .Input("Input")
.Output("B2IOutput") .Output("B2IOutput")
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
...@@ -213,7 +213,7 @@ void TestStringHalfBidirectionTransform(const int type, ...@@ -213,7 +213,7 @@ void TestStringHalfBidirectionTransform(const int type,
// Run // Run
net.RunOp(D); net.RunOp(D);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest") OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("B2IOutput") .Input("B2IOutput")
.Output("I2BOutput") .Output("I2BOutput")
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
......
...@@ -12,23 +12,23 @@ ...@@ -12,23 +12,23 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/image_to_buffer.h" #include "mace/ops/buffer_transform.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
void Register_ImageToBuffer(OperatorRegistryBase *op_registry) { void Register_BufferTransform(OperatorRegistryBase *op_registry) {
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferTransform")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
ImageToBufferOp<DeviceType::GPU, float>); BufferTransformOp<DeviceType::GPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer") MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferTransform")
.Device(DeviceType::GPU) .Device(DeviceType::GPU)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
ImageToBufferOp<DeviceType::GPU, half>); BufferTransformOp<DeviceType::GPU, half>);
} }
} // namespace ops } // namespace ops
......
...@@ -12,19 +12,19 @@ ...@@ -12,19 +12,19 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_BUFFER_TO_IMAGE_H_ #ifndef MACE_OPS_BUFFER_TRANSFORM_H_
#define MACE_OPS_BUFFER_TO_IMAGE_H_ #define MACE_OPS_BUFFER_TRANSFORM_H_
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/kernels/buffer_to_image.h" #include "mace/kernels/buffer_transform.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class BufferToImageOp : public Operator<D, T> { class BufferTransformOp : public Operator<D, T> {
public: public:
BufferToImageOp(const OperatorDef &op_def, OpKernelContext *context) BufferTransformOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, context), : Operator<D, T>(op_def, context),
functor_(context, functor_(context,
OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {} OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
...@@ -41,7 +41,7 @@ class BufferToImageOp : public Operator<D, T> { ...@@ -41,7 +41,7 @@ class BufferToImageOp : public Operator<D, T> {
} }
private: private:
kernels::BufferToImageFunctor<D, T> functor_; kernels::BufferTransformFunctor<D, T> functor_;
protected: protected:
MACE_OP_INPUT_TAGS(INPUT); MACE_OP_INPUT_TAGS(INPUT);
...@@ -50,4 +50,4 @@ class BufferToImageOp : public Operator<D, T> { ...@@ -50,4 +50,4 @@ class BufferToImageOp : public Operator<D, T> {
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_BUFFER_TO_IMAGE_H_ #endif // MACE_OPS_BUFFER_TRANSFORM_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstring>
#include "gtest/gtest.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class BufferTransformTest : public OpsTestBase {
protected:
virtual void SetUp() {
OpTestContext::Get()->SetOCLBufferTestFlag();
}
};
namespace {
template <typename OrgType, typename DstType>
void TestBidirectionTransform(const int type,
const std::vector<index_t> &input_shape) {
OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input")
.Output("TransformedOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<DstType>::value)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::GPU, OrgType>("Input", input_shape);
// Run
net.RunOp(DeviceType::GPU);
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("TransformedOutput")
.Output("Output")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<OrgType>::value)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::GPU);
if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
net.GetOutput("Output")->UnderlyingBuffer());
} else {
// Check
ExpectTensorNear<OrgType>(*net.GetOutput("Input"),
*net.GetOutput("Output"),
1e-3, 1e-4);
}
}
} // namespace
TEST_F(BufferTransformTest, FloatToHalf) {
TestBidirectionTransform<float, half>(kernels::BufferType::IN_OUT_CHANNEL,
{1, 2, 3, 4});
}
TEST_F(BufferTransformTest, HalfToHalf) {
TestBidirectionTransform<half, half>(kernels::BufferType::IN_OUT_CHANNEL,
{1, 2, 3, 4});
}
namespace {
template <typename T>
void TestArgumentTransform(const index_t input_size) {
OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input")
.Output("Output")
.AddIntArg("buffer_type", kernels::BufferType::ARGUMENT)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::GPU, T>("Input", {input_size});
// Run
net.RunOp(DeviceType::GPU);
auto output_tensor = net.GetOutput("Output");
index_t expected_size = RoundUp<index_t>(input_size, 4);
EXPECT_EQ(expected_size, output_tensor->buffer_shape()[0]);
// Check
ExpectTensorNear<T>(*net.GetTensor("Input"), *output_tensor,
1e-3, 1e-4);
}
} // namespace
TEST_F(BufferTransformTest, Argument) {
TestArgumentTransform<half>(30);
TestArgumentTransform<half>(32);
}
} // namespace test
} // namespace ops
} // namespace mace
...@@ -40,8 +40,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -40,8 +40,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
"NOOP")), "NOOP")),
OperatorBase::GetOptionalArg<float>("max_limit", 0.0f), OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
static_cast<bool>(OperatorBase::GetOptionalArg<int>( static_cast<bool>(OperatorBase::GetOptionalArg<int>(
"is_filter_transformed", false)), "is_filter_transformed", false))) {}
context->workspace()->GetScratchBuffer(D)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -23,21 +23,26 @@ namespace mace { ...@@ -23,21 +23,26 @@ namespace mace {
namespace ops { namespace ops {
namespace test { namespace test {
class Conv2dOpTest : public OpsTestBase {}; class Conv2dOpTest : public OpsTestBase {
protected:
virtual void SetUp() {
OpTestContext::Get()->SetOCLImageTestFlag();
}
};
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void TestNHWCSimple3x3VALID() { void TestNHWCSimple3x3VALID() {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, T>( net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2}, "Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>( net.AddInputFromArray<D, float>(
"Filter", {1, 2, 3, 3}, "Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f}); net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
...@@ -50,7 +55,6 @@ void TestNHWCSimple3x3VALID() { ...@@ -50,7 +55,6 @@ void TestNHWCSimple3x3VALID() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -77,7 +81,7 @@ void TestNHWCSimple3x3VALID() { ...@@ -77,7 +81,7 @@ void TestNHWCSimple3x3VALID() {
net.RunOp(D); net.RunOp(D);
// Transfer output // Transfer output
ImageToBuffer<D, T>(&net, "OutputImage", "Output", ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
...@@ -85,7 +89,11 @@ void TestNHWCSimple3x3VALID() { ...@@ -85,7 +89,11 @@ void TestNHWCSimple3x3VALID() {
} }
auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {18.1f}); auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {18.1f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5); if (DataTypeToEnum<T>::value == DataType::DT_FLOAT) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} else {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3, 1e-3);
}
} }
template <DeviceType D, typename T> template <DeviceType D, typename T>
...@@ -93,14 +101,14 @@ void TestNHWCSimple3x3SAME() { ...@@ -93,14 +101,14 @@ void TestNHWCSimple3x3SAME() {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, T>( net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2}, "Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>( net.AddInputFromArray<D, float>(
"Filter", {1, 2, 3, 3}, "Filter", {1, 2, 3, 3},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f}); net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
...@@ -113,7 +121,6 @@ void TestNHWCSimple3x3SAME() { ...@@ -113,7 +121,6 @@ void TestNHWCSimple3x3SAME() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -140,7 +147,7 @@ void TestNHWCSimple3x3SAME() { ...@@ -140,7 +147,7 @@ void TestNHWCSimple3x3SAME() {
net.RunOp(D); net.RunOp(D);
// Transfer output // Transfer output
ImageToBuffer<D, T>(&net, "OutputImage", "Output", ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} else { } else {
...@@ -151,7 +158,11 @@ void TestNHWCSimple3x3SAME() { ...@@ -151,7 +158,11 @@ void TestNHWCSimple3x3SAME() {
{1, 3, 3, 1}, {1, 3, 3, 1},
{8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5); if (DataTypeToEnum<T>::value == DataType::DT_FLOAT) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} else {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3, 1e-3);
}
} }
} // namespace } // namespace
...@@ -165,6 +176,11 @@ TEST_F(Conv2dOpTest, OPENCLSimple) { ...@@ -165,6 +176,11 @@ TEST_F(Conv2dOpTest, OPENCLSimple) {
TestNHWCSimple3x3SAME<DeviceType::GPU, float>(); TestNHWCSimple3x3SAME<DeviceType::GPU, float>();
} }
TEST_F(Conv2dOpTest, OPENCLHalfSimple) {
TestNHWCSimple3x3VALID<DeviceType::GPU, half>();
TestNHWCSimple3x3SAME<DeviceType::GPU, half>();
}
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void TestNHWCSimple3x3WithoutBias() { void TestNHWCSimple3x3WithoutBias() {
...@@ -638,7 +654,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -638,7 +654,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
auto func = [&](int stride_h, int stride_w, Padding padding) { auto func = [&](int stride_h, int stride_w, Padding padding) {
// generate random input // generate random input
index_t batch = 3; index_t batch = 1;
index_t height = input_shape[0]; index_t height = input_shape[0];
index_t width = input_shape[1]; index_t width = input_shape[1];
index_t kernel_h = filter_shape[0]; index_t kernel_h = filter_shape[0];
...@@ -713,7 +729,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -713,7 +729,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-1); 1e-2);
}; };
func(1, 1, VALID); func(1, 1, VALID);
......
...@@ -26,7 +26,7 @@ TEST(CoreTest, INIT_MODE) { ...@@ -26,7 +26,7 @@ TEST(CoreTest, INIT_MODE) {
Workspace ws; Workspace ws;
op_defs.emplace_back(OperatorDef()); op_defs.emplace_back(OperatorDef());
OpDefBuilder("BufferToImage", "BufferToImageTest") OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input") .Input("Input")
.Output("B2IOutput") .Output("B2IOutput")
.AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER) .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER)
...@@ -43,7 +43,7 @@ TEST(CoreTest, INIT_MODE) { ...@@ -43,7 +43,7 @@ TEST(CoreTest, INIT_MODE) {
} }
op_defs.emplace_back(OperatorDef()); op_defs.emplace_back(OperatorDef());
OpDefBuilder("ImageToBuffer", "ImageToBufferTest") OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("B2IOutput") .Input("B2IOutput")
.Output("Output") .Output("Output")
.AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER) .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER)
......
...@@ -250,19 +250,19 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -250,19 +250,19 @@ void TestNxNS12(const index_t height, const index_t width) {
Padding type) { Padding type) {
// generate random input // generate random input
static unsigned int seed = time(NULL); static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 5; index_t batch = 1;
index_t input_channels = 3 + rand_r(&seed) % 16; index_t channel = 32;
index_t multiplier = 1; index_t multiplier = 1;
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Input", {batch, height, width, input_channels}); "Input", {batch, height, width, channel});
net.AddRandomInput<DeviceType::GPU, float>( net.AddRandomInput<DeviceType::GPU, float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w}); "Filter", {multiplier, channel, kernel_h, kernel_w});
net.AddRandomInput<DeviceType::GPU, float>("Bias", net.AddRandomInput<DeviceType::GPU, float>("Bias",
{multiplier * input_channels}); {multiplier * channel});
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -275,6 +275,8 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -275,6 +275,8 @@ void TestNxNS12(const index_t height, const index_t width) {
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6.0)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on cpu // Run on cpu
...@@ -302,6 +304,8 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -302,6 +304,8 @@ void TestNxNS12(const index_t height, const index_t width) {
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6.0)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
......
...@@ -67,8 +67,8 @@ extern void Register_WinogradInverseTransform(OperatorRegistryBase *op_registry) ...@@ -67,8 +67,8 @@ extern void Register_WinogradInverseTransform(OperatorRegistryBase *op_registry)
extern void Register_WinogradTransform(OperatorRegistryBase *op_registry); extern void Register_WinogradTransform(OperatorRegistryBase *op_registry);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
extern void Register_BufferToImage(OperatorRegistryBase *op_registry); extern void Register_BufferTransform(OperatorRegistryBase *op_registry);
extern void Register_ImageToBuffer(OperatorRegistryBase *op_registry); extern void Register_BufferInverseTransform(OperatorRegistryBase *op_registry);
extern void Register_LSTMCell(OperatorRegistryBase *op_registry); extern void Register_LSTMCell(OperatorRegistryBase *op_registry);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} // namespace ops } // namespace ops
...@@ -125,8 +125,8 @@ OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() { ...@@ -125,8 +125,8 @@ OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() {
ops::Register_WinogradTransform(this); ops::Register_WinogradTransform(this);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
ops::Register_BufferToImage(this); ops::Register_BufferTransform(this);
ops::Register_ImageToBuffer(this); ops::Register_BufferInverseTransform(this);
ops::Register_LSTMCell(this); ops::Register_LSTMCell(this);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -27,18 +27,11 @@ OpTestContext *OpTestContext::Get(int num_threads, ...@@ -27,18 +27,11 @@ OpTestContext *OpTestContext::Get(int num_threads,
return &instance; return &instance;
} }
std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
return gpu_context_;
}
Device *OpTestContext::GetDevice(DeviceType device_type) {
return device_map_[device_type].get();
}
OpTestContext::OpTestContext(int num_threads, OpTestContext::OpTestContext(int num_threads,
CPUAffinityPolicy cpu_affinity_policy, CPUAffinityPolicy cpu_affinity_policy,
bool use_gemmlowp) bool use_gemmlowp)
: gpu_context_(new GPUContext()) { : gpu_context_(new GPUContext()),
opencl_mem_types_({MemoryType::GPU_IMAGE}) {
device_map_[DeviceType::CPU] = std::unique_ptr<Device>( device_map_[DeviceType::CPU] = std::unique_ptr<Device>(
new CPUDevice(num_threads, new CPUDevice(num_threads,
cpu_affinity_policy, cpu_affinity_policy,
...@@ -50,6 +43,30 @@ OpTestContext::OpTestContext(int num_threads, ...@@ -50,6 +43,30 @@ OpTestContext::OpTestContext(int num_threads,
GPUPriorityHint::PRIORITY_NORMAL)); GPUPriorityHint::PRIORITY_NORMAL));
} }
std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
return gpu_context_;
}
Device *OpTestContext::GetDevice(DeviceType device_type) {
return device_map_[device_type].get();
}
std::vector<MemoryType> OpTestContext::opencl_mem_types() {
return opencl_mem_types_;
}
void OpTestContext::SetOCLBufferTestFlag() {
opencl_mem_types_ = {MemoryType::GPU_BUFFER};
}
void OpTestContext::SetOCLImageTestFlag() {
opencl_mem_types_ = {MemoryType::GPU_IMAGE};
}
void OpTestContext::SetOCLImageAndBufferTestFlag() {
opencl_mem_types_ = {MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER};
}
} // namespace test } // namespace test
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -120,7 +120,10 @@ class OpTestContext { ...@@ -120,7 +120,10 @@ class OpTestContext {
bool use_gemmlowp = true); bool use_gemmlowp = true);
std::shared_ptr<GPUContext> gpu_context() const; std::shared_ptr<GPUContext> gpu_context() const;
Device *GetDevice(DeviceType device_type); Device *GetDevice(DeviceType device_type);
std::vector<MemoryType> opencl_mem_types();
void SetOCLBufferTestFlag();
void SetOCLImageTestFlag();
void SetOCLImageAndBufferTestFlag();
private: private:
OpTestContext(int num_threads, OpTestContext(int num_threads,
CPUAffinityPolicy cpu_affinity_policy, CPUAffinityPolicy cpu_affinity_policy,
...@@ -128,6 +131,7 @@ class OpTestContext { ...@@ -128,6 +131,7 @@ class OpTestContext {
MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext); MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);
std::shared_ptr<GPUContext> gpu_context_; std::shared_ptr<GPUContext> gpu_context_;
std::vector<MemoryType> opencl_mem_types_;
std::map<DeviceType, std::unique_ptr<Device>> device_map_; std::map<DeviceType, std::unique_ptr<Device>> device_map_;
}; };
...@@ -459,9 +463,20 @@ class OpsTestNet { ...@@ -459,9 +463,20 @@ class OpsTestNet {
// Test and benchmark should setup model once and run multiple times. // Test and benchmark should setup model once and run multiple times.
// Setup time should not be counted during benchmark. // Setup time should not be counted during benchmark.
MaceStatus RunOp(DeviceType device) { MaceStatus RunOp(DeviceType device) {
if (device == DeviceType::GPU) {
auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types();
for (auto type : opencl_mem_types) {
OpTestContext::Get()->GetDevice(device)
->opencl_runtime()->set_mem_type(type);
Setup(device);
MACE_RETURN_IF_ERROR(Run());
}
return MACE_SUCCESS;
} else {
Setup(device); Setup(device);
return Run(); return Run();
} }
}
// DEPRECATED(liyin): // DEPRECATED(liyin):
// Test and benchmark should setup model once and run multiple times. // Test and benchmark should setup model once and run multiple times.
...@@ -512,6 +527,7 @@ class OpsTestBase : public ::testing::Test { ...@@ -512,6 +527,7 @@ class OpsTestBase : public ::testing::Test {
} }
virtual void TearDown() { virtual void TearDown() {
OpTestContext::Get()->SetOCLImageTestFlag();
} }
}; };
...@@ -747,7 +763,7 @@ void BufferToImage(OpsTestNet *net, ...@@ -747,7 +763,7 @@ void BufferToImage(OpsTestNet *net,
const int wino_block_size = 2) { const int wino_block_size = 2) {
MACE_CHECK_NOTNULL(net); MACE_CHECK_NOTNULL(net);
OpDefBuilder("BufferToImage", "BufferToImageTest") OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
...@@ -755,7 +771,7 @@ void BufferToImage(OpsTestNet *net, ...@@ -755,7 +771,7 @@ void BufferToImage(OpsTestNet *net,
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef()); .Finalize(net->NewOperatorDef());
// Run // TODO(liuqi): Use AddNewOperatorDef, and run all ops with same NetDef.
net->RunOp(D); net->RunOp(D);
net->Sync(); net->Sync();
...@@ -769,7 +785,7 @@ void ImageToBuffer(OpsTestNet *net, ...@@ -769,7 +785,7 @@ void ImageToBuffer(OpsTestNet *net,
const int wino_block_size = 2) { const int wino_block_size = 2) {
MACE_CHECK_NOTNULL(net); MACE_CHECK_NOTNULL(net);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest") OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
......
...@@ -111,6 +111,7 @@ void Pooling(int iters, ...@@ -111,6 +111,7 @@ void Pooling(int iters,
#define MACE_BM_POOLING(N, C, H, W, K, S, PA, PO) \ #define MACE_BM_POOLING(N, C, H, W, K, S, PA, PO) \
MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, float, CPU); \ MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, float, CPU); \
MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, float, GPU); \ MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, float, GPU); \
MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, half, GPU); \
MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, uint8_t, CPU); MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, uint8_t, CPU);
......
...@@ -27,8 +27,8 @@ class ResizeBicubicOp : public Operator<D, T> { ...@@ -27,8 +27,8 @@ class ResizeBicubicOp : public Operator<D, T> {
ResizeBicubicOp(const OperatorDef &operator_def, OpKernelContext *context) ResizeBicubicOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, context), : Operator<D, T>(operator_def, context),
functor_(context, functor_(context,
OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}), OperatorBase::GetOptionalArg<bool>("align_corners", false),
OperatorBase::GetOptionalArg<bool>("align_corners", false)) {} OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1})) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
......
...@@ -36,7 +36,7 @@ class SpaceToBatchNDOp : public Operator<D, T> { ...@@ -36,7 +36,7 @@ class SpaceToBatchNDOp : public Operator<D, T> {
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *space_tensor = this->Input(INPUT); const Tensor *space_tensor = this->Input(INPUT);
Tensor *batch_tensor = this->Output(OUTPUT); Tensor *batch_tensor = this->Output(OUTPUT);
return functor_(const_cast<Tensor *>(space_tensor), batch_tensor, future); return functor_(space_tensor, batch_tensor, future);
} }
private: private:
......
...@@ -106,6 +106,7 @@ def main(unused_args): ...@@ -106,6 +106,7 @@ def main(unused_args):
option.winograd = FLAGS.winograd option.winograd = FLAGS.winograd
option.quantize = FLAGS.quantize option.quantize = FLAGS.quantize
option.quantize_range_file = FLAGS.quantize_range_file option.quantize_range_file = FLAGS.quantize_range_file
option.cl_mem_type = FLAGS.cl_mem_type
input_node_names = FLAGS.input_node.split(',') input_node_names = FLAGS.input_node.split(',')
input_node_shapes = FLAGS.input_shape.split(':') input_node_shapes = FLAGS.input_shape.split(':')
...@@ -323,6 +324,11 @@ def parse_args(): ...@@ -323,6 +324,11 @@ def parse_args():
type=str, type=str,
default="", default="",
help="file path of quantize range for each tensor") help="file path of quantize range for each tensor")
parser.add_argument(
"--cl_mem_type",
type=str,
default="image",
help="which memory type to use.[image|buffer]")
return parser.parse_known_args() return parser.parse_known_args()
......
...@@ -131,8 +131,8 @@ class MaceKeyword(object): ...@@ -131,8 +131,8 @@ class MaceKeyword(object):
mace_output_node_name = 'mace_output_node' mace_output_node_name = 'mace_output_node'
mace_buffer_type = 'buffer_type' mace_buffer_type = 'buffer_type'
mace_mode = 'mode' mace_mode = 'mode'
mace_buffer_to_image = 'BufferToImage' mace_buffer_transform = 'BufferTransform'
mace_image_to_buffer = 'ImageToBuffer' mace_buffer_inverse_transform = 'BufferInverseTransform'
# arg related str # arg related str
mace_padding_str = 'padding' mace_padding_str = 'padding'
mace_padding_values_str = 'padding_values' mace_padding_values_str = 'padding_values'
...@@ -175,6 +175,7 @@ class MaceKeyword(object): ...@@ -175,6 +175,7 @@ class MaceKeyword(object):
mace_opencl_max_image_size = "opencl_max_image_size" mace_opencl_max_image_size = "opencl_max_image_size"
mace_seperate_buffer_str = 'seperate_buffer' mace_seperate_buffer_str = 'seperate_buffer'
mace_scalar_input_index_str = 'scalar_input_index' mace_scalar_input_index_str = 'scalar_input_index'
mace_opencl_mem_type = "opencl_mem_type"
class TransformerRule(Enum): class TransformerRule(Enum):
...@@ -194,7 +195,7 @@ class TransformerRule(Enum): ...@@ -194,7 +195,7 @@ class TransformerRule(Enum):
RESHAPE_FC_WEIGHT = 14 RESHAPE_FC_WEIGHT = 14
TRANSPOSE_DATA_FORMAT = 15 TRANSPOSE_DATA_FORMAT = 15
TRANSFORM_GLOBAL_CONV_TO_FC = 16 TRANSFORM_GLOBAL_CONV_TO_FC = 16
TRANSFORM_BUFFER_IMAGE = 17 ADD_BUFFER_TRANSFORM = 17
ADD_DEVICE = 18 ADD_DEVICE = 18
SORT_BY_EXECUTION = 19 SORT_BY_EXECUTION = 19
ADD_IN_OUT_TENSOR_INFO = 20 ADD_IN_OUT_TENSOR_INFO = 20
...@@ -208,6 +209,7 @@ class TransformerRule(Enum): ...@@ -208,6 +209,7 @@ class TransformerRule(Enum):
TRANSFORM_FAKE_QUANTIZE = 28 TRANSFORM_FAKE_QUANTIZE = 28
CHECK_QUANTIZE_INFO = 29 CHECK_QUANTIZE_INFO = 29
REARRANGE_BATCH_TO_SPACE = 30 REARRANGE_BATCH_TO_SPACE = 30
ADD_OPENCL_INFORMATIONS = 31
class ConverterInterface(object): class ConverterInterface(object):
...@@ -265,6 +267,7 @@ class ConverterOption(object): ...@@ -265,6 +267,7 @@ class ConverterOption(object):
self._quantize = False self._quantize = False
self._quantize_range_file = "" self._quantize_range_file = ""
self._transformer_option = None self._transformer_option = None
self._cl_mem_type = ""
@property @property
def input_nodes(self): def input_nodes(self):
...@@ -298,6 +301,10 @@ class ConverterOption(object): ...@@ -298,6 +301,10 @@ class ConverterOption(object):
def transformer_option(self): def transformer_option(self):
return self._transformer_option return self._transformer_option
@property
def cl_mem_type(self):
return self._cl_mem_type
@input_nodes.setter @input_nodes.setter
def input_nodes(self, input_nodes): def input_nodes(self, input_nodes):
for node in input_nodes: for node in input_nodes:
...@@ -338,6 +345,10 @@ class ConverterOption(object): ...@@ -338,6 +345,10 @@ class ConverterOption(object):
def transformer_option(self, transformer_option): def transformer_option(self, transformer_option):
self._transformer_option = transformer_option self._transformer_option = transformer_option
@cl_mem_type.setter
def cl_mem_type(self, cl_mem_type):
self._cl_mem_type = cl_mem_type
def disable_transpose_filters(self): def disable_transpose_filters(self):
if TransformerRule.TRANSPOSE_FILTERS in self._transformer_option: if TransformerRule.TRANSPOSE_FILTERS in self._transformer_option:
self._transformer_option.remove(TransformerRule.TRANSPOSE_FILTERS) self._transformer_option.remove(TransformerRule.TRANSPOSE_FILTERS)
...@@ -377,11 +388,12 @@ class ConverterOption(object): ...@@ -377,11 +388,12 @@ class ConverterOption(object):
# Mace model structure related transformation # Mace model structure related transformation
TransformerRule.ADD_IN_OUT_TENSOR_INFO, TransformerRule.ADD_IN_OUT_TENSOR_INFO,
# Device related transformation # Device related transformation
TransformerRule.TRANSFORM_BUFFER_IMAGE, TransformerRule.ADD_BUFFER_TRANSFORM,
TransformerRule.ADD_DEVICE, TransformerRule.ADD_DEVICE,
# Data type related transformation # Data type related transformation
TransformerRule.UPDATE_FLOAT_OP_DATA_TYPE, TransformerRule.UPDATE_FLOAT_OP_DATA_TYPE,
# Transform finalization # Transform finalization
TransformerRule.ADD_OPENCL_INFORMATIONS,
TransformerRule.ADD_MACE_INPUT_AND_OUTPUT_NODES, TransformerRule.ADD_MACE_INPUT_AND_OUTPUT_NODES,
# for quantization entropy calibration use # for quantization entropy calibration use
TransformerRule.SORT_BY_EXECUTION, TransformerRule.SORT_BY_EXECUTION,
......
...@@ -80,8 +80,8 @@ class Transformer(base_converter.ConverterInterface): ...@@ -80,8 +80,8 @@ class Transformer(base_converter.ConverterInterface):
TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC: TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC:
self.transform_global_conv_to_fc, self.transform_global_conv_to_fc,
TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight, TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight,
TransformerRule.TRANSFORM_BUFFER_IMAGE: TransformerRule.ADD_BUFFER_TRANSFORM:
self.transform_buffer_image, self.add_buffer_transform,
TransformerRule.QUANTIZE_NODES: TransformerRule.QUANTIZE_NODES:
self.quantize_nodes, self.quantize_nodes,
TransformerRule.ADD_QUANTIZE_TENSOR_RANGE: TransformerRule.ADD_QUANTIZE_TENSOR_RANGE:
...@@ -94,6 +94,8 @@ class Transformer(base_converter.ConverterInterface): ...@@ -94,6 +94,8 @@ class Transformer(base_converter.ConverterInterface):
self.update_float_op_data_type, self.update_float_op_data_type,
TransformerRule.ADD_MACE_INPUT_AND_OUTPUT_NODES: TransformerRule.ADD_MACE_INPUT_AND_OUTPUT_NODES:
self.add_mace_input_and_output_nodes, self.add_mace_input_and_output_nodes,
TransformerRule.ADD_OPENCL_INFORMATIONS:
self.add_opencl_informations,
TransformerRule.SORT_BY_EXECUTION: self.sort_by_execution, TransformerRule.SORT_BY_EXECUTION: self.sort_by_execution,
TransformerRule.CHECK_QUANTIZE_INFO: TransformerRule.CHECK_QUANTIZE_INFO:
self.check_quantize_info, self.check_quantize_info,
...@@ -1269,13 +1271,13 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1269,13 +1271,13 @@ class Transformer(base_converter.ConverterInterface):
return False return False
def buffer_to_image(self, op, input_idx, input_type): def buffer_transform(self, op, input_idx, input_type):
net = self._model net = self._model
input_name = op.input[input_idx] input_name = op.input[input_idx]
op_def = net.op.add() op_def = net.op.add()
op_def.name = input_name.replace(':', '_') + "_b2i" op_def.name = input_name.replace(':', '_') + "_b2i"
output_name = op_def.name output_name = op_def.name
op_def.type = MaceKeyword.mace_buffer_to_image op_def.type = MaceKeyword.mace_buffer_transform
op_def.input.extend([input_name]) op_def.input.extend([input_name])
op_def.output.extend([output_name]) op_def.output.extend([output_name])
...@@ -1307,65 +1309,66 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1307,65 +1309,66 @@ class Transformer(base_converter.ConverterInterface):
self._opencl_max_image_size[1] = max(self._opencl_max_image_size[1], self._opencl_max_image_size[1] = max(self._opencl_max_image_size[1],
img_shape[1]) img_shape[1])
def transform_buffer_image(self): def add_buffer_transform(self):
if self._option.device != DeviceType.GPU.value: if self._option.device != DeviceType.GPU.value:
return False return False
print("Transform buffer to image") print("Add buffer transform op")
net = self._model net = self._model
for op in net.op: for op in net.op:
if op.type == MaceOp.Conv2D.name \ if op.type == MaceOp.Conv2D.name \
or op.type == MaceOp.Deconv2D.name: or op.type == MaceOp.Deconv2D.name:
self.buffer_to_image(op, 1, OpenCLBufferType.CONV2D_FILTER) self.buffer_transform(op, 1, OpenCLBufferType.CONV2D_FILTER)
if len(op.input) >= 3 and op.type == MaceOp.Conv2D.name: if len(op.input) >= 3 and op.type == MaceOp.Conv2D.name:
self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
elif len(op.input) >= 4 and op.type == MaceOp.Deconv2D.name: elif len(op.input) >= 4 and op.type == MaceOp.Deconv2D.name:
self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.DepthwiseConv2d.name: elif op.type == MaceOp.DepthwiseConv2d.name:
self.buffer_to_image(op, 1, OpenCLBufferType.DW_CONV2D_FILTER) self.buffer_transform(op, 1, OpenCLBufferType.DW_CONV2D_FILTER)
if len(op.input) >= 3: if len(op.input) >= 3:
self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.BiasAdd.name: elif op.type == MaceOp.BiasAdd.name:
self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.Eltwise.name and len(op.input) == 2: elif op.type == MaceOp.Eltwise.name and len(op.input) == 2:
if op.input[0] in self._consts \ if op.input[0] in self._consts \
and len(self._consts[op.input[0]].dims) == 1: and len(self._consts[op.input[0]].dims) == 1:
self.buffer_to_image(op, 0, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 0, OpenCLBufferType.ARGUMENT)
if op.input[1] in self._consts \ if op.input[1] in self._consts \
and len(self._consts[op.input[1]].dims) == 1: and len(self._consts[op.input[1]].dims) == 1:
self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.FoldedBatchNorm.name: elif op.type == MaceOp.FoldedBatchNorm.name:
self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
if len(op.input) >= 4: if len(op.input) >= 4:
self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.MatMul.name and \ elif op.type == MaceOp.MatMul.name and \
ConverterUtil.get_arg(op, ConverterUtil.get_arg(op,
MaceKeyword.mace_winograd_filter_transformed) is not None: # noqa MaceKeyword.mace_winograd_filter_transformed) is not None: # noqa
self.buffer_to_image(op, 0, OpenCLBufferType.WINOGRAD_FILTER) self.buffer_transform(op, 0, OpenCLBufferType.WINOGRAD_FILTER)
elif op.type == MaceOp.WinogradInverseTransform.name \ elif op.type == MaceOp.WinogradInverseTransform.name \
and len(op.input) >= 3: and len(op.input) >= 3:
self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.FullyConnected.name: elif op.type == MaceOp.FullyConnected.name:
self.buffer_to_image(op, 1, OpenCLBufferType.WEIGHT_WIDTH) self.buffer_transform(op, 1, OpenCLBufferType.WEIGHT_WIDTH)
if len(op.input) >= 3: if len(op.input) >= 3:
self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.Activation.name: elif op.type == MaceOp.Activation.name:
if ConverterUtil.get_arg(op, if ConverterUtil.get_arg(op,
MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name: # noqa MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name: # noqa
self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.LSTMCell.name: elif op.type == MaceOp.LSTMCell.name:
if op.input[1] in self._consts: if op.input[1] in self._consts:
self.buffer_to_image(op, 1, self.buffer_transform(op, 1,
OpenCLBufferType.IN_OUT_CHANNEL) OpenCLBufferType.IN_OUT_CHANNEL)
self.buffer_to_image(op, 2, OpenCLBufferType.IN_OUT_CHANNEL) self.buffer_transform(op, 2, OpenCLBufferType.IN_OUT_CHANNEL)
self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT)
if op.input[4] in self._consts: if op.input[4] in self._consts:
self.buffer_to_image(op, 4, self.buffer_transform(op, 4,
OpenCLBufferType.IN_OUT_CHANNEL) OpenCLBufferType.IN_OUT_CHANNEL)
# Add OpenCL max image size # Add OpenCL max image size
if self._option.cl_mem_type == "image":
arg = net.arg.add() arg = net.arg.add()
arg.name = MaceKeyword.mace_opencl_max_image_size arg.name = MaceKeyword.mace_opencl_max_image_size
arg.ints.extend(self._opencl_max_image_size) arg.ints.extend(self._opencl_max_image_size)
...@@ -1376,7 +1379,7 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1376,7 +1379,7 @@ class Transformer(base_converter.ConverterInterface):
op_def = self._model.op.add() op_def = self._model.op.add()
op_def.name = self.normalize_op_name(input_node.name) op_def.name = self.normalize_op_name(input_node.name)
op_def.type = MaceKeyword.mace_buffer_to_image op_def.type = MaceKeyword.mace_buffer_transform
op_def.input.extend([new_input_name]) op_def.input.extend([new_input_name])
op_def.output.extend([input_node.name]) op_def.output.extend([input_node.name])
output_shape = op_def.output_shape.add() output_shape = op_def.output_shape.add()
...@@ -1394,7 +1397,7 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1394,7 +1397,7 @@ class Transformer(base_converter.ConverterInterface):
+ '_' + output_node.name + '_' + output_node.name
op_def = self._model.op.add() op_def = self._model.op.add()
op_def.name = self.normalize_op_name(output_name) op_def.name = self.normalize_op_name(output_name)
op_def.type = MaceKeyword.mace_image_to_buffer op_def.type = MaceKeyword.mace_buffer_inverse_transform
op_def.input.extend([output_node.name]) op_def.input.extend([output_node.name])
op_def.output.extend([output_name]) op_def.output.extend([output_name])
if output_node.shape: if output_node.shape:
...@@ -1920,3 +1923,16 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1920,3 +1923,16 @@ class Transformer(base_converter.ConverterInterface):
and op.type != MaceOp.Dequantize.name): # noqa and op.type != MaceOp.Dequantize.name): # noqa
mace_check(len(op.output) == len(op.quantize_info), mace_check(len(op.output) == len(op.quantize_info),
"missing quantize info: %s" % op) "missing quantize info: %s" % op)
def add_opencl_informations(self):
if self._option.device != DeviceType.GPU.value:
return False
print("Add OpenCL informations")
net = self._model
arg = net.arg.add()
arg.name = MaceKeyword.mace_opencl_mem_type
arg.i = mace_pb2.GPU_IMAGE if self._option.cl_mem_type == "image"\
else mace_pb2.GPU_BUFFER
...@@ -18,6 +18,8 @@ from mace.proto import mace_pb2 ...@@ -18,6 +18,8 @@ from mace.proto import mace_pb2
from mace.python.tools.converter_tool import base_converter as cvt from mace.python.tools.converter_tool import base_converter as cvt
from mace.python.tools.converter_tool.base_converter import DeviceType from mace.python.tools.converter_tool.base_converter import DeviceType
from mace.python.tools.converter_tool.base_converter import ConverterUtil
from mace.python.tools.converter_tool.base_converter import MaceKeyword
from mace.python.tools.convert_util import calculate_image_shape from mace.python.tools.convert_util import calculate_image_shape
from mace.python.tools.convert_util import OpenCLBufferType from mace.python.tools.convert_util import OpenCLBufferType
...@@ -56,6 +58,10 @@ class MemoryOptimizer(object): ...@@ -56,6 +58,10 @@ class MemoryOptimizer(object):
self.total_mem_count = 0 self.total_mem_count = 0
self.input_ref_counter = {} self.input_ref_counter = {}
self.mem_ref_counter = {} self.mem_ref_counter = {}
ocl_mem_type_arg = ConverterUtil.get_arg(
net_def, MaceKeyword.mace_opencl_mem_type)
self.cl_mem_type = ocl_mem_type_arg.i if ocl_mem_type_arg is not None\
else None
consumers = {} consumers = {}
for op in net_def.op: for op in net_def.op:
...@@ -223,13 +229,13 @@ class MemoryOptimizer(object): ...@@ -223,13 +229,13 @@ class MemoryOptimizer(object):
class GPUMemoryOptimizer(MemoryOptimizer): class GPUMemoryOptimizer(MemoryOptimizer):
def op_need_optimize_memory(self, op): def op_need_optimize_memory(self, op):
if op.type == 'BufferToImage': if op.type == MaceKeyword.mace_buffer_transform:
for arg in op.arg: for arg in op.arg:
if arg.name == 'mode' and arg.i == 0: if arg.name == 'mode' and arg.i == 0:
return False return False
return op.type != 'ImageToBuffer' return op.type != MaceKeyword.mace_buffer_inverse_transform
def get_op_mem_block(self, op_type, output_shape, output_type): def get_op_image_mem_block(self, op_type, output_shape):
if op_type == 'WinogradTransform' or op_type == 'MatMul': if op_type == 'WinogradTransform' or op_type == 'MatMul':
buffer_shape = list(output_shape) + [1] buffer_shape = list(output_shape) + [1]
mem_block = MemoryBlock( mem_block = MemoryBlock(
...@@ -264,6 +270,16 @@ class GPUMemoryOptimizer(MemoryOptimizer): ...@@ -264,6 +270,16 @@ class GPUMemoryOptimizer(MemoryOptimizer):
buffer_shape)) buffer_shape))
return mem_block return mem_block
def get_op_buffer_mem_block(self, output_shape):
return MemoryBlock(mace_pb2.GPU_BUFFER,
[reduce(operator.mul, output_shape, 1), 1])
def get_op_mem_block(self, op_type, output_shape, output_type):
if self.cl_mem_type == mace_pb2.GPU_IMAGE:
return self.get_op_image_mem_block(op_type, output_shape)
else:
return self.get_op_buffer_mem_block(output_shape)
def mem_size(self, memory_block): def mem_size(self, memory_block):
if memory_block.mem_type == mace_pb2.GPU_IMAGE: if memory_block.mem_type == mace_pb2.GPU_IMAGE:
return memory_block.block[0] * memory_block.block[1] * 4 return memory_block.block[0] * memory_block.block[1] * 4
...@@ -295,6 +311,7 @@ class GPUMemoryOptimizer(MemoryOptimizer): ...@@ -295,6 +311,7 @@ class GPUMemoryOptimizer(MemoryOptimizer):
max_image_size_x = max(max_image_size_x, block.x) max_image_size_x = max(max_image_size_x, block.x)
max_image_size_y = max(max_image_size_y, block.y) max_image_size_y = max(max_image_size_y, block.y)
if self.cl_mem_type == mace_pb2.GPU_IMAGE:
# Update OpenCL max image size # Update OpenCL max image size
net_ocl_max_img_size_arg = None net_ocl_max_img_size_arg = None
for arg in self.net_def.arg: for arg in self.net_def.arg:
......
...@@ -69,7 +69,7 @@ void BufferToImage(const std::string &input_name, ...@@ -69,7 +69,7 @@ void BufferToImage(const std::string &input_name,
const int mode = NetMode::NORMAL) { const int mode = NetMode::NORMAL) {
OperatorDef operator_def; OperatorDef operator_def;
ops::test::OpDefBuilder("BufferToImage", "BufferToImageOp") ops::test::OpDefBuilder("BufferTransform", "BufferTransformOp")
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", buffer_type) .AddIntArg("buffer_type", buffer_type)
...@@ -93,7 +93,7 @@ void ImageToBuffer(const std::string &input_name, ...@@ -93,7 +93,7 @@ void ImageToBuffer(const std::string &input_name,
NetDef *net_def) { NetDef *net_def) {
OperatorDef operator_def; OperatorDef operator_def;
ops::test::OpDefBuilder("ImageToBuffer", "ImageToBufferOp") ops::test::OpDefBuilder("BufferInverseTransform", "BufferInverseTransformOp")
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", buffer_type) .AddIntArg("buffer_type", buffer_type)
......
...@@ -70,7 +70,7 @@ void BufferToImage(const std::string &input_name, ...@@ -70,7 +70,7 @@ void BufferToImage(const std::string &input_name,
const int mode = NetMode::NORMAL) { const int mode = NetMode::NORMAL) {
OperatorDef operator_def; OperatorDef operator_def;
ops::test::OpDefBuilder("BufferToImage", "BufferToImageOp") ops::test::OpDefBuilder("BufferTransform", "BufferTransformOp")
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", buffer_type) .AddIntArg("buffer_type", buffer_type)
...@@ -95,7 +95,7 @@ void ImageToBuffer(const std::string &input_name, ...@@ -95,7 +95,7 @@ void ImageToBuffer(const std::string &input_name,
NetDef *net_def) { NetDef *net_def) {
OperatorDef operator_def; OperatorDef operator_def;
ops::test::OpDefBuilder("ImageToBuffer", "ImageToBufferOp") ops::test::OpDefBuilder("BufferInverseTransform", "BufferInverseTransformOp")
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", buffer_type) .AddIntArg("buffer_type", buffer_type)
......
...@@ -33,6 +33,12 @@ namespace mace { ...@@ -33,6 +33,12 @@ namespace mace {
CLASSNAME &operator=(const CLASSNAME &) = delete CLASSNAME &operator=(const CLASSNAME &) = delete
#endif #endif
#ifndef MACE_VIRTUAL_EMPTY_DESTRUCTOR
#define MACE_VIRTUAL_EMPTY_DESTRUCTOR(CLASSNAME) \
public: \
virtual ~CLASSNAME() {}
#endif
template <typename Integer> template <typename Integer>
Integer RoundUp(Integer i, Integer factor) { Integer RoundUp(Integer i, Integer factor) {
return (i + factor - 1) / factor * factor; return (i + factor - 1) / factor * factor;
......
...@@ -23,30 +23,38 @@ def _opencl_encrypt_kernel_impl(repository_ctx): ...@@ -23,30 +23,38 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/activation.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/activation.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/addn.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/addn.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/batch_norm.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/batch_norm.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/batch_to_space.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/bias_add.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/bias_add.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/buffer_to_image.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/buffer_to_image.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/buffer_transform.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/channel_shuffle.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/channel_shuffle.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/common.h")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/common.h"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/concat.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/concat.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_1x1.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_1x1.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_3x3.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_3x3.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_buffer.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/crop.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/crop.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/deconv_2d.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/deconv_2d.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depth_to_space.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depth_to_space.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depthwise_conv2d.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depthwise_conv2d.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/eltwise.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/eltwise.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/fully_connected.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/fully_connected.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/lstmcell.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/lstmcell.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/matmul.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/matmul.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pad.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pad.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pooling.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pooling.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pooling_buffer.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/reduce_mean.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/reduce_mean.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/resize_bicubic.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/resize_bicubic.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/resize_bilinear.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/resize_bilinear.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/split.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/split.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/softmax.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/softmax.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/softmax_buffer.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/space_to_batch.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/space_to_batch.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/space_to_depth.cl"))
unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/winograd_transform.cl")) unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/winograd_transform.cl"))
python_bin_path = repository_ctx.which("python") python_bin_path = repository_ctx.which("python")
......
...@@ -203,6 +203,7 @@ class YAMLKeyword(object): ...@@ -203,6 +203,7 @@ class YAMLKeyword(object):
validation_inputs_data = 'validation_inputs_data' validation_inputs_data = 'validation_inputs_data'
validation_threshold = 'validation_threshold' validation_threshold = 'validation_threshold'
graph_optimize_options = 'graph_optimize_options' # internal use for now graph_optimize_options = 'graph_optimize_options' # internal use for now
cl_mem_type = 'cl_mem_type'
class ModuleName(object): class ModuleName(object):
...@@ -692,7 +693,7 @@ def get_model_files(model_file_path, ...@@ -692,7 +693,7 @@ def get_model_files(model_file_path,
return model_file, weight_file return model_file, weight_file
def convert_model(configs): def convert_model(configs, cl_mem_type):
# Remove previous output dirs # Remove previous output dirs
library_name = configs[YAMLKeyword.library_name] library_name = configs[YAMLKeyword.library_name]
if not os.path.exists(BUILD_OUTPUT_DIR): if not os.path.exists(BUILD_OUTPUT_DIR):
...@@ -735,6 +736,10 @@ def convert_model(configs): ...@@ -735,6 +736,10 @@ def convert_model(configs):
StringFormatter.block("Convert %s model" % model_name)) StringFormatter.block("Convert %s model" % model_name))
model_config = configs[YAMLKeyword.models][model_name] model_config = configs[YAMLKeyword.models][model_name]
runtime = model_config[YAMLKeyword.runtime] runtime = model_config[YAMLKeyword.runtime]
if cl_mem_type:
model_config[YAMLKeyword.cl_mem_type] = cl_mem_type
else:
model_config[YAMLKeyword.cl_mem_type] = "image"
model_file_path, weight_file_path = get_model_files( model_file_path, weight_file_path = get_model_files(
model_config[YAMLKeyword.model_file_path], model_config[YAMLKeyword.model_file_path],
...@@ -769,6 +774,7 @@ def convert_model(configs): ...@@ -769,6 +774,7 @@ def convert_model(configs):
model_config[YAMLKeyword.obfuscate], model_config[YAMLKeyword.obfuscate],
configs[YAMLKeyword.model_graph_format], configs[YAMLKeyword.model_graph_format],
data_type, data_type,
model_config[YAMLKeyword.cl_mem_type],
",".join(model_config.get(YAMLKeyword.graph_optimize_options, []))) ",".join(model_config.get(YAMLKeyword.graph_optimize_options, [])))
if configs[YAMLKeyword.model_graph_format] == ModelFormat.file: if configs[YAMLKeyword.model_graph_format] == ModelFormat.file:
...@@ -844,7 +850,7 @@ def convert_func(flags): ...@@ -844,7 +850,7 @@ def convert_func(flags):
print_configuration(configs) print_configuration(configs)
convert_model(configs) convert_model(configs, flags.cl_mem_type)
if configs[YAMLKeyword.model_graph_format] == ModelFormat.code: if configs[YAMLKeyword.model_graph_format] == ModelFormat.code:
build_model_lib(configs, flags.address_sanitizer) build_model_lib(configs, flags.address_sanitizer)
...@@ -1683,6 +1689,11 @@ def parse_args(): ...@@ -1683,6 +1689,11 @@ def parse_args():
'convert', 'convert',
parents=[all_type_parent_parser, convert_run_parent_parser], parents=[all_type_parent_parser, convert_run_parent_parser],
help='convert to mace model (file or code)') help='convert to mace model (file or code)')
convert.add_argument(
"--cl_mem_type",
type=str,
default=None,
help="Which type of OpenCL memory type to use [image | buffer].")
convert.set_defaults(func=convert_func) convert.set_defaults(func=convert_func)
run = subparsers.add_parser( run = subparsers.add_parser(
'run', 'run',
......
...@@ -340,7 +340,7 @@ def bazel_build(target, ...@@ -340,7 +340,7 @@ def bazel_build(target,
enable_neon=True, enable_neon=True,
enable_opencl=True, enable_opencl=True,
address_sanitizer=False, address_sanitizer=False,
symbol_hidden=False, symbol_hidden=True,
extra_args=""): extra_args=""):
print("* Build %s with ABI %s" % (target, abi)) print("* Build %s with ABI %s" % (target, abi))
if abi == "host": if abi == "host":
...@@ -560,6 +560,7 @@ def gen_model_code(model_codegen_dir, ...@@ -560,6 +560,7 @@ def gen_model_code(model_codegen_dir,
obfuscate, obfuscate,
model_graph_format, model_graph_format,
data_type, data_type,
cl_mem_type,
graph_optimize_options): graph_optimize_options):
bazel_build_common("//mace/python/tools:converter") bazel_build_common("//mace/python/tools:converter")
...@@ -591,6 +592,7 @@ def gen_model_code(model_codegen_dir, ...@@ -591,6 +592,7 @@ def gen_model_code(model_codegen_dir,
"--model_graph_format=%s" % model_graph_format, "--model_graph_format=%s" % model_graph_format,
"--data_type=%s" % data_type, "--data_type=%s" % data_type,
"--graph_optimize_options=%s" % graph_optimize_options, "--graph_optimize_options=%s" % graph_optimize_options,
"--cl_mem_type=%s" % cl_mem_type,
_fg=True) _fg=True)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册