提交 e2a40a03 编写于 作者: 刘琦

Merge branch 'gpu-buffer' into 'master'

Refactor OpenCL kernel for supporting buffer.

See merge request !803
......@@ -8,6 +8,7 @@ stages:
- ops_test
- api_test
- python_tools_tests
- model_tests
- build_android_demo
- ops_benchmark
- extra_tests
......@@ -113,6 +114,18 @@ python_tools_tests:
python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
model_tests:
stage: model_tests
script:
- pwd
- rm -rf mace-models
- GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
- CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1.yml
- >
python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
- CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
- >
python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
......
......@@ -14,6 +14,8 @@
#include "mace/core/device.h"
#include "mace/core/buffer.h"
namespace mace {
CPUDevice::CPUDevice(const int num_threads,
......@@ -21,7 +23,8 @@ CPUDevice::CPUDevice(const int num_threads,
const bool use_gemmlowp)
: cpu_runtime_(new CPURuntime(num_threads,
policy,
use_gemmlowp)) {}
use_gemmlowp)),
scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
CPUDevice::~CPUDevice() = default;
......@@ -31,6 +34,7 @@ CPURuntime *CPUDevice::cpu_runtime() {
#ifdef MACE_ENABLE_OPENCL
OpenCLRuntime *CPUDevice::opencl_runtime() {
LOG(FATAL) << "CPU device should not call OpenCL Runtime";
return nullptr;
}
#endif
......@@ -43,4 +47,8 @@ DeviceType CPUDevice::device_type() const {
return DeviceType::CPU;
}
ScratchBuffer *CPUDevice::scratch_buffer() {
return scratch_buffer_.get();
}
} // namespace mace
......@@ -26,6 +26,8 @@
namespace mace {
class ScratchBuffer;
class Device {
public:
virtual ~Device() {}
......@@ -37,6 +39,7 @@ class Device {
virtual Allocator *allocator() = 0;
virtual DeviceType device_type() const = 0;
virtual ScratchBuffer *scratch_buffer() = 0;
};
class CPUDevice : public Device {
......@@ -53,9 +56,11 @@ class CPUDevice : public Device {
Allocator *allocator() override;
DeviceType device_type() const override;
ScratchBuffer *scratch_buffer() override;
private:
std::unique_ptr<CPURuntime> cpu_runtime_;
std::unique_ptr<ScratchBuffer> scratch_buffer_;
};
} // namespace mace
......
......@@ -15,7 +15,9 @@
#ifndef MACE_CORE_FUTURE_H_
#define MACE_CORE_FUTURE_H_
#include <algorithm>
#include <functional>
#include <vector>
#include "mace/utils/logging.h"
......@@ -25,9 +27,7 @@ struct CallStats;
// Wait the call to finish and get the stats if param is not nullptr
struct StatsFuture {
std::function<void(CallStats *)> wait_fn = [](CallStats *) {
LOG(FATAL) << "wait_fn must be properly set";
};
std::function<void(CallStats *)> wait_fn;
};
inline void SetFutureDefaultWaitFn(StatsFuture *future) {
......@@ -41,6 +41,29 @@ inline void SetFutureDefaultWaitFn(StatsFuture *future) {
}
}
inline void MergeMultipleFutureWaitFn(
const std::vector<StatsFuture> &org_futures,
StatsFuture *dst_future) {
if (dst_future != nullptr) {
dst_future->wait_fn = [org_futures](CallStats *stats) {
if (stats != nullptr) {
stats->start_micros = INT64_MAX;
stats->end_micros = 0;
for (auto &org_future : org_futures) {
CallStats tmp_stats;
if (org_future.wait_fn != nullptr) {
org_future.wait_fn(&tmp_stats);
stats->start_micros = std::min(stats->start_micros,
tmp_stats.start_micros);
stats->end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
}
}
stats->end_micros += stats->start_micros;
}
};
}
}
} // namespace mace
#endif // MACE_CORE_FUTURE_H_
......@@ -14,6 +14,8 @@
#include "mace/core/runtime/opencl/gpu_device.h"
#include "mace/core/buffer.h"
namespace mace {
GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
......@@ -27,7 +29,8 @@ GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
CPUDevice(num_threads, cpu_affinity_policy, use_gemmlowp),
runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
opencl_binary_storage, tuner)),
allocator_(new OpenCLAllocator(runtime_.get())) {}
allocator_(new OpenCLAllocator(runtime_.get())),
scratch_buffer_(new ScratchBuffer(allocator_.get())) {}
GPUDevice::~GPUDevice() = default;
......@@ -43,4 +46,8 @@ DeviceType GPUDevice::device_type() const {
return DeviceType::GPU;
}
ScratchBuffer *GPUDevice::scratch_buffer() {
return scratch_buffer_.get();
}
} // namespace mace
......@@ -37,9 +37,11 @@ class GPUDevice : public CPUDevice {
OpenCLRuntime *opencl_runtime() override;
Allocator *allocator() override;
DeviceType device_type() const override;
ScratchBuffer *scratch_buffer() override;
private:
std::unique_ptr<OpenCLRuntime> runtime_;
std::unique_ptr<OpenCLAllocator> allocator_;
std::unique_ptr<ScratchBuffer> scratch_buffer_;
};
} // namespace mace
......
......@@ -31,8 +31,6 @@
namespace mace {
std::string kOpenCLParameterPath; // NOLINT(runtime/string)
extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap;
......@@ -286,7 +284,8 @@ OpenCLRuntime::OpenCLRuntime(
is_opencl_avaliable_(false),
is_profiling_enabled_(false),
opencl_version_(CL_VER_UNKNOWN),
gpu_type_(UNKNOWN) {
gpu_type_(UNKNOWN),
mem_type_(MemoryType::GPU_IMAGE) {
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
......@@ -471,6 +470,14 @@ uint32_t OpenCLRuntime::device_compute_units() const {
return device_compute_units_;
}
bool OpenCLRuntime::UseImageMemory() {
return this->mem_type_ == MemoryType::GPU_IMAGE;
}
void OpenCLRuntime::set_mem_type(MemoryType type) {
this->mem_type_ = type;
}
bool OpenCLRuntime::BuildProgramFromCache(
const std::string &built_program_key,
const std::string &build_options_str,
......
......@@ -25,6 +25,7 @@
#include "mace/core/file_storage.h"
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/proto/mace.pb.h"
#include "mace/utils/string_util.h"
#include "mace/utils/timer.h"
#include "mace/utils/tuner.h"
......@@ -82,6 +83,9 @@ class OpenCLRuntime {
uint32_t device_compute_units() const;
Tuner<uint32_t> *tuner();
bool is_opencl_avaliable();
// TODO(liuqi): remove this function in the future, make decision at runtime.
bool UseImageMemory();
void set_mem_type(MemoryType type);
void GetCallStats(const cl::Event &event, CallStats *stats);
uint64_t GetDeviceMaxWorkGroupSize();
......@@ -129,6 +133,7 @@ class OpenCLRuntime {
bool is_profiling_enabled_;
OpenCLVersion opencl_version_;
GPUType gpu_type_;
MemoryType mem_type_;
// All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library.
std::shared_ptr<cl::Context> context_;
......
......@@ -101,13 +101,14 @@ enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 };
class Tensor {
public:
Tensor(Allocator *alloc, DataType type,
bool is_weight = false)
bool is_weight = false,
const std::string name = "")
: allocator_(alloc),
dtype_(type),
buffer_(nullptr),
is_buffer_owner_(true),
unused_(false),
name_(""),
name_(name),
is_weight_(is_weight),
scale_(0.f),
zero_point_(0),
......@@ -115,12 +116,13 @@ class Tensor {
maxval_(0.f) {}
Tensor(BufferBase *buffer, DataType dtype,
bool is_weight = false)
bool is_weight = false,
const std::string name = "")
: dtype_(dtype),
buffer_(buffer),
is_buffer_owner_(false),
unused_(false),
name_(""),
name_(name),
is_weight_(is_weight),
scale_(0.f),
zero_point_(0),
......@@ -129,12 +131,13 @@ class Tensor {
Tensor(const BufferSlice &buffer_slice,
DataType dtype,
bool is_weight = false)
bool is_weight = false,
const std::string name = "")
: dtype_(dtype),
buffer_slice_(buffer_slice),
is_buffer_owner_(false),
unused_(false),
name_(""),
name_(name),
is_weight_(is_weight),
scale_(0.f),
zero_point_(0),
......@@ -152,6 +155,8 @@ class Tensor {
}
}
inline std::string name() const { return name_; }
inline DataType dtype() const { return dtype_; }
inline void SetDtype(DataType dtype) { dtype_ = dtype; }
......@@ -188,11 +193,15 @@ class Tensor {
shape_configured_ = shape_configured;
}
inline const std::vector<index_t> &buffer_shape() const {
return buffer_shape_;
}
inline index_t dim_size() const { return shape_.size(); }
inline index_t dim(unsigned int index) const {
MACE_CHECK(index < shape_.size(), "Dim out of range: ", index, " >= ",
shape_.size());
MACE_CHECK(index < shape_.size(),
name_, ": Dim out of range: ", index, " >= ", shape_.size());
return shape_[index];
}
......@@ -214,12 +223,12 @@ class Tensor {
#ifdef MACE_ENABLE_OPENCL
inline cl::Image *opencl_image() const {
MACE_CHECK(has_opencl_image(), "do not have image");
MACE_CHECK(has_opencl_image(), name_, " do not have image");
return static_cast<cl::Image *>(buffer_->buffer());
}
inline cl::Buffer *opencl_buffer() const {
MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer");
MACE_CHECK(has_opencl_buffer(), name_, " do not have opencl buffer");
return static_cast<cl::Buffer *>(buffer_->buffer());
}
#endif
......@@ -268,12 +277,14 @@ class Tensor {
inline MaceStatus Resize(const std::vector<index_t> &shape) {
shape_ = shape;
buffer_shape_ = shape;
image_shape_.clear();
if (buffer_ != nullptr) {
MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage.");
MACE_CHECK(!has_opencl_image(),
name_, ": Cannot resize image, use ResizeImage.");
if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to "
<< raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE;
LOG(WARNING) << name_ << ": Resize buffer from size " << buffer_->size()
<< " to " << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE;
return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE);
}
return MaceStatus::MACE_SUCCESS;
......@@ -296,19 +307,22 @@ class Tensor {
allocator_ = other.allocator_;
dtype_ = other.dtype_;
shape_ = other.shape_;
buffer_shape_ = other.buffer_shape_;
image_shape_ = other.image_shape_;
}
inline MaceStatus ResizeImage(const std::vector<index_t> &shape,
const std::vector<size_t> &image_shape) {
shape_ = shape;
buffer_shape_ = shape;
image_shape_ = image_shape;
if (buffer_ == nullptr) {
MACE_CHECK(is_buffer_owner_);
buffer_ = new Image(allocator_);
return buffer_->Allocate(image_shape, dtype_);
} else {
MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
MACE_CHECK(has_opencl_image(),
name_, ": Cannot ResizeImage buffer, use Resize.");
Image *image = dynamic_cast<Image *>(buffer_);
MACE_CHECK(image_shape[0] <= image->image_shape()[0] &&
image_shape[1] <= image->image_shape()[1],
......@@ -366,8 +380,6 @@ class Tensor {
inline BufferBase *UnderlyingBuffer() const { return buffer_; }
inline void SetSourceOpName(const std::string name) { name_ = name; }
inline void DebugPrint() const {
using namespace numerical_chars; // NOLINT(build/namespaces)
std::stringstream os;
......@@ -459,9 +471,12 @@ class Tensor {
private:
Allocator *allocator_;
DataType dtype_;
// the shape of buffer(logical)
std::vector<index_t> shape_;
std::vector<index_t> shape_configured_;
std::vector<size_t> image_shape_;
// the shape of buffer(physical storage)
std::vector<index_t> buffer_shape_;
BufferBase *buffer_;
BufferSlice buffer_slice_;
bool is_buffer_owner_;
......
......@@ -44,8 +44,7 @@ bool HasQuantizeOp(const NetDef &net_def) {
}
} // namespace
Workspace::Workspace() :
host_scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
Workspace::Workspace() = default;
Tensor *Workspace::CreateTensor(const std::string &name,
Allocator *alloc,
......@@ -54,8 +53,8 @@ Tensor *Workspace::CreateTensor(const std::string &name,
VLOG(3) << "Tensor " << name << " already exists. Skipping.";
} else {
VLOG(3) << "Creating Tensor " << name;
tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type));
tensor_map_[name]->SetSourceOpName(name);
tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type,
false, name));
}
return GetTensor(name);
}
......@@ -171,7 +170,10 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
tensor_buffer_.get(), const_tensor.offset(),
const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type(), true));
const_tensor.data_type(),
true,
const_tensor.name()));
tensor->Reshape(dims);
tensor->SetScale(const_tensor.scale());
tensor->SetZeroPoint(const_tensor.zero_point());
......@@ -275,7 +277,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(device->allocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)));
mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
}
......@@ -301,10 +304,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
}
std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]),
output_type));
tensor->SetSourceOpName(op.name());
if (device_type == DeviceType::GPU) {
VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
output_type, false, op.output(i)));
if (device_type == DeviceType::GPU && tensor->has_opencl_image()) {
VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
<< " Mem: " << mem_ids[i]
<< " Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
......@@ -312,8 +314,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
<< ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1];
} else if (device_type == DeviceType::CPU) {
VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
} else {
VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
<< " Mem: " << mem_ids[i]
<< ", Buffer size: " << tensor->UnderlyingBuffer()->size();
}
......@@ -356,14 +358,6 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
return MaceStatus::MACE_SUCCESS;
}
ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
if (device_type == CPU) {
return host_scratch_buffer_.get();
} else {
return nullptr;
}
}
void Workspace::RemoveUnusedBuffer() {
auto iter = tensor_map_.begin();
auto end_iter = tensor_map_.end();
......
......@@ -52,8 +52,6 @@ class Workspace {
Device *device,
const unsigned char *model_data);
ScratchBuffer *GetScratchBuffer(DeviceType device_type);
void RemoveUnusedBuffer();
void RemoveAndReloadBuffer(const NetDef &net_def,
......@@ -64,15 +62,12 @@ class Workspace {
MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
Device *device);
Device *device_;
TensorMap tensor_map_;
std::unique_ptr<BufferBase> tensor_buffer_;
PreallocatedPooledAllocator preallocated_allocator_;
std::unique_ptr<ScratchBuffer> host_scratch_buffer_;
bool fused_buffer_;
MACE_DISABLE_COPY_AND_ASSIGN(Workspace);
......
......@@ -32,6 +32,8 @@ cc_library(
) + if_opencl_enabled(glob(
[
"opencl/*.cc",
"opencl/image/*.cc",
"opencl/buffer/*.cc",
],
exclude = [
"opencl/*_test.cc",
......@@ -43,14 +45,16 @@ cc_library(
"arm/*.h",
],
exclude = [
"buffer_to_image.h",
"image_to_buffer.h",
"buffer_transform.h",
"buffer_inverse_transform.h",
"lstmcell.h",
],
) + if_opencl_enabled(glob([
"opencl/*.h",
"buffer_to_image.h",
"image_to_buffer.h",
"opencl/image/*.h",
"opencl/buffer/*.h",
"buffer_transform.h",
"buffer_inverse_transform.h",
"lstmcell.h",
])),
copts = [
......
......@@ -26,10 +26,6 @@
#include "mace/core/types.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
......@@ -164,15 +160,22 @@ class ActivationFunctor<DeviceType::CPU, float> : OpKernel {
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLActivationKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLActivationKernel);
};
template <typename T>
class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
public:
ActivationFunctor(OpKernelContext *context,
ActivationType type,
T relux_max_limit)
: OpKernel(context),
activation_(type),
relux_max_limit_(static_cast<T>(relux_max_limit)) {}
T relux_max_limit);
MaceStatus operator()(const Tensor *input,
const Tensor *alpha,
......@@ -180,13 +183,7 @@ class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
StatsFuture *future);
private:
ActivationType activation_;
T relux_max_limit_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::string tuning_key_prefix_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLActivationKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -26,10 +26,6 @@
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
......@@ -96,17 +92,23 @@ struct AddNFunctor : OpKernel {
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLAddNKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLAddNKernel);
};
template <typename T>
struct AddNFunctor<DeviceType::GPU, T> : OpKernel {
explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
explicit AddNFunctor(OpKernelContext *context);
MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLAddNKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -26,41 +26,22 @@
#include "mace/kernels/activation.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
struct BatchNormFunctorBase : OpKernel {
BatchNormFunctorBase(OpKernelContext *context,
bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: OpKernel(context),
folded_constant_(folded_constant),
activation_(activation),
relux_max_limit_(relux_max_limit) {}
const bool folded_constant_;
const ActivationType activation_;
const float relux_max_limit_;
};
template<DeviceType D, typename T>
struct BatchNormFunctor;
template<>
struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
struct BatchNormFunctor<DeviceType::CPU, float> : OpKernel {
BatchNormFunctor(OpKernelContext *context,
const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: BatchNormFunctorBase(context,
folded_constant,
activation,
relux_max_limit) {}
: OpKernel(context),
folded_constant_(folded_constant),
activation_(activation),
relux_max_limit_(relux_max_limit) {}
MaceStatus operator()(const Tensor *input,
const Tensor *scale,
......@@ -133,19 +114,33 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
return MACE_SUCCESS;
}
const bool folded_constant_;
const ActivationType activation_;
const float relux_max_limit_;
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLBatchNormKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchNormKernel);
};
template<typename T>
struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
struct BatchNormFunctor<DeviceType::GPU, T> : OpKernel {
BatchNormFunctor(OpKernelContext *context,
const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: BatchNormFunctorBase(context,
folded_constant,
activation,
relux_max_limit) {}
const float relux_max_limit);
MaceStatus operator()(const Tensor *input,
const Tensor *scale,
const Tensor *offset,
......@@ -154,10 +149,7 @@ struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
const float epsilon,
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLBatchNormKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -24,10 +24,6 @@
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
......@@ -51,7 +47,8 @@ struct BatchToSpaceFunctorBase : OpKernel {
void CalculateBatchToSpaceOutputShape(const Tensor *input_tensor,
const DataFormat data_format,
index_t *output_shape) {
MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
MACE_CHECK(input_tensor->dim_size() == 4,
"Input(", input_tensor->name(), ") shape should be 4D");
index_t batch = input_tensor->dim(0);
index_t channels = 0;
index_t height = 0;
......@@ -96,8 +93,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor,
MaceStatus operator()(const Tensor *batch_tensor,
Tensor *space_tensor,
StatsFuture *future) {
MACE_UNUSED(future);
......@@ -191,8 +188,8 @@ struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {}
MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor,
MaceStatus operator()(const Tensor *batch_tensor,
Tensor *space_tensor,
StatsFuture *future) {
MACE_UNUSED(future);
......@@ -272,21 +269,29 @@ struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLBatchToSpaceKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *batch_tensor,
const std::vector<int> &paddings,
const std::vector<int> &block_shape,
const std::vector<index_t> &output_shape,
Tensor *space_tensor,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchToSpaceKernel);
};
template <typename T>
struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase {
BatchToSpaceFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {}
const std::vector<int> &block_shape);
MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor,
MaceStatus operator()(const Tensor *batch_tensor,
Tensor *space_tensor,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> space_shape_;
std::unique_ptr<OpenCLBatchToSpaceKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -24,10 +24,6 @@
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
......@@ -96,18 +92,26 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLBiasAddKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBiasAddKernel);
};
template <typename T>
struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase {
BiasAddFunctor(OpKernelContext *context, const DataFormat data_format)
: BiasAddFunctorBase(context, data_format) {}
BiasAddFunctor(OpKernelContext *context, const DataFormat data_format);
MaceStatus operator()(const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLBiasAddKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_
#define MACE_KERNELS_BUFFER_TO_IMAGE_H_
#ifndef MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
#define MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
#include <memory>
#include <vector>
......@@ -26,18 +26,19 @@
namespace mace {
namespace kernels {
struct BufferToImageFunctorBase : OpKernel {
explicit BufferToImageFunctorBase(OpKernelContext *context,
struct BufferInverseTransformFunctorBase : OpKernel {
BufferInverseTransformFunctorBase(OpKernelContext *context,
const int wino_blk_size)
: OpKernel(context), wino_blk_size_(wino_blk_size) {}
: OpKernel(context),
wino_blk_size_(wino_blk_size) {}
const int wino_blk_size_;
};
template <DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase {
explicit BufferToImageFunctor(OpKernelContext *context,
struct BufferInverseTransformFunctor : BufferInverseTransformFunctorBase {
explicit BufferInverseTransformFunctor(OpKernelContext *context,
const int wino_blk_size)
: BufferToImageFunctorBase(context, wino_blk_size) {}
: BufferInverseTransformFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
......@@ -51,22 +52,31 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
}
};
class OpenCLBufferInverseTransformKernel {
public:
virtual MaceStatus Compute(OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferInverseTransformKernel)
};
template <typename T>
struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
explicit BufferToImageFunctor(OpKernelContext *context,
const int wino_blk_size)
: BufferToImageFunctorBase(context, wino_blk_size) {}
struct BufferInverseTransformFunctor<DeviceType::GPU, T>
: BufferInverseTransformFunctorBase {
explicit BufferInverseTransformFunctor(OpKernelContext *context,
const int wino_blk_size);
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_BUFFER_TO_IMAGE_H_
#endif // MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_IMAGE_TO_BUFFER_H_
#define MACE_KERNELS_IMAGE_TO_BUFFER_H_
#ifndef MACE_KERNELS_BUFFER_TRANSFORM_H_
#define MACE_KERNELS_BUFFER_TRANSFORM_H_
#include <memory>
#include <vector>
......@@ -26,18 +26,19 @@
namespace mace {
namespace kernels {
struct ImageToBufferFunctorBase : OpKernel {
ImageToBufferFunctorBase(OpKernelContext *context,
struct BufferTransformFunctorBase : OpKernel {
explicit BufferTransformFunctorBase(OpKernelContext *context,
const int wino_blk_size)
: OpKernel(context),
wino_blk_size_(wino_blk_size) {}
: OpKernel(context), wino_blk_size_(wino_blk_size) {}
const int wino_blk_size_;
};
template <DeviceType D, typename T>
struct ImageToBufferFunctor : ImageToBufferFunctorBase {
ImageToBufferFunctor(OpKernelContext *context, const int wino_blk_size)
: ImageToBufferFunctorBase(context, wino_blk_size) {}
struct BufferTransformFunctor : BufferTransformFunctorBase {
BufferTransformFunctor(OpKernelContext *context,
const int wino_blk_size)
: BufferTransformFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
......@@ -51,22 +52,30 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
}
};
class OpenCLBufferTransformKernel {
public:
virtual MaceStatus Compute(OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferTransformKernel)
};
template <typename T>
struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
ImageToBufferFunctor(OpKernelContext *context,
const int wino_blk_size)
: ImageToBufferFunctorBase(context, wino_blk_size) {}
struct BufferTransformFunctor<DeviceType::GPU, T> : BufferTransformFunctorBase {
BufferTransformFunctor(OpKernelContext *context, const int wino_blk_size);
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_IMAGE_TO_BUFFER_H_
#endif // MACE_KERNELS_BUFFER_TRANSFORM_H_
......@@ -71,20 +71,24 @@ struct ChannelShuffleFunctor : OpKernel {
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLChannelShuffleKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLChannelShuffleKernel);
};
template<typename T>
struct ChannelShuffleFunctor<DeviceType::GPU, T> : OpKernel {
ChannelShuffleFunctor(OpKernelContext *context, const int groups)
: OpKernel(context), groups_(groups) {}
ChannelShuffleFunctor(OpKernelContext *context, const int groups);
MaceStatus operator()(const Tensor *input,
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
const int groups_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLChannelShuffleKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -24,24 +24,13 @@
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
struct ConcatFunctorBase : OpKernel {
ConcatFunctorBase(OpKernelContext *context, const int32_t axis)
: OpKernel(context), axis_(axis) {}
int32_t axis_;
};
template <DeviceType D, typename T>
struct ConcatFunctor : ConcatFunctorBase {
struct ConcatFunctor : OpKernel {
ConcatFunctor(OpKernelContext *context, const int32_t axis)
: ConcatFunctorBase(context, axis) {}
: OpKernel(context), axis_(axis) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
......@@ -98,21 +87,29 @@ struct ConcatFunctor : ConcatFunctorBase {
return MACE_SUCCESS;
}
int32_t axis_;
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLConcatKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConcatKernel);
};
template <typename T>
struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
ConcatFunctor(OpKernelContext *context, const int32_t axis)
: ConcatFunctorBase(context, axis) {}
struct ConcatFunctor<DeviceType::GPU, T> : OpKernel {
ConcatFunctor(OpKernelContext *context, const int32_t axis);
MaceStatus operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLConcatKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -35,10 +35,6 @@
#include "mace/kernels/quantize.h"
#include "mace/utils/utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
......@@ -78,8 +74,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const bool is_filter_transformed,
ScratchBuffer *scratch)
const bool is_filter_transformed)
: Conv2dFunctorBase(context,
strides,
padding_type,
......@@ -88,8 +83,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
activation,
relux_max_limit),
transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT),
is_filter_transformed_(is_filter_transformed),
scratch_(scratch) {}
is_filter_transformed_(is_filter_transformed) {}
void Conv2dGeneral(const float *input,
const float *filter,
......@@ -494,14 +488,15 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
}
// Init scratch buffer
scratch_->Rewind();
scratch_->GrowSize(total_scratch_size);
ScratchBuffer *scratch = context_->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
Tensor
transformed_input(scratch_->Scratch(transformed_input_size), DT_FLOAT);
transformed_input(scratch->Scratch(transformed_input_size), DT_FLOAT);
Tensor
transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
transformed_output(scratch->Scratch(transformed_output_size), DT_FLOAT);
Tensor padded_input(scratch->Scratch(padded_input_size), DT_FLOAT);
Tensor padded_output(scratch->Scratch(padded_output_size), DT_FLOAT);
const index_t extra_input_shape[4] =
{batch, input_channels, extra_input_height, extra_input_width};
const index_t extra_output_shape[4] =
......@@ -559,7 +554,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
transformed_output_data,
pad_output,
&sgemm_,
scratch_);
scratch);
};
} else if (use_neon_3x3_s1) {
conv_func = [=](const float *pad_input, float *pad_output) {
......@@ -588,7 +583,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
channels,
pad_output,
&sgemm_,
scratch_);
scratch);
};
} else if (use_neon_5x5_s1) {
conv_func = [=](const float *pad_input, float *pad_output) {
......@@ -735,7 +730,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
Tensor transformed_filter_;
bool is_filter_transformed_;
ScratchBuffer *scratch_;
SGemm sgemm_;
};
......@@ -748,16 +742,14 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const bool is_filter_transformed,
ScratchBuffer *scratch)
const bool is_filter_transformed)
: Conv2dFunctorBase(context,
strides,
padding_type,
paddings,
dilations,
activation,
relux_max_limit),
scratch_(scratch) {
relux_max_limit) {
MACE_UNUSED(is_filter_transformed);
}
......@@ -926,13 +918,14 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
bool im2col_required =
filter_h != 1 || filter_w != 1 || stride_h != 1 || stride_w != 1;
total_scratch_size += (im2col_required ? im2col_size : 0);
scratch_->Rewind();
scratch_->GrowSize(total_scratch_size);
ScratchBuffer *scratch = context_->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
std::unique_ptr<Tensor> zero_bias;
const int32_t *bias_data = nullptr;
if (bias == nullptr) {
zero_bias.reset(new Tensor(scratch_->Scratch(zero_bias_size), DT_INT32));
zero_bias.reset(new Tensor(scratch->Scratch(zero_bias_size), DT_INT32));
zero_bias->Reshape({channels});
zero_bias->Clear();
bias_data = zero_bias->data<int32_t>();
......@@ -944,7 +937,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
auto gemm_input_data = input_data;
if (im2col_required) {
// prepare im2col
im2col.reset(new Tensor(scratch_->Scratch(im2col_size), DT_UINT8));
im2col.reset(new Tensor(scratch->Scratch(im2col_size), DT_UINT8));
uint8_t *im2col_data = im2col->mutable_data<uint8_t>();
Im2col(input_data, input->shape(), filter_h, filter_w, stride_h,
stride_w, static_cast<uint8_t>(input->zero_point()),
......@@ -976,12 +969,28 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
return MACE_SUCCESS;
}
ScratchBuffer *scratch_;
};
#ifdef MACE_ENABLE_OPENCL
template<typename T>
class OpenCLConv2dKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConv2dKernel);
};
template <typename T>
struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
Conv2dFunctor(OpKernelContext *context,
const int *strides,
......@@ -990,18 +999,7 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const bool is_filter_transformed,
ScratchBuffer *scratch)
: Conv2dFunctorBase(context,
strides,
padding_type,
paddings,
dilations,
activation,
relux_max_limit) {
MACE_UNUSED(is_filter_transformed);
MACE_UNUSED(scratch);
}
const bool is_filter_transformed);
MaceStatus operator()(const Tensor *input,
const Tensor *filter,
......@@ -1009,10 +1007,7 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLConv2dKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -210,6 +210,20 @@ void CalcOutputSize(const index_t *input_shape,
}
}
void CalcNCHWInputShape(const index_t *output_shape,
const index_t *filter_shape,
const int *strides,
const int *dilations,
index_t *input_shape) {
MACE_CHECK_NOTNULL(input_shape);
input_shape[0] = output_shape[0];
input_shape[1] = filter_shape[1];
input_shape[2] = (output_shape[2] - 1) * strides[0] +
(filter_shape[2] - 1) * dilations[0] + 1;
input_shape[3] = (output_shape[3] - 1) * strides[1] +
(filter_shape[3] - 1) * dilations[1] + 1;
}
void CalcOutputSize(const index_t *input_shape, // NHWC
const index_t *filter_shape, // OIHW
const int *padding_size,
......@@ -234,8 +248,8 @@ void CalcNCHWOutputSize(const index_t *input_shape, // NCHW
void CalPaddingSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
const int *dilations,
const int *strides,
const int *dilations,
Padding padding,
int *padding_size) {
MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,
......
......@@ -84,6 +84,12 @@ void CalcNCHWOutputSize(const index_t *input_shape,
const RoundType round_type,
index_t *output_shape);
void CalcNCHWInputShape(const index_t *output_shape,
const index_t *filter_shape,
const int *strides,
const int *dilations,
index_t *input_shape);
void CalPaddingSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
const int *dilations,
......@@ -91,6 +97,7 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
Padding padding,
int *padding_size);
MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input,
const int pad_top, const int pad_bottom,
const int pad_left, const int pad_right,
......
......@@ -24,32 +24,18 @@
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
struct CropFunctorBase : OpKernel {
CropFunctorBase(OpKernelContext *context,
template <DeviceType D, typename T>
struct CropFunctor : OpKernel {
CropFunctor(OpKernelContext *context,
const int axis,
const std::vector<int> &offset)
: OpKernel(context),
axis_(axis),
offset_(offset) {}
const int axis_;
std::vector<int> offset_;
};
template <DeviceType D, typename T>
struct CropFunctor : CropFunctorBase {
CropFunctor(OpKernelContext *context,
const int axis,
const std::vector<int> &offset)
: CropFunctorBase(context, axis, offset) {}
void crop_copy(const T* input_data, T* output_data,
const std::vector<index_t> &input_shape,
const std::vector<index_t> &output_shape,
......@@ -121,23 +107,31 @@ struct CropFunctor : CropFunctorBase {
return MACE_SUCCESS;
}
const int axis_;
std::vector<int> offset_;
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLCropKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLCropKernel);
};
template <typename T>
struct CropFunctor<DeviceType::GPU, T> : CropFunctorBase {
struct CropFunctor<DeviceType::GPU, T> : OpKernel {
CropFunctor(OpKernelContext *context,
const int axis,
const std::vector<int> &offset)
: CropFunctorBase(context, axis, offset) {}
const std::vector<int> &offset);
MaceStatus operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLCropKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -28,10 +28,6 @@
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/utils/utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
......@@ -317,6 +313,22 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLDeconv2dKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *padding_data,
const ActivationType activation,
const float relux_max_limit,
const std::vector<index_t> &output_shape,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDeconv2dKernel);
};
template <typename T>
struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
Deconv2dFunctor(OpKernelContext *context,
......@@ -325,14 +337,7 @@ struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
const std::vector<int> &paddings,
const std::vector<index_t> &output_shape,
const ActivationType activation,
const float relux_max_limit)
: Deconv2dFunctorBase(context,
strides,
padding_type,
paddings,
output_shape,
activation,
relux_max_limit) {}
const float relux_max_limit);
MaceStatus operator()(const Tensor *input,
const Tensor *filter,
......@@ -341,10 +346,7 @@ struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLDeconv2dKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -93,20 +93,24 @@ struct DepthToSpaceOpFunctor : OpKernel {
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLDepthToSpaceKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthToSpaceKernel);
};
template<typename T>
struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
DepthToSpaceOpFunctor(OpKernelContext *context,
const int block_size)
: OpKernel(context), block_size_(block_size) {}
const int block_size);
MaceStatus operator()(const Tensor *input,
Tensor *output,
StatsFuture *future);
const int block_size_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLDepthToSpaceKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -501,6 +501,24 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLDepthwiseConv2dKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthwiseConv2dKernel);
};
template<typename T>
struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
: DepthwiseConv2dFunctorBase {
......@@ -510,14 +528,7 @@ struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
const std::vector<int> &paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit)
: DepthwiseConv2dFunctorBase(context,
strides,
padding_type,
paddings,
dilations,
activation,
relux_max_limit) {}
const float relux_max_limit);
MaceStatus operator()(const Tensor *input,
const Tensor *filter,
......@@ -525,10 +536,7 @@ struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLDepthwiseConv2dKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -27,10 +27,6 @@
#include "mace/kernels/kernel.h"
#include "mace/utils/quantize.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
......@@ -805,11 +801,12 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
struct EltwiseFunctorBase : OpKernel {
EltwiseFunctorBase(OpKernelContext *context,
template <DeviceType D, typename T>
struct EltwiseFunctor : OpKernel {
EltwiseFunctor(OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff,
const float scalar_input,
const float scalar_input, // float as it comes from arg
const int32_t scalar_input_index,
const DataFormat data_format)
: OpKernel(context),
......@@ -819,28 +816,6 @@ struct EltwiseFunctorBase : OpKernel {
scalar_input_index_(scalar_input_index),
data_format_(data_format) {}
EltwiseType type_;
std::vector<float> coeff_;
float scalar_input_;
int32_t scalar_input_index_;
DataFormat data_format_;
};
template <DeviceType D, typename T>
struct EltwiseFunctor : EltwiseFunctorBase {
EltwiseFunctor(OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff,
const float scalar_input, // float as it comes from arg
const int32_t scalar_input_index,
const DataFormat data_format)
: EltwiseFunctorBase(context,
type,
coeff,
scalar_input,
scalar_input_index,
data_format) {}
template <typename DstType>
MaceStatus DoEltwise(const Tensor *input0,
const Tensor *input1,
......@@ -957,23 +932,28 @@ struct EltwiseFunctor : EltwiseFunctorBase {
}
}
EltwiseType type_;
std::vector<float> coeff_;
float scalar_input_;
int32_t scalar_input_index_;
DataFormat data_format_;
Tensor scalar_tensor_;
};
template <>
struct EltwiseFunctor<DeviceType::CPU, uint8_t> : EltwiseFunctorBase {
struct EltwiseFunctor<DeviceType::CPU, uint8_t> : OpKernel {
EltwiseFunctor(OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff,
const float scalar_input, // float as it comes from arg
const int32_t scalar_input_index,
const DataFormat data_format)
: EltwiseFunctorBase(context,
type,
coeff,
scalar_input,
scalar_input_index,
data_format) {}
: OpKernel(context),
type_(type),
coeff_(coeff),
scalar_input_(scalar_input),
scalar_input_index_(scalar_input_index),
data_format_(data_format) {}
MaceStatus operator()(const Tensor *input0,
const Tensor *input1,
......@@ -1093,33 +1073,41 @@ struct EltwiseFunctor<DeviceType::CPU, uint8_t> : EltwiseFunctorBase {
return MACE_SUCCESS;
}
EltwiseType type_;
std::vector<float> coeff_;
float scalar_input_;
int32_t scalar_input_index_;
DataFormat data_format_;
Tensor scalar_tensor_;
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLEltwiseKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLEltwiseKernel);
};
template <typename T>
struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
struct EltwiseFunctor<DeviceType::GPU, T> : OpKernel {
EltwiseFunctor(OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff,
const float scalar_input,
const int32_t scalar_input_index,
const DataFormat data_format)
: EltwiseFunctorBase(context,
type,
coeff,
scalar_input,
scalar_input_index,
data_format) {}
const DataFormat data_format);
MaceStatus operator()(const Tensor *input0,
const Tensor *input1,
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLEltwiseKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -151,12 +151,24 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLFullyConnectedKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLFullyConnectedKernel);
};
template <typename T>
struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
FullyConnectedFunctor(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit)
: FullyConnectedBase(context, activation, relux_max_limit) {}
const float relux_max_limit);
MaceStatus operator()(const Tensor *input,
const Tensor *weight,
......@@ -164,11 +176,7 @@ struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
std::vector<uint32_t> gws_;
std::vector<uint32_t> lws_;
std::vector<index_t> input_shape_;
std::unique_ptr<BufferBase> kernel_error_;
std::unique_ptr<OpenCLFullyConnectedKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -35,11 +35,23 @@ namespace kernels {
template <DeviceType D, typename T>
struct LSTMCellFunctor;
class OpenCLLSTMCellKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *pre_output,
const Tensor *weight,
const Tensor *bias,
const Tensor *pre_cell,
Tensor *cell,
Tensor *output,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLLSTMCellKernel);
};
template <typename T>
struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
LSTMCellFunctor(OpKernelContext *context, T forget_bias)
: OpKernel(context),
forget_bias_(static_cast<T>(forget_bias)) {}
LSTMCellFunctor(OpKernelContext *context, T forget_bias);
MaceStatus operator()(const Tensor *input,
const Tensor *pre_output,
const Tensor *weight,
......@@ -49,11 +61,7 @@ struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
Tensor *output,
StatsFuture *future);
T forget_bias_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
std::unique_ptr<OpenCLLSTMCellKernel> kernel_;
};
} // namespace kernels
......
......@@ -34,10 +34,6 @@
#include "mace/kernels/gemmlowp_util.h"
#include "mace/kernels/sgemm.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
......@@ -89,7 +85,7 @@ struct MatMulFunctor : OpKernel {
const index_t height_b = B->dim(rank - 2);
const index_t width_b = B->dim(rank - 1);
auto scratch_buffer = context_->workspace()->GetScratchBuffer(D);
auto scratch_buffer = context_->device()->scratch_buffer();
scratch_buffer->Rewind();
index_t scratch_size = C->raw_max_size();
if (!A->is_weight()) {
......@@ -112,7 +108,7 @@ struct MatMulFunctor : OpKernel {
A->is_weight(),
B->is_weight(),
c_ptr_base,
scratch_buffer);
context_->device()->scratch_buffer());
return MACE_SUCCESS;
}
......@@ -218,9 +214,21 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
};
#ifdef MACE_ENABLE_OPENCL
class OpenCLMatMulKernel {
public:
virtual MaceStatus Compute(
OpKernelContext *context,
const Tensor *A,
const Tensor *B,
Tensor *C,
bool transpose_a,
bool transpose_b,
StatsFuture *future) = 0;
MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLMatMulKernel);
};
template <typename T>
struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
explicit MatMulFunctor(OpKernelContext *context);
MaceStatus operator()(const Tensor *A,
const Tensor *B,
......@@ -229,9 +237,7 @@ struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
bool transpose_b,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::unique_ptr<OpenCLMatMulKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
......
......@@ -13,96 +13,31 @@
// limitations under the License.
#include "mace/kernels/activation.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
#include "mace/kernels/opencl/image/activation.h"
namespace mace {
namespace kernels {
template <typename T>
ActivationFunctor<DeviceType::GPU, T>::ActivationFunctor(
OpKernelContext *context,
ActivationType type,
T relux_max_limit) : OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(
new opencl::image::ActivationKernel<T>(type, relux_max_limit));
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T>
MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
switch (activation_) {
case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
kernel_.setArg(idx++, *(alpha->opencl_image()));
}
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
return kernel_->Compute(context_, input, alpha, output, future);
}
template struct ActivationFunctor<DeviceType::GPU, float>;
......
......@@ -13,97 +13,32 @@
// limitations under the License.
#include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
#include "mace/kernels/opencl/image/addn.h"
namespace mace {
namespace kernels {
template <typename T>
AddNFunctor<DeviceType::GPU, T>::AddNFunctor(OpKernelContext *context)
: OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(
new opencl::image::AddNKernel<T>);
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T>
MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
const index_t batch = input_tensors[0]->dim(0);
const index_t height = input_tensors[0]->dim(1);
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = context_->device()->opencl_runtime();
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
MACE_CHECK(height == input_tensors[i]->dim(1));
MACE_CHECK(width == input_tensors[i]->dim(2));
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_2D_GWS_ARGS(kernel_);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
return kernel_->Compute(context_, input_tensors, output_tensor, future);
}
template struct AddNFunctor<DeviceType::GPU, float>;
template struct AddNFunctor<DeviceType::GPU, half>;
} // namespace kernels
......
......@@ -13,14 +13,26 @@
// limitations under the License.
#include "mace/kernels/batch_norm.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
#include "mace/kernels/opencl/image/batch_norm.h"
namespace mace {
namespace kernels {
template <typename T>
BatchNormFunctor<DeviceType::GPU, T>::BatchNormFunctor(
OpKernelContext *context,
const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::BatchNormKernel<T>(
folded_constant, activation, relux_max_limit));
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T>
MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
......@@ -31,84 +43,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
const float epsilon,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (folded_constant_) {
built_options.emplace("-DFOLDED_CONSTANT");
}
switch (activation_) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (!folded_constant_) {
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
return kernel_->Compute(context_, input, scale, offset, mean,
var, epsilon, output, future);
}
template struct BatchNormFunctor<DeviceType::GPU, float>;
......
......@@ -16,84 +16,31 @@
#define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
#include "mace/kernels/batch_to_space.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
#include "mace/kernels/opencl/image/batch_to_space.h"
namespace mace {
namespace kernels {
template <typename T>
BatchToSpaceFunctor<DeviceType::GPU, T>::BatchToSpaceFunctor(
OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape)
: BatchToSpaceFunctorBase(context, paddings, block_shape) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::BatchToSpaceKernel<T>);
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T>
MaceStatus BatchToSpaceFunctor<DeviceType::GPU, T>::operator()(
Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
const Tensor *batch_tensor, Tensor *space_tensor, StatsFuture *future) {
std::vector<index_t> output_shape(4, 0);
CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
output_shape.data());
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
MACE_RETURN_IF_ERROR(
space_tensor->ResizeImage(output_shape, output_image_shape));
const uint32_t chan_blk =
static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
const char *kernel_name = "batch_to_space";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
kernel_.setArg(idx++, block_shape_[0]);
kernel_.setArg(idx++, block_shape_[1]);
kernel_.setArg(idx++, paddings_[0]);
kernel_.setArg(idx++, paddings_[2]);
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
space_shape_ = space_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
return kernel_->Compute(context_, batch_tensor, paddings_, block_shape_,
output_shape, space_tensor, future);
}
template struct BatchToSpaceFunctor<DeviceType::GPU, float>;
......
......@@ -13,13 +13,23 @@
// limitations under the License.
#include "mace/kernels/bias_add.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/kernels/opencl/image/bias_add.h"
namespace mace {
namespace kernels {
template <typename T>
BiasAddFunctor<DeviceType::GPU, T>::BiasAddFunctor(
OpKernelContext *context,
const DataFormat data_format)
: BiasAddFunctorBase(context, data_format) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::BiasAddKernel<T>);
} else {
MACE_NOT_IMPLEMENTED;
}
}
template <typename T>
MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *bias,
......@@ -27,75 +37,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
StatsFuture *future) {
MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC,
"gpu only support biasadd for 4-dimensional NHWC format tensor");
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
return MACE_SUCCESS;
return kernel_->Compute(context_, input, bias, output, future);
}
template struct BiasAddFunctor<DeviceType::GPU, float>;
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
#define MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
#include "mace/kernels/buffer_inverse_transform.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
MaceStatus BufferTypeTransform(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future);
template <typename T>
class BufferInverseTransform: public OpenCLBufferInverseTransformKernel {
public:
MaceStatus Compute(OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
};
template <typename T>
MaceStatus BufferInverseTransform<T>::Compute(OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(type);
MACE_UNUSED(wino_blk_size);
const DataType dt = DataTypeToEnum<T>::value;
if (input->dtype() != output->dtype()) {
return BufferTypeTransform(context, &kernel_, input, dt, output, future);
} else {
SetFutureDefaultWaitFn(future);
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/opencl/buffer/buffer_transform.h"
#include <vector>
#include <set>
#include <string>
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
MaceStatus TransformConv2DFilter(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future) {
const index_t out_chan = input->dim(0);
const index_t in_chan = input->dim(1);
const index_t filter_height = input->dim(2);
const index_t filter_width = input->dim(3);
std::vector<index_t> transformed_shape = {
filter_height, filter_width,
RoundUpDiv4(out_chan),
RoundUp<index_t>(in_chan, 4),
4,
};
uint32_t gws[3];
gws[0] = static_cast<uint32_t>(transformed_shape[3]);
gws[1] = static_cast<uint32_t>(transformed_shape[2]);
gws[2] = static_cast<uint32_t>(filter_height * filter_width);
MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
output->Reshape(input->shape());
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_NON_UNIFORM_WG_CONFIG;
MACE_OUT_OF_RANGE_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter");
built_options.emplace("-Dtransform_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
kernel));
}
MACE_OUT_OF_RANGE_INIT(*kernel);
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel->setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
kernel->setArg(idx++, *(output->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(out_chan));
kernel->setArg(idx++, static_cast<int32_t>(in_chan));
kernel->setArg(idx++, static_cast<int32_t>(filter_height));
kernel->setArg(idx++, static_cast<int32_t>(filter_width));
kernel->setArg(idx++, static_cast<int32_t>(
in_chan * filter_height * filter_width));
std::string tuning_key =
Concat("transform_conv_filter",
transformed_shape[0],
transformed_shape[1],
transformed_shape[2],
transformed_shape[3]);
std::vector<uint32_t> lws = {4, 4, 4, 0};
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION
// Mark the buffer unused.
const_cast<Tensor *>(input)->MarkUnused();
return MACE_SUCCESS;
}
MaceStatus TransformDWConv2DFilter(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future) {
const index_t multiplier = input->dim(0);
const index_t in_chan = input->dim(1);
const index_t filter_height = input->dim(2);
const index_t filter_width = input->dim(3);
std::vector<index_t> transformed_shape = {
multiplier, RoundUpDiv4(in_chan),
filter_height, filter_width, 4,
};
uint32_t gws[3];
gws[0] = static_cast<uint32_t>(filter_width);
gws[1] = static_cast<uint32_t>(filter_height);
gws[2] = static_cast<uint32_t>(transformed_shape[1]);
MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
output->Reshape(input->shape());
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter");
built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
kernel));
}
MACE_OUT_OF_RANGE_INIT(*kernel);
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel->setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
kernel->setArg(idx++, *(output->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(in_chan));
kernel->setArg(idx++, static_cast<int32_t>(filter_height * filter_width));
std::string tuning_key =
Concat("transform_conv_filter",
transformed_shape[0],
transformed_shape[1],
transformed_shape[2],
transformed_shape[3]);
std::vector<uint32_t> lws = {4, 4, 4, 0};
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION
// Mark the buffer unused.
const_cast<Tensor *>(input)->MarkUnused();
return MACE_SUCCESS;
}
MaceStatus TransformArgument(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future) {
const index_t size = input->dim(0);
std::vector<index_t> transformed_shape = {RoundUp<index_t>(size, 4)};
uint32_t gws = static_cast<uint32_t>(RoundUpDiv4(transformed_shape[0]));
MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
output->Reshape(input->shape());
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg");
built_options.emplace("-Dtransform_arg=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
kernel));
}
MACE_OUT_OF_RANGE_INIT(*kernel);
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size());
kernel->setArg(idx++, gws);
kernel->setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel->setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
kernel->setArg(idx++, *(output->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(size));
const uint32_t lws =
static_cast<uint32_t>(RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize()));
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(gws),
cl::NDRange(lws), nullptr, &event);
} else {
uint32_t roundup_gws = RoundUp(gws, lws);
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(roundup_gws),
cl::NDRange(lws), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
// Mark the buffer unused.
const_cast<Tensor *>(input)->MarkUnused();
return MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
#define MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
#include <vector>
#include "mace/kernels/buffer_transform.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
MaceStatus BufferTypeTransform(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future);
MaceStatus TransformConv2DFilter(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future);
MaceStatus TransformDWConv2DFilter(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future);
MaceStatus TransformArgument(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future);
template <typename T>
class BufferTransform: public OpenCLBufferTransformKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus BufferTransform<T>::Compute(OpKernelContext *context,
const Tensor *input,
const BufferType type,
const int wino_blk_size,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(type);
MACE_UNUSED(wino_blk_size);
const DataType dt = DataTypeToEnum<T>::value;
switch (type) {
case CONV2D_FILTER:
return TransformConv2DFilter(context, &kernel_, input,
dt, output, future);
case DW_CONV2D_FILTER:
return TransformDWConv2DFilter(context, &kernel_, input,
dt, output, future);
case ARGUMENT:
return TransformArgument(context, &kernel_, input, dt, output, future);
default:
if (input->dtype() != dt) {
return BufferTypeTransform(context, &kernel_, input,
dt, output, future);
} else {
SetFutureDefaultWaitFn(future);
output->ReuseTensorBuffer(*input);
return MaceStatus::MACE_SUCCESS;
}
}
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
MaceStatus BufferTypeTransform(
OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const DataType dt,
Tensor *output,
StatsFuture *future) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
const uint32_t gws =
static_cast<uint32_t>(RoundUpDiv4(output->size()));
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type");
built_options.emplace("-Dtransform_data_type=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
kernel_name,
built_options,
kernel));
}
MACE_OUT_OF_RANGE_INIT(*kernel);
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
kernel->setArg(idx++, gws);
kernel->setArg(idx++, *(input->opencl_buffer()));
MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
"buffer offset not aligned");
kernel->setArg(idx++,
static_cast<uint32_t>(input->buffer_offset() /
GetEnumTypeSize(input->dtype())));
kernel->setArg(idx++, *(output->opencl_buffer()));
const uint32_t lws =
static_cast<uint32_t>(RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize()));
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(gws),
cl::NDRange(lws), nullptr, &event);
} else {
uint32_t roundup_gws = RoundUp(gws, lws);
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(roundup_gws),
cl::NDRange(lws), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
MACE_OUT_OF_RANGE_VALIDATION
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
// Mark the buffer unused.
const_cast<Tensor *>(input)->MarkUnused();
return MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
#define MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
#include "mace/kernels/conv_2d.h"
#include <functional>
#include <memory>
#include <vector>
#include "mace/kernels/opencl/buffer/utils.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
namespace conv2d {
extern MaceStatus Conv2d1x1(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *padded_input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const bool input_changed,
Tensor *output,
StatsFuture *future);
extern MaceStatus Conv2dGeneral(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const bool input_changed,
Tensor *output,
StatsFuture *future);
} // namespace conv2d
template <typename T>
class Conv2dKernel : public OpenCLConv2dKernel {
public:
Conv2dKernel() : old_scratch_size_(0) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) override;
private:
index_t old_scratch_size_;
cl::Kernel kernels_[2];
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus Conv2dKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) {
StatsFuture pad_future, conv_future;
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
// Reshape output
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter->shape().data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
bool use_1x1 = filter_h == 1 && filter_w == 1;
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w, tile_c = 4;
if (use_1x1) {
tile_w = 2;
} else {
tile_w = 4;
}
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
input->dtype()));
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
if (use_1x1) {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2d1x1(
context, &kernels_[1], pad_input, filter, bias, strides,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
input_changed, output, &conv_future);
};
} else {
conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
return conv2d::Conv2dGeneral(
context, &kernels_[1], pad_input, filter, bias, strides, dilations,
DataTypeToEnum<T>::v(), activation, relux_max_limit,
input_changed, output, &conv_future);
};
}
MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
MergeMultipleFutureWaitFn({pad_future, conv_future}, future);
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
namespace conv2d {
MaceStatus Conv2d1x1(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *padded_input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const bool input_changed,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channel = output->dim(3);
const index_t in_height = padded_input->dim(1);
const index_t in_width = padded_input->dim(2);
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1_buffer",
kernel_name,
built_options, kernel));
}
const uint32_t gws[2] = {static_cast<uint32_t>(
RoundUpDiv4(channel) *
RoundUpDiv<index_t>(width, 2)),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_2D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input->opencl_buffer()));
kernel->setArg(idx++, *(filter->opencl_buffer()));
if (bias != nullptr) {
kernel->setArg(idx++, *(bias->opencl_buffer()));
}
kernel->setArg(idx++, static_cast<int32_t>(in_height));
kernel->setArg(idx++, static_cast<int32_t>(in_width));
kernel->setArg(idx++, static_cast<int32_t>(padded_input->dim(3)));
kernel->setArg(idx++,
static_cast<int32_t>(filter->buffer_shape()[3]));
kernel->setArg(idx++, static_cast<int32_t>(height));
kernel->setArg(idx++, static_cast<int32_t>(width));
kernel->setArg(idx++, static_cast<int32_t>(channel));
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
std::string tuning_key =
Concat("conv2d_1x1_buffer", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
std::vector<uint32_t> lws = {16, 4, 0};
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws,
lws, future));
MACE_OUT_OF_RANGE_VALIDATION;
return MACE_SUCCESS;
}
} // namespace conv2d
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
namespace conv2d {
MaceStatus Conv2dGeneral(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *padded_input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const bool input_changed,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channel = output->dim(3);
const index_t in_height = padded_input->dim(1);
const index_t in_width = padded_input->dim(2);
const index_t in_channel = padded_input->dim(3);
const index_t filter_height = filter->dim(2);
const index_t filter_width = filter->dim(3);
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
built_options.emplace("-Dconv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_buffer",
kernel_name,
built_options, kernel));
}
const uint32_t gws[2] = {static_cast<uint32_t>(
RoundUpDiv4(channel) * RoundUpDiv4(width)),
static_cast<uint32_t>(height * batch)};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
auto filter_buffer_shape = filter->buffer_shape();
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_2D_GWS_ARGS(*kernel, gws)
kernel->setArg(idx++, *(padded_input->opencl_buffer()));
kernel->setArg(idx++, *(filter->opencl_buffer()));
if (bias != nullptr) {
kernel->setArg(idx++, *(bias->opencl_buffer()));
}
kernel->setArg(idx++, static_cast<int32_t>(in_height));
kernel->setArg(idx++, static_cast<int32_t>(in_width));
kernel->setArg(idx++, static_cast<int32_t>(padded_input->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(filter_height));
kernel->setArg(idx++, static_cast<int32_t>(filter_width));
kernel->setArg(idx++,
static_cast<int32_t>(filter_buffer_shape[3]));
kernel->setArg(idx++, static_cast<int32_t>(
filter_buffer_shape[2] * filter_buffer_shape[3]
* filter_buffer_shape[4]));
kernel->setArg(idx++, static_cast<int32_t>(height));
kernel->setArg(idx++, static_cast<int32_t>(width));
kernel->setArg(idx++, static_cast<int32_t>(channel));
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, static_cast<int32_t>(
dilations[0] * in_width * in_channel));
kernel->setArg(idx++, static_cast<int32_t>(
dilations[1] * in_channel));
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
std::string tuning_key =
Concat("conv2d_general_buffer", output->dim(0), output->dim(1),
output->dim(2), output->dim(3), filter_height, filter_width);
std::vector<uint32_t> lws = {16, 4, 0};
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws,
lws, future));
MACE_OUT_OF_RANGE_VALIDATION
return MACE_SUCCESS;
}
} // namespace conv2d
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/opencl/buffer/depthwise_conv2d.h"
#include <set>
#include <string>
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
namespace depthwise {
MaceStatus DepthwiseConv2d(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *padded_input, // NHWC
const Tensor *filter, // HWIM
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const bool input_changed,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channel = output->dim(3);
const index_t in_height = padded_input->dim(1);
const index_t in_width = padded_input->dim(2);
const index_t in_channel = padded_input->dim(3);
const index_t filter_height = filter->dim(2);
const index_t filter_width = filter->dim(3);
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
break;
case RELU:
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
built_options.emplace("-DUSE_RELUX");
break;
case TANH:
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
built_options.emplace("-DUSE_SIGMOID");
break;
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("depthwise_conv2d_buffer", kernel_name,
built_options, kernel));
}
const uint32_t gws[2] = {
static_cast<uint32_t>(RoundUpDiv4(channel) * RoundUpDiv4(width)),
static_cast<uint32_t>(height * batch)
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_2D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input->opencl_buffer()));
kernel->setArg(idx++, *(filter->opencl_buffer()));
if (bias != nullptr) {
kernel->setArg(idx++, *(bias->opencl_buffer()));
}
kernel->setArg(idx++, static_cast<uint32_t>(in_height));
kernel->setArg(idx++, static_cast<uint32_t>(in_width));
kernel->setArg(idx++, static_cast<uint32_t>(in_channel));
kernel->setArg(idx++, static_cast<uint32_t>(filter_height));
kernel->setArg(idx++, static_cast<uint32_t>(filter_width));
kernel->setArg(idx++, static_cast<uint32_t>(filter_height * filter_width));
kernel->setArg(idx++, static_cast<uint32_t>(height));
kernel->setArg(idx++, static_cast<uint32_t>(width));
kernel->setArg(idx++, static_cast<uint32_t>(channel));
kernel->setArg(idx++, static_cast<uint32_t>(strides[0]));
kernel->setArg(idx++, static_cast<uint32_t>(strides[1]));
kernel->setArg(idx++, static_cast<int32_t>(
dilations[0] * in_width * in_channel));
kernel->setArg(idx++, static_cast<int32_t>(
dilations[1] * in_channel));
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
std::vector<uint32_t> lws = {16, 4, 0};
std::string tuning_key =
Concat("depthwise_conv2d_buffer_kernel", in_height, in_width, in_channel,
filter_height, filter_width, channel);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION
return MACE_SUCCESS;
}
} // namespace depthwise
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
#define MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
#include "mace/kernels/depthwise_conv2d.h"
#include <functional>
#include <memory>
#include <vector>
#include "mace/kernels/opencl/buffer/utils.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
namespace depthwise {
MaceStatus DepthwiseConv2d(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *padded_input, // NHWC
const Tensor *filter, // HWIM
const Tensor *bias,
const int *strides,
const int *dilations,
const DataType dt,
const ActivationType activation,
const float relux_max_limit,
const bool input_changed,
Tensor *output,
StatsFuture *future);
} // namespace depthwise
template <typename T>
class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
public:
DepthwiseConv2dKernel() : old_scratch_size_(0) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) override;
private:
index_t old_scratch_size_;
cl::Kernel kernels_[2];
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus DepthwiseConv2dKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
Tensor *output,
StatsFuture *future) {
StatsFuture pad_future, dw_conv_future;
index_t filter_w = filter->dim(3);
// Create a fake conv_2d filter to calculate the paddings and output size
std::vector<index_t> fake_filter_shape(4);
fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
fake_filter_shape[1] = filter->dim(1);
fake_filter_shape[2] = filter->dim(2);
fake_filter_shape[3] = filter->dim(3);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (padding_data.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), fake_filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::FLOOR,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// calculate padded input shape
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
int pad_top = paddings[0] >> 1;
int pad_left = paddings[1] >> 1;
MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
MACE_CHECK(filter->dim(0) * input_channels == channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
std::vector<index_t> padded_output_shape = output_shape;
index_t tile_w = 4, tile_c = 4;
padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[1] = input_height + paddings[0];
padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
(filter_w - 1) * dilations[1] + 1;
padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[1] != input_height ||
padded_input_shape[2] != input_width ||
padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
input->dtype()));
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, pad_top, pad_left,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
MACE_RETURN_IF_ERROR(
depthwise::DepthwiseConv2d(
context, &kernels_[1], padded_input_ptr, filter, bias, strides,
dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
input_changed, output, &dw_conv_future));
MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, future);
return MaceStatus::MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
#define MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
#include "mace/kernels/pooling.h"
#include <functional>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/buffer/utils.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
template <typename T>
class PoolingKernel : public OpenCLPoolingKernel {
public:
PoolingKernel() : old_scratch_size_(0) {}
MaceStatus Compute(
OpKernelContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
Tensor *output,
StatsFuture *future) override;
private:
index_t old_scratch_size_;
cl::Kernel kernels_[2];
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus PoolingKernel<T>::Compute(
OpKernelContext *context,
const Tensor *input,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding &padding_type,
const std::vector<int> &padding_data,
const int *dilations,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
StatsFuture pad_future, pooling_future;
index_t input_channels = input->dim(3);
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
kernels[0], kernels[1]};
std::vector<int> paddings(2);
if (padding_data.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations, strides,
padding_type, output_shape.data(), paddings.data());
} else {
paddings = padding_data;
CalcOutputSize(input->shape().data(), filter_shape.data(),
padding_data.data(), dilations, strides, RoundType::CEIL,
output_shape.data());
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
// Mark whether input changed or not
bool input_changed = !IsVecEqual(input_shape_, input->shape());
input_shape_ = input->shape();
auto runtime = context->device()->opencl_runtime();
// pad input
std::vector<index_t> padded_input_shape = input->shape();
padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
const Tensor *padded_input_ptr = input;
// pad input
std::unique_ptr<Tensor> padded_input;
if (padded_input_shape[3] != input_channels) {
index_t total_scratch_size = 0;
index_t padded_input_size = 0;
padded_input_size =
std::accumulate(padded_input_shape.begin(),
padded_input_shape.end(),
1,
std::multiplies<index_t>())
* GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
total_scratch_size += padded_input_size;
// Init scratch buffer
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(total_scratch_size);
if (old_scratch_size_ != scratch->size()) {
input_changed |= scratch->size() != old_scratch_size_;
old_scratch_size_ = scratch->size();
}
padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
input->dtype()));
padded_input->Resize(padded_input_shape);
PadInput(context, &kernels_[0], input, 0, 0,
input_changed, padded_input.get(), &pad_future);
padded_input_ptr = padded_input.get();
}
cl::Kernel *kernel = &kernels_[1];
MACE_OUT_OF_RANGE_DEFINITION
if (kernel->get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
} else {
built_options.emplace("-DIN_DATA_TYPE=" +
DtToCLDt(input->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
}
if (pooling_type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
kernel_name,
built_options,
kernel));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
static_cast<uint32_t>(output->dim(2)),
static_cast<uint32_t>(output->dim(0) * output->dim(1)),
};
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
MACE_SET_3D_GWS_ARGS(*kernel, gws);
kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
kernel->setArg(idx++, paddings[0] / 2);
kernel->setArg(idx++, paddings[1] / 2);
kernel->setArg(idx++, strides[0]);
kernel->setArg(idx++, strides[1]);
kernel->setArg(idx++, kernels[0]);
kernel->setArg(idx++, kernels[1]);
kernel->setArg(idx++, *(output->opencl_buffer()));
}
const std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, &pooling_future));
MACE_OUT_OF_RANGE_VALIDATION
MergeMultipleFutureWaitFn({pad_future, pooling_future}, future);
return MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
#define MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
#include "mace/kernels/softmax.h"
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
template <typename T>
class SoftmaxKernel : public OpenCLSoftmaxKernel {
public:
MaceStatus Compute(
OpKernelContext *context,
const Tensor *logits,
Tensor *output,
StatsFuture *future) override;
private:
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus SoftmaxKernel<T>::Compute(
OpKernelContext *context,
const Tensor *logits,
Tensor *output,
StatsFuture *future) {
index_t batch = 0;
index_t height = 0;
index_t width = 0;
index_t channels = 0;
if (logits->dim_size() == 2) {
batch = logits->dim(0);
height = 1;
width = 1;
channels = logits->dim(1);
} else if (logits->dim_size() == 4) {
batch = logits->dim(0);
height = logits->dim(1);
width = logits->dim(2);
channels = logits->dim(3);
} else {
MACE_NOT_IMPLEMENTED;
}
const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels;
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(logits->opencl_buffer()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_buffer()));
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = {4, 4, 4, 0};
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION
return MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/opencl/buffer/utils.h"
#include <set>
#include <string>
#include <vector>
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
MaceStatus PadInput(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const int pad_top,
const int pad_left,
const bool input_changed,
Tensor *padded_input,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
const index_t in_channel = input->dim(3);
const index_t padded_height = padded_input->dim(1);
const index_t padded_width = padded_input->dim(2);
const index_t padded_channel = padded_input->dim(3);
const uint32_t gws[2] = {
static_cast<uint32_t>(padded_width * RoundUpDiv4(padded_channel)),
static_cast<uint32_t>(padded_height * batch)
};
auto runtime = context->device()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("Dpad_input");
built_options.emplace("-Dpad_input=" + kernel_name);
built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input->dtype()));
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_transform",
kernel_name,
built_options,
kernel));
}
MACE_OUT_OF_RANGE_INIT(*kernel);
if (input_changed) {
uint32_t idx = 0;
MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, padded_input->size());
MACE_SET_2D_GWS_ARGS(*kernel, gws)
kernel->setArg(idx++, *(input->opencl_buffer()));
kernel->setArg(idx++, static_cast<int32_t>(in_height));
kernel->setArg(idx++, static_cast<int32_t>(in_width));
kernel->setArg(idx++, static_cast<int32_t>(in_channel));
kernel->setArg(idx++, static_cast<int32_t>(padded_height));
kernel->setArg(idx++, static_cast<int32_t>(padded_width));
kernel->setArg(idx++, static_cast<int32_t>(padded_channel));
kernel->setArg(idx++, pad_top);
kernel->setArg(idx++, pad_left);
kernel->setArg(idx++, *(padded_input->opencl_buffer()));
}
std::string tuning_key =
Concat("pad_input", batch, in_height, in_width, in_channel,
padded_height, padded_width, padded_channel);
std::vector<uint32_t> lws = {8, 4, 0};
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
MACE_OUT_OF_RANGE_VALIDATION
return MACE_SUCCESS;
}
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
#define MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
#include "mace/core/future.h"
#include "mace/core/op_kernel_context.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
namespace mace {
namespace kernels {
namespace opencl {
namespace buffer {
MaceStatus PadInput(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const int pad_top,
const int pad_left,
const bool input_changed,
Tensor *padded_input,
StatsFuture *future);
} // namespace buffer
} // namespace opencl
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/buffer_inverse_transform.h"
#include "mace/kernels/opencl/image/image_to_buffer.h"
#include "mace/kernels/opencl/buffer/buffer_inverse_transform.h"
namespace mace {
namespace kernels {
template<typename T>
BufferInverseTransformFunctor<
DeviceType::GPU, T>::BufferInverseTransformFunctor(
OpKernelContext *context,
const int wino_blk_size)
: BufferInverseTransformFunctorBase(context, wino_blk_size) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::ImageToBuffer<T>);
} else {
kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
}
}
template <typename T>
MaceStatus BufferInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const BufferType type,
Tensor *output,
StatsFuture *future) {
return kernel_->Compute(context_, input, type,
wino_blk_size_, output, future);
}
template struct BufferInverseTransformFunctor<DeviceType::GPU, float>;
template struct BufferInverseTransformFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/buffer_transform.h"
#include "mace/kernels/opencl/image/buffer_to_image.h"
#include "mace/kernels/opencl/buffer/buffer_transform.h"
namespace mace {
namespace kernels {
template<typename T>
BufferTransformFunctor<DeviceType::GPU, T>::BufferTransformFunctor(
OpKernelContext *context,
const int wino_blk_size)
: BufferTransformFunctorBase(context, wino_blk_size) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::BufferToImage<T>);
} else {
kernel_.reset(new opencl::buffer::BufferTransform<T>);
}
}
template <typename T>
MaceStatus BufferTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input,
const BufferType type,
Tensor *output,
StatsFuture *future) {
return kernel_->Compute(context_, input, type,
wino_blk_size_, output, future);
}
template struct BufferTransformFunctor<DeviceType::GPU, float>;
template struct BufferTransformFunctor<DeviceType::GPU, half>;
} // namespace kernels
} // namespace mace
......@@ -13,73 +13,26 @@
// limitations under the License.
#include "mace/kernels/channel_shuffle.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
#include "mace/kernels/opencl/image/channel_shuffle.h"
namespace mace {
namespace kernels {
template <typename T>
MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, Tensor *output, StatsFuture *future) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
const index_t channels = input->dim(3);
const index_t channels_per_group = channels / groups_;
MACE_CHECK(channels_per_group % 4 == 0,
"channels per group must be multiple of 4");
MACE_CHECK(groups_ % 4 == 0, "groups must be multiple of 4");
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
ChannelShuffleFunctor<DeviceType::GPU, T>::ChannelShuffleFunctor(
OpKernelContext *context,
const int groups) : OpKernel(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::ChannelShuffleKernel<T>(groups));
} else {
MACE_NOT_IMPLEMENTED;
}
}
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
template <typename T>
MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, Tensor *output, StatsFuture *future) {
return kernel_->Compute(context_, input, output, future);
}
template struct ChannelShuffleFunctor<DeviceType::GPU, float>;
......
#include <common.h>
__kernel void activation(KERNEL_ERROR_PARAMS
__kernel void activation(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
#ifdef USE_PRELU
......
#include <common.h>
__kernel void addn(KERNEL_ERROR_PARAMS
__kernel void addn(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1,
......
#include <common.h>
// Supported data types: half/float
__kernel void batch_norm(KERNEL_ERROR_PARAMS
__kernel void batch_norm(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__read_only image2d_t scale,
......
#include <common.h>
__kernel void batch_to_space(KERNEL_ERROR_PARAMS
__kernel void batch_to_space(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t batch_data,
__write_only image2d_t space_data,
......
#include <common.h>
// Supported data types: half/float
__kernel void bias_add(KERNEL_ERROR_PARAMS
__kernel void bias_add(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__read_only image2d_t bias,
......
#include <common.h>
__kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS
__kernel void filter_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* OIHW */
__private const int input_offset,
......@@ -52,7 +52,7 @@ __kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values);
}
__kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS
__kernel void filter_image_to_buffer(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* OIHW */
__private const int out_channel,
......@@ -102,7 +102,7 @@ __kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS
}
// TODO(liuqi): Support multiplier > 1
__kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS
__kernel void dw_filter_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* MIHW */
__private const int input_offset,
......@@ -154,7 +154,7 @@ __kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS
__kernel void in_out_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
......@@ -196,7 +196,7 @@ __kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS
__kernel void in_out_image_to_buffer(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* nhwc */
__private const int height,
......@@ -236,7 +236,7 @@ __kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS
}
}
__kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS
__kernel void arg_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input,
__private const int input_offset,
......@@ -272,7 +272,7 @@ __kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values);
}
__kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS
__kernel void arg_image_to_buffer(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output,
__private const int count,
......@@ -306,7 +306,7 @@ __kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS
}
__kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS
__kernel void in_out_height_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //nhwc
__private const int input_offset,
......@@ -349,7 +349,7 @@ __kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS
__kernel void in_out_height_image_to_buffer(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //nhwc
__private const int height,
......@@ -387,7 +387,7 @@ __kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS
output[offset] = values.w;
}
__kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS
__kernel void in_out_width_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
......@@ -430,7 +430,7 @@ __kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values);
}
__kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS
__kernel void weight_height_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, // OIHW
__private const int input_offset,
......@@ -475,7 +475,7 @@ __kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values);
}
__kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS
__kernel void weight_height_image_to_buffer(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //OIHW
__private const int out_channels,
......@@ -517,7 +517,7 @@ __kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS
}
__kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS
__kernel void weight_width_buffer_to_image(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, // OIHW
__private const int input_offset,
......@@ -565,7 +565,7 @@ __kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, coord, values);
}
__kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
__kernel void weight_width_image_to_buffer(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, // OIHW
__private const int in_channels,
......@@ -609,7 +609,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
__kernel void winograd_filter_buffer_to_image_2x2(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
......@@ -714,7 +714,7 @@ __kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
}
// only support 3x3 now
__kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS
__kernel void winograd_filter_image_to_buffer_2x2(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
......@@ -757,7 +757,7 @@ __kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS
__kernel void winograd_filter_buffer_to_image_6x6(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
......@@ -891,7 +891,7 @@ PROCESS(7);
#undef PROCESS
}
__kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS
__kernel void winograd_filter_image_to_buffer_6x6(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
......@@ -933,7 +933,7 @@ __kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
__kernel void winograd_filter_buffer_to_image_4x4(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
......@@ -1040,7 +1040,7 @@ __kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
#undef PROCESS
}
__kernel void winograd_filter_image_to_buffer_4x4(KERNEL_ERROR_PARAMS
__kernel void winograd_filter_image_to_buffer_4x4(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
......
#include <common.h>
__kernel void pad_input(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global IN_DATA_TYPE *input,
__private const int in_height,
__private const int in_width,
__private const int in_chan,
__private const int padded_height,
__private const int padded_width,
__private const int padded_chan,
__private const int pad_top,
__private const int pad_left,
__global DATA_TYPE *output) {
const int padded_wc_blk_idx = get_global_id(0);
const int padded_hb_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (padded_wc_blk_idx >= global_size_dim0 ||
padded_hb_idx >= global_size_dim1) {
return;
}
#endif
const int padded_chan_blk = (padded_chan + 3) >> 2;
const int padded_width_idx = padded_wc_blk_idx / padded_chan_blk;
const int padded_chan_blk_idx = padded_wc_blk_idx % padded_chan_blk;
const int batch_idx = padded_hb_idx / padded_height;
const int padded_height_idx = padded_hb_idx % padded_height;
const int padded_chan_idx = padded_chan_blk_idx << 2;
const int in_height_idx = padded_height_idx - pad_top;
const int in_width_idx = padded_width_idx - pad_left;
const int padded_offset = mad24(mad24(mad24(batch_idx, padded_height, padded_height_idx),
padded_width, padded_width_idx), padded_chan, padded_chan_idx);
const int in_offset = mad24(mad24(mad24(batch_idx, in_height, in_height_idx),
in_width, in_width_idx), in_chan, padded_chan_idx);
DATA_TYPE4 value = 0;
if (0 <= in_height_idx && in_height_idx < in_height &&
0 <= in_width_idx && in_width_idx < in_width) {
const int remain_chan = in_chan - padded_chan_idx;
if (remain_chan < 4) {
switch (remain_chan) {
case 3:
value.z = CONVERT(input[in_offset + 2]);
case 2:
value.y = CONVERT(input[in_offset + 1]);
case 1:
value.x = CONVERT(input[in_offset]);
}
} else {
value = CONVERT4(vload4(0, input + in_offset));
}
}
vstore4(value, 0, output + padded_offset);
CHECK_OUT_OF_RANGE_FOR_BUFFER(padded_offset + 3);
}
// OIHW -> [H, W, (O+3) / 4, I, 4]
__kernel void transform_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__global IN_DATA_TYPE *input, // OIHW
__private const int input_offset,
__global DATA_TYPE *output,
__private const int out_chan,
__private const int in_chan,
__private const int height,
__private const int width,
__private const int inner_size) {
const int in_chan_idx = get_global_id(0);
const int out_chan_blk_idx = get_global_id(1);
const int hw_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (in_chan_idx >= global_size_dim0 ||
out_chan_blk_idx >= global_size_dim1 ||
hw_idx >= global_size_dim2) {
return;
}
#endif
const int t_in_chan = global_size_dim0;
const int out_chan_blk = global_size_dim1;
const int h_idx = hw_idx / width;
const int w_idx = hw_idx % width;
const int out_chan_idx = out_chan_blk_idx << 2;
const int in_offset = mad24(mad24(mad24(out_chan_idx, in_chan, in_chan_idx),
height, h_idx), width, w_idx) + input_offset;
const int out_offset = (mad24(mad24(mad24(h_idx, width, w_idx),
out_chan_blk, out_chan_blk_idx), t_in_chan, in_chan_idx) << 2);
DATA_TYPE4 value = 0;
if (in_chan_idx < in_chan) {
if (out_chan_idx + 3 < out_chan) {
value.x = CONVERT(input[in_offset]);
value.y = CONVERT(input[in_offset + inner_size]);
value.z = CONVERT(input[in_offset + 2 * inner_size]);
value.w = CONVERT(input[in_offset + 3 * inner_size]);
} else {
const int diff = out_chan - out_chan_idx;
switch(diff) {
case 3:
value.z = CONVERT(input[in_offset + 2 * inner_size]);
case 2:
value.y = CONVERT(input[in_offset + inner_size]);
case 1:
value.x = CONVERT(input[in_offset]);
}
}
}
VSTORE4(value, output, out_offset);
}
// MIHW -> [M, (I+3) / 4, H, W, 4]
__kernel void transform_dw_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__global IN_DATA_TYPE *input, // MIHW
__private const int input_offset,
__global DATA_TYPE *output,
__private const int in_chan,
__private const int in_hw) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int in_chan_blk_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (width_idx >= global_size_dim0 ||
height_idx >= global_size_dim1 ||
in_chan_blk_idx >= global_size_dim2) {
return;
}
#endif
const int width = global_size_dim0;
const int height = global_size_dim1;
const int in_chan_idx = in_chan_blk_idx << 2;
const int in_offset = mad24(in_chan_idx, in_hw,
mad24(height_idx, width, width_idx)) + input_offset;
const int out_offset = mad24(in_chan_blk_idx, in_hw,
mad24(height_idx, width, width_idx)) << 2;
DATA_TYPE4 value = 0;
if (in_chan_idx + 3 < in_chan) {
value.x = CONVERT(input[in_offset]);
value.y = CONVERT(input[in_offset + in_hw]);
value.z = CONVERT(input[in_offset + (in_hw << 1)]);
value.w = CONVERT(input[in_offset + in_hw + (in_hw << 1)]);
} else {
const int diff = in_chan - in_chan_idx;
switch(diff) {
case 3:
value.z = CONVERT(input[in_offset + (in_hw << 1)]);
case 2:
value.y = CONVERT(input[in_offset + in_hw]);
case 1:
value.x = CONVERT(input[in_offset]);
}
}
VSTORE4(value, output, out_offset);
}
__kernel void transform_arg(BUFFER_OUT_OF_RANGE_PARAMS
__private const int global_size_dim0,
__global IN_DATA_TYPE *input,
__private const int input_offset,
__global DATA_TYPE *output,
__private int size) {
const int blk_idx = get_global_id(0);
#ifndef NON_UNIFORM_WORK_GROUP
if (blk_idx >= global_size_dim0) {
return;
}
#endif
const int idx = blk_idx << 2;
const int diff = size - idx;
const int in_idx = idx + input_offset;
DATA_TYPE4 value = 0;
if (diff < 4) {
switch (diff) {
case 3:
value.z = CONVERT(input[in_idx + 2]);
case 2:
value.y = CONVERT(input[in_idx + 1]);
case 1:
value.x = CONVERT(input[in_idx]);
}
} else {
value = CONVERT4(vload4(0, input + in_idx));
}
VSTORE4(value, output, idx);
}
__kernel void transform_data_type(BUFFER_OUT_OF_RANGE_PARAMS
__private const int global_size_dim0,
__global IN_DATA_TYPE *input,
__private const int input_offset,
__global DATA_TYPE *output) {
const int out_idx = get_global_id(0);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_idx >= global_size_dim0) {
return;
}
#endif
DATA_TYPE4 input_value = CONVERT4(vload4(out_idx, input + input_offset));
vstore4(input_value, out_idx, output);
}
#include <common.h>
// assume channes_per_group mod 4 = 0 && groups mod 4 == 0
__kernel void channel_shuffle(KERNEL_ERROR_PARAMS
__kernel void channel_shuffle(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int groups,
......
......@@ -24,19 +24,13 @@
#define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
#define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4)
#define OUT_DATA_TYPE4 VEC_DATA_TYPE(OUT_DATA_TYPE, 4)
#ifdef OUT_OF_RANGE_CHECK
#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
check_out_of_range_for_image2d(image, (coord).x, (coord).y, kernel_error);
#else
#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)
#endif
#define CONVERT_STR(value, type) convert_##type((value))
#define READ_IMAGET(image, sampler, coord) \
CMD_TYPE(read_image, CMD_DATA_TYPE)(image, sampler, coord)
#define WRITE_IMAGET(image, coord, value) \
CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value);
#define CONVERT_TO(value, type) CONVERT_STR(value, type)
#define CONVERT(value) CONVERT_TO(value, DATA_TYPE)
#define CONVERT4(value) CONVERT_TO(value, DATA_TYPE4)
#define GLOBAL_WORK_GROUP_SIZE_DIM2 \
__private const int global_size_dim0, \
......@@ -47,16 +41,37 @@
__private const int global_size_dim1, \
__private const int global_size_dim2,
// oorc for 'Out Of Range Check'
#ifdef OUT_OF_RANGE_CHECK
#define OUT_OF_RANGE_PARAMS \
__global int *oorc_flag,
#define KERNEL_ERROR_PARAMS \
__global char *kernel_error,
#define BUFFER_OUT_OF_RANGE_PARAMS \
__global int *oorc_flag, \
__private const int oorc_output_length,
#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
check_out_of_range_for_image2d(image, (coord).x, (coord).y, oorc_flag);
#define CHECK_OUT_OF_RANGE_FOR_BUFFER(idx) \
check_out_of_range_for_buffer(oorc_output_length, (idx), oorc_flag);
#else
#define OUT_OF_RANGE_PARAMS
#define BUFFER_OUT_OF_RANGE_PARAMS
#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)
#define CHECK_OUT_OF_RANGE_FOR_BUFFER(idx)
#endif
#define KERNEL_ERROR_PARAMS
#define READ_IMAGET(image, sampler, coord) \
CMD_TYPE(read_image, CMD_DATA_TYPE)(image, sampler, coord)
#define WRITE_IMAGET(image, coord, value) \
CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value);
#define VSTORE4(data, output, offset) \
CHECK_OUT_OF_RANGE_FOR_BUFFER((offset) + 3) \
vstore4(data, 0, output + (offset));
#endif
__constant sampler_t SAMPLER =
CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
......@@ -66,6 +81,7 @@ inline float4 do_sigmoid(float4 in) {
return native_recip(1.0f + native_exp(-in));
}
#ifdef DATA_TYPE
inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
#ifdef USE_PRELU
DATA_TYPE4 prelu_alpha,
......@@ -89,17 +105,25 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
#endif
return out;
}
#endif
inline void check_out_of_range_for_image2d(__write_only image2d_t image,
__private const int x,
__private const int y,
global char *kernel_error) {
#ifdef OUT_OF_RANGE_CHECK
__global int *oorc_flag) {
int2 image_dim = get_image_dim(image);
if (x >= image_dim.x || y >= image_dim.y) {
*kernel_error = 1;
*oorc_flag = 1;
}
#endif
}
inline void check_out_of_range_for_buffer(__private const int length,
__private const int idx,
__global int *oorc_flag) {
if (idx >= length) {
*oorc_flag = idx - length + 1;
}
}
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
......@@ -22,7 +22,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
}
// Supported data type: half/float
__kernel void concat_channel(KERNEL_ERROR_PARAMS
__kernel void concat_channel(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input0,
__read_only image2d_t input1,
......@@ -84,7 +84,7 @@ __kernel void concat_channel(KERNEL_ERROR_PARAMS
}
// Required: All input channels are divisible by 4
__kernel void concat_channel_multi(KERNEL_ERROR_PARAMS
__kernel void concat_channel_multi(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int chan_blk_offset,
......
#include <common.h>
__kernel void conv_2d(KERNEL_ERROR_PARAMS
__kernel void conv_2d(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
......
#include <common.h>
__kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS
__kernel void conv_2d_1x1(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, cout/4 */
......
#include <common.h>
__kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global IN_DATA_TYPE *padded_input,
__global IN_DATA_TYPE *filter,
#ifdef BIAS
__global IN_DATA_TYPE *bias,
#endif
__private const int in_height,
__private const int in_width,
__private const int in_chan,
__private const int filter_in_chan,
__private const int out_height,
__private const int out_width,
__private const int out_chan,
__private const int stride_h,
__private const int stride_w,
__private const float relux_max_limit,
__global OUT_DATA_TYPE *output) {
const int out_wc_blk_idx = get_global_id(0);
const int out_hb_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_wc_blk_idx >= global_size_dim0 ||
out_hb_idx >= global_size_dim1) {
return;
}
#endif
const int out_chan_blk = (out_chan + 3) >> 2;
const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
const int batch_idx = out_hb_idx / out_height;
const int out_height_idx = out_hb_idx % out_height;
const int out_width_idx = out_width_blk_idx << 1;
const int out_chan_idx = out_chan_blk_idx << 2;
const int in_height_idx = mul24(out_height_idx, stride_h);
const int in_width_idx = mul24(out_width_idx, stride_w);
const int strided_chan = mul24(in_chan, stride_w);
#ifdef BIAS
DATA_TYPE4 out0 = CONVERT4(vload4(0, bias + out_chan_idx));
DATA_TYPE4 out1 = out0;
#else
DATA_TYPE4 out0 = 0;
DATA_TYPE4 out1 = 0;
#endif
int in_offset = mul24(mad24(mad24(batch_idx, in_height, in_height_idx),
in_width, in_width_idx), in_chan);
int filter_offset = mul24(out_chan_blk_idx, filter_in_chan) << 2;
DATA_TYPE4 in0, in1;
DATA_TYPE4 w0, w1, w2, w3;
for (int in_chan_idx = 0; in_chan_idx < in_chan; in_chan_idx += 4) {
w0 = CONVERT4(vload4(0, filter + filter_offset));
w1 = CONVERT4(vload4(0, filter + filter_offset + 4));
w2 = CONVERT4(vload4(0, filter + filter_offset + 8));
w3 = CONVERT4(vload4(0, filter + filter_offset + 12));
in0 = CONVERT4(vload4(0, padded_input + in_offset));
in1 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan));
out0 = mad((DATA_TYPE4)(in0.x), w0, out0);
out0 = mad((DATA_TYPE4)(in0.y), w1, out0);
out0 = mad((DATA_TYPE4)(in0.z), w2, out0);
out0 = mad((DATA_TYPE4)(in0.w), w3, out0);
out1 = mad((DATA_TYPE4)(in1.x), w0, out1);
out1 = mad((DATA_TYPE4)(in1.y), w1, out1);
out1 = mad((DATA_TYPE4)(in1.z), w2, out1);
out1 = mad((DATA_TYPE4)(in1.w), w3, out1);
filter_offset += 16;
in_offset += 4;
}
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
out0 = do_activation(out0, relux_max_limit);
out1 = do_activation(out1, relux_max_limit);
#endif
int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),
out_width, out_width_idx), out_chan, out_chan_idx);
#define WRITE_OUTPUT(i) \
if (out_chan_idx + 4 > out_chan) { \
const int diff = out_chan - out_chan_idx; \
switch(diff) { \
case 3: \
output[out_offset + 2] = CONVERT_TO(out##i.z, OUT_DATA_TYPE); \
case 2: \
output[out_offset + 1] = CONVERT_TO(out##i.y, OUT_DATA_TYPE); \
case 1: \
output[out_offset] = CONVERT_TO(out##i.x, OUT_DATA_TYPE); \
} \
CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + diff - 1); \
} else { \
VSTORE4(CONVERT_TO(out##i, OUT_DATA_TYPE4), output, out_offset); \
}
WRITE_OUTPUT(0);
if (out_width_idx + 1 >= out_width) return;
out_offset += out_chan;
WRITE_OUTPUT(1);
#undef WRITE_OUTPUT
}
#include <common.h>
__kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS
__kernel void conv_2d_3x3(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
......
此差异已折叠。
#include <common.h>
__kernel void crop(KERNEL_ERROR_PARAMS
__kernel void crop(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int offset_b,
......
#include <common.h>
__kernel void deconv_2d(KERNEL_ERROR_PARAMS
__kernel void deconv_2d(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__read_only image2d_t weights,
......
#include <common.h>
__kernel void depth_to_space(KERNEL_ERROR_PARAMS
__kernel void depth_to_space(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int block_size,
......
#include <common.h>
// Only multiplier = 1 is supported
__kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS
__kernel void depthwise_conv2d(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
......@@ -136,7 +136,7 @@ __kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
}
__kernel void depthwise_conv2d_s1(KERNEL_ERROR_PARAMS
__kernel void depthwise_conv2d_s1(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
......
此差异已折叠。
#include <common.h>
__kernel void eltwise(KERNEL_ERROR_PARAMS
__kernel void eltwise(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input0,
#if INPUT_TYPE == 1
......
#include <common.h>
__kernel void lstmcell(KERNEL_ERROR_PARAMS
__kernel void lstmcell(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
__read_only image2d_t pre_output,
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册