提交 6a764b53 编写于 作者: 叶剑武

Merge branch 'fix-opencl-tuning-bug' into 'master'

Fix opencl get_gloabl_size wrong when the kernel split.

See merge request !693
上级 a0fb2012
......@@ -44,10 +44,12 @@ cc_library(
],
exclude = [
"buffer_to_image.h",
"image_to_buffer.h",
],
) + if_opencl_enabled(glob([
"opencl/*.h",
"buffer_to_image.h",
"image_to_buffer.h",
])),
copts = [
"-Werror",
......
......@@ -16,19 +16,18 @@
#define MACE_KERNELS_BUFFER_TO_IMAGE_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/kernels/opencl/common.h"
namespace mace {
namespace kernels {
struct BufferToImageFunctorBase {
explicit BufferToImageFunctorBase(const int wino_blk_size)
: kernel_error_(nullptr),
wino_blk_size_(wino_blk_size) {}
std::unique_ptr<BufferBase> kernel_error_;
: wino_blk_size_(wino_blk_size) {}
const int wino_blk_size_;
};
......@@ -57,6 +56,10 @@ struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
const BufferType type,
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -23,11 +23,6 @@
#include "mace/kernels/activation.h"
#include "mace/kernels/gemm.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/kernels/opencl/helper.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace kernels {
......
......@@ -16,19 +16,18 @@
#define MACE_KERNELS_IMAGE_TO_BUFFER_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/kernels/opencl/common.h"
namespace mace {
namespace kernels {
struct ImageToBufferFunctorBase {
explicit ImageToBufferFunctorBase(const int wino_blk_size)
: kernel_error_(nullptr),
wino_blk_size_(wino_blk_size) {}
std::unique_ptr<BufferBase> kernel_error_;
: wino_blk_size_(wino_blk_size) {}
const int wino_blk_size_;
};
......@@ -37,9 +36,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
explicit ImageToBufferFunctor(const int wino_blk_size)
: ImageToBufferFunctorBase(wino_blk_size) {}
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
StatsFuture *future) {
const BufferType type,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(input);
MACE_UNUSED(type);
MACE_UNUSED(output);
......@@ -54,9 +53,13 @@ struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
explicit ImageToBufferFunctor(const int wino_blk_size)
: ImageToBufferFunctorBase(wino_blk_size) {}
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
StatsFuture *future);
const BufferType type,
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -13,7 +13,6 @@
// limitations under the License.
#include "mace/kernels/activation.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
......@@ -38,23 +37,13 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
switch (activation_) {
case RELU:
tuning_key_prefix_ = "relu_opencl_kernel";
......@@ -92,15 +81,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
......@@ -119,13 +101,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws,
lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -49,24 +49,14 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
......@@ -92,14 +82,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
OUT_OF_RANGE_SET_ARG;
SET_2D_GWS_ARGS(kernel_);
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
......@@ -114,14 +98,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -13,7 +13,6 @@
// limitations under the License.
#include "mace/kernels/batch_norm.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
......@@ -49,23 +48,13 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (folded_constant_) {
built_options.emplace("-DFOLDED_CONSTANT");
}
......@@ -96,15 +85,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
......@@ -125,14 +107,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
output->dim(1), output->dim(2), output->dim(3), folded_constant_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -13,7 +13,6 @@
// limitations under the License.
#include "mace/kernels/bias_add.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
......@@ -45,39 +44,21 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
......@@ -104,12 +85,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
......
......@@ -13,8 +13,8 @@
// limitations under the License.
#include "mace/kernels/buffer_to_image.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
......@@ -30,7 +30,8 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_);
if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape =
CalWinogradShape(buffer->shape(), type, wino_blk_size_);
{(wino_blk_size_ + 2) * (wino_blk_size_ + 2),
buffer->dim(0), buffer->dim(1)};
MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape));
} else {
MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape));
......@@ -76,85 +77,74 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
auto runtime = OpenCLRuntime::Global();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
}
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (buffer->dtype() == image->dtype()) {
built_options.emplace(
"-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
}
cl::Kernel b2f_kernel;
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &b2f_kernel));
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]);
b2f_kernel.setArg(idx++, gws[1]);
}
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
"buffer offset not aligned");
b2f_kernel.setArg(idx++,
static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype())));
if (type == CONV2D_FILTER) {
const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
} else if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[1]));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[2]));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[3]));
if (!IsVecEqual(input_shape_, buffer->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_2D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(buffer->opencl_buffer()));
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
"buffer offset not aligned");
kernel_.setArg(idx++,
static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype())));
if (type == CONV2D_FILTER) {
const index_t
inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(image->opencl_image()));
input_shape_ = buffer->shape();
}
b2f_kernel.setArg(idx++, *(image->opencl_image()));
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel));
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
......@@ -163,16 +153,11 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
}
error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
......
......@@ -45,23 +45,13 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
......@@ -72,15 +62,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
......@@ -95,14 +78,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -17,10 +17,8 @@ __kernel void activation(KERNEL_ERROR_PARAMS
|| hb >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
const int width = global_size_dim1;
const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
......
......@@ -21,10 +21,8 @@ __kernel void batch_norm(KERNEL_ERROR_PARAMS
|| hb >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
const int width = global_size_dim1;
#ifdef FOLDED_CONSTANT
DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
......
......@@ -14,10 +14,8 @@ __kernel void bias_add(KERNEL_ERROR_PARAMS
|| hb >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
const int width = global_size_dim1;
const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
......
......@@ -446,10 +446,8 @@ __kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int inner_size = global_size_dim0;
#else
const int inner_size = get_global_size(0);
#endif
const int inner_size = global_size_dim0;
const int out_chan_idx = h << 2;
const int in_chan_idx = w % in_channels;
......@@ -492,10 +490,8 @@ __kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int inner_size = global_size_dim0;
#else
const int inner_size = get_global_size(0);
#endif
const int inner_size = global_size_dim0;
const int out_chan_idx = h << 2;
const int in_chan_idx = w % in_channels;
......@@ -536,10 +532,8 @@ __kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif
const int out_channels = global_size_dim1;
const int in_chan_blks = (in_channels + 3) >> 2;
const int hw_size = height * width;
const int inner_size = in_channels * hw_size;
......@@ -585,10 +579,8 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif
const int out_channels = global_size_dim1;
const int in_chan_blks = (in_channels + 3) >> 2;
const int hw_size = height * width;
const int inner_size = in_channels * hw_size;
......@@ -632,10 +624,8 @@ __kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif
const int out_channels = global_size_dim1;
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
......@@ -782,10 +772,8 @@ __kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif
const int out_channels = global_size_dim1;
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
......@@ -960,10 +948,8 @@ __kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif
const int out_channels = global_size_dim1;
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
......
......@@ -16,10 +16,8 @@ __kernel void channel_shuffle(KERNEL_ERROR_PARAMS
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
const int width = global_size_dim1;
const int group_blks = groups / 4;
const int groups_blks_width = group_blks * width;
......
......@@ -38,24 +38,15 @@
CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value);
#ifndef NON_UNIFORM_WORK_GROUP
#define GLOBAL_WORK_GROUP_SIZE_DIM2 \
__private const int global_size_dim0, \
__private const int global_size_dim1,
#define GLOBAL_WORK_GROUP_SIZE_DIM3 \
__private const int global_size_dim0, \
__private const int global_size_dim1, \
__private const int global_size_dim2,
#else
#define GLOBAL_WORK_GROUP_SIZE_DIM2
#define GLOBAL_WORK_GROUP_SIZE_DIM3
#endif
#ifdef OUT_OF_RANGE_CHECK
#define KERNEL_ERROR_PARAMS \
......
......@@ -37,10 +37,8 @@ __kernel void concat_channel(KERNEL_ERROR_PARAMS
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
const int width = global_size_dim1;
const int input0_chan_blk = (input0_chan + 3) >> 2;
......@@ -100,10 +98,8 @@ __kernel void concat_channel_multi(KERNEL_ERROR_PARAMS
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
const int width = global_size_dim1;
DATA_TYPE4 data = 0;
data = READ_IMAGET(input,
......
......@@ -30,10 +30,8 @@ __kernel void conv_2d(KERNEL_ERROR_PARAMS
|| out_hb >= global_size_dim2) {
return;
}
const int out_w_blks = global_size_dim1;
#else
const int out_w_blks = get_global_size(1);
#endif
const int out_w_blks = global_size_dim1;
#ifdef BIAS
DATA_TYPE4 out0 =
......
......@@ -24,10 +24,8 @@ __kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS
|| out_hb >= global_size_dim2) {
return;
}
const int out_w_blks = global_size_dim1;
#else
const int out_w_blks = get_global_size(1);
#endif
const int out_w_blks = global_size_dim1;
#ifdef BIAS
DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
......
......@@ -28,10 +28,8 @@ __kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS
|| out_hb >= global_size_dim2) {
return;
}
const int out_w_blks = global_size_dim1;
#else
const int out_w_blks = get_global_size(1);
#endif
const int out_w_blks = global_size_dim1;
#ifdef BIAS
DATA_TYPE4 out0 =
......
......@@ -30,10 +30,8 @@ __kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS
|| out_hb >= global_size_dim2) {
return;
}
const short out_w_blks = global_size_dim1;
#else
const short out_w_blks = get_global_size(1);
#endif
const short out_w_blks = global_size_dim1;
const short rounded_in_ch = in_ch_blks << 2;
const short in_ch_blk = out_ch_blk; // multiplier = 1
......
......@@ -80,7 +80,7 @@ __kernel void fully_connected_width(KERNEL_ERROR_PARAMS
__private const float relux_max_limit) {
const int inter_out_idx = get_global_id(0);
const int width_blk_idx = get_global_id(1);
const int width_blk_count = get_global_size(1);
const int width_blk_count = global_size_dim1;
const int batch_out_blk_idx = get_global_id(2);
const int batch_idx = batch_out_blk_idx / out_blks;
......
......@@ -23,11 +23,8 @@ __kernel void pad(KERNEL_ERROR_PARAMS
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
const int width = global_size_dim1;
DATA_TYPE4 data = constant_value;
if ((height_padding <= height_idx && height_idx < input_padded_height) &&
......
......@@ -36,10 +36,8 @@ __kernel void pooling(KERNEL_ERROR_PARAMS
|| out_hb_idx >= global_size_dim2) {
return;
}
const int out_width = global_size_dim1;
#else
const int out_width = get_global_size(1);
#endif
const int out_width = global_size_dim1;
const int batch_idx = mul24((out_hb_idx / out_height), in_height);
const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
......
......@@ -19,12 +19,9 @@ __kernel void resize_bilinear_nocache(KERNEL_ERROR_PARAMS
|| hb >= global_size_dim2) {
return;
}
#endif
const int ch_blks = global_size_dim0;
const int out_width = global_size_dim1;
#else
const int ch_blks = get_global_size(0);
const int out_width = get_global_size(1);
#endif
const int b = hb / out_height;
const int h = hb % out_height;
......
......@@ -14,10 +14,8 @@ __kernel void slice(KERNEL_ERROR_PARAMS
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
const int width = global_size_dim1;
DATA_TYPE4 data = READ_IMAGET(input, SAMPLER,
(int2)(mad24(chan_blk_idx + chan_blk_offset,
......
......@@ -15,12 +15,9 @@ __kernel void softmax(KERNEL_ERROR_PARAMS
|| hb_idx >= global_size_dim2) {
return;
}
#endif
const int chan_blks = global_size_dim0 - 1;
const int width = global_size_dim1;
#else
const int chan_blks = get_global_size(0) - 1;
const int width = get_global_size(1);
#endif
int pos = width_idx;
DATA_TYPE max_value = -FLT_MAX;
......
......@@ -20,10 +20,8 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS
if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
return;
}
const int chan_blk_size = global_size_dim1;
#else
const int chan_blk_size = get_global_size(1);
#endif
const int chan_blk_size = global_size_dim1;
const int batch_idx = out_width_idx * round_hw_r;
const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
......@@ -141,10 +139,8 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return;
}
const int out_channel = global_size_dim1;
#else
const int out_channel = get_global_size(1);
#endif
const int out_channel = global_size_dim1;
int width = width_idx;
int height = height_idx;
......@@ -255,10 +251,8 @@ __kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS
if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
return;
}
const int chan_blk_size = global_size_dim1;
#else
const int chan_blk_size = get_global_size(1);
#endif
const int chan_blk_size = global_size_dim1;
const int batch_idx = out_width_idx * round_hw_r;
const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
......@@ -417,10 +411,8 @@ __kernel void winograd_inverse_transform_4x4(KERNEL_ERROR_PARAMS
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return;
}
const int out_channel = global_size_dim1;
#else
const int out_channel = get_global_size(1);
#endif
const int out_channel = global_size_dim1;
const int batch = width_idx * round_hw_r;
int h = mad24(batch, -round_hw, width_idx);
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_COMMON_H_
#define MACE_KERNELS_OPENCL_COMMON_H_
namespace mace {
namespace kernels {
enum BufferType {
CONV2D_FILTER = 0,
IN_OUT_CHANNEL = 1,
ARGUMENT = 2,
IN_OUT_HEIGHT = 3,
IN_OUT_WIDTH = 4,
WINOGRAD_FILTER = 5,
DW_CONV2D_FILTER = 6,
WEIGHT_HEIGHT = 7,
WEIGHT_WIDTH = 8,
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_COMMON_H_
......@@ -65,26 +65,16 @@ static MaceStatus Concat2(cl::Kernel *kernel,
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name);
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
}
if (input0->dim(3) % 4 == 0) {
built_options.emplace("-DDIVISIBLE_FOUR");
......@@ -97,15 +87,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
}
if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG_PTR;
SET_3D_GWS_ARGS_PTR(kernel, gws);
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input0->opencl_image())));
kernel->setArg(idx++,
......@@ -123,14 +106,7 @@ static MaceStatus Concat2(cl::Kernel *kernel,
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
}
......@@ -149,22 +125,12 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel));
*kwg_size =
......@@ -185,15 +151,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG_PTR;
SET_3D_GWS_ARGS_PTR(kernel, gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
kernel->setArg(idx++, *(output->opencl_image()));
......@@ -217,12 +176,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
OUT_OF_RANGE_VALIDATION(*kernel_error);
if (future != nullptr && runtime->is_profiling_enabled()) {
event.wait();
CallStats tmp_stats;
......
......@@ -98,22 +98,12 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
MACE_CHECK(input_batch == batch);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
......@@ -147,17 +137,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
// Support different input size
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG_PTR;
SET_3D_GWS_ARGS_PTR(kernel, gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
......@@ -182,14 +166,7 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
}
......
......@@ -84,22 +84,12 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
......@@ -131,17 +121,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
// Support different input size
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG_PTR;
SET_3D_GWS_ARGS_PTR(kernel, gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
......@@ -169,14 +153,7 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
}
......
......@@ -93,22 +93,12 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
built_options.emplace("-Dconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
......@@ -140,17 +130,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
// Support different input size
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG_PTR;
SET_3D_GWS_ARGS_PTR(kernel, gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
......@@ -182,13 +166,7 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
}
......
......@@ -136,23 +136,13 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
built_options.emplace("-Dcrop=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
built_options, &kernel_));
......@@ -161,15 +151,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
}
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(offsets[0]));
kernel_.setArg(idx++, static_cast<int>(offsets[1]));
......@@ -190,14 +173,7 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -59,22 +59,12 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
built_options.emplace("-Ddeconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
case NOOP:
......@@ -108,15 +98,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG_PTR;
SET_3D_GWS_ARGS_PTR(kernel, gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
......@@ -152,13 +135,7 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
}
......
......@@ -76,6 +76,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
......@@ -83,18 +85,6 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
obfuscated_kernel_name,
built_options,
......@@ -106,15 +96,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
if (d2s_) {
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
......@@ -140,13 +123,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -93,6 +93,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d_s1");
......@@ -100,20 +102,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
} else {
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
}
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace(MakeString("-DSTRIDE=", stride));
switch (activation) {
......@@ -154,15 +144,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
input_channels);
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG_PTR;
SET_3D_GWS_ARGS_PTR(kernel, gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
......@@ -193,13 +176,7 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
}
......
......@@ -74,11 +74,13 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
built_options.emplace("-Deltwise=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (input1 == nullptr) {
built_options.emplace("-DINPUT_TYPE=1");
......@@ -90,19 +92,6 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
if (swapped) built_options.emplace("-DSWAPPED");
}
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
built_options, &kernel_));
......@@ -111,15 +100,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
}
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input0->opencl_image()));
if (input1 == nullptr) {
kernel_.setArg(idx++, value_);
......@@ -144,13 +126,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -13,6 +13,8 @@
// limitations under the License.
#include "mace/kernels/fully_connected.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace {
......@@ -42,11 +44,13 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
const index_t output_blocks = RoundUpDiv4(output_size);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
built_options.emplace("-Dfully_connected_width=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
......@@ -71,19 +75,6 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
built_options, kernel));
......@@ -113,15 +104,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
(*gws)[2] = static_cast<uint32_t>(batch * output_blocks);
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, (*gws)[0]);
kernel->setArg(idx++, (*gws)[1]);
kernel->setArg(idx++, (*gws)[2]);
}
OUT_OF_RANGE_SET_ARG_PTR;
SET_3D_GWS_ARGS_PTR(kernel, *gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) {
......@@ -154,12 +138,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
}
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
OUT_OF_RANGE_VALIDATION(*kernel_error);
MACE_CL_RET_STATUS(error);
if (future != nullptr) {
......@@ -192,26 +171,16 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
built_options.emplace("-Dfully_connected=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
switch (activation) {
case NOOP:
break;
......@@ -247,14 +216,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
};
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, (*gws)[0]);
kernel->setArg(idx++, (*gws)[1]);
}
OUT_OF_RANGE_SET_ARG_PTR;
SET_2D_GWS_ARGS_PTR(kernel, *gws);
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) {
......@@ -276,13 +239,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key,
gws->data(), *lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
}
} // namespace
......
......@@ -180,22 +180,6 @@ std::vector<index_t> FormatBufferShape(
}
}
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type,
const int wino_blk_size) {
if (type == WINOGRAD_FILTER) {
return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[0], shape[1]};
} else if (type == IN_OUT_HEIGHT) {
index_t out_width =
shape[0] * ((shape[1] + wino_blk_size - 1) / wino_blk_size) *
((shape[2] + wino_blk_size - 1) / wino_blk_size);
return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[3], out_width};
} else {
LOG(FATAL) << "Mace not supported yet.";
return std::vector<index_t>();
}
}
std::string DtToCLDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
......@@ -220,7 +204,7 @@ std::string DtToCLCMDDt(const DataType dt) {
}
}
std::string DtToUpstreamCLDt(const DataType dt) {
std::string DtToUpCompatibleCLDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
......@@ -231,7 +215,7 @@ std::string DtToUpstreamCLDt(const DataType dt) {
}
}
std::string DtToUpstreamCLCMDDt(const DataType dt) {
std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
switch (dt) {
case DT_FLOAT:
case DT_HALF:
......@@ -357,7 +341,8 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(
static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
static_cast<uint32_t>(elapse_time / kMaxKernelExecTime) + 1,
gws[2]);
uint32_t block_size = gws[2] / num_blocks;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
block_size = RoundUp(block_size, params[2]);
......@@ -465,7 +450,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(
static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
static_cast<uint32_t>(elapse_time / kMaxKernelExecTime) + 1,
gws[1]);
uint32_t block_size = gws[1] / num_blocks;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
block_size = RoundUp(block_size, params[1]);
......
......@@ -15,7 +15,9 @@
#ifndef MACE_KERNELS_OPENCL_HELPER_H_
#define MACE_KERNELS_OPENCL_HELPER_H_
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "mace/core/future.h"
......@@ -23,26 +25,71 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/types.h"
#include "mace/kernels/opencl/common.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
const float kMaxKernelExeTime = 1000.0; // microseconds
#define OUT_OF_RANGE_CONFIG(kernel_error) \
if (runtime->IsOutOfRangeCheckEnabled()) { \
built_options.emplace("-DOUT_OF_RANGE_CHECK"); \
(kernel_error) = std::move(std::unique_ptr<Buffer>( \
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); \
MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1)); \
(kernel_error)->Map(nullptr); \
*((kernel_error)->mutable_data<char>()) = 0; \
(kernel_error)->UnMap(); \
}
const int32_t kBaseGPUMemCacheSize = 16384;
#define OUT_OF_RANGE_SET_ARG \
if (runtime->IsOutOfRangeCheckEnabled()) { \
kernel_.setArg(idx++, \
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); \
}
#define OUT_OF_RANGE_SET_ARG_PTR \
if (runtime->IsOutOfRangeCheckEnabled()) { \
kernel->setArg(idx++, \
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); \
}
#define OUT_OF_RANGE_VALIDATION(kernel_error) \
if (runtime->IsOutOfRangeCheckEnabled()) { \
(kernel_error)->Map(nullptr); \
char *kerror_code = (kernel_error)->mutable_data<char>(); \
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;\
(kernel_error)->UnMap(); \
}
#define NON_UNIFORM_WG_CONFIG \
if (runtime->IsNonUniformWorkgroupsSupported()) { \
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); \
}
#define SET_3D_GWS_ARGS(kernel) \
kernel.setArg(idx++, gws[0]); \
kernel.setArg(idx++, gws[1]); \
kernel.setArg(idx++, gws[2]);
enum BufferType {
CONV2D_FILTER = 0,
IN_OUT_CHANNEL = 1,
ARGUMENT = 2,
IN_OUT_HEIGHT = 3,
IN_OUT_WIDTH = 4,
WINOGRAD_FILTER = 5,
DW_CONV2D_FILTER = 6,
WEIGHT_HEIGHT = 7,
WEIGHT_WIDTH = 8,
};
#define SET_2D_GWS_ARGS(kernel) \
kernel.setArg(idx++, gws[0]); \
kernel.setArg(idx++, gws[1]);
#define SET_3D_GWS_ARGS_PTR(kernel, gws) \
kernel->setArg(idx++, (gws)[0]); \
kernel->setArg(idx++, (gws)[1]); \
kernel->setArg(idx++, (gws)[2]);
#define SET_2D_GWS_ARGS_PTR(kernel, gws) \
kernel->setArg(idx++, (gws)[0]); \
kernel->setArg(idx++, (gws)[1]);
// Max execution time of OpenCL kernel for tuning to prevent UI stuck.
const float kMaxKernelExecTime = 1000.0; // microseconds
// Base GPU cache size used for computing local work group size.
const int32_t kBaseGPUMemCacheSize = 16384;
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type,
......@@ -53,41 +100,35 @@ std::vector<index_t> FormatBufferShape(
const std::vector<index_t> &buffer_shape,
const BufferType type);
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type,
const int wino_blk_size = 2);
// CPU data type to OpenCL command data type
std::string DtToCLCMDDt(const DataType dt);
std::string DtToUpstreamCLCMDDt(const DataType dt);
// CPU data type to upward compatible OpenCL command data type
// e.g. half -> float
std::string DtToUpCompatibleCLCMDDt(const DataType dt);
// CPU data type to OpenCL data type
std::string DtToCLDt(const DataType dt);
std::string DtToUpstreamCLDt(const DataType dt);
// CPU data type to upward compatible OpenCL data type
// e.g. half -> float
std::string DtToUpCompatibleCLDt(const DataType dt);
// Tuning or Run OpenCL kernel with 3D work group size
MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future);
// Tuning or Run OpenCL kernel with 2D work group size
MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future);
inline void SetFuture(StatsFuture *future, const cl::Event &event) {
if (future != nullptr) {
future->wait_fn = [event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
OpenCLRuntime::Global()->GetCallStats(event, stats);
}
};
}
}
// Check whether limit OpenCL kernel time flag open.
inline bool LimitKernelTime() {
const char *flag = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME");
return flag != nullptr && strlen(flag) == 1 && flag[0] == '1';
......
......@@ -13,9 +13,10 @@
// limitations under the License.
#include "mace/kernels/image_to_buffer.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
......@@ -68,80 +69,71 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
auto runtime = OpenCLRuntime::Global();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
}
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (buffer->dtype() == image->dtype()) {
built_options.emplace(
"-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options,
&kernel_));
}
cl::Kernel b2f_kernel;
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options,
&b2f_kernel));
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]);
b2f_kernel.setArg(idx++, gws[1]);
}
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
if (type == CONV2D_FILTER) {
const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else if (type == WEIGHT_HEIGHT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
} else {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[1]));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[2]));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[3]));
if (!IsVecEqual(input_shape_, image->shape())) {
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG;
SET_2D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(buffer->opencl_buffer()));
if (type == CONV2D_FILTER) {
const index_t
inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
} else if (type == ARGUMENT) {
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else if (type == WEIGHT_HEIGHT) {
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
} else {
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[1]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[2]));
kernel_.setArg(idx++,
static_cast<uint32_t>(formatted_buffer_shape[3]));
}
kernel_.setArg(idx++, *(image->opencl_image()));
input_shape_ = image->shape();
}
b2f_kernel.setArg(idx++, *(image->opencl_image()));
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel));
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
......@@ -150,16 +142,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
}
error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
......
......@@ -57,23 +57,13 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
built_options, &kernel_));
......@@ -81,14 +71,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
OUT_OF_RANGE_SET_ARG;
SET_2D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
......@@ -98,18 +82,12 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -40,9 +40,8 @@ bool BufferToImageOpImpl(Tensor *buffer,
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
OUT_OF_RANGE_CONFIG(kernel_error);
NON_UNIFORM_WG_CONFIG;
if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" +
DtToCLDt(DataTypeToEnum<float>::value));
......@@ -50,57 +49,46 @@ bool BufferToImageOpImpl(Tensor *buffer,
DtToCLCMDDt(DataTypeToEnum<float>::value));
} else {
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<float>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<float>::value));
}
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error->Allocate(1));
kernel_error->Map(nullptr);
*(kernel_error->mutable_data<char>()) = 0;
kernel_error->UnMap();
DtToUpCompatibleCLDt(DataTypeToEnum<float>::value));
built_options.emplace(
"-DCMD_DATA_TYPE=" +
DtToUpCompatibleCLCMDDt(DataTypeToEnum<float>::value));
}
cl::Kernel b2f_kernel;
cl::Kernel kernel;
cl_int error = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options, &b2f_kernel);
built_options,
&kernel);
if (error != CL_SUCCESS) {
return false;
}
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]);
b2f_kernel.setArg(idx++, gws[1]);
kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error->buffer())));
}
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
SET_2D_GWS_ARGS(kernel);
kernel.setArg(idx++, *(buffer->opencl_buffer()));
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
"buffer offset not aligned");
b2f_kernel.setArg(idx++,
kernel.setArg(idx++,
static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype())));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
b2f_kernel.setArg(idx++, *(image->opencl_image()));
kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
kernel.setArg(idx++, *(image->opencl_image()));
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel));
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
......@@ -109,7 +97,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
}
error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
if (error != CL_SUCCESS) {
......
......@@ -51,23 +51,13 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
built_options.emplace("-Dpad=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
built_options, &kernel_));
......@@ -81,15 +71,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, this->constant_value_);
......@@ -108,13 +91,7 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -59,6 +59,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
......@@ -67,24 +69,12 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
}
if (pooling_type_ == AVG) {
built_options.emplace("-DPOOL_AVG");
}
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
kernel_name,
built_options,
......@@ -130,15 +120,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
};
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
......@@ -171,13 +154,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws.data(), lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -50,26 +50,15 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean");
built_options.emplace("-Dreduce_mean=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO");
}
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean",
kernel_name,
built_options,
......@@ -96,15 +85,8 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, (group_size * 4 * sizeof(T)),
nullptr);
......@@ -140,12 +122,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
......
......@@ -74,23 +74,13 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bilinear",
kernel_name,
......@@ -115,15 +105,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
CalculateResizeScale(in_width, out_width, align_corners_);
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
......@@ -142,13 +125,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -44,23 +44,13 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
built_options.emplace("-Dslice=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("slice",
kernel_name,
built_options,
......@@ -81,15 +71,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
CallStats call_stats{INT64_MAX, 0};
for (size_t i = 0; i < outputs_count; ++i) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
......@@ -111,12 +94,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (future != nullptr && runtime->is_profiling_enabled()) {
event.wait();
CallStats tmp_stats;
......
......@@ -82,23 +82,13 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
built_options, &kernel_));
......@@ -107,15 +97,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
}
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
......@@ -130,13 +113,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -59,24 +59,14 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
obfuscated_kernel_name,
built_options,
......@@ -87,15 +77,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
}
if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
OUT_OF_RANGE_SET_ARG;
SET_3D_GWS_ARGS(kernel_);
if (b2s_) {
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
......@@ -123,13 +106,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -29,6 +29,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
......@@ -44,21 +46,9 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS;
}
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
built_options,
......@@ -107,14 +97,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
OUT_OF_RANGE_SET_ARG;
SET_2D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(input_tensor->opencl_image()));
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
......@@ -139,13 +123,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......@@ -160,6 +138,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
......@@ -176,21 +156,9 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
}
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation_) {
case NOOP:
......@@ -240,14 +208,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const float round_w_r = 1.f / static_cast<float>(round_w);
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
OUT_OF_RANGE_SET_ARG;
SET_2D_GWS_ARGS(kernel_);
kernel_.setArg(
idx++,
*(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
......@@ -275,12 +237,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
}
......
......@@ -28,7 +28,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h"
#include "mace/core/workspace.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/kernels/opencl/common.h"
#include "mace/ops/ops_register.h"
#include "mace/utils/utils.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册