提交 6a764b53 编写于 作者: 叶剑武

Merge branch 'fix-opencl-tuning-bug' into 'master'

Fix opencl get_gloabl_size wrong when the kernel split.

See merge request !693
上级 a0fb2012
...@@ -44,10 +44,12 @@ cc_library( ...@@ -44,10 +44,12 @@ cc_library(
], ],
exclude = [ exclude = [
"buffer_to_image.h", "buffer_to_image.h",
"image_to_buffer.h",
], ],
) + if_opencl_enabled(glob([ ) + if_opencl_enabled(glob([
"opencl/*.h", "opencl/*.h",
"buffer_to_image.h", "buffer_to_image.h",
"image_to_buffer.h",
])), ])),
copts = [ copts = [
"-Werror", "-Werror",
......
...@@ -16,19 +16,18 @@ ...@@ -16,19 +16,18 @@
#define MACE_KERNELS_BUFFER_TO_IMAGE_H_ #define MACE_KERNELS_BUFFER_TO_IMAGE_H_
#include <memory> #include <memory>
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/common.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct BufferToImageFunctorBase { struct BufferToImageFunctorBase {
explicit BufferToImageFunctorBase(const int wino_blk_size) explicit BufferToImageFunctorBase(const int wino_blk_size)
: kernel_error_(nullptr), : wino_blk_size_(wino_blk_size) {}
wino_blk_size_(wino_blk_size) {}
std::unique_ptr<BufferBase> kernel_error_;
const int wino_blk_size_; const int wino_blk_size_;
}; };
...@@ -57,6 +56,10 @@ struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase { ...@@ -57,6 +56,10 @@ struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -23,11 +23,6 @@ ...@@ -23,11 +23,6 @@
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/kernels/gemm.h" #include "mace/kernels/gemm.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/kernels/opencl/helper.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
......
...@@ -16,19 +16,18 @@ ...@@ -16,19 +16,18 @@
#define MACE_KERNELS_IMAGE_TO_BUFFER_H_ #define MACE_KERNELS_IMAGE_TO_BUFFER_H_
#include <memory> #include <memory>
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/common.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct ImageToBufferFunctorBase { struct ImageToBufferFunctorBase {
explicit ImageToBufferFunctorBase(const int wino_blk_size) explicit ImageToBufferFunctorBase(const int wino_blk_size)
: kernel_error_(nullptr), : wino_blk_size_(wino_blk_size) {}
wino_blk_size_(wino_blk_size) {}
std::unique_ptr<BufferBase> kernel_error_;
const int wino_blk_size_; const int wino_blk_size_;
}; };
...@@ -37,9 +36,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { ...@@ -37,9 +36,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
explicit ImageToBufferFunctor(const int wino_blk_size) explicit ImageToBufferFunctor(const int wino_blk_size)
: ImageToBufferFunctorBase(wino_blk_size) {} : ImageToBufferFunctorBase(wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(input); MACE_UNUSED(input);
MACE_UNUSED(type); MACE_UNUSED(type);
MACE_UNUSED(output); MACE_UNUSED(output);
...@@ -54,9 +53,13 @@ struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase { ...@@ -54,9 +53,13 @@ struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
explicit ImageToBufferFunctor(const int wino_blk_size) explicit ImageToBufferFunctor(const int wino_blk_size)
: ImageToBufferFunctorBase(wino_blk_size) {} : ImageToBufferFunctorBase(wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
...@@ -38,23 +37,13 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()( ...@@ -38,23 +37,13 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name); built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
switch (activation_) { switch (activation_) {
case RELU: case RELU:
tuning_key_prefix_ = "relu_opencl_kernel"; tuning_key_prefix_ = "relu_opencl_kernel";
...@@ -92,15 +81,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()( ...@@ -92,15 +81,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0; int idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) { if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha); MACE_CHECK_NOTNULL(alpha);
...@@ -119,13 +101,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()( ...@@ -119,13 +101,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws,
lws, future)); lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -49,24 +49,14 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -49,24 +49,14 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
built_options.emplace("-Daddn=" + kernel_name); built_options.emplace("-Daddn=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_)); built_options, &kernel_));
...@@ -92,14 +82,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -92,14 +82,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
output_tensor->ResizeImage(output_shape, output_image_shape)); output_tensor->ResizeImage(output_shape, output_image_shape));
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_2D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
for (auto input : input_tensors) { for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
} }
...@@ -114,14 +98,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -114,14 +98,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
output_tensor->dim(2), output_tensor->dim(3)); output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/batch_norm.h" #include "mace/kernels/batch_norm.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
...@@ -49,23 +48,13 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()( ...@@ -49,23 +48,13 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name); built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (folded_constant_) { if (folded_constant_) {
built_options.emplace("-DFOLDED_CONSTANT"); built_options.emplace("-DFOLDED_CONSTANT");
} }
...@@ -96,15 +85,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()( ...@@ -96,15 +85,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image())); kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image())); kernel_.setArg(idx++, *(offset->opencl_image()));
...@@ -125,14 +107,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()( ...@@ -125,14 +107,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
output->dim(1), output->dim(2), output->dim(3), folded_constant_); output->dim(1), output->dim(2), output->dim(3), folded_constant_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/bias_add.h" #include "mace/kernels/bias_add.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
...@@ -45,39 +44,21 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -45,39 +44,21 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name); built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_)); built_options, &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image())); kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
...@@ -104,12 +85,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -104,12 +85,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} }
MACE_CL_RET_STATUS(error); MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
event.wait(); event.wait();
......
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/buffer_to_image.h" #include "mace/kernels/buffer_to_image.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -30,7 +30,8 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -30,7 +30,8 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_); CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_);
if (type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = std::vector<index_t> new_shape =
CalWinogradShape(buffer->shape(), type, wino_blk_size_); {(wino_blk_size_ + 2) * (wino_blk_size_ + 2),
buffer->dim(0), buffer->dim(1)};
MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape)); MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape));
} else { } else {
MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape)); MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape));
...@@ -76,85 +77,74 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -76,85 +77,74 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss; std::set<std::string> built_options;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; OUT_OF_RANGE_CONFIG(kernel_error_);
built_options.emplace(kernel_name_ss.str()); NON_UNIFORM_WG_CONFIG;
if (runtime->IsNonUniformWorkgroupsSupported()) { std::stringstream kernel_name_ss;
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
} built_options.emplace(kernel_name_ss.str());
if (buffer->dtype() == image->dtype()) { if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace(
built_options.emplace("-DCMD_DATA_TYPE=" + "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
DtToCLCMDDt(DataTypeToEnum<T>::value)); built_options.emplace("-DCMD_DATA_TYPE=" +
} else { DtToCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace("-DDATA_TYPE=" + } else {
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" +
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); built_options.emplace("-DCMD_DATA_TYPE=" +
} DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
} }
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
} }
cl::Kernel b2f_kernel; if (!IsVecEqual(input_shape_, buffer->shape())) {
uint32_t idx = 0;
MACE_RETURN_IF_ERROR(runtime->BuildKernel( OUT_OF_RANGE_SET_ARG;
"buffer_to_image", obfuscated_kernel_name, built_options, &b2f_kernel)); SET_2D_GWS_ARGS(kernel_);
kernel_.setArg(idx++, *(buffer->opencl_buffer()));
uint32_t idx = 0; MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
if (runtime->IsOutOfRangeCheckEnabled()) { "buffer offset not aligned");
b2f_kernel.setArg(idx++, kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); static_cast<uint32_t>(buffer->buffer_offset() /
} GetEnumTypeSize(buffer->dtype())));
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (type == CONV2D_FILTER) {
b2f_kernel.setArg(idx++, gws[0]); const index_t
b2f_kernel.setArg(idx++, gws[1]); inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
} kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
"buffer offset not aligned"); kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
b2f_kernel.setArg(idx++, } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
static_cast<uint32_t>(buffer->buffer_offset() / kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
GetEnumTypeSize(buffer->dtype()))); kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
if (type == CONV2D_FILTER) { kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); } else if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); } else {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(inner_size)); kernel_.setArg(idx++,
} else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) { static_cast<uint32_t>(formatted_buffer_shape[1]));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); kernel_.setArg(idx++,
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1))); static_cast<uint32_t>(formatted_buffer_shape[2]));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); kernel_.setArg(idx++,
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); static_cast<uint32_t>(formatted_buffer_shape[3]));
} else if (type == ARGUMENT) { }
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); kernel_.setArg(idx++, *(image->opencl_image()));
} else { input_shape_ = buffer->shape();
b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[1]));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[2]));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[3]));
} }
b2f_kernel.setArg(idx++, *(image->opencl_image()));
const uint32_t kwg_size = const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16}; const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event; cl::Event event;
cl_int error; cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else { } else {
std::vector<uint32_t> roundup_gws(lws.size()); std::vector<uint32_t> roundup_gws(lws.size());
...@@ -163,16 +153,11 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -163,16 +153,11 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
} }
MACE_CL_RET_STATUS(error); MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
event.wait(); event.wait();
......
...@@ -45,23 +45,13 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -45,23 +45,13 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name); built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR( MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name, runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_)); built_options, &kernel_));
...@@ -72,15 +62,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -72,15 +62,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_); kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group)); kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
...@@ -95,14 +78,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -95,14 +78,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -17,10 +17,8 @@ __kernel void activation(KERNEL_ERROR_PARAMS ...@@ -17,10 +17,8 @@ __kernel void activation(KERNEL_ERROR_PARAMS
|| hb >= global_size_dim2) { || hb >= global_size_dim2) {
return; return;
} }
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif #endif
const int width = global_size_dim1;
const int pos = mad24(ch_blk, width, w); const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
......
...@@ -21,10 +21,8 @@ __kernel void batch_norm(KERNEL_ERROR_PARAMS ...@@ -21,10 +21,8 @@ __kernel void batch_norm(KERNEL_ERROR_PARAMS
|| hb >= global_size_dim2) { || hb >= global_size_dim2) {
return; return;
} }
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif #endif
const int width = global_size_dim1;
#ifdef FOLDED_CONSTANT #ifdef FOLDED_CONSTANT
DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
......
...@@ -14,10 +14,8 @@ __kernel void bias_add(KERNEL_ERROR_PARAMS ...@@ -14,10 +14,8 @@ __kernel void bias_add(KERNEL_ERROR_PARAMS
|| hb >= global_size_dim2) { || hb >= global_size_dim2) {
return; return;
} }
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif #endif
const int width = global_size_dim1;
const int pos = mad24(ch_blk, width, w); const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
......
...@@ -446,10 +446,8 @@ __kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS ...@@ -446,10 +446,8 @@ __kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
const int inner_size = global_size_dim0;
#else
const int inner_size = get_global_size(0);
#endif #endif
const int inner_size = global_size_dim0;
const int out_chan_idx = h << 2; const int out_chan_idx = h << 2;
const int in_chan_idx = w % in_channels; const int in_chan_idx = w % in_channels;
...@@ -492,10 +490,8 @@ __kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS ...@@ -492,10 +490,8 @@ __kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
const int inner_size = global_size_dim0;
#else
const int inner_size = get_global_size(0);
#endif #endif
const int inner_size = global_size_dim0;
const int out_chan_idx = h << 2; const int out_chan_idx = h << 2;
const int in_chan_idx = w % in_channels; const int in_chan_idx = w % in_channels;
...@@ -536,10 +532,8 @@ __kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS ...@@ -536,10 +532,8 @@ __kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif #endif
const int out_channels = global_size_dim1;
const int in_chan_blks = (in_channels + 3) >> 2; const int in_chan_blks = (in_channels + 3) >> 2;
const int hw_size = height * width; const int hw_size = height * width;
const int inner_size = in_channels * hw_size; const int inner_size = in_channels * hw_size;
...@@ -585,10 +579,8 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS ...@@ -585,10 +579,8 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif #endif
const int out_channels = global_size_dim1;
const int in_chan_blks = (in_channels + 3) >> 2; const int in_chan_blks = (in_channels + 3) >> 2;
const int hw_size = height * width; const int hw_size = height * width;
const int inner_size = in_channels * hw_size; const int inner_size = in_channels * hw_size;
...@@ -632,10 +624,8 @@ __kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS ...@@ -632,10 +624,8 @@ __kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif #endif
const int out_channels = global_size_dim1;
const int out_channel_idx = h; const int out_channel_idx = h;
const int in_channel_idx = w << 2; const int in_channel_idx = w << 2;
...@@ -782,10 +772,8 @@ __kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS ...@@ -782,10 +772,8 @@ __kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif #endif
const int out_channels = global_size_dim1;
const int out_channel_idx = h; const int out_channel_idx = h;
const int in_channel_idx = w << 2; const int in_channel_idx = w << 2;
...@@ -960,10 +948,8 @@ __kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS ...@@ -960,10 +948,8 @@ __kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif #endif
const int out_channels = global_size_dim1;
const int out_channel_idx = h; const int out_channel_idx = h;
const int in_channel_idx = w << 2; const int in_channel_idx = w << 2;
......
...@@ -16,10 +16,8 @@ __kernel void channel_shuffle(KERNEL_ERROR_PARAMS ...@@ -16,10 +16,8 @@ __kernel void channel_shuffle(KERNEL_ERROR_PARAMS
|| hb_idx >= global_size_dim2) { || hb_idx >= global_size_dim2) {
return; return;
} }
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif #endif
const int width = global_size_dim1;
const int group_blks = groups / 4; const int group_blks = groups / 4;
const int groups_blks_width = group_blks * width; const int groups_blks_width = group_blks * width;
......
...@@ -38,24 +38,15 @@ ...@@ -38,24 +38,15 @@
CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \ CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value); CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value);
#ifndef NON_UNIFORM_WORK_GROUP
#define GLOBAL_WORK_GROUP_SIZE_DIM2 \ #define GLOBAL_WORK_GROUP_SIZE_DIM2 \
__private const int global_size_dim0, \ __private const int global_size_dim0, \
__private const int global_size_dim1, __private const int global_size_dim1,
#define GLOBAL_WORK_GROUP_SIZE_DIM3 \ #define GLOBAL_WORK_GROUP_SIZE_DIM3 \
__private const int global_size_dim0, \ __private const int global_size_dim0, \
__private const int global_size_dim1, \ __private const int global_size_dim1, \
__private const int global_size_dim2, __private const int global_size_dim2,
#else
#define GLOBAL_WORK_GROUP_SIZE_DIM2
#define GLOBAL_WORK_GROUP_SIZE_DIM3
#endif
#ifdef OUT_OF_RANGE_CHECK #ifdef OUT_OF_RANGE_CHECK
#define KERNEL_ERROR_PARAMS \ #define KERNEL_ERROR_PARAMS \
......
...@@ -37,10 +37,8 @@ __kernel void concat_channel(KERNEL_ERROR_PARAMS ...@@ -37,10 +37,8 @@ __kernel void concat_channel(KERNEL_ERROR_PARAMS
|| hb_idx >= global_size_dim2) { || hb_idx >= global_size_dim2) {
return; return;
} }
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif #endif
const int width = global_size_dim1;
const int input0_chan_blk = (input0_chan + 3) >> 2; const int input0_chan_blk = (input0_chan + 3) >> 2;
...@@ -100,10 +98,8 @@ __kernel void concat_channel_multi(KERNEL_ERROR_PARAMS ...@@ -100,10 +98,8 @@ __kernel void concat_channel_multi(KERNEL_ERROR_PARAMS
|| hb_idx >= global_size_dim2) { || hb_idx >= global_size_dim2) {
return; return;
} }
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif #endif
const int width = global_size_dim1;
DATA_TYPE4 data = 0; DATA_TYPE4 data = 0;
data = READ_IMAGET(input, data = READ_IMAGET(input,
......
...@@ -30,10 +30,8 @@ __kernel void conv_2d(KERNEL_ERROR_PARAMS ...@@ -30,10 +30,8 @@ __kernel void conv_2d(KERNEL_ERROR_PARAMS
|| out_hb >= global_size_dim2) { || out_hb >= global_size_dim2) {
return; return;
} }
const int out_w_blks = global_size_dim1;
#else
const int out_w_blks = get_global_size(1);
#endif #endif
const int out_w_blks = global_size_dim1;
#ifdef BIAS #ifdef BIAS
DATA_TYPE4 out0 = DATA_TYPE4 out0 =
......
...@@ -24,10 +24,8 @@ __kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS ...@@ -24,10 +24,8 @@ __kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS
|| out_hb >= global_size_dim2) { || out_hb >= global_size_dim2) {
return; return;
} }
const int out_w_blks = global_size_dim1;
#else
const int out_w_blks = get_global_size(1);
#endif #endif
const int out_w_blks = global_size_dim1;
#ifdef BIAS #ifdef BIAS
DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
......
...@@ -28,10 +28,8 @@ __kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS ...@@ -28,10 +28,8 @@ __kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS
|| out_hb >= global_size_dim2) { || out_hb >= global_size_dim2) {
return; return;
} }
const int out_w_blks = global_size_dim1;
#else
const int out_w_blks = get_global_size(1);
#endif #endif
const int out_w_blks = global_size_dim1;
#ifdef BIAS #ifdef BIAS
DATA_TYPE4 out0 = DATA_TYPE4 out0 =
......
...@@ -30,10 +30,8 @@ __kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS ...@@ -30,10 +30,8 @@ __kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS
|| out_hb >= global_size_dim2) { || out_hb >= global_size_dim2) {
return; return;
} }
const short out_w_blks = global_size_dim1;
#else
const short out_w_blks = get_global_size(1);
#endif #endif
const short out_w_blks = global_size_dim1;
const short rounded_in_ch = in_ch_blks << 2; const short rounded_in_ch = in_ch_blks << 2;
const short in_ch_blk = out_ch_blk; // multiplier = 1 const short in_ch_blk = out_ch_blk; // multiplier = 1
......
...@@ -80,7 +80,7 @@ __kernel void fully_connected_width(KERNEL_ERROR_PARAMS ...@@ -80,7 +80,7 @@ __kernel void fully_connected_width(KERNEL_ERROR_PARAMS
__private const float relux_max_limit) { __private const float relux_max_limit) {
const int inter_out_idx = get_global_id(0); const int inter_out_idx = get_global_id(0);
const int width_blk_idx = get_global_id(1); const int width_blk_idx = get_global_id(1);
const int width_blk_count = get_global_size(1); const int width_blk_count = global_size_dim1;
const int batch_out_blk_idx = get_global_id(2); const int batch_out_blk_idx = get_global_id(2);
const int batch_idx = batch_out_blk_idx / out_blks; const int batch_idx = batch_out_blk_idx / out_blks;
......
...@@ -23,11 +23,8 @@ __kernel void pad(KERNEL_ERROR_PARAMS ...@@ -23,11 +23,8 @@ __kernel void pad(KERNEL_ERROR_PARAMS
|| hb_idx >= global_size_dim2) { || hb_idx >= global_size_dim2) {
return; return;
} }
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif #endif
const int width = global_size_dim1;
DATA_TYPE4 data = constant_value; DATA_TYPE4 data = constant_value;
if ((height_padding <= height_idx && height_idx < input_padded_height) && if ((height_padding <= height_idx && height_idx < input_padded_height) &&
......
...@@ -36,10 +36,8 @@ __kernel void pooling(KERNEL_ERROR_PARAMS ...@@ -36,10 +36,8 @@ __kernel void pooling(KERNEL_ERROR_PARAMS
|| out_hb_idx >= global_size_dim2) { || out_hb_idx >= global_size_dim2) {
return; return;
} }
const int out_width = global_size_dim1;
#else
const int out_width = get_global_size(1);
#endif #endif
const int out_width = global_size_dim1;
const int batch_idx = mul24((out_hb_idx / out_height), in_height); const int batch_idx = mul24((out_hb_idx / out_height), in_height);
const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top; const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
......
...@@ -19,12 +19,9 @@ __kernel void resize_bilinear_nocache(KERNEL_ERROR_PARAMS ...@@ -19,12 +19,9 @@ __kernel void resize_bilinear_nocache(KERNEL_ERROR_PARAMS
|| hb >= global_size_dim2) { || hb >= global_size_dim2) {
return; return;
} }
#endif
const int ch_blks = global_size_dim0; const int ch_blks = global_size_dim0;
const int out_width = global_size_dim1; const int out_width = global_size_dim1;
#else
const int ch_blks = get_global_size(0);
const int out_width = get_global_size(1);
#endif
const int b = hb / out_height; const int b = hb / out_height;
const int h = hb % out_height; const int h = hb % out_height;
......
...@@ -14,10 +14,8 @@ __kernel void slice(KERNEL_ERROR_PARAMS ...@@ -14,10 +14,8 @@ __kernel void slice(KERNEL_ERROR_PARAMS
|| hb_idx >= global_size_dim2) { || hb_idx >= global_size_dim2) {
return; return;
} }
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif #endif
const int width = global_size_dim1;
DATA_TYPE4 data = READ_IMAGET(input, SAMPLER, DATA_TYPE4 data = READ_IMAGET(input, SAMPLER,
(int2)(mad24(chan_blk_idx + chan_blk_offset, (int2)(mad24(chan_blk_idx + chan_blk_offset,
......
...@@ -15,12 +15,9 @@ __kernel void softmax(KERNEL_ERROR_PARAMS ...@@ -15,12 +15,9 @@ __kernel void softmax(KERNEL_ERROR_PARAMS
|| hb_idx >= global_size_dim2) { || hb_idx >= global_size_dim2) {
return; return;
} }
#endif
const int chan_blks = global_size_dim0 - 1; const int chan_blks = global_size_dim0 - 1;
const int width = global_size_dim1; const int width = global_size_dim1;
#else
const int chan_blks = get_global_size(0) - 1;
const int width = get_global_size(1);
#endif
int pos = width_idx; int pos = width_idx;
DATA_TYPE max_value = -FLT_MAX; DATA_TYPE max_value = -FLT_MAX;
......
...@@ -20,10 +20,8 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS ...@@ -20,10 +20,8 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS
if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) { if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
return; return;
} }
const int chan_blk_size = global_size_dim1;
#else
const int chan_blk_size = get_global_size(1);
#endif #endif
const int chan_blk_size = global_size_dim1;
const int batch_idx = out_width_idx * round_hw_r; const int batch_idx = out_width_idx * round_hw_r;
const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
...@@ -141,10 +139,8 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS ...@@ -141,10 +139,8 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) { if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return; return;
} }
const int out_channel = global_size_dim1;
#else
const int out_channel = get_global_size(1);
#endif #endif
const int out_channel = global_size_dim1;
int width = width_idx; int width = width_idx;
int height = height_idx; int height = height_idx;
...@@ -255,10 +251,8 @@ __kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS ...@@ -255,10 +251,8 @@ __kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS
if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) { if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
return; return;
} }
const int chan_blk_size = global_size_dim1;
#else
const int chan_blk_size = get_global_size(1);
#endif #endif
const int chan_blk_size = global_size_dim1;
const int batch_idx = out_width_idx * round_hw_r; const int batch_idx = out_width_idx * round_hw_r;
const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
...@@ -417,10 +411,8 @@ __kernel void winograd_inverse_transform_4x4(KERNEL_ERROR_PARAMS ...@@ -417,10 +411,8 @@ __kernel void winograd_inverse_transform_4x4(KERNEL_ERROR_PARAMS
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) { if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return; return;
} }
const int out_channel = global_size_dim1;
#else
const int out_channel = get_global_size(1);
#endif #endif
const int out_channel = global_size_dim1;
const int batch = width_idx * round_hw_r; const int batch = width_idx * round_hw_r;
int h = mad24(batch, -round_hw, width_idx); int h = mad24(batch, -round_hw, width_idx);
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_COMMON_H_
#define MACE_KERNELS_OPENCL_COMMON_H_
namespace mace {
namespace kernels {
enum BufferType {
CONV2D_FILTER = 0,
IN_OUT_CHANNEL = 1,
ARGUMENT = 2,
IN_OUT_HEIGHT = 3,
IN_OUT_WIDTH = 4,
WINOGRAD_FILTER = 5,
DW_CONV2D_FILTER = 6,
WEIGHT_HEIGHT = 7,
WEIGHT_WIDTH = 8,
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_COMMON_H_
...@@ -65,26 +65,16 @@ static MaceStatus Concat2(cl::Kernel *kernel, ...@@ -65,26 +65,16 @@ static MaceStatus Concat2(cl::Kernel *kernel,
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name); built_options.emplace("-Dconcat_channel=" + kernel_name);
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (input0->dtype() == output->dtype()) { if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
} else { } else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
} }
if (input0->dim(3) % 4 == 0) { if (input0->dim(3) % 4 == 0) {
built_options.emplace("-DDIVISIBLE_FOUR"); built_options.emplace("-DDIVISIBLE_FOUR");
...@@ -97,15 +87,8 @@ static MaceStatus Concat2(cl::Kernel *kernel, ...@@ -97,15 +87,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
} }
if (!IsVecEqual(*prev_input_shape, input0->shape())) { if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG_PTR;
kernel->setArg(idx++, SET_3D_GWS_ARGS_PTR(kernel, gws);
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input0->opencl_image()))); *(static_cast<const cl::Image2D *>(input0->opencl_image())));
kernel->setArg(idx++, kernel->setArg(idx++,
...@@ -123,14 +106,7 @@ static MaceStatus Concat2(cl::Kernel *kernel, ...@@ -123,14 +106,7 @@ static MaceStatus Concat2(cl::Kernel *kernel,
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
...@@ -149,22 +125,12 @@ static MaceStatus ConcatN(cl::Kernel *kernel, ...@@ -149,22 +125,12 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel)); built_options, kernel));
*kwg_size = *kwg_size =
...@@ -185,15 +151,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel, ...@@ -185,15 +151,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG_PTR;
kernel->setArg(idx++, SET_3D_GWS_ARGS_PTR(kernel, gws);
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset)); kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
kernel->setArg(idx++, *(output->opencl_image())); kernel->setArg(idx++, *(output->opencl_image()));
...@@ -217,12 +176,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel, ...@@ -217,12 +176,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} }
MACE_CL_RET_STATUS(error); MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(*kernel_error);
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
if (future != nullptr && runtime->is_profiling_enabled()) { if (future != nullptr && runtime->is_profiling_enabled()) {
event.wait(); event.wait();
CallStats tmp_stats; CallStats tmp_stats;
......
...@@ -98,22 +98,12 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -98,22 +98,12 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
MACE_CHECK(input_batch == batch); MACE_CHECK(input_batch == batch);
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (bias != nullptr) { if (bias != nullptr) {
built_options.emplace("-DBIAS"); built_options.emplace("-DBIAS");
} }
...@@ -147,17 +137,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -147,17 +137,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
// Support different input size
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG_PTR;
kernel->setArg(idx++, SET_3D_GWS_ARGS_PTR(kernel, gws);
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -182,14 +166,7 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -182,14 +166,7 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -84,22 +84,12 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -84,22 +84,12 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP:
...@@ -131,17 +121,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -131,17 +121,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
// Support different input size
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG_PTR;
kernel->setArg(idx++, SET_3D_GWS_ARGS_PTR(kernel, gws);
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -169,14 +153,7 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -169,14 +153,7 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -93,22 +93,12 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, ...@@ -93,22 +93,12 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-Dconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP:
...@@ -140,17 +130,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, ...@@ -140,17 +130,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
// Support different input size
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG_PTR;
kernel->setArg(idx++, SET_3D_GWS_ARGS_PTR(kernel, gws);
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -182,13 +166,7 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, ...@@ -182,13 +166,7 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future)); gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(*kernel_error);
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -136,23 +136,13 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()( ...@@ -136,23 +136,13 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
built_options.emplace("-Dcrop=" + kernel_name); built_options.emplace("-Dcrop=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
built_options, &kernel_)); built_options, &kernel_));
...@@ -161,15 +151,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()( ...@@ -161,15 +151,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
} }
if (!IsVecEqual(input_shape_, input0->shape())) { if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input0->opencl_image())); kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(offsets[0])); kernel_.setArg(idx++, static_cast<int>(offsets[0]));
kernel_.setArg(idx++, static_cast<int>(offsets[1])); kernel_.setArg(idx++, static_cast<int>(offsets[1]));
...@@ -190,14 +173,7 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()( ...@@ -190,14 +173,7 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -59,22 +59,12 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, ...@@ -59,22 +59,12 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
built_options.emplace("-Ddeconv_2d=" + kernel_name); built_options.emplace("-Ddeconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP:
...@@ -108,15 +98,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, ...@@ -108,15 +98,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG_PTR;
kernel->setArg(idx++, SET_3D_GWS_ARGS_PTR(kernel, gws);
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -152,13 +135,7 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, ...@@ -152,13 +135,7 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future)); gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(*kernel_error);
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -76,6 +76,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -76,6 +76,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
...@@ -83,18 +85,6 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -83,18 +85,6 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space", MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
obfuscated_kernel_name, obfuscated_kernel_name,
built_options, built_options,
...@@ -106,15 +96,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -106,15 +96,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
if (d2s_) { if (d2s_) {
kernel_.setArg(idx++, static_cast<int32_t>(block_size_)); kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
...@@ -140,13 +123,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -140,13 +123,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -93,6 +93,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, ...@@ -93,6 +93,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d_s1"); kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d_s1");
...@@ -100,20 +102,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, ...@@ -100,20 +102,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
} else { } else {
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
} }
if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DOUT_OF_RANGE_CHECK"); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace(MakeString("-DSTRIDE=", stride)); built_options.emplace(MakeString("-DSTRIDE=", stride));
switch (activation) { switch (activation) {
...@@ -154,15 +144,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, ...@@ -154,15 +144,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
input_channels); input_channels);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG_PTR;
kernel->setArg(idx++, SET_3D_GWS_ARGS_PTR(kernel, gws);
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -193,13 +176,7 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, ...@@ -193,13 +176,7 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future)); gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(*kernel_error);
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -74,11 +74,13 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -74,11 +74,13 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
built_options.emplace("-Deltwise=" + kernel_name); built_options.emplace("-Deltwise=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (input1 == nullptr) { if (input1 == nullptr) {
built_options.emplace("-DINPUT_TYPE=1"); built_options.emplace("-DINPUT_TYPE=1");
...@@ -90,19 +92,6 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -90,19 +92,6 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
if (swapped) built_options.emplace("-DSWAPPED"); if (swapped) built_options.emplace("-DSWAPPED");
} }
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
built_options, &kernel_)); built_options, &kernel_));
...@@ -111,15 +100,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -111,15 +100,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
} }
if (!IsVecEqual(input_shape_, input0->shape())) { if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input0->opencl_image())); kernel_.setArg(idx++, *(input0->opencl_image()));
if (input1 == nullptr) { if (input1 == nullptr) {
kernel_.setArg(idx++, value_); kernel_.setArg(idx++, value_);
...@@ -144,13 +126,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -144,13 +126,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/fully_connected.h" #include "mace/kernels/fully_connected.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
namespace mace { namespace mace {
...@@ -42,11 +44,13 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, ...@@ -42,11 +44,13 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
const index_t output_blocks = RoundUpDiv4(output_size); const index_t output_blocks = RoundUpDiv4(output_size);
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
built_options.emplace("-Dfully_connected_width=" + kernel_name); built_options.emplace("-Dfully_connected_width=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (bias != nullptr) { if (bias != nullptr) {
built_options.emplace("-DBIAS"); built_options.emplace("-DBIAS");
} }
...@@ -71,19 +75,6 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, ...@@ -71,19 +75,6 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO"); built_options.emplace("-DNON_QUALCOMM_ADRENO");
} }
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
built_options, kernel)); built_options, kernel));
...@@ -113,15 +104,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, ...@@ -113,15 +104,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
(*gws)[2] = static_cast<uint32_t>(batch * output_blocks); (*gws)[2] = static_cast<uint32_t>(batch * output_blocks);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG_PTR;
kernel->setArg(idx++, SET_3D_GWS_ARGS_PTR(kernel, *gws);
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, (*gws)[0]);
kernel->setArg(idx++, (*gws)[1]);
kernel->setArg(idx++, (*gws)[2]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(weight->opencl_image())); kernel->setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -154,12 +138,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, ...@@ -154,12 +138,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
} }
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(*kernel_error);
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
MACE_CL_RET_STATUS(error); MACE_CL_RET_STATUS(error);
if (future != nullptr) { if (future != nullptr) {
...@@ -192,26 +171,16 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, ...@@ -192,26 +171,16 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
built_options.emplace("-Dfully_connected=" + kernel_name); built_options.emplace("-Dfully_connected=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (bias != nullptr) { if (bias != nullptr) {
built_options.emplace("-DBIAS"); built_options.emplace("-DBIAS");
} }
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = 0;
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
switch (activation) { switch (activation) {
case NOOP: case NOOP:
break; break;
...@@ -247,14 +216,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, ...@@ -247,14 +216,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
}; };
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG_PTR;
kernel->setArg(idx++, SET_2D_GWS_ARGS_PTR(kernel, *gws);
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, (*gws)[0]);
kernel->setArg(idx++, (*gws)[1]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(weight->opencl_image())); kernel->setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -276,13 +239,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, ...@@ -276,13 +239,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key,
gws->data(), *lws, future)); gws->data(), *lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(*kernel_error);
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
} // namespace } // namespace
......
...@@ -180,22 +180,6 @@ std::vector<index_t> FormatBufferShape( ...@@ -180,22 +180,6 @@ std::vector<index_t> FormatBufferShape(
} }
} }
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type,
const int wino_blk_size) {
if (type == WINOGRAD_FILTER) {
return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[0], shape[1]};
} else if (type == IN_OUT_HEIGHT) {
index_t out_width =
shape[0] * ((shape[1] + wino_blk_size - 1) / wino_blk_size) *
((shape[2] + wino_blk_size - 1) / wino_blk_size);
return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[3], out_width};
} else {
LOG(FATAL) << "Mace not supported yet.";
return std::vector<index_t>();
}
}
std::string DtToCLDt(const DataType dt) { std::string DtToCLDt(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:
...@@ -220,7 +204,7 @@ std::string DtToCLCMDDt(const DataType dt) { ...@@ -220,7 +204,7 @@ std::string DtToCLCMDDt(const DataType dt) {
} }
} }
std::string DtToUpstreamCLDt(const DataType dt) { std::string DtToUpCompatibleCLDt(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:
case DT_HALF: case DT_HALF:
...@@ -231,7 +215,7 @@ std::string DtToUpstreamCLDt(const DataType dt) { ...@@ -231,7 +215,7 @@ std::string DtToUpstreamCLDt(const DataType dt) {
} }
} }
std::string DtToUpstreamCLCMDDt(const DataType dt) { std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:
case DT_HALF: case DT_HALF:
...@@ -357,7 +341,8 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -357,7 +341,8 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
double elapse_time = timer->AccumulatedMicros(); double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming(); timer->ClearTiming();
uint32_t num_blocks = std::min( uint32_t num_blocks = std::min(
static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]); static_cast<uint32_t>(elapse_time / kMaxKernelExecTime) + 1,
gws[2]);
uint32_t block_size = gws[2] / num_blocks; uint32_t block_size = gws[2] / num_blocks;
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
block_size = RoundUp(block_size, params[2]); block_size = RoundUp(block_size, params[2]);
...@@ -465,7 +450,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -465,7 +450,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
double elapse_time = timer->AccumulatedMicros(); double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming(); timer->ClearTiming();
uint32_t num_blocks = std::min( uint32_t num_blocks = std::min(
static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]); static_cast<uint32_t>(elapse_time / kMaxKernelExecTime) + 1,
gws[1]);
uint32_t block_size = gws[1] / num_blocks; uint32_t block_size = gws[1] / num_blocks;
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
block_size = RoundUp(block_size, params[1]); block_size = RoundUp(block_size, params[1]);
......
...@@ -15,7 +15,9 @@ ...@@ -15,7 +15,9 @@
#ifndef MACE_KERNELS_OPENCL_HELPER_H_ #ifndef MACE_KERNELS_OPENCL_HELPER_H_
#define MACE_KERNELS_OPENCL_HELPER_H_ #define MACE_KERNELS_OPENCL_HELPER_H_
#include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
...@@ -23,26 +25,71 @@ ...@@ -23,26 +25,71 @@
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/kernels/opencl/common.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
const float kMaxKernelExeTime = 1000.0; // microseconds #define OUT_OF_RANGE_CONFIG(kernel_error) \
if (runtime->IsOutOfRangeCheckEnabled()) { \
built_options.emplace("-DOUT_OF_RANGE_CHECK"); \
(kernel_error) = std::move(std::unique_ptr<Buffer>( \
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); \
MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1)); \
(kernel_error)->Map(nullptr); \
*((kernel_error)->mutable_data<char>()) = 0; \
(kernel_error)->UnMap(); \
}
const int32_t kBaseGPUMemCacheSize = 16384; #define OUT_OF_RANGE_SET_ARG \
if (runtime->IsOutOfRangeCheckEnabled()) { \
kernel_.setArg(idx++, \
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); \
}
#define OUT_OF_RANGE_SET_ARG_PTR \
if (runtime->IsOutOfRangeCheckEnabled()) { \
kernel->setArg(idx++, \
*(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); \
}
#define OUT_OF_RANGE_VALIDATION(kernel_error) \
if (runtime->IsOutOfRangeCheckEnabled()) { \
(kernel_error)->Map(nullptr); \
char *kerror_code = (kernel_error)->mutable_data<char>(); \
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;\
(kernel_error)->UnMap(); \
}
#define NON_UNIFORM_WG_CONFIG \
if (runtime->IsNonUniformWorkgroupsSupported()) { \
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); \
}
#define SET_3D_GWS_ARGS(kernel) \
kernel.setArg(idx++, gws[0]); \
kernel.setArg(idx++, gws[1]); \
kernel.setArg(idx++, gws[2]);
enum BufferType { #define SET_2D_GWS_ARGS(kernel) \
CONV2D_FILTER = 0, kernel.setArg(idx++, gws[0]); \
IN_OUT_CHANNEL = 1, kernel.setArg(idx++, gws[1]);
ARGUMENT = 2,
IN_OUT_HEIGHT = 3, #define SET_3D_GWS_ARGS_PTR(kernel, gws) \
IN_OUT_WIDTH = 4, kernel->setArg(idx++, (gws)[0]); \
WINOGRAD_FILTER = 5, kernel->setArg(idx++, (gws)[1]); \
DW_CONV2D_FILTER = 6, kernel->setArg(idx++, (gws)[2]);
WEIGHT_HEIGHT = 7,
WEIGHT_WIDTH = 8, #define SET_2D_GWS_ARGS_PTR(kernel, gws) \
}; kernel->setArg(idx++, (gws)[0]); \
kernel->setArg(idx++, (gws)[1]);
// Max execution time of OpenCL kernel for tuning to prevent UI stuck.
const float kMaxKernelExecTime = 1000.0; // microseconds
// Base GPU cache size used for computing local work group size.
const int32_t kBaseGPUMemCacheSize = 16384;
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type, const BufferType type,
...@@ -53,41 +100,35 @@ std::vector<index_t> FormatBufferShape( ...@@ -53,41 +100,35 @@ std::vector<index_t> FormatBufferShape(
const std::vector<index_t> &buffer_shape, const std::vector<index_t> &buffer_shape,
const BufferType type); const BufferType type);
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape, // CPU data type to OpenCL command data type
const BufferType type,
const int wino_blk_size = 2);
std::string DtToCLCMDDt(const DataType dt); std::string DtToCLCMDDt(const DataType dt);
std::string DtToUpstreamCLCMDDt(const DataType dt); // CPU data type to upward compatible OpenCL command data type
// e.g. half -> float
std::string DtToUpCompatibleCLCMDDt(const DataType dt);
// CPU data type to OpenCL data type
std::string DtToCLDt(const DataType dt); std::string DtToCLDt(const DataType dt);
std::string DtToUpstreamCLDt(const DataType dt); // CPU data type to upward compatible OpenCL data type
// e.g. half -> float
std::string DtToUpCompatibleCLDt(const DataType dt);
// Tuning or Run OpenCL kernel with 3D work group size
MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future); StatsFuture *future);
// Tuning or Run OpenCL kernel with 2D work group size
MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future); StatsFuture *future);
inline void SetFuture(StatsFuture *future, const cl::Event &event) { // Check whether limit OpenCL kernel time flag open.
if (future != nullptr) {
future->wait_fn = [event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
OpenCLRuntime::Global()->GetCallStats(event, stats);
}
};
}
}
inline bool LimitKernelTime() { inline bool LimitKernelTime() {
const char *flag = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME"); const char *flag = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME");
return flag != nullptr && strlen(flag) == 1 && flag[0] == '1'; return flag != nullptr && strlen(flag) == 1 && flag[0] == '1';
......
...@@ -13,9 +13,10 @@ ...@@ -13,9 +13,10 @@
// limitations under the License. // limitations under the License.
#include "mace/kernels/image_to_buffer.h" #include "mace/kernels/image_to_buffer.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -68,80 +69,71 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -68,80 +69,71 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss; std::set<std::string> built_options;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; OUT_OF_RANGE_CONFIG(kernel_error_);
built_options.emplace(kernel_name_ss.str()); NON_UNIFORM_WG_CONFIG;
if (runtime->IsNonUniformWorkgroupsSupported()) { std::stringstream kernel_name_ss;
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
} built_options.emplace(kernel_name_ss.str());
if (buffer->dtype() == image->dtype()) { if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace(
built_options.emplace("-DCMD_DATA_TYPE=" + "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
DtToCLCMDDt(DataTypeToEnum<T>::value)); built_options.emplace("-DCMD_DATA_TYPE=" +
} else { DtToCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace("-DDATA_TYPE=" + } else {
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" +
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); built_options.emplace("-DCMD_DATA_TYPE=" +
} DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
} }
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options,
&kernel_));
} }
cl::Kernel b2f_kernel;
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options,
&b2f_kernel));
uint32_t idx = 0; if (!IsVecEqual(input_shape_, image->shape())) {
if (runtime->IsOutOfRangeCheckEnabled()) { uint32_t idx = 0;
b2f_kernel.setArg(idx++, OUT_OF_RANGE_SET_ARG;
*(static_cast<cl::Buffer *>(kernel_error_->buffer()))); SET_2D_GWS_ARGS(kernel_);
} kernel_.setArg(idx++, *(buffer->opencl_buffer()));
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (type == CONV2D_FILTER) {
b2f_kernel.setArg(idx++, gws[0]); const index_t
b2f_kernel.setArg(idx++, gws[1]); inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
} kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
if (type == CONV2D_FILTER) { kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); } else if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); } else if (type == WEIGHT_HEIGHT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(inner_size)); kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else if (type == ARGUMENT) { kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
} else if (type == WEIGHT_HEIGHT) { kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); } else {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1))); kernel_.setArg(idx++,
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); static_cast<uint32_t>(formatted_buffer_shape[1]));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); kernel_.setArg(idx++,
} else { static_cast<uint32_t>(formatted_buffer_shape[2]));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[1])); kernel_.setArg(idx++,
b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[2])); static_cast<uint32_t>(formatted_buffer_shape[3]));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[3])); }
kernel_.setArg(idx++, *(image->opencl_image()));
input_shape_ = image->shape();
} }
b2f_kernel.setArg(idx++, *(image->opencl_image()));
const uint32_t kwg_size = const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {16, kwg_size / 16}; const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event; cl::Event event;
cl_int error; cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else { } else {
std::vector<uint32_t> roundup_gws(lws.size()); std::vector<uint32_t> roundup_gws(lws.size());
...@@ -150,16 +142,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -150,16 +142,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
} }
MACE_CL_RET_STATUS(error); MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
event.wait(); event.wait();
......
...@@ -57,23 +57,13 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -57,23 +57,13 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name); built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
built_options, &kernel_)); built_options, &kernel_));
...@@ -81,14 +71,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -81,14 +71,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_2D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
kernel_.setArg(idx++, *(A->opencl_image())); kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image())); kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image())); kernel_.setArg(idx++, *(C->opencl_image()));
...@@ -98,18 +82,12 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -98,18 +82,12 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(height_blocks)); kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K))); kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -40,9 +40,8 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -40,9 +40,8 @@ bool BufferToImageOpImpl(Tensor *buffer,
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
if (runtime->IsNonUniformWorkgroupsSupported()) { OUT_OF_RANGE_CONFIG(kernel_error);
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); NON_UNIFORM_WG_CONFIG;
}
if (buffer->dtype() == image->dtype()) { if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + built_options.emplace("-DDATA_TYPE=" +
DtToCLDt(DataTypeToEnum<float>::value)); DtToCLDt(DataTypeToEnum<float>::value));
...@@ -50,57 +49,46 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -50,57 +49,46 @@ bool BufferToImageOpImpl(Tensor *buffer,
DtToCLCMDDt(DataTypeToEnum<float>::value)); DtToCLCMDDt(DataTypeToEnum<float>::value));
} else { } else {
built_options.emplace("-DDATA_TYPE=" + built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<float>::value)); DtToUpCompatibleCLDt(DataTypeToEnum<float>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace(
DtToUpstreamCLCMDDt(DataTypeToEnum<float>::value)); "-DCMD_DATA_TYPE=" +
} DtToUpCompatibleCLCMDDt(DataTypeToEnum<float>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error->Allocate(1));
kernel_error->Map(nullptr);
*(kernel_error->mutable_data<char>()) = 0;
kernel_error->UnMap();
} }
cl::Kernel b2f_kernel; cl::Kernel kernel;
cl_int error = runtime->BuildKernel("buffer_to_image", cl_int error = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name, obfuscated_kernel_name,
built_options, &b2f_kernel); built_options,
&kernel);
if (error != CL_SUCCESS) { if (error != CL_SUCCESS) {
return false; return false;
} }
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++, kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error->buffer()))); *(static_cast<cl::Buffer *>(kernel_error->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]);
b2f_kernel.setArg(idx++, gws[1]);
} }
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); SET_2D_GWS_ARGS(kernel);
kernel.setArg(idx++, *(buffer->opencl_buffer()));
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
"buffer offset not aligned"); "buffer offset not aligned");
b2f_kernel.setArg(idx++, kernel.setArg(idx++,
static_cast<uint32_t>(buffer->buffer_offset() / static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype()))); GetEnumTypeSize(buffer->dtype())));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1))); kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2))); kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
b2f_kernel.setArg(idx++, *(image->opencl_image())); kernel.setArg(idx++, *(image->opencl_image()));
const uint32_t kwg_size = const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
const std::vector<uint32_t> lws = {16, kwg_size / 16}; const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event; cl::Event event;
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else { } else {
std::vector<uint32_t> roundup_gws(lws.size()); std::vector<uint32_t> roundup_gws(lws.size());
...@@ -109,7 +97,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -109,7 +97,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
} }
if (error != CL_SUCCESS) { if (error != CL_SUCCESS) {
......
...@@ -51,23 +51,13 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -51,23 +51,13 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
built_options.emplace("-Dpad=" + kernel_name); built_options.emplace("-Dpad=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
built_options, &kernel_)); built_options, &kernel_));
...@@ -81,15 +71,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -81,15 +71,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0; int idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, this->constant_value_); kernel_.setArg(idx++, this->constant_value_);
...@@ -108,13 +91,7 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -108,13 +91,7 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -59,6 +59,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -59,6 +59,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value; const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name); built_options.emplace("-Dpooling=" + kernel_name);
...@@ -67,24 +69,12 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -67,24 +69,12 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace(dt == DT_HALF ? "-DFP16" : ""); built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
} else { } else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
} }
if (pooling_type_ == AVG) { if (pooling_type_ == AVG) {
built_options.emplace("-DPOOL_AVG"); built_options.emplace("-DPOOL_AVG");
} }
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling", MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
kernel_name, kernel_name,
built_options, built_options,
...@@ -130,15 +120,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -130,15 +120,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
}; };
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1))); kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2))); kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
...@@ -171,13 +154,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -171,13 +154,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws.data(), lws, future)); gws.data(), lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -50,26 +50,15 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()( ...@@ -50,26 +50,15 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value; const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean");
built_options.emplace("-Dreduce_mean=" + kernel_name); built_options.emplace("-Dreduce_mean=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO"); built_options.emplace("-DNON_QUALCOMM_ADRENO");
} }
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean", MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean",
kernel_name, kernel_name,
built_options, built_options,
...@@ -96,15 +85,8 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()( ...@@ -96,15 +85,8 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, (group_size * 4 * sizeof(T)), kernel_.setArg(idx++, (group_size * 4 * sizeof(T)),
nullptr); nullptr);
...@@ -140,12 +122,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()( ...@@ -140,12 +122,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} }
MACE_CL_RET_STATUS(error); MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
......
...@@ -74,23 +74,13 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -74,23 +74,13 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR( MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bilinear", runtime->BuildKernel("resize_bilinear",
kernel_name, kernel_name,
...@@ -115,15 +105,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -115,15 +105,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
CalculateResizeScale(in_width, out_width, align_corners_); CalculateResizeScale(in_width, out_width, align_corners_);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale); kernel_.setArg(idx++, height_scale);
...@@ -142,13 +125,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -142,13 +125,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -44,23 +44,13 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -44,23 +44,13 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
built_options.emplace("-Dslice=" + kernel_name); built_options.emplace("-Dslice=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value)); DtToCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("slice", MACE_RETURN_IF_ERROR(runtime->BuildKernel("slice",
kernel_name, kernel_name,
built_options, built_options,
...@@ -81,15 +71,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -81,15 +71,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
CallStats call_stats{INT64_MAX, 0}; CallStats call_stats{INT64_MAX, 0};
for (size_t i = 0; i < outputs_count; ++i) { for (size_t i = 0; i < outputs_count; ++i) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i)); kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
kernel_.setArg(idx++, *(output_list[i]->opencl_image())); kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
...@@ -111,12 +94,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -111,12 +94,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} }
MACE_CL_RET_STATUS(error); MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
if (future != nullptr && runtime->is_profiling_enabled()) { if (future != nullptr && runtime->is_profiling_enabled()) {
event.wait(); event.wait();
CallStats tmp_stats; CallStats tmp_stats;
......
...@@ -82,23 +82,13 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -82,23 +82,13 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name); built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
built_options, &kernel_)); built_options, &kernel_));
...@@ -107,15 +97,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -107,15 +97,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
} }
if (!IsVecEqual(input_shape_, logits->shape())) { if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(logits->opencl_image())); kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels)); kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels); kernel_.setArg(idx++, remain_channels);
...@@ -130,13 +113,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -130,13 +113,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -59,24 +59,14 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -59,24 +59,14 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value)); DtToCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch", MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
obfuscated_kernel_name, obfuscated_kernel_name,
built_options, built_options,
...@@ -87,15 +77,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -87,15 +77,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
} }
if (!IsVecEqual(space_shape_, space_tensor->shape())) { if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_3D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
if (b2s_) { if (b2s_) {
kernel_.setArg(idx++, *(batch_tensor->opencl_image())); kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image())); kernel_.setArg(idx++, *(space_tensor->opencl_image()));
...@@ -123,13 +106,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -123,13 +106,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -29,6 +29,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -29,6 +29,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name; std::string obfuscated_kernel_name;
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
if (wino_blk_size_ == 4) { if (wino_blk_size_ == 4) {
obfuscated_kernel_name = obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4"); MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
...@@ -44,21 +46,9 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -44,21 +46,9 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
return MACE_SUCCESS; return MACE_SUCCESS;
} }
built_options.emplace("-DDATA_TYPE=" + built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name, obfuscated_kernel_name,
built_options, built_options,
...@@ -107,14 +97,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -107,14 +97,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_2D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
kernel_.setArg(idx++, *(input_tensor->opencl_image())); kernel_.setArg(idx++, *(input_tensor->opencl_image()));
kernel_.setArg(idx++, *(output_tensor->opencl_image())); kernel_.setArg(idx++, *(output_tensor->opencl_image()));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1))); kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
...@@ -139,13 +123,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -139,13 +123,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
...@@ -160,6 +138,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -160,6 +138,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name; std::string obfuscated_kernel_name;
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
NON_UNIFORM_WG_CONFIG;
if (wino_blk_size_ == 4) { if (wino_blk_size_ == 4) {
obfuscated_kernel_name = obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4"); MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
...@@ -176,21 +156,9 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -176,21 +156,9 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
} }
built_options.emplace("-DDATA_TYPE=" + built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::GPU))));
MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation_) { switch (activation_) {
case NOOP: case NOOP:
...@@ -240,14 +208,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -240,14 +208,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const float round_w_r = 1.f / static_cast<float>(round_w); const float round_w_r = 1.f / static_cast<float>(round_w);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_SET_ARG;
kernel_.setArg(idx++, SET_2D_GWS_ARGS(kernel_);
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
kernel_.setArg( kernel_.setArg(
idx++, idx++,
*(static_cast<const cl::Image2D *>(input_tensor->opencl_image()))); *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
...@@ -275,12 +237,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -275,12 +237,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { OUT_OF_RANGE_VALIDATION(kernel_error_);
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/workspace.h" #include "mace/core/workspace.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/common.h"
#include "mace/ops/ops_register.h" #include "mace/ops/ops_register.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册