提交 9d8505f4 编写于 作者: Y yejianwu

add out_of_range_check for opencl kernel

上级 9b10c71d
...@@ -323,6 +323,14 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, ...@@ -323,6 +323,14 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
const char *kernel_path = getenv("MACE_KERNEL_PATH"); const char *kernel_path = getenv("MACE_KERNEL_PATH");
this->kernel_path_ = this->kernel_path_ =
std::string(kernel_path == nullptr ? "" : kernel_path) + "/"; std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
const char *out_of_range_check = getenv("MACE_OUT_OF_RANGE_CHECK");
if (out_of_range_check != nullptr && strlen(out_of_range_check) == 1
&& out_of_range_check[0] == '1') {
this->out_of_range_check_ = true;
} else {
this->out_of_range_check_ = false;
}
} }
OpenCLRuntime::~OpenCLRuntime() { OpenCLRuntime::~OpenCLRuntime() {
...@@ -505,4 +513,8 @@ const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName( ...@@ -505,4 +513,8 @@ const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
} }
} }
const bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const {
return out_of_range_check_;
}
} // namespace mace } // namespace mace
...@@ -70,6 +70,7 @@ class OpenCLRuntime { ...@@ -70,6 +70,7 @@ class OpenCLRuntime {
cl::Kernel BuildKernel(const std::string &program_name, cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name, const std::string &kernel_name,
const std::set<std::string> &build_options); const std::set<std::string> &build_options);
const bool IsOutOfRangeCheckEnabled() const;
private: private:
OpenCLRuntime(GPUPerfHint, GPUPriorityHint); OpenCLRuntime(GPUPerfHint, GPUPriorityHint);
...@@ -94,6 +95,7 @@ class OpenCLRuntime { ...@@ -94,6 +95,7 @@ class OpenCLRuntime {
std::string kernel_path_; std::string kernel_path_;
GPUType gpu_type_; GPUType gpu_type_;
std::string opencl_version_; std::string opencl_version_;
bool out_of_range_check_;
static GPUPerfHint gpu_perf_hint_; static GPUPerfHint gpu_perf_hint_;
static GPUPriorityHint gpu_priority_hint_; static GPUPriorityHint gpu_priority_hint_;
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#define MACE_KERNELS_ACTIVATION_H_ #define MACE_KERNELS_ACTIVATION_H_
#include <algorithm> #include <algorithm>
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -165,6 +166,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> { ...@@ -165,6 +166,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
T relux_max_limit_; T relux_max_limit_;
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::string tuning_key_prefix_; std::string tuning_key_prefix_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include <algorithm> #include <algorithm>
#include <memory>
#include <vector> #include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
...@@ -85,6 +86,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> { ...@@ -85,6 +86,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include <memory>
#include <vector> #include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
...@@ -165,6 +166,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase { ...@@ -165,6 +166,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_BIAS_ADD_H_ #ifndef MACE_KERNELS_BIAS_ADD_H_
#define MACE_KERNELS_BIAS_ADD_H_ #define MACE_KERNELS_BIAS_ADD_H_
#include <memory>
#include <vector> #include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
...@@ -65,6 +66,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> { ...@@ -65,6 +66,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_ #ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_
#define MACE_KERNELS_BUFFER_TO_IMAGE_H_ #define MACE_KERNELS_BUFFER_TO_IMAGE_H_
#include <memory>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
...@@ -13,8 +15,10 @@ namespace mace { ...@@ -13,8 +15,10 @@ namespace mace {
namespace kernels { namespace kernels {
struct BufferToImageFunctorBase { struct BufferToImageFunctorBase {
explicit BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {} explicit BufferToImageFunctorBase(bool i2b)
: i2b_(i2b), kernel_error_(nullptr) {}
bool i2b_; bool i2b_;
std::unique_ptr<BufferBase> kernel_error_;
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_ #ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
#define MACE_KERNELS_CHANNEL_SHUFFLE_H_ #define MACE_KERNELS_CHANNEL_SHUFFLE_H_
#include <memory>
#include <vector> #include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
...@@ -57,6 +58,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> { ...@@ -57,6 +58,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
const int groups_; const int groups_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_CONCAT_H_ #ifndef MACE_KERNELS_CONCAT_H_
#define MACE_KERNELS_CONCAT_H_ #define MACE_KERNELS_CONCAT_H_
#include <memory>
#include <vector> #include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
...@@ -86,6 +87,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase { ...@@ -86,6 +87,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include <algorithm> #include <algorithm>
#include <memory>
#include <vector> #include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
...@@ -468,6 +469,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase { ...@@ -468,6 +469,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -115,6 +115,7 @@ struct CWiseFunctor<DeviceType::OPENCL, T> : CWiseFunctorBase { ...@@ -115,6 +115,7 @@ struct CWiseFunctor<DeviceType::OPENCL, T> : CWiseFunctorBase {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -109,6 +109,7 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> { ...@@ -109,6 +109,7 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
const int block_size_; const int block_size_;
bool d2s_; bool d2s_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
......
...@@ -454,6 +454,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T> ...@@ -454,6 +454,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -105,6 +105,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase { ...@@ -105,6 +105,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -107,6 +107,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase { ...@@ -107,6 +107,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
std::vector<uint32_t> gws_; std::vector<uint32_t> gws_;
std::vector<uint32_t> lws_; std::vector<uint32_t> lws_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
std::unique_ptr<BufferBase> kernel_error_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -68,6 +68,7 @@ struct MatMulFunctor<DeviceType::OPENCL, T> { ...@@ -68,6 +68,7 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -33,6 +33,14 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -33,6 +33,14 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -72,6 +80,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -72,6 +80,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0; int idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -93,6 +105,13 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -93,6 +105,13 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3)); output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template struct ActivationFunctor<DeviceType::OPENCL, float>; template struct ActivationFunctor<DeviceType::OPENCL, float>;
......
...@@ -45,6 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -45,6 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -71,6 +79,10 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -71,6 +79,10 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor->ResizeImage(output_shape, output_image_shape); output_tensor->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -88,6 +100,13 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -88,6 +100,13 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1] ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
<< "_" << output_shape[2] << "_" << output_shape[3]; << "_" << output_shape[2] << "_" << output_shape[3];
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template struct AddNFunctor<DeviceType::OPENCL, float>; template struct AddNFunctor<DeviceType::OPENCL, float>;
......
...@@ -36,7 +36,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -36,7 +36,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
...@@ -44,6 +43,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -44,6 +43,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options.emplace("-Dbatch_norm=" + kernel_name); built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -76,6 +83,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -76,6 +83,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -100,6 +111,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -100,6 +111,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_); output->dim(1), output->dim(2), output->dim(3), folded_constant_);
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template struct BatchNormFunctor<DeviceType::OPENCL, float>; template struct BatchNormFunctor<DeviceType::OPENCL, float>;
......
...@@ -36,6 +36,14 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -36,6 +36,14 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options.emplace("-Dbias_add=" + kernel_name); built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -46,6 +54,10 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -46,6 +54,10 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -77,6 +89,12 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -77,6 +89,12 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} }
MACE_CHECK_CL_SUCCESS(error); MACE_CHECK_CL_SUCCESS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
event.wait(); event.wait();
......
...@@ -13,6 +13,7 @@ template <typename T> ...@@ -13,6 +13,7 @@ template <typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) { Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
if (!i2b_) { if (!i2b_) {
CalImage2DShape(buffer->shape(), type, &image_shape); CalImage2DShape(buffer->shape(), type, &image_shape);
if (type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
...@@ -80,10 +81,25 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -80,10 +81,25 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
} }
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
if (!kernel_error_) {
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
}
auto b2f_kernel = runtime->BuildKernel("buffer_to_image", auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name, built_options); obfuscated_kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
b2f_kernel.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]); b2f_kernel.setArg(idx++, gws[0]);
b2f_kernel.setArg(idx++, gws[1]); b2f_kernel.setArg(idx++, gws[1]);
...@@ -135,6 +151,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -135,6 +151,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
} }
MACE_CHECK_CL_SUCCESS(error); MACE_CHECK_CL_SUCCESS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
event.wait(); event.wait();
......
...@@ -43,6 +43,14 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -43,6 +43,14 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -55,6 +63,10 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -55,6 +63,10 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -76,6 +88,13 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -76,6 +88,13 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
<< output->dim(2) << "_" << output->dim(2) << "_"
<< output->dim(3); << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template template
......
#include <common.h> #include <common.h>
__kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void activation(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
#ifdef USE_PRELU #ifdef USE_PRELU
__read_only image2d_t alpha, __read_only image2d_t alpha,
...@@ -29,6 +30,9 @@ __kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -29,6 +30,9 @@ __kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3
#else #else
DATA_TYPE4 out = do_activation(in, relux_max_limit); DATA_TYPE4 out = do_activation(in, relux_max_limit);
#endif #endif
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, pos, hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(pos, hb), out); WRITE_IMAGET(output, (int2)(pos, hb), out);
} }
#include <common.h> #include <common.h>
__kernel void addn(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void addn(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1, __read_only image2d_t input1,
#if INPUT_NUM > 2 #if INPUT_NUM > 2
...@@ -31,6 +32,9 @@ __kernel void addn(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -31,6 +32,9 @@ __kernel void addn(GLOBAL_WORK_GROUP_SIZE_DIM2
out = out + in3; out = out + in3;
#endif #endif
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(w, hb), out); WRITE_IMAGET(output, (int2)(w, hb), out);
} }
#include <common.h> #include <common.h>
// Supported data types: half/float // Supported data types: half/float
__kernel void batch_norm(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void batch_norm(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t scale, __read_only image2d_t scale,
__read_only image2d_t offset, __read_only image2d_t offset,
...@@ -48,5 +49,8 @@ __kernel void batch_norm(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -48,5 +49,8 @@ __kernel void batch_norm(GLOBAL_WORK_GROUP_SIZE_DIM3
out = do_activation(out, relux_max_limit); out = do_activation(out, relux_max_limit);
#endif #endif
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, pos, hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(pos, hb), out); WRITE_IMAGET(output, (int2)(pos, hb), out);
} }
#include <common.h> #include <common.h>
// Supported data types: half/float // Supported data types: half/float
__kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void bias_add(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t bias, __read_only image2d_t bias,
__write_only image2d_t output) { __write_only image2d_t output) {
...@@ -22,5 +23,9 @@ __kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -22,5 +23,9 @@ __kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(ch_blk, 0));
DATA_TYPE4 out = in + bias_value; DATA_TYPE4 out = in + bias_value;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, pos, hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(pos, hb), out); WRITE_IMAGET(output, (int2)(pos, hb), out);
} }
#include <common.h> #include <common.h>
__kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* h, w, oc, ic */ __global const DATA_TYPE *input, /* h, w, oc, ic */
__private const int input_offset, __private const int input_offset,
__private const int filter_h, __private const int filter_h,
...@@ -49,10 +50,14 @@ __kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -49,10 +50,14 @@ __kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
} }
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, h, kernel_error);
#endif
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* h, w, oc, ic */ __global DATA_TYPE *output, /* h, w, oc, ic */
__private const int filter_h, __private const int filter_h,
__private const int filter_w, __private const int filter_w,
...@@ -100,7 +105,8 @@ __kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -100,7 +105,8 @@ __kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
} }
} }
__kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* h, w, ic, m */ __global const DATA_TYPE *input, /* h, w, ic, m */
__private const int input_offset, __private const int input_offset,
__private const int filter_w, __private const int filter_w,
...@@ -154,10 +160,14 @@ __kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -154,10 +160,14 @@ __kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
} }
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, h, kernel_error);
#endif
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */ __global const DATA_TYPE *input, /* nhwc */
__private const int input_offset, __private const int input_offset,
__private const int height, __private const int height,
...@@ -195,10 +205,14 @@ __kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -195,10 +205,14 @@ __kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
values = vload4(0, input + offset); values = vload4(0, input + offset);
} }
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, h, kernel_error);
#endif
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* nhwc */ __global DATA_TYPE *output, /* nhwc */
__private const int height, __private const int height,
__private const int width, __private const int width,
...@@ -237,7 +251,8 @@ __kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -237,7 +251,8 @@ __kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
} }
} }
__kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */ __global const DATA_TYPE *input, /* nhwc */
__private const int input_offset, __private const int input_offset,
__private const int count, __private const int count,
...@@ -269,10 +284,14 @@ __kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -269,10 +284,14 @@ __kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
values = vload4(0, input + offset); values = vload4(0, input + offset);
} }
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, h, kernel_error);
#endif
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* nhwc */ __global DATA_TYPE *output, /* nhwc */
__private const int count, __private const int count,
__read_only image2d_t input) { __read_only image2d_t input) {
...@@ -305,7 +324,8 @@ __kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -305,7 +324,8 @@ __kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
} }
__kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //nhwc __global const DATA_TYPE *input, //nhwc
__private const int input_offset, __private const int input_offset,
__private const int height, __private const int height,
...@@ -344,10 +364,14 @@ __kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -344,10 +364,14 @@ __kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
values.x = *(input + offset); values.x = *(input + offset);
} }
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, h, kernel_error);
#endif
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //nhwc __global DATA_TYPE *output, //nhwc
__private const int height, __private const int height,
__private const int width, __private const int width,
...@@ -385,7 +409,8 @@ __kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -385,7 +409,8 @@ __kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
} }
__kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */ __global const DATA_TYPE *input, /* nhwc */
__private const int input_offset, __private const int input_offset,
__private const int height, __private const int height,
...@@ -423,11 +448,15 @@ __kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -423,11 +448,15 @@ __kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
values.x = *(input + offset); values.x = *(input + offset);
} }
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, h, kernel_error);
#endif
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W __global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset, __private const int input_offset,
__private const int in_channels, __private const int in_channels,
...@@ -495,6 +524,11 @@ __kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -495,6 +524,11 @@ __kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
tu3[1] = tt + tu3[1] / 2; tu3[1] = tt + tu3[1] / 2;
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, coord.x, coord.y + out_channels * 15, kernel_error);
#endif
#pragma unroll #pragma unroll
for (short i = 0; i < 4; ++i) { for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, coord, tu0[i]); WRITE_IMAGET(output, coord, tu0[i]);
...@@ -518,7 +552,8 @@ __kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -518,7 +552,8 @@ __kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
} }
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W __global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height, __private const int height,
__private const int width, __private const int width,
......
#include <common.h> #include <common.h>
// assume channes_per_group mod 4 = 0 && groups mod 4 == 0 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
__kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void channel_shuffle(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int groups, __private const int groups,
__private const int channels_per_group, __private const int channels_per_group,
...@@ -49,6 +50,11 @@ __kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -49,6 +50,11 @@ __kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3
out_chan_data3 = (DATA_TYPE4)(in_chan_data0.w, in_chan_data1.w, in_chan_data2.w, in_chan_data3.w); out_chan_data3 = (DATA_TYPE4)(in_chan_data0.w, in_chan_data1.w, in_chan_data2.w, in_chan_data3.w);
int out_x = mad24(mad24(group_chan_blk_idx, groups, g_blk), width, width_idx); int out_x = mad24(mad24(group_chan_blk_idx, groups, g_blk), width, width_idx);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x + groups_blks_width * 3, hb_idx, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x, hb_idx), out_chan_data0); WRITE_IMAGET(output, (int2)(out_x, hb_idx), out_chan_data0);
out_x += groups_blks_width; out_x += groups_blks_width;
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
#ifndef NON_UNIFORM_WORK_GROUP #ifndef NON_UNIFORM_WORK_GROUP
#define GLOBAL_WORK_GROUP_SIZE_DIM2 \ #define GLOBAL_WORK_GROUP_SIZE_DIM2 \
...@@ -34,6 +35,18 @@ ...@@ -34,6 +35,18 @@
#endif #endif
#ifdef OUT_OF_RANGE_CHECK
#define KERNEL_ERROR_PARAMS \
__global char *kernel_error,
#else
#define KERNEL_ERROR_PARAMS
#endif
__constant sampler_t SAMPLER = __constant sampler_t SAMPLER =
CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
...@@ -61,4 +74,14 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in, ...@@ -61,4 +74,14 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
return out; return out;
} }
inline void check_out_of_range_for_image2d(__write_only image2d_t image,
__private const int x,
__private const int y,
global char *kernel_error) {
int2 image_dim = get_image_dim(image);
if (x >= image_dim.x || y >= image_dim.y) {
*kernel_error = '1';
}
}
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
...@@ -22,7 +22,8 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left, ...@@ -22,7 +22,8 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
} }
// Supported data type: half/float // Supported data type: half/float
__kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void concat_channel(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input0, __read_only image2d_t input0,
__read_only image2d_t input1, __read_only image2d_t input1,
__private const int input0_chan, __private const int input0_chan,
...@@ -79,11 +80,17 @@ __kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -79,11 +80,17 @@ __kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3
} }
#endif #endif
WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data); const int pos = mad24(chan_blk_idx, width, width_idx);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, pos, hb_idx, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(pos, hb_idx), data);
} }
// Required: All input channels are divisible by 4 // Required: All input channels are divisible by 4
__kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void concat_channel_multi(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int chan_blk_offset, __private const int chan_blk_offset,
__write_only image2d_t output) { __write_only image2d_t output) {
...@@ -106,7 +113,12 @@ __kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -106,7 +113,12 @@ __kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3
SAMPLER, SAMPLER,
(int2)(mad24(chan_blk_idx, width, width_idx), hb_idx)); (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx + chan_blk_offset, width, width_idx), hb_idx), data); const int pos = mad24(chan_blk_idx + chan_blk_offset, width, width_idx);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, pos, hb_idx, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(pos, hb_idx), data);
} }
//__kernel void concat_width(__read_only image2d_t input0, //__kernel void concat_width(__read_only image2d_t input0,
......
#include <common.h> #include <common.h>
__kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void conv_2d(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */ __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
#ifdef BIAS #ifdef BIAS
...@@ -126,19 +127,32 @@ __kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -126,19 +127,32 @@ __kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3
#endif #endif
const int out_x_base = mul24(out_ch_blk, out_width); const int out_x_base = mul24(out_ch_blk, out_width);
int w = out_w_blk; int w = out_w_blk;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
} }
#include <common.h> #include <common.h>
__kernel void conv_2d_1x1(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, cout/4 */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
#ifdef BIAS #ifdef BIAS
...@@ -104,17 +105,29 @@ __kernel void conv_2d_1x1(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -104,17 +105,29 @@ __kernel void conv_2d_1x1(GLOBAL_WORK_GROUP_SIZE_DIM3
const int out_x_base = mul24(out_ch_blk, width); const int out_x_base = mul24(out_ch_blk, width);
int out_x_idx = out_w_blk; int out_x_idx = out_w_blk;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + out_x_idx, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0); WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
out_x_idx += out_w_blks; out_x_idx += out_w_blks;
if (out_x_idx >= width) return; if (out_x_idx >= width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + out_x_idx, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1); WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1);
out_x_idx += out_w_blks; out_x_idx += out_w_blks;
if (out_x_idx >= width) return; if (out_x_idx >= width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + out_x_idx, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2); WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2);
out_x_idx += out_w_blks; out_x_idx += out_w_blks;
if (out_x_idx >= width) return; if (out_x_idx >= width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + out_x_idx, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3); WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
} }
#include <common.h> #include <common.h>
__kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */ __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
#ifdef BIAS #ifdef BIAS
...@@ -135,30 +136,45 @@ __kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -135,30 +136,45 @@ __kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3
const int out_x_base = mul24(out_ch_blk, out_width); const int out_x_base = mul24(out_ch_blk, out_width);
int w = out_w_blk; int w = out_w_blk;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb), (int2)(out_x_base + w, out_hb),
out0); out0);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb), (int2)(out_x_base + w, out_hb),
out1); out1);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb), (int2)(out_x_base + w, out_hb),
out2); out2);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb), (int2)(out_x_base + w, out_hb),
out3); out3);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb), (int2)(out_x_base + w, out_hb),
out4); out4);
......
#include <common.h> #include <common.h>
__kernel void cwise(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void cwise(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__private const float value, __private const float value,
__write_only image2d_t output) { __write_only image2d_t output) {
...@@ -43,5 +44,8 @@ __kernel void cwise(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -43,5 +44,8 @@ __kernel void cwise(GLOBAL_WORK_GROUP_SIZE_DIM2
out.w = fabs(in0.w); out.w = fabs(in0.w);
#endif #endif
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(w, hb), out); WRITE_IMAGET(output, (int2)(w, hb), out);
} }
#include <common.h> #include <common.h>
__kernel void depth_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void depth_to_space(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int block_size, __private const int block_size,
__private const int input_height, __private const int input_hb,
__private const int input_width, __private const int input_width,
__private const int input_depth_blocks, __private const int input_depth_blocks,
__private const int output_height,
__private const int output_width, __private const int output_width,
__private const int output_depth_blocks, __private const int output_depth_blocks,
__write_only image2d_t output) { __write_only image2d_t output) {
const int out_d = get_global_id(0); const int out_d = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_h = get_global_id(2); const int out_hb = get_global_id(2);
if (out_d >= output_depth_blocks || out_h >= output_height || out_w >= output_width) #ifndef NON_UNIFORM_WORK_GROUP
if (out_d >= global_size_dim0 || out_w >= global_size_dim1
|| out_hb >= global_size_dim2) {
return; return;
}
#endif
const int out_pos = mad24(out_d, output_width, out_w); const int out_pos = mad24(out_d, output_width, out_w);
const int in_h = out_h / block_size; const int in_hb = out_hb / block_size;
const int offset_h = out_h % block_size; const int offset_h = out_hb % block_size;
const int in_w = out_w / block_size; const int in_w = out_w / block_size;
const int offset_w = out_w % block_size; const int offset_w = out_w % block_size;
const int offset_d = (offset_h * block_size + offset_w) * output_depth_blocks; const int offset_d = (offset_h * block_size + offset_w) * output_depth_blocks;
const int in_d = out_d + offset_d; const int in_d = out_d + offset_d;
if (in_h >= input_height || in_w >= input_width || in_d >= input_depth_blocks) if (in_hb >= input_hb || in_w >= input_width || in_d >= input_depth_blocks) {
return; return;
}
const int in_pos = mad24(in_d, input_width, in_w); const int in_pos = mad24(in_d, input_width, in_w);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_h)); DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_hb));
WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_pos, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
} }
__kernel void space_to_depth( __kernel void space_to_depth(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int block_size, __private const int block_size,
__private const int input_height,
__private const int input_width, __private const int input_width,
__private const int input_depth_blocks, __private const int input_depth_blocks,
__private const int output_height, __private const int output_hb,
__private const int output_width, __private const int output_width,
__private const int output_depth_blocks, __private const int output_depth_blocks,
__write_only image2d_t output) { __write_only image2d_t output) {
const int d = get_global_id(0); const int d = get_global_id(0);
const int w = get_global_id(1); const int w = get_global_id(1);
const int h = get_global_id(2); const int hb = get_global_id(2);
if (h >= input_height || w >= input_width || d >= input_depth_blocks) #ifndef NON_UNIFORM_WORK_GROUP
if (d >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return; return;
}
#endif
const int in_pos = mad24(d, input_width, w); const int in_pos = mad24(d, input_width, w);
const int out_h = h / block_size; const int out_hb = hb / block_size;
const int offset_h = h % block_size; const int offset_h = hb % block_size;
const int out_w = w / block_size; const int out_w = w / block_size;
const int offset_w = w % block_size; const int offset_w = w % block_size;
const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks; const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks;
const int out_d = d + offset_d; const int out_d = d + offset_d;
if (out_d >= output_depth_blocks || out_h >= output_height || out_w >= output_width) if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) {
return; return;
}
const int out_pos = mad24(out_d, output_width, out_w); const int out_pos = mad24(out_d, output_width, out_w);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, h)); DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, hb));
WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_pos, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
} }
#include <common.h> #include <common.h>
// Only multiplier = 1 is supported // Only multiplier = 1 is supported
__kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS #ifdef BIAS
...@@ -122,22 +123,35 @@ __kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -122,22 +123,35 @@ __kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3
const short out_x_base = mul24(out_ch_blk, out_width); const short out_x_base = mul24(out_ch_blk, out_width);
short w = out_w_blk; short w = out_w_blk;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
} }
__kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void depthwise_conv2d_s1(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS #ifdef BIAS
...@@ -247,17 +261,29 @@ __kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -247,17 +261,29 @@ __kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3
const short out_x_base = mul24(out_ch_blk, out_width); const short out_x_base = mul24(out_ch_blk, out_width);
short w = out_w_blk; short w = out_w_blk;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0);
w += 1; w += 1;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1);
w += 1; w += 1;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2);
w += 1; w += 1;
if (w >= out_width) return; if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
} }
#include <common.h> #include <common.h>
__kernel void eltwise(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void eltwise(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1, __read_only image2d_t input1,
#ifdef COEFF_SUM #ifdef COEFF_SUM
...@@ -36,5 +37,8 @@ __kernel void eltwise(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -36,5 +37,8 @@ __kernel void eltwise(GLOBAL_WORK_GROUP_SIZE_DIM2
out = in0 - in1; out = in0 - in1;
#endif #endif
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(w, hb), out); WRITE_IMAGET(output, (int2)(w, hb), out);
} }
#include <common.h> #include <common.h>
// output = weight * input + bias // output = weight * input + bias
__kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void fully_connected(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t weight, __read_only image2d_t weight,
#ifdef BIAS #ifdef BIAS
...@@ -58,11 +59,16 @@ __kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -58,11 +59,16 @@ __kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID) #if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
result = do_activation(result, relux_max_limit); result = do_activation(result, relux_max_limit);
#endif #endif
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_blk_idx, batch_idx, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result); WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result);
} }
// output = weight * input + bias // output = weight * input + bias
__kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void fully_connected_width(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t weight, __read_only image2d_t weight,
#ifdef BIAS #ifdef BIAS
...@@ -147,6 +153,10 @@ __kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -147,6 +153,10 @@ __kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID) #if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
result = do_activation(result, relux_max_limit); result = do_activation(result, relux_max_limit);
#endif #endif
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_blk_idx, batch_idx, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result); WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result);
} }
} }
#include <common.h> #include <common.h>
// C = A * B // C = A * B
__kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void matmul(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t A, __read_only image2d_t A,
__read_only image2d_t B, __read_only image2d_t B,
__write_only image2d_t C, __write_only image2d_t C,
...@@ -46,11 +47,27 @@ __kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -46,11 +47,27 @@ __kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2
c3 += (DATA_TYPE4)(dot(a0, b3), dot(a1, b3), dot(a2, b3), dot(a3, b3)); c3 += (DATA_TYPE4)(dot(a0, b3), dot(a1, b3), dot(a2, b3), dot(a3, b3));
} }
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(C, gx, gy, kernel_error);
#endif
WRITE_IMAGET(C, (int2)(gx, gy), c0); WRITE_IMAGET(C, (int2)(gx, gy), c0);
if ((gx + 1) >= N) return; if ((gx + 1) >= N) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(C, gx + 1, gy, kernel_error);
#endif
WRITE_IMAGET(C, (int2)(gx + 1, gy), c1); WRITE_IMAGET(C, (int2)(gx + 1, gy), c1);
if ((gx + 2) >= N) return; if ((gx + 2) >= N) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(C, gx + 2, gy, kernel_error);
#endif
WRITE_IMAGET(C, (int2)(gx + 2, gy), c2); WRITE_IMAGET(C, (int2)(gx + 2, gy), c2);
if ((gx + 3) >= N) return; if ((gx + 3) >= N) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(C, gx + 3, gy, kernel_error);
#endif
WRITE_IMAGET(C, (int2)(gx + 3, gy), c3); WRITE_IMAGET(C, (int2)(gx + 3, gy), c3);
} }
...@@ -19,7 +19,8 @@ inline int calculate_avg_block_size(const int pool_size, ...@@ -19,7 +19,8 @@ inline int calculate_avg_block_size(const int pool_size,
} }
// Supported data type: half/float // Supported data type: half/float
__kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void pooling(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
...@@ -94,5 +95,9 @@ __kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -94,5 +95,9 @@ __kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3
} }
#endif #endif
WRITE_IMAGET(output, (int2)(mad24(out_chan_idx, out_width, out_width_idx), out_hb_idx), res); const int pos = mad24(out_chan_idx, out_width, out_width_idx);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, pos, out_hb_idx, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(pos, out_hb_idx), res);
} }
#include <common.h> #include <common.h>
__kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void resize_bilinear_nocache(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__write_only image2d_t output, __write_only image2d_t output,
__private const float height_scale, __private const float height_scale,
...@@ -56,6 +57,10 @@ __kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -56,6 +57,10 @@ __kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3
const int out_w_offset = mul24(ch_blk, out_width); const int out_w_offset = mul24(ch_blk, out_width);
const int out_h_offset = mul24(b, out_height); const int out_h_offset = mul24(b, out_height);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_w_offset + w, out_h_offset + h, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out); WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
} }
#include <common.h> #include <common.h>
__kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void slice(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int chan_blk_offset, __private const int chan_blk_offset,
__write_only image2d_t output) { __write_only image2d_t output) {
...@@ -21,6 +22,11 @@ __kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -21,6 +22,11 @@ __kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3
DATA_TYPE4 data = READ_IMAGET(input, SAMPLER, DATA_TYPE4 data = READ_IMAGET(input, SAMPLER,
(int2)(mad24(chan_blk_idx + chan_blk_offset, (int2)(mad24(chan_blk_idx + chan_blk_offset,
width, width_idx), hb_idx)); width, width_idx), hb_idx));
const int pos = mad24(chan_blk_idx, width, width_idx);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, pos, hb_idx, kernel_error);
#endif
WRITE_IMAGET(output, WRITE_IMAGET(output,
(int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data); (int2)(pos, hb_idx), data);
} }
#include <common.h> #include <common.h>
__kernel void softmax(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void softmax(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__private const int channels, __private const int channels,
__private const int remain_channels, __private const int remain_channels,
...@@ -84,5 +85,8 @@ __kernel void softmax(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -84,5 +85,8 @@ __kernel void softmax(GLOBAL_WORK_GROUP_SIZE_DIM3
data = native_exp(data) / sum; data = native_exp(data) / sum;
} }
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, pos, hb_idx, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(pos, hb_idx), data); WRITE_IMAGET(output, (int2)(pos, hb_idx), data);
} }
#include <common.h> #include <common.h>
__kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void space_to_batch(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t space_data, __read_only image2d_t space_data,
__write_only image2d_t batch_data, __write_only image2d_t batch_data,
__private const int block_height, __private const int block_height,
...@@ -44,10 +45,15 @@ __kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -44,10 +45,15 @@ __kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3
DATA_TYPE4 value = READ_IMAGET(space_data, SAMPLER, space_coord); DATA_TYPE4 value = READ_IMAGET(space_data, SAMPLER, space_coord);
int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx); int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(batch_data, batch_coord.x, batch_coord.y, kernel_error);
#endif
WRITE_IMAGET(batch_data, batch_coord, value); WRITE_IMAGET(batch_data, batch_coord, value);
} }
__kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3 __kernel void batch_to_space(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t batch_data, __read_only image2d_t batch_data,
__write_only image2d_t space_data, __write_only image2d_t space_data,
__private const int block_height, __private const int block_height,
...@@ -87,6 +93,10 @@ __kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3 ...@@ -87,6 +93,10 @@ __kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx, int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx,
space_b_idx * space_height + space_h_idx); space_b_idx * space_height + space_h_idx);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(space_data, space_coord.x, space_coord.y, kernel_error);
#endif
WRITE_IMAGET(space_data, space_coord, value); WRITE_IMAGET(space_data, space_coord, value);
} }
} }
#include <common.h> #include <common.h>
__kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const int in_height, __private const int in_height,
...@@ -93,6 +94,9 @@ __kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -93,6 +94,9 @@ __kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
input3[2] = tv3[2] - tv3[1]; input3[2] = tv3[2] - tv3[1];
input3[3] = tv3[1] - tv3[3]; input3[3] = tv3[1] - tv3[3];
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_width_idx, chan_blk_idx + chan_blk_idx * 15, kernel_error);
#endif
#pragma unroll #pragma unroll
for (short i = 0; i < 4; ++i) { for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), input0[i]); WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), input0[i]);
...@@ -115,7 +119,8 @@ __kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -115,7 +119,8 @@ __kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
} }
} }
__kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input, __read_only image2d_t input,
#ifdef BIAS #ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
...@@ -208,18 +213,30 @@ __kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 ...@@ -208,18 +213,30 @@ __kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
in1[1] = do_activation(in1[1], relux_max_limit); in1[1] = do_activation(in1[1], relux_max_limit);
#endif #endif
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, coord_x, coord_y, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(coord_x, coord_y), in0[0]); WRITE_IMAGET(output, (int2)(coord_x, coord_y), in0[0]);
t = 0; t = 0;
if (out_width_idx + 1 < out_width) { if (out_width_idx + 1 < out_width) {
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, coord_x + 1, coord_y, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(coord_x + 1, coord_y), in0[1]); WRITE_IMAGET(output, (int2)(coord_x + 1, coord_y), in0[1]);
t += 1; t += 1;
} }
if (out_height_idx + 1 < out_height) { if (out_height_idx + 1 < out_height) {
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, coord_x, coord_y + 1, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(coord_x, coord_y + 1), in1[0]); WRITE_IMAGET(output, (int2)(coord_x, coord_y + 1), in1[0]);
t += 1; t += 1;
} }
if (t == 2) { if (t == 2) {
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, coord_x + 1, coord_y + 1, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(coord_x + 1, coord_y + 1), in1[1]); WRITE_IMAGET(output, (int2)(coord_x + 1, coord_y + 1), in1[1]);
} }
......
...@@ -18,7 +18,8 @@ static void Concat2(cl::Kernel *kernel, ...@@ -18,7 +18,8 @@ static void Concat2(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size) { uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -36,6 +37,14 @@ static void Concat2(cl::Kernel *kernel, ...@@ -36,6 +37,14 @@ static void Concat2(cl::Kernel *kernel,
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name); built_options.emplace("-Dconcat_channel=" + kernel_name);
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = '0';
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -56,6 +65,10 @@ static void Concat2(cl::Kernel *kernel, ...@@ -56,6 +65,10 @@ static void Concat2(cl::Kernel *kernel,
} }
if (!IsVecEqual(*prev_input_shape, input0->shape())) { if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
...@@ -77,6 +90,13 @@ static void Concat2(cl::Kernel *kernel, ...@@ -77,6 +90,13 @@ static void Concat2(cl::Kernel *kernel,
ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
} }
static void ConcatN(cl::Kernel *kernel, static void ConcatN(cl::Kernel *kernel,
...@@ -84,7 +104,8 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -84,7 +104,8 @@ static void ConcatN(cl::Kernel *kernel,
const DataType dt, const DataType dt,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size) { uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -98,6 +119,14 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -98,6 +119,14 @@ static void ConcatN(cl::Kernel *kernel,
built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = '0';
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -117,6 +146,10 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -117,6 +146,10 @@ static void ConcatN(cl::Kernel *kernel,
}; };
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
...@@ -132,6 +165,13 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -132,6 +165,13 @@ static void ConcatN(cl::Kernel *kernel,
ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_" ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
<< batch * height; << batch * height;
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
} }
} }
...@@ -172,12 +212,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -172,12 +212,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch (inputs_count) { switch (inputs_count) {
case 2: case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value, Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
&input_shape_, output, future, &kwg_size_); &input_shape_, output, future, &kwg_size_, &kernel_error_);
break; break;
default: default:
if (divisible_four) { if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future, ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
&kwg_size_); &kwg_size_, &kernel_error_);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
......
...@@ -21,7 +21,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -21,7 +21,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size); uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
extern void Conv2dOpenclK3x3(cl::Kernel *kernel, extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
...@@ -36,7 +37,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -36,7 +37,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size); uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
extern void Conv2dOpencl(cl::Kernel *kernel, extern void Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
...@@ -51,7 +53,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -51,7 +53,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size); uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
template <typename T> template <typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...@@ -65,7 +68,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -65,7 +68,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const int *dilations, const ActivationType activation, const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt, const float relux_max_limit, const DataType dt,
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future, std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
uint32_t *kwg_size); uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error);
// Selection matrix: kernel_size x stride_size // Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] = { static const Conv2dOpenclFunction selector[5] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
...@@ -106,12 +109,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -106,12 +109,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future, DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_); &kwg_size_, &kernel_error_);
} else { } else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future, DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_); &kwg_size_, &kernel_error_);
} }
} }
......
...@@ -23,7 +23,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -23,7 +23,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size) { uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -47,6 +48,14 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -47,6 +48,14 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = '0';
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -84,6 +93,10 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -84,6 +93,10 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
...@@ -112,6 +125,13 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -112,6 +125,13 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
} }
} // namespace kernels } // namespace kernels
......
...@@ -25,7 +25,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -25,7 +25,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size) { uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -44,6 +45,14 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -44,6 +45,14 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = '0';
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -79,6 +88,10 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -79,6 +88,10 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
...@@ -110,6 +123,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -110,6 +123,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
} }
} // namespace kernels } // namespace kernels
......
...@@ -25,7 +25,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -25,7 +25,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size) { uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -44,6 +45,14 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -44,6 +45,14 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-Dconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = '0';
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -79,6 +88,10 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -79,6 +88,10 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
...@@ -112,6 +125,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -112,6 +125,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
} }
} // namespace kernels } // namespace kernels
......
...@@ -34,6 +34,14 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -34,6 +34,14 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DCWISE_TYPE=", type_)); built_options.emplace(MakeString("-DCWISE_TYPE=", type_));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -44,6 +52,10 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -44,6 +52,10 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -59,6 +71,13 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -59,6 +71,13 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
ss << "cwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "cwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template struct CWiseFunctor<DeviceType::OPENCL, float>; template struct CWiseFunctor<DeviceType::OPENCL, float>;
......
...@@ -23,8 +23,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -23,8 +23,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
const char *kernel_name = nullptr; const char *kernel_name = nullptr;
index_t output_height, output_width, output_depth; index_t output_height, output_width, output_depth;
if (d2s_) { if (d2s_) { output_height = input_height * block_size_;
output_height = input_height * block_size_;
output_width = input_width * block_size_; output_width = input_width * block_size_;
output_depth = input_depth / (block_size_ * block_size_); output_depth = input_depth / (block_size_ * block_size_);
kernel_name = "depth_to_space"; kernel_name = "depth_to_space";
...@@ -55,6 +54,14 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -55,6 +54,14 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -84,19 +91,31 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -84,19 +91,31 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
} }
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
} }
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(block_size_)); if (d2s_) {
kernel_.setArg(idx++, static_cast<int32_t>(input_height)); kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(input_width)); kernel_.setArg(idx++, static_cast<int32_t>(input_height * batch));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks)); kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_height)); kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(output_width)); kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks)); kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
} else {
kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
kernel_.setArg(idx++, static_cast<int32_t>(input_width));
kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
kernel_.setArg(idx++, static_cast<int32_t>(output_height * batch));
kernel_.setArg(idx++, static_cast<int32_t>(output_width));
kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
}
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape(); input_shape_ = input->shape();
...@@ -104,6 +123,13 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -104,6 +123,13 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1}; const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>; template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>;
......
...@@ -24,7 +24,8 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -24,7 +24,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
uint32_t *kwg_size) { uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -52,6 +53,14 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -52,6 +53,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
} else { } else {
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
} }
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = '0';
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -97,6 +106,10 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -97,6 +106,10 @@ void DepthwiseConv2d(cl::Kernel *kernel,
input_channels); input_channels);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
...@@ -130,6 +143,13 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -130,6 +143,13 @@ void DepthwiseConv2d(cl::Kernel *kernel,
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
batch, height, width, channels, multiplier); batch, height, width, channels, multiplier);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
} }
template <typename T> template <typename T>
...@@ -182,7 +202,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -182,7 +202,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future, DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_); &kwg_size_, &kernel_error_);
} }
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>; template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
......
...@@ -37,6 +37,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -37,6 +37,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -48,6 +56,10 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -48,6 +56,10 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
} }
if (!IsVecEqual(input_shape_, input0->shape())) { if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -68,6 +80,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -68,6 +80,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template struct EltwiseFunctor<DeviceType::OPENCL, float>; template struct EltwiseFunctor<DeviceType::OPENCL, float>;
......
...@@ -19,7 +19,8 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -19,7 +19,8 @@ void FCWXKernel(cl::Kernel *kernel,
std::vector<uint32_t> *gws, std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws, std::vector<uint32_t> *lws,
const float relux_max_limit, const float relux_max_limit,
StatsFuture *future) { StatsFuture *future,
std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK(input->dim(3) % 4 == 0) MACE_CHECK(input->dim(3) % 4 == 0)
<< "FC width kernel only support input with 4x channel."; << "FC width kernel only support input with 4x channel.";
MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(gws);
...@@ -33,8 +34,7 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -33,8 +34,7 @@ void FCWXKernel(cl::Kernel *kernel,
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
built_options.emplace("-Dfully_connected_width=" + kernel_name); built_options.emplace("-Dfully_connected_width=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
...@@ -62,6 +62,14 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -62,6 +62,14 @@ void FCWXKernel(cl::Kernel *kernel,
if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_QUALCOMM_ADRENO"); built_options.emplace("-DNON_QUALCOMM_ADRENO");
} }
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = '0';
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -95,6 +103,10 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -95,6 +103,10 @@ void FCWXKernel(cl::Kernel *kernel,
(*gws)[2] = static_cast<uint32_t>(batch * output_blocks); (*gws)[2] = static_cast<uint32_t>(batch * output_blocks);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, (*gws)[0]); kernel->setArg(idx++, (*gws)[0]);
kernel->setArg(idx++, (*gws)[1]); kernel->setArg(idx++, (*gws)[1]);
...@@ -132,6 +144,12 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -132,6 +144,12 @@ void FCWXKernel(cl::Kernel *kernel,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
} }
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) { if (future != nullptr) {
...@@ -155,7 +173,8 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -155,7 +173,8 @@ void FCWTXKernel(cl::Kernel *kernel,
std::vector<uint32_t> *gws, std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws, std::vector<uint32_t> *lws,
const float relux_max_limit, const float relux_max_limit,
StatsFuture *future) { StatsFuture *future,
std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws); MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
...@@ -169,6 +188,14 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -169,6 +188,14 @@ void FCWTXKernel(cl::Kernel *kernel,
if (bias != nullptr) { if (bias != nullptr) {
built_options.emplace("-DBIAS"); built_options.emplace("-DBIAS");
} }
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
*kernel_error = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
(*kernel_error)->Map(nullptr);
*((*kernel_error)->mutable_data<char>()) = '0';
(*kernel_error)->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -206,6 +233,10 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -206,6 +233,10 @@ void FCWTXKernel(cl::Kernel *kernel,
}; };
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel->setArg(idx++,
*(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, (*gws)[0]); kernel->setArg(idx++, (*gws)[0]);
kernel->setArg(idx++, (*gws)[1]); kernel->setArg(idx++, (*gws)[1]);
...@@ -229,6 +260,13 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -229,6 +260,13 @@ void FCWTXKernel(cl::Kernel *kernel,
ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
<< output->dim(2) << "_" << output->dim(3); << output->dim(2) << "_" << output->dim(3);
TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future); TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
} }
template <typename T> template <typename T>
...@@ -246,10 +284,12 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -246,10 +284,12 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
if (weight_type_ == BufferType::WEIGHT_HEIGHT) { if (weight_type_ == BufferType::WEIGHT_HEIGHT) {
FCWTXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output, FCWTXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, &gws_, &lws_, relux_max_limit_, future); activation_, &gws_, &lws_, relux_max_limit_, future,
&kernel_error_);
} else { } else {
FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output, FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, &gws_, &lws_, relux_max_limit_, future); activation_, &gws_, &lws_, relux_max_limit_, future,
&kernel_error_);
} }
} }
......
...@@ -40,6 +40,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -40,6 +40,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
built_options.emplace("-Dmatmul=" + kernel_name); built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -49,6 +57,10 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -49,6 +57,10 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -67,6 +79,13 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -67,6 +79,13 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_" ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
<< C->dim(2) << "_" << C->dim(3); << C->dim(2) << "_" << C->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template struct MatMulFunctor<DeviceType::OPENCL, float>; template struct MatMulFunctor<DeviceType::OPENCL, float>;
......
...@@ -37,6 +37,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -37,6 +37,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (pooling_type_ == AVG) { if (pooling_type_ == AVG) {
built_options.emplace("-DPOOL_AVG"); built_options.emplace("-DPOOL_AVG");
} }
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -82,6 +90,10 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -82,6 +90,10 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}; };
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -117,6 +129,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -117,6 +129,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template struct PoolingFunctor<DeviceType::OPENCL, float>; template struct PoolingFunctor<DeviceType::OPENCL, float>;
......
...@@ -37,6 +37,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -37,6 +37,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -61,6 +69,10 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -61,6 +69,10 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
CalculateResizeScale(in_width, out_width, align_corners_); CalculateResizeScale(in_width, out_width, align_corners_);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -82,6 +94,13 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -82,6 +94,13 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_" ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>; template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
......
...@@ -38,6 +38,14 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -38,6 +38,14 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" built_options.emplace("-DCMD_DATA_TYPE="
+ DtToCLCMDDt(DataTypeToEnum<T>::value)); + DtToCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -64,6 +72,10 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -64,6 +72,10 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
<< outputs_count; << outputs_count;
for (int i = 0; i < outputs_count; ++i) { for (int i = 0; i < outputs_count; ++i) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -74,6 +86,12 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -74,6 +86,12 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, *(output_list[i]->opencl_image())); kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
} }
......
...@@ -36,6 +36,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -36,6 +36,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -46,6 +54,10 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -46,6 +54,10 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
} }
if (!IsVecEqual(input_shape_, logits->shape())) { if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -64,6 +76,13 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -64,6 +76,13 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template struct SoftmaxFunctor<DeviceType::OPENCL, float>; template struct SoftmaxFunctor<DeviceType::OPENCL, float>;
......
...@@ -47,6 +47,14 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -47,6 +47,14 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value)); DtToCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -59,6 +67,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -59,6 +67,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
} }
if (!IsVecEqual(space_shape_, space_tensor->shape())) { if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -89,6 +101,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -89,6 +101,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
<< batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_" << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
<< batch_tensor->dim(3); << batch_tensor->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>; template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>;
......
...@@ -26,6 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -26,6 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -62,6 +70,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -62,6 +70,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor->ResizeImage(output_shape, image_shape); output_tensor->ResizeImage(output_shape, image_shape);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -85,6 +97,13 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -85,6 +97,13 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
<< input_tensor->dim(3); << input_tensor->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template <typename T> template <typename T>
...@@ -106,6 +125,14 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -106,6 +125,14 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsOutOfRangeCheckEnabled()) {
built_options.emplace("-DOUT_OF_RANGE_CHECK");
kernel_error_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
kernel_error_->Map(nullptr);
*(kernel_error_->mutable_data<char>()) = '0';
kernel_error_->UnMap();
}
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
...@@ -152,6 +179,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -152,6 +179,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2; const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
*(static_cast<cl::Buffer *>(kernel_error_->buffer())));
}
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
...@@ -181,6 +212,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -181,6 +212,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
<< input_tensor->dim(3); << input_tensor->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
template struct WinogradTransformFunctor<DeviceType::OPENCL, float>; template struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
......
...@@ -198,6 +198,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase { ...@@ -198,6 +198,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -174,6 +174,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T> ...@@ -174,6 +174,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -62,6 +62,7 @@ struct SliceFunctor<DeviceType::OPENCL, T> { ...@@ -62,6 +62,7 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -67,6 +67,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> { ...@@ -67,6 +67,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -57,6 +57,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase { ...@@ -57,6 +57,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> space_shape_; std::vector<index_t> space_shape_;
}; };
......
...@@ -52,6 +52,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T> ...@@ -52,6 +52,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
...@@ -110,6 +111,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> ...@@ -110,6 +111,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -99,7 +99,8 @@ def main(unused_args): ...@@ -99,7 +99,8 @@ def main(unused_args):
args=FLAGS.args, args=FLAGS.args,
opencl_profiling=1, opencl_profiling=1,
vlog_level=0, vlog_level=0,
device_bin_path="/data/local/tmp/mace") device_bin_path="/data/local/tmp/mace",
out_of_range_check=1)
globals()[FLAGS.stdout_processor](stdouts, device_properties, target_abi) globals()[FLAGS.stdout_processor](stdouts, device_properties, target_abi)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -60,7 +60,8 @@ def adb_run(serialno, host_bin_path, bin_name, ...@@ -60,7 +60,8 @@ def adb_run(serialno, host_bin_path, bin_name,
args="", args="",
opencl_profiling=1, opencl_profiling=1,
vlog_level=0, vlog_level=0,
device_bin_path="/data/local/tmp/mace"): device_bin_path="/data/local/tmp/mace",
out_of_range_check=1):
host_bin_full_path = "%s/%s" % (host_bin_path, bin_name) host_bin_full_path = "%s/%s" % (host_bin_path, bin_name)
device_bin_full_path = "%s/%s" % (device_bin_path, bin_name) device_bin_full_path = "%s/%s" % (device_bin_path, bin_name)
device_cl_path = "%s/cl" % device_bin_path device_cl_path = "%s/cl" % device_bin_path
...@@ -77,8 +78,8 @@ def adb_run(serialno, host_bin_path, bin_name, ...@@ -77,8 +78,8 @@ def adb_run(serialno, host_bin_path, bin_name,
stdout_buff=[] stdout_buff=[]
process_output = make_output_processor(stdout_buff) process_output = make_output_processor(stdout_buff)
p = sh.adb("-s", serialno, "shell", p = sh.adb("-s", serialno, "shell",
"MACE_OPENCL_PROFILING=%d MACE_KERNEL_PATH=%s MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" % "MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d MACE_KERNEL_PATH=%s MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" %
(opencl_profiling, device_cl_path, vlog_level, device_bin_full_path, args), (out_of_range_check, opencl_profiling, device_cl_path, vlog_level, device_bin_full_path, args),
_out=process_output, _bg=True, _err_to_out=True) _out=process_output, _bg=True, _err_to_out=True)
p.wait() p.wait()
return "".join(stdout_buff) return "".join(stdout_buff)
......
...@@ -70,6 +70,7 @@ else ...@@ -70,6 +70,7 @@ else
ADB_CMD_STR="LD_LIBRARY_PATH=${PHONE_DATA_DIR} \ ADB_CMD_STR="LD_LIBRARY_PATH=${PHONE_DATA_DIR} \
MACE_TUNING=${tuning_flag} \ MACE_TUNING=${tuning_flag} \
MACE_OUT_OF_RANGE_CHECK="1" \
MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \ MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \ MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
MACE_KERNEL_PATH=$KERNEL_DIR \ MACE_KERNEL_PATH=$KERNEL_DIR \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册