提交 33b049e6 编写于 作者: 刘琦

Merge branch 'compatible_with_opencl_1.1_and_1.2' into 'master'

Compatible with opencl 1.1 and 1.2

See merge request !333
...@@ -142,17 +142,17 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, ...@@ -142,17 +142,17 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
} }
bool gpu_detected = false; bool gpu_detected = false;
bool is_adreno_gpu = false;
device_ = std::make_shared<cl::Device>(); device_ = std::make_shared<cl::Device>();
for (auto device : all_devices) { for (auto device : all_devices) {
if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) { if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
*device_ = device; *device_ = device;
gpu_detected = true; gpu_detected = true;
const std::string device_name = device.getInfo<CL_DEVICE_NAME>(); const std::string device_name = device.getInfo<CL_DEVICE_NAME>();
constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)"; gpu_type_ = ParseGPUTypeFromDeviceName(device_name);
if (device_name == kQualcommAdrenoGPUStr) {
is_adreno_gpu = true; const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
} opencl_version_ = device_version.substr(7, 3);
VLOG(1) << "Using device: " << device_name; VLOG(1) << "Using device: " << device_name;
break; break;
...@@ -171,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, ...@@ -171,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
} }
cl_int err; cl_int err;
if (is_adreno_gpu) { if (gpu_type_ == GPUType::QUALCOMM_ADRENO) {
std::vector<cl_context_properties> context_properties; std::vector<cl_context_properties> context_properties;
context_properties.reserve(5); context_properties.reserve(5);
GetAdrenoContextProperties(&context_properties, gpu_perf_hint, GetAdrenoContextProperties(&context_properties, gpu_perf_hint,
...@@ -350,4 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) { ...@@ -350,4 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
return size; return size;
} }
const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() {
if (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
opencl_version_ == "2.0") {
return true;
} else {
return false;
}
}
const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
const std::string &device_name) {
constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
constexpr const char *kMaliGPUStr = "Mali";
constexpr const char *kPowerVRGPUStr = "PowerVR";
if (device_name == kQualcommAdrenoGPUStr) {
return GPUType::QUALCOMM_ADRENO;
} else if (device_name.find(kMaliGPUStr) != std::string::npos) {
return GPUType::MALI;
} else if (device_name.find(kPowerVRGPUStr) != std::string::npos) {
return GPUType::PowerVR;
} else {
return GPUType::UNKNOWN;
}
}
} // namespace mace } // namespace mace
...@@ -19,6 +19,13 @@ ...@@ -19,6 +19,13 @@
namespace mace { namespace mace {
enum GPUType {
QUALCOMM_ADRENO,
MALI,
PowerVR,
UNKNOWN,
};
class OpenCLProfilingTimer : public Timer { class OpenCLProfilingTimer : public Timer {
public: public:
explicit OpenCLProfilingTimer(const cl::Event *event) explicit OpenCLProfilingTimer(const cl::Event *event)
...@@ -50,6 +57,8 @@ class OpenCLRuntime { ...@@ -50,6 +57,8 @@ class OpenCLRuntime {
uint64_t GetDeviceMaxWorkGroupSize(); uint64_t GetDeviceMaxWorkGroupSize();
uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel); uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
uint64_t GetKernelWaveSize(const cl::Kernel &kernel); uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
const bool IsNonUniformWorkgroupsSupported();
const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name);
cl::Kernel BuildKernel(const std::string &program_name, cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name, const std::string &kernel_name,
const std::set<std::string> &build_options); const std::set<std::string> &build_options);
...@@ -75,6 +84,8 @@ class OpenCLRuntime { ...@@ -75,6 +84,8 @@ class OpenCLRuntime {
std::map<std::string, cl::Program> built_program_map_; std::map<std::string, cl::Program> built_program_map_;
std::mutex program_build_mutex_; std::mutex program_build_mutex_;
std::string kernel_path_; std::string kernel_path_;
GPUType gpu_type_;
std::string opencl_version_;
static GPUPerfHint gpu_perf_hint_; static GPUPerfHint gpu_perf_hint_;
static GPUPriorityHint gpu_priority_hint_; static GPUPriorityHint gpu_priority_hint_;
......
...@@ -155,6 +155,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> { ...@@ -155,6 +155,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
ActivationType activation_; ActivationType activation_;
T relux_max_limit_; T relux_max_limit_;
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::string tuning_key_prefix_; std::string tuning_key_prefix_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -90,6 +90,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> { ...@@ -90,6 +90,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -157,6 +157,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase { ...@@ -157,6 +157,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -64,6 +64,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> { ...@@ -64,6 +64,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -56,6 +56,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> { ...@@ -56,6 +56,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
const int groups_; const int groups_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -85,6 +85,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase { ...@@ -85,6 +85,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -401,6 +401,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase { ...@@ -401,6 +401,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -108,6 +108,7 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> { ...@@ -108,6 +108,7 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
const int block_size_; const int block_size_;
bool d2s_; bool d2s_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
......
...@@ -437,6 +437,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T> ...@@ -437,6 +437,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -97,6 +97,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase { ...@@ -97,6 +97,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -241,6 +241,7 @@ struct MatMulFunctor<DeviceType::OPENCL, T> { ...@@ -241,6 +241,7 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -24,15 +24,18 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -24,15 +24,18 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
if (kernel_.get() == nullptr) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name); built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
switch (activation_) { switch (activation_) {
case RELU: case RELU:
tuning_key_prefix_ = "relu_opencl_kernel_"; tuning_key_prefix_ = "relu_opencl_kernel_";
...@@ -58,10 +61,22 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -58,10 +61,22 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options); kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0; int idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) { if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha); MACE_CHECK_NOTNULL(alpha);
...@@ -73,10 +88,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -73,10 +88,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3)); output->dim(3));
......
...@@ -24,6 +24,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -24,6 +24,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
const index_t width = input_tensors[0]->dim(2); const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3); const index_t channels = input_tensors[0]->dim(3);
auto runtime = OpenCLRuntime::Global();
for (int i = 1; i < size; ++i) { for (int i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]); MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0)); MACE_CHECK(batch == input_tensors[i]->dim(0));
...@@ -36,7 +38,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -36,7 +38,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
if (input_tensors.size() > 4) { if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
...@@ -44,7 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -44,7 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options); kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
std::vector<index_t> output_shape = input_tensors[0]->shape(); std::vector<index_t> output_shape = input_tensors[0]->shape();
...@@ -53,6 +61,9 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -53,6 +61,9 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
const index_t width_pixels = channel_blocks * width; const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height; const index_t batch_height_pixels = batch * height;
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) { if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
...@@ -60,6 +71,10 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -60,6 +71,10 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor->ResizeImage(output_shape, output_image_shape); output_tensor->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
for (auto input : input_tensors) { for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
} }
...@@ -68,9 +83,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -68,9 +83,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_ = input_tensors[0]->shape(); input_shape_ = input_tensors[0]->shape();
} }
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels), const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
static_cast<uint32_t>(batch_height_pixels)};
const std::vector<uint32_t> lws = {64, 16, 1};
std::stringstream ss; std::stringstream ss;
ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1] ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
<< "_" << output_shape[2] << "_" << output_shape[3]; << "_" << output_shape[2] << "_" << output_shape[3];
......
...@@ -30,14 +30,23 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -30,14 +30,23 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
if (kernel_.get() == nullptr) { const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name); built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (folded_constant_) { if (folded_constant_) {
built_options.emplace("-DFOLDED_CONSTANT"); built_options.emplace("-DFOLDED_CONSTANT");
} }
...@@ -61,9 +70,17 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -61,9 +70,17 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options); kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image())); kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image())); kernel_.setArg(idx++, *(offset->opencl_image()));
...@@ -78,10 +95,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -78,10 +95,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_); output->dim(1), output->dim(2), output->dim(3), folded_constant_);
......
...@@ -23,7 +23,12 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -23,7 +23,12 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
...@@ -31,25 +36,46 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -31,25 +36,46 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options.emplace("-Dbias_add=" + kernel_name); built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options); kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image())); kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8};
cl::Event event; cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
......
...@@ -26,7 +26,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -26,7 +26,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
buffer->Resize(image->shape()); buffer->Resize(image->shape());
} }
size_t gws[2] = {image_shape[0], image_shape[1]}; uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
std::string kernel_name; std::string kernel_name;
switch (type) { switch (type) {
case CONV2D_FILTER: case CONV2D_FILTER:
...@@ -58,11 +59,17 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -58,11 +59,17 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
: "winograd_filter_buffer_to_image"; : "winograd_filter_buffer_to_image";
break; break;
} }
auto runtime = OpenCLRuntime::Global();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options; std::set<std::string> built_options;
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (buffer->dtype() == image->dtype()) { if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
...@@ -73,11 +80,14 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -73,11 +80,14 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
} }
auto runtime = OpenCLRuntime::Global();
auto b2f_kernel = runtime->BuildKernel("buffer_to_image", auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name, built_options); obfuscated_kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]);
b2f_kernel.setArg(idx++, gws[1]);
}
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
if (!i2b_) { if (!i2b_) {
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
...@@ -103,13 +113,28 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -103,13 +113,28 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
} }
b2f_kernel.setArg(idx++, *(image->opencl_image())); b2f_kernel.setArg(idx++, *(image->opencl_image()));
const std::vector<uint32_t> lws = {16, 64};
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel));
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event; cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; } else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
event.wait(); event.wait();
......
...@@ -30,20 +30,36 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -30,20 +30,36 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
"groups must be multiple of 4"); "groups must be multiple of 4");
const index_t group_channel_blocks = RoundUpDiv4(channels_per_group); const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
if (kernel_.get() == nullptr) { const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name); built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
built_options); built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_); kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group)); kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
...@@ -51,10 +67,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -51,10 +67,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width), const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "channel_shuffle_opencl_kernel_" ss << "channel_shuffle_opencl_kernel_"
<< output->dim(0) << "_" << output->dim(0) << "_"
......
#include <common.h> #include <common.h>
__kernel void activation(__read_only image2d_t input, __kernel void activation(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
#ifdef USE_PRELU #ifdef USE_PRELU
__read_only image2d_t alpha, __read_only image2d_t alpha,
#endif #endif
...@@ -9,7 +11,16 @@ __kernel void activation(__read_only image2d_t input, ...@@ -9,7 +11,16 @@ __kernel void activation(__read_only image2d_t input,
const int ch_blk = get_global_id(0); const int ch_blk = get_global_id(0);
const int w = get_global_id(1); const int w = get_global_id(1);
const int hb = get_global_id(2); const int hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (ch_blk >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1); const int width = get_global_size(1);
#endif
const int pos = mad24(ch_blk, width, w); const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
......
#include <common.h> #include <common.h>
__kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __kernel void addn(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1, __read_only image2d_t input1,
#if INPUT_NUM > 2 #if INPUT_NUM > 2
__read_only image2d_t input2, __read_only image2d_t input2,
...@@ -12,6 +14,10 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ ...@@ -12,6 +14,10 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
const int w = get_global_id(0); const int w = get_global_id(0);
const int hb = get_global_id(1); const int hb = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || hb >= global_size_dim1) return;
#endif
DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb)); DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb));
DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb)); DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb));
DATA_TYPE4 out = in0 + in1; DATA_TYPE4 out = in0 + in1;
......
#include <common.h> #include <common.h>
// Supported data types: half/float // Supported data types: half/float
__kernel void batch_norm(__read_only image2d_t input, __kernel void batch_norm(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__read_only image2d_t scale, __read_only image2d_t scale,
__read_only image2d_t offset, __read_only image2d_t offset,
#ifndef FOLDED_CONSTANT #ifndef FOLDED_CONSTANT
...@@ -13,7 +15,16 @@ __kernel void batch_norm(__read_only image2d_t input, ...@@ -13,7 +15,16 @@ __kernel void batch_norm(__read_only image2d_t input,
const int ch_blk = get_global_id(0); const int ch_blk = get_global_id(0);
const int w = get_global_id(1); const int w = get_global_id(1);
const int hb = get_global_id(2); const int hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (ch_blk >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1); const int width = get_global_size(1);
#endif
#ifdef FOLDED_CONSTANT #ifdef FOLDED_CONSTANT
DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
......
#include <common.h> #include <common.h>
// Supported data types: half/float // Supported data types: half/float
__kernel void bias_add(__read_only image2d_t input, __kernel void bias_add(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__read_only image2d_t bias, __read_only image2d_t bias,
__write_only image2d_t output) { __write_only image2d_t output) {
const int ch_blk = get_global_id(0); const int ch_blk = get_global_id(0);
const int w = get_global_id(1); const int w = get_global_id(1);
const int hb = get_global_id(2); const int hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (ch_blk >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1); const int width = get_global_size(1);
#endif
const int pos = mad24(ch_blk, width, w); const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
......
#include <common.h> #include <common.h>
__kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */ __kernel void filter_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* h, w, oc, ic */
__private const int input_offset, __private const int input_offset,
__private const int filter_h, __private const int filter_h,
__private const int filter_w, __private const int filter_w,
...@@ -9,6 +11,13 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o ...@@ -9,6 +11,13 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
__write_only image2d_t output) { __write_only image2d_t output) {
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int in_channel_idx = w; const int in_channel_idx = w;
const int hw_size = filter_w * filter_h; const int hw_size = filter_w * filter_h;
const int out_channel_idx = h / hw_size * 4; const int out_channel_idx = h / hw_size * 4;
...@@ -44,7 +53,9 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o ...@@ -44,7 +53,9 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic */ __kernel void filter_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, /* h, w, oc, ic */
__private const int filter_h, __private const int filter_h,
__private const int filter_w, __private const int filter_w,
__private const int out_channel, __private const int out_channel,
...@@ -52,6 +63,13 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic ...@@ -52,6 +63,13 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
__read_only image2d_t input) { __read_only image2d_t input) {
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int in_channel_idx = w; const int in_channel_idx = w;
const int hw_size = filter_w * filter_h; const int hw_size = filter_w * filter_h;
const int out_channel_idx = h / hw_size * 4; const int out_channel_idx = h / hw_size * 4;
...@@ -84,7 +102,9 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic ...@@ -84,7 +102,9 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
} }
} }
__kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */ __kernel void dw_filter_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* h, w, ic, m */
__private const int input_offset, __private const int input_offset,
__private const int filter_w, __private const int filter_w,
__private const int in_channel, __private const int in_channel,
...@@ -93,6 +113,12 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w ...@@ -93,6 +113,12 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
const int w = get_global_id(0); const int w = get_global_id(0);
const int h = get_global_id(1); const int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
DATA_TYPE4 values = 0; DATA_TYPE4 values = 0;
if (multiplier == 1) { if (multiplier == 1) {
const int in_channel_idx = h << 2; const int in_channel_idx = h << 2;
...@@ -134,7 +160,9 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w ...@@ -134,7 +160,9 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ __kernel void in_out_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset, __private const int input_offset,
__private const int height, __private const int height,
__private const int width, __private const int width,
...@@ -142,6 +170,13 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ ...@@ -142,6 +170,13 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__write_only image2d_t output) { __write_only image2d_t output) {
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int batch_idx = h / height; const int batch_idx = h / height;
const int height_idx = h % height; const int height_idx = h % height;
const int width_idx = w % width; const int width_idx = w % width;
...@@ -167,13 +202,22 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ ...@@ -167,13 +202,22 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ __kernel void in_out_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, /* nhwc */
__private const int height, __private const int height,
__private const int width, __private const int width,
__private const int channels, __private const int channels,
__read_only image2d_t input) { __read_only image2d_t input) {
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int batch_idx = h / height; const int batch_idx = h / height;
const int height_idx = h % height; const int height_idx = h % height;
const int width_idx = w % width; const int width_idx = w % width;
...@@ -198,13 +242,21 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ ...@@ -198,13 +242,21 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
} }
} }
__kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ __kernel void arg_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset, __private const int input_offset,
__private const int count, __private const int count,
__write_only image2d_t output) { __write_only image2d_t output) {
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int offset = input_offset + w * 4; const int offset = input_offset + w * 4;
const int size = count - w * 4; const int size = count - w * 4;
...@@ -226,11 +278,20 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ ...@@ -226,11 +278,20 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ __kernel void arg_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, /* nhwc */
__private const int count, __private const int count,
__read_only image2d_t input) { __read_only image2d_t input) {
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int offset = w * 4; const int offset = w * 4;
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
...@@ -251,7 +312,9 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ ...@@ -251,7 +312,9 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
} }
__kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc __kernel void in_out_height_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, //nhwc
__private const int input_offset, __private const int input_offset,
__private const int height, __private const int height,
__private const int width, __private const int width,
...@@ -259,6 +322,13 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n ...@@ -259,6 +322,13 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
__write_only image2d_t output) { __write_only image2d_t output) {
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int wc = width * channels; const int wc = width * channels;
const int height_blks = (height + 3) / 4; const int height_blks = (height + 3) / 4;
const int batch_idx = h / height_blks; const int batch_idx = h / height_blks;
...@@ -285,13 +355,22 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n ...@@ -285,13 +355,22 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
WRITE_IMAGET(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc __kernel void in_out_height_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, //nhwc
__private const int height, __private const int height,
__private const int width, __private const int width,
__private const int channels, __private const int channels,
__read_only image2d_t input) { __read_only image2d_t input) {
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int height_blks = (height + 3) / 4; const int height_blks = (height + 3) / 4;
const int batch_idx = h / height_blks; const int batch_idx = h / height_blks;
const int height_idx = (h % height_blks) << 2; const int height_idx = (h % height_blks) << 2;
...@@ -315,7 +394,9 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc ...@@ -315,7 +394,9 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
} }
__kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ __kernel void in_out_width_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset, __private const int input_offset,
__private const int height, __private const int height,
__private const int width, __private const int width,
...@@ -323,6 +404,13 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n ...@@ -323,6 +404,13 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
__write_only image2d_t output) { __write_only image2d_t output) {
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int width_blks = (width + 3) / 4; const int width_blks = (width + 3) / 4;
const int batch_idx = h / height; const int batch_idx = h / height;
const int height_idx = h % height; const int height_idx = h % height;
...@@ -349,7 +437,9 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n ...@@ -349,7 +437,9 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
} }
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W __kernel void winograd_filter_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset, __private const int input_offset,
__private const int in_channels, __private const int in_channels,
__private const int height, __private const int height,
...@@ -357,7 +447,16 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, / ...@@ -357,7 +447,16 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
__write_only image2d_t output) { __write_only image2d_t output) {
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1); const int out_channels = get_global_size(1);
#endif
const int out_channel_idx = h; const int out_channel_idx = h;
const int in_channel_idx = w << 2; const int in_channel_idx = w << 2;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width; const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
...@@ -430,13 +529,22 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, / ...@@ -430,13 +529,22 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
} }
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_image_to_buffer(__global DATA_TYPE *output, //Oc, Ic, H, W __kernel void winograd_filter_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height, __private const int height,
__private const int width, __private const int width,
__private const int channel, __private const int channel,
__read_only image2d_t input) { __read_only image2d_t input) {
const int w = get_global_id(0); const int w = get_global_id(0);
const int h = get_global_id(1); const int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int width_idx = w << 2; const int width_idx = w << 2;
const int size = width - width_idx; const int size = width - width_idx;
int offset = h * width + width_idx; int offset = h * width + width_idx;
......
#include <common.h> #include <common.h>
// assume channes_per_group mod 4 = 0 && groups mod 4 == 0 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
__kernel void channel_shuffle(__read_only image2d_t input, __kernel void channel_shuffle(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int groups, __private const int groups,
__private const int channels_per_group, __private const int channels_per_group,
__write_only image2d_t output) { __write_only image2d_t output) {
const int group_chan_blk_idx = get_global_id(0); const int group_chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1); const int width_idx = get_global_id(1);
const int width = get_global_size(1);
const int hb_idx = get_global_id(2); const int hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (group_chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
const int group_blks = groups / 4; const int group_blks = groups / 4;
const int groups_blks_width = group_blks * width; const int groups_blks_width = group_blks * width;
const int channels_per_group_blks = channels_per_group / 4; const int channels_per_group_blks = channels_per_group / 4;
......
...@@ -18,6 +18,23 @@ ...@@ -18,6 +18,23 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
#ifndef NON_UNIFORM_WORK_GROUP
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \
__private const int global_size_dim0, \
__private const int global_size_dim1,
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \
__private const int global_size_dim0, \
__private const int global_size_dim1, \
__private const int global_size_dim2,
#else
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
#endif
__constant sampler_t SAMPLER = __constant sampler_t SAMPLER =
CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
......
...@@ -22,14 +22,26 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left, ...@@ -22,14 +22,26 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
} }
// Supported data type: half/float // Supported data type: half/float
__kernel void concat_channel(__read_only image2d_t input0, __kernel void concat_channel(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input0,
__read_only image2d_t input1, __read_only image2d_t input1,
__private const int input0_chan, __private const int input0_chan,
__write_only image2d_t output) { __write_only image2d_t output) {
const int chan_blk_idx = get_global_id(0); const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1); const int width_idx = get_global_id(1);
const int width = get_global_size(1);
const int hb_idx = get_global_id(2); const int hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
const int input0_chan_blk = (input0_chan + 3) >> 2; const int input0_chan_blk = (input0_chan + 3) >> 2;
DATA_TYPE4 data = 0; DATA_TYPE4 data = 0;
...@@ -72,13 +84,25 @@ __kernel void concat_channel(__read_only image2d_t input0, ...@@ -72,13 +84,25 @@ __kernel void concat_channel(__read_only image2d_t input0,
} }
// Required: All input channels are divisible by 4 // Required: All input channels are divisible by 4
__kernel void concat_channel_multi(__read_only image2d_t input, __kernel void concat_channel_multi(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int chan_blk_offset, __private const int chan_blk_offset,
__write_only image2d_t output) { __write_only image2d_t output) {
const int chan_blk_idx = get_global_id(0); const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1); const int width_idx = get_global_id(1);
const int width = get_global_size(1);
const int hb_idx = get_global_id(2); const int hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
DATA_TYPE4 data = 0; DATA_TYPE4 data = 0;
data = READ_IMAGET(input, data = READ_IMAGET(input,
SAMPLER, SAMPLER,
......
#include <common.h> #include <common.h>
__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void conv_2d(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */ __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
#ifdef BIAS #ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
...@@ -21,9 +23,18 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -21,9 +23,18 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__private const int dilation_w) { __private const int dilation_w) {
const int out_ch_blk = get_global_id(0); const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2); const int out_hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const int out_w_blks = global_size_dim1;
#else
const int out_w_blks = get_global_size(1);
#endif
#ifdef BIAS #ifdef BIAS
DATA_TYPE4 out0 = DATA_TYPE4 out0 =
READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
......
#include <common.h> #include <common.h>
__kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void conv_2d_1x1(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, cout/4 */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
#ifdef BIAS #ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
...@@ -15,9 +17,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -15,9 +17,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__private const int stride) { __private const int stride) {
const int out_ch_blk = get_global_id(0); const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2); const int out_hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const int out_w_blks = global_size_dim1;
#else
const int out_w_blks = get_global_size(1);
#endif
#ifdef BIAS #ifdef BIAS
DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
DATA_TYPE4 out1 = out0; DATA_TYPE4 out1 = out0;
......
#include <common.h> #include <common.h>
__kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void conv_2d_3x3(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */ __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
#ifdef BIAS #ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
...@@ -19,9 +21,18 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -19,9 +21,18 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__private const int dilation_w) { __private const int dilation_w) {
const int out_ch_blk = get_global_id(0); const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2); const int out_hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const int out_w_blks = global_size_dim1;
#else
const int out_w_blks = get_global_size(1);
#endif
#ifdef BIAS #ifdef BIAS
DATA_TYPE4 out0 = DATA_TYPE4 out0 =
READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
......
#include <common.h> #include <common.h>
__kernel void depth_to_space(__read_only image2d_t input, __kernel void depth_to_space(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int block_size, __private const int block_size,
__private const int output_depth, __private const int output_depth,
__write_only image2d_t output) { __write_only image2d_t output) {
const int out_d = get_global_id(0); const int out_d = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_h = get_global_id(2); const int out_h = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_d >= global_size_dim0 || out_w >= global_size_dim1
|| out_h >= global_size_dim2) {
return;
}
const int output_width = global_size_dim1;
#else
const int output_width = get_global_size(1); const int output_width = get_global_size(1);
#endif
const int out_pos = mad24(out_d, output_width, out_w); const int out_pos = mad24(out_d, output_width, out_w);
...@@ -27,14 +38,27 @@ __kernel void depth_to_space(__read_only image2d_t input, ...@@ -27,14 +38,27 @@ __kernel void depth_to_space(__read_only image2d_t input,
WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data); WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
} }
__kernel void space_to_depth(__read_only image2d_t input, __kernel void space_to_depth(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int block_size, __private const int block_size,
__private const int input_depth, __private const int input_depth,
__write_only image2d_t output) { __write_only image2d_t output) {
const int d = get_global_id(0); const int d = get_global_id(0);
const int w = get_global_id(1); const int w = get_global_id(1);
const int h = get_global_id(2); const int h = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (d >= global_size_dim0 || w >= global_size_dim1
|| h >= global_size_dim2) {
return;
}
const int input_width = global_size_dim1;
#else
const int input_width = get_global_size(1); const int input_width = get_global_size(1);
#endif
const int in_pos = mad24(d, input_width, w); const int in_pos = mad24(d, input_width, w);
const int output_width = input_width / block_size; const int output_width = input_width / block_size;
......
#include <common.h> #include <common.h>
// Only multiplier = 1 is supported // Only multiplier = 1 is supported
__kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void depthwise_conv2d(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS #ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
...@@ -21,8 +23,18 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h ...@@ -21,8 +23,18 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
__private const short dilation_w) { __private const short dilation_w) {
const short out_ch_blk = get_global_id(0); const short out_ch_blk = get_global_id(0);
const short out_w_blk = get_global_id(1); const short out_w_blk = get_global_id(1);
const short out_w_blks = get_global_size(1);
const short out_hb = get_global_id(2); const short out_hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const short out_w_blks = global_size_dim1;
#else
const short out_w_blks = get_global_size(1);
#endif
const short rounded_in_ch = in_ch_blks << 2; const short rounded_in_ch = in_ch_blks << 2;
const short in_ch_blk = out_ch_blk; // multiplier = 1 const short in_ch_blk = out_ch_blk; // multiplier = 1
...@@ -126,7 +138,9 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h ...@@ -126,7 +138,9 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
} }
__kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void depthwise_conv2d_s1(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS #ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
...@@ -145,6 +159,14 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4 ...@@ -145,6 +159,14 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4
const short out_ch_blk = get_global_id(0); const short out_ch_blk = get_global_id(0);
const short out_w_blk = get_global_id(1) << 2; const short out_w_blk = get_global_id(1) << 2;
const short out_hb = get_global_id(2); const short out_hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
#endif
const short rounded_in_ch = in_ch_blks << 2; const short rounded_in_ch = in_ch_blks << 2;
const short in_ch_blk = out_ch_blk; // multiplier = 1 const short in_ch_blk = out_ch_blk; // multiplier = 1
......
#include <common.h> #include <common.h>
__kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __kernel void eltwise(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1, __read_only image2d_t input1,
#ifdef COEFF_SUM #ifdef COEFF_SUM
__private const float coeff0, __private const float coeff0,
...@@ -10,6 +12,10 @@ __kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ ...@@ -10,6 +12,10 @@ __kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
const int w = get_global_id(0); const int w = get_global_id(0);
const int hb = get_global_id(1); const int hb = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || hb >= global_size_dim1) return;
#endif
DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb)); DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb));
DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb)); DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb));
DATA_TYPE4 out; DATA_TYPE4 out;
......
#include <common.h> #include <common.h>
// C = A * B // C = A * B
__kernel void matmul(__read_only image2d_t A, __kernel void matmul(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only image2d_t A,
__read_only image2d_t B, __read_only image2d_t B,
__write_only image2d_t C, __write_only image2d_t C,
__private const int M, __private const int M,
...@@ -11,6 +13,11 @@ __kernel void matmul(__read_only image2d_t A, ...@@ -11,6 +13,11 @@ __kernel void matmul(__read_only image2d_t A,
__private const int k_blocks) { __private const int k_blocks) {
const int gx = get_global_id(0) << 2; const int gx = get_global_id(0) << 2;
const int hb = get_global_id(1); const int hb = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (get_global_id(0) >= global_size_dim0 || hb >= global_size_dim1) return;
#endif
const int batch = hb / height_blocks; const int batch = hb / height_blocks;
const int ty = (hb % height_blocks); const int ty = (hb % height_blocks);
const int gy = mad24(batch, height_blocks, ty); const int gy = mad24(batch, height_blocks, ty);
......
...@@ -19,7 +19,9 @@ inline int calculate_avg_block_size(const int pool_size, ...@@ -19,7 +19,9 @@ inline int calculate_avg_block_size(const int pool_size,
} }
// Supported data type: half/float // Supported data type: half/float
__kernel void pooling(__read_only image2d_t input, __kernel void pooling(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
__private const int out_height, __private const int out_height,
...@@ -28,11 +30,21 @@ __kernel void pooling(__read_only image2d_t input, ...@@ -28,11 +30,21 @@ __kernel void pooling(__read_only image2d_t input,
__private const int stride, __private const int stride,
__private const int pooling_size, __private const int pooling_size,
__write_only image2d_t output) { __write_only image2d_t output) {
const int out_chan_idx = get_global_id(0); const int out_chan_idx = get_global_id(0);
const int out_width_idx = get_global_id(1); const int out_width_idx = get_global_id(1);
const int out_width = get_global_size(1);
const int out_hb_idx = get_global_id(2); const int out_hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1
|| out_hb_idx >= global_size_dim2) {
return;
}
const int out_width = global_size_dim1;
#else
const int out_width = get_global_size(1);
#endif
const int batch_idx = mul24((out_hb_idx / out_height), in_height); const int batch_idx = mul24((out_hb_idx / out_height), in_height);
const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top; const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
const int in_width_start = mul24(out_width_idx, stride) - pad_left; const int in_width_start = mul24(out_width_idx, stride) - pad_left;
......
#include <common.h> #include <common.h>
__kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void resize_bilinear_nocache(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__write_only image2d_t output, __write_only image2d_t output,
__private const float height_scale, __private const float height_scale,
__private const float width_scale, __private const float width_scale,
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
__private const int out_height) { __private const int out_height) {
const int ch_blk = get_global_id(0); const int ch_blk = get_global_id(0);
const int ch_blks = get_global_size(0);
const int w = get_global_id(1); const int w = get_global_id(1);
const int out_width = get_global_size(1);
const int hb = get_global_id(2); const int hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (ch_blk >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
}
const int ch_blks = global_size_dim0;
const int out_width = global_size_dim1;
#else
const int ch_blks = get_global_size(0);
const int out_width = get_global_size(1);
#endif
const int b = hb / out_height; const int b = hb / out_height;
const int h = hb % out_height; const int h = hb % out_height;
......
#include <common.h> #include <common.h>
__kernel void slice(__read_only image2d_t input, __kernel void slice(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int chan_blk_offset, __private const int chan_blk_offset,
__write_only image2d_t output) { __write_only image2d_t output) {
const int chan_blk_idx = get_global_id(0); const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1); const int width_idx = get_global_id(1);
const int width = get_global_size(1);
const int hb_idx = get_global_id(2); const int hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
DATA_TYPE4 data = READ_IMAGET(input, SAMPLER, DATA_TYPE4 data = READ_IMAGET(input, SAMPLER,
(int2)(mad24(chan_blk_idx + chan_blk_offset, (int2)(mad24(chan_blk_idx + chan_blk_offset,
width, width_idx), hb_idx)); width, width_idx), hb_idx));
......
#include <common.h> #include <common.h>
__kernel void softmax(__read_only image2d_t input, __kernel void softmax(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int channels, __private const int channels,
__private const int remain_channels, __private const int remain_channels,
__write_only image2d_t output) { __write_only image2d_t output) {
const int chan_blk_idx = get_global_id(0); const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1); const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2); const int hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
}
const int chan_blks = global_size_dim0 - 1;
const int width = global_size_dim1;
#else
const int chan_blks = get_global_size(0) - 1; const int chan_blks = get_global_size(0) - 1;
const int width = get_global_size(1); const int width = get_global_size(1);
#endif
int pos = width_idx; int pos = width_idx;
DATA_TYPE max_value = -FLT_MAX; DATA_TYPE max_value = -FLT_MAX;
......
#include <common.h> #include <common.h>
__kernel void space_to_batch(__read_only image2d_t space_data, __kernel void space_to_batch(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t space_data,
__write_only image2d_t batch_data, __write_only image2d_t batch_data,
__private const int block_height, __private const int block_height,
__private const int block_width, __private const int block_width,
...@@ -14,6 +16,13 @@ __kernel void space_to_batch(__read_only image2d_t space_data, ...@@ -14,6 +16,13 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
const int batch_w_idx = get_global_id(1); const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2); const int batch_hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
|| batch_hb_idx >= global_size_dim2) {
return;
}
#endif
const int batch_b_idx = batch_hb_idx / batch_height; const int batch_b_idx = batch_hb_idx / batch_height;
const int batch_h_idx = batch_hb_idx % batch_height; const int batch_h_idx = batch_hb_idx % batch_height;
...@@ -39,7 +48,9 @@ __kernel void space_to_batch(__read_only image2d_t space_data, ...@@ -39,7 +48,9 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
WRITE_IMAGET(batch_data, batch_coord, value); WRITE_IMAGET(batch_data, batch_coord, value);
} }
__kernel void batch_to_space(__read_only image2d_t batch_data, __kernel void batch_to_space(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t batch_data,
__write_only image2d_t space_data, __write_only image2d_t space_data,
__private const int block_height, __private const int block_height,
__private const int block_width, __private const int block_width,
...@@ -53,6 +64,13 @@ __kernel void batch_to_space(__read_only image2d_t batch_data, ...@@ -53,6 +64,13 @@ __kernel void batch_to_space(__read_only image2d_t batch_data,
const int batch_w_idx = get_global_id(1); const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2); const int batch_hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
|| batch_hb_idx >= global_size_dim2) {
return;
}
#endif
const int batch_b_idx = batch_hb_idx / batch_height; const int batch_b_idx = batch_hb_idx / batch_height;
const int batch_h_idx = batch_hb_idx % batch_height; const int batch_h_idx = batch_hb_idx % batch_height;
......
#include <common.h> #include <common.h>
__kernel void winograd_transform_2x2(__read_only image2d_t input, __kernel void winograd_transform_2x2(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
...@@ -11,7 +13,15 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input, ...@@ -11,7 +13,15 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
__private const int padding_left) { __private const int padding_left) {
int out_width_idx = get_global_id(0); int out_width_idx = get_global_id(0);
int chan_blk_idx = get_global_id(1); int chan_blk_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
return;
}
const int chan_blk_size = global_size_dim1;
#else
const int chan_blk_size = get_global_size(1); const int chan_blk_size = get_global_size(1);
#endif
const int batch_idx = out_width_idx / round_hw; const int batch_idx = out_width_idx / round_hw;
const int t_idx = out_width_idx % round_hw; const int t_idx = out_width_idx % round_hw;
...@@ -106,7 +116,9 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input, ...@@ -106,7 +116,9 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
} }
} }
__kernel void winograd_inverse_transform_2x2(__read_only image2d_t input, __kernel void winograd_inverse_transform_2x2(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only image2d_t input,
#ifdef BIAS #ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
#endif #endif
...@@ -118,7 +130,16 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input, ...@@ -118,7 +130,16 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
__private const float relux_max_limit) { __private const float relux_max_limit) {
const int width_idx = get_global_id(0); const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1); const int height_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return;
}
const int out_channel = global_size_dim1;
#else
const int out_channel = get_global_size(1); const int out_channel = get_global_size(1);
#endif
int width = width_idx; int width = width_idx;
int height = height_idx; int height = height_idx;
......
...@@ -17,19 +17,28 @@ static void Concat2(cl::Kernel *kernel, ...@@ -17,19 +17,28 @@ static void Concat2(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future,
uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
const index_t channel = output->dim(3); const index_t channel = output->dim(3);
const int channel_blk = RoundUpDiv4(channel); const int channel_blk = RoundUpDiv4(channel);
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
if (kernel->get() == nullptr) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name); built_options.emplace("-Dconcat_channel=" + kernel_name);
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (input0->dtype() == output->dtype()) { if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
...@@ -41,9 +50,17 @@ static void Concat2(cl::Kernel *kernel, ...@@ -41,9 +50,17 @@ static void Concat2(cl::Kernel *kernel,
built_options.emplace("-DDIVISIBLE_FOUR"); built_options.emplace("-DDIVISIBLE_FOUR");
} }
*kernel = runtime->BuildKernel("concat", kernel_name, built_options); *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
if (!IsVecEqual(*prev_input_shape, input0->shape())) { if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input0->opencl_image()))); *(static_cast<const cl::Image2D *>(input0->opencl_image())));
kernel->setArg(idx++, kernel->setArg(idx++,
...@@ -51,14 +68,11 @@ static void Concat2(cl::Kernel *kernel, ...@@ -51,14 +68,11 @@ static void Concat2(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3))); kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Image2D *>(output->opencl_image()))); *(static_cast<cl::Image2D *>(output->opencl_image())));
*prev_input_shape = input0->shape(); *prev_input_shape = input0->shape();
} }
const uint32_t gws[3] = { const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
...@@ -69,38 +83,51 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -69,38 +83,51 @@ static void ConcatN(cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
const DataType dt, const DataType dt,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future,
uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
const index_t channel = output->dim(3); const index_t channel = output->dim(3);
if (kernel->get() == nullptr) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
*kernel = runtime->BuildKernel("concat", kernel_name, built_options); *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const int inputs_count = input_list.size(); const int inputs_count = input_list.size();
index_t chan_blk_offset = 0; index_t chan_blk_offset = 0;
for (int i = 0; i < inputs_count; ++i) { for (int i = 0; i < inputs_count; ++i) {
const Tensor *input = input_list[i]; const Tensor *input = input_list[i];
index_t input_channel_blk = input->dim(3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset)); kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
kernel->setArg(idx++, *(output->opencl_image())); kernel->setArg(idx++, *(output->opencl_image()));
index_t input_channel_blk = input->dim(3) / 4;
chan_blk_offset += input_channel_blk; chan_blk_offset += input_channel_blk;
const uint32_t gws[3] = { const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_" ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
<< batch * height; << batch * height;
...@@ -145,11 +172,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -145,11 +172,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch (inputs_count) { switch (inputs_count) {
case 2: case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value, Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
&input_shape_, output, future); &input_shape_, output, future, &kwg_size_);
break; break;
default: default:
if (divisible_four) { if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future); ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
&kwg_size_);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
......
...@@ -20,7 +20,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -20,7 +20,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future,
uint32_t *kwg_size);
extern void Conv2dOpenclK3x3(cl::Kernel *kernel, extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
...@@ -34,7 +35,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -34,7 +35,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future,
uint32_t *kwg_size);
extern void Conv2dOpencl(cl::Kernel *kernel, extern void Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
...@@ -48,7 +50,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -48,7 +50,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future,
uint32_t *kwg_size);
template <typename T> template <typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...@@ -61,7 +64,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -61,7 +64,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *bias, const int stride, const int *padding, const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation, const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt, const float relux_max_limit, const DataType dt,
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future); std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
uint32_t *kwg_size);
// Selection matrix: kernel_size x stride_size // Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] = { static const Conv2dOpenclFunction selector[5] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
...@@ -101,11 +105,13 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -101,11 +105,13 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto conv2d_func = selector[kernel_h - 1]; auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future); DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_);
} else { } else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future); DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_);
} }
} }
......
...@@ -22,7 +22,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -22,7 +22,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future,
uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -36,6 +37,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -36,6 +37,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv4(width);
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
MACE_CHECK(input_batch == batch); MACE_CHECK(input_batch == batch);
...@@ -44,6 +47,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -44,6 +47,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (bias != nullptr) { if (bias != nullptr) {
built_options.emplace("-DBIAS"); built_options.emplace("-DBIAS");
} }
...@@ -66,11 +72,23 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -66,11 +72,23 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options); *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -89,10 +107,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -89,10 +107,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 15, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
......
...@@ -24,7 +24,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -24,7 +24,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future,
uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -35,12 +36,17 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -35,12 +36,17 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv<index_t, 5>(width); const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP:
...@@ -61,11 +67,23 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -61,11 +67,23 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options); *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -87,10 +105,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -87,10 +105,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 1};
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {4, 15, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
......
...@@ -24,7 +24,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -24,7 +24,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future,
uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -35,12 +36,17 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -35,12 +36,17 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv4(width);
auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-Dconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
case NOOP: case NOOP:
...@@ -61,11 +67,23 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -61,11 +67,23 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options); *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -89,10 +107,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -89,10 +107,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
......
...@@ -45,8 +45,9 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -45,8 +45,9 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
output->ResizeImage(output_shape, image_shape); output->ResizeImage(output_shape, image_shape);
if (kernel_.get() == nullptr) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
...@@ -55,38 +56,49 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -55,38 +56,49 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = kernel_ =
runtime->BuildKernel("depth_to_space", kernel_name, built_options); runtime->BuildKernel("depth_to_space", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
uint32_t gws[3];
std::stringstream ss;
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
if (d2s_) {
gws[0] = static_cast<uint32_t>(depth_blocks);
gws[1] = static_cast<uint32_t>(output_width);
gws[2] = static_cast<uint32_t>(output_height * batch);
ss << "depth_to_space_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
} else {
gws[0] = static_cast<uint32_t>(depth_blocks);
gws[1] = static_cast<uint32_t>(input_width);
gws[2] = static_cast<uint32_t>(input_height * batch);
ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_"
<< input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
}
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, block_size_); kernel_.setArg(idx++, block_size_);
kernel_.setArg(idx++, depth_blocks); kernel_.setArg(idx++, depth_blocks);
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
if (d2s_) { const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
const uint32_t gws[3] = {static_cast<uint32_t>(depth_blocks),
static_cast<uint32_t>(output_width),
static_cast<uint32_t>(output_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "depth_to_space_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
} else {
const uint32_t gws[3] = {static_cast<uint32_t>(depth_blocks),
static_cast<uint32_t>(input_width),
static_cast<uint32_t>(input_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_"
<< input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
} }
template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>; template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>;
......
...@@ -23,7 +23,8 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -23,7 +23,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future,
uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -35,8 +36,14 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -35,8 +36,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv4(width);
if (kernel->get() == nullptr) {
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
...@@ -45,6 +52,9 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -45,6 +52,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
} else { } else {
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
} }
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
...@@ -70,6 +80,9 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -70,6 +80,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
*kernel = *kernel =
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options); runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t input_batch = input->dim(0); const index_t input_batch = input->dim(0);
...@@ -84,6 +97,11 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -84,6 +97,11 @@ void DepthwiseConv2d(cl::Kernel *kernel,
input_channels); input_channels);
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
...@@ -104,13 +122,11 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -104,13 +122,11 @@ void DepthwiseConv2d(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int16_t>(dilations[0])); kernel->setArg(idx++, static_cast<int16_t>(dilations[0]));
kernel->setArg(idx++, static_cast<int16_t>(dilations[1])); kernel->setArg(idx++, static_cast<int16_t>(dilations[1]));
} }
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
batch, height, width, channels, multiplier); batch, height, width, channels, multiplier);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
...@@ -165,7 +181,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -165,7 +181,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future); DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_);
} }
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>; template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
......
...@@ -24,8 +24,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -24,8 +24,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
const index_t width_pixels = channel_blocks * width; const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height; const index_t batch_height_pixels = batch * height;
if (kernel_.get() == nullptr) { const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
...@@ -33,11 +37,21 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -33,11 +37,21 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options); kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, input0->shape())) { if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
kernel_.setArg(idx++, *(input0->opencl_image())); kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, *(input1->opencl_image())); kernel_.setArg(idx++, *(input1->opencl_image()));
if (!coeff_.empty()) { if (!coeff_.empty()) {
...@@ -45,12 +59,11 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -45,12 +59,11 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, coeff_[1]); kernel_.setArg(idx++, coeff_[1]);
} }
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape(); input_shape_ = input0->shape();
} }
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels), const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
static_cast<uint32_t>(batch_height_pixels)};
const std::vector<uint32_t> lws = {64, 16, 1};
std::stringstream ss; std::stringstream ss;
ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
......
...@@ -200,6 +200,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -200,6 +200,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size = const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
...@@ -226,12 +227,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -226,12 +227,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
{4, kwg_size / 28, 7, 1}, {4, kwg_size / 28, 7, 1},
{4, kwg_size / 32, 8, 1}, {4, kwg_size / 32, 8, 1},
{4, kwg_size / 56, 14, 1}, {4, kwg_size / 56, 14, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1}, {1, kwg_size, 1, 1},
{4, 15, 8, 1},
}; };
}; };
cl::Event event; cl::Event event;
...@@ -240,6 +236,13 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -240,6 +236,13 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
MACE_CHECK(params.size() == 4) MACE_CHECK(params.size() == 4)
<< "Tuning parameters of 3D kernel must be 4D"; << "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
std::vector<uint32_t> roundup_gws(3);
if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 3; ++i) {
roundup_gws[i] = RoundUp(gws[i], params[i]);
}
}
if (timer == nullptr) { if (timer == nullptr) {
uint32_t num_blocks = params[3]; uint32_t num_blocks = params[3];
const uint32_t block_size = gws[2] / num_blocks; const uint32_t block_size = gws[2] / num_blocks;
...@@ -247,17 +250,32 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -247,17 +250,32 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, 0, i * block_size), kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2), cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
} else {
uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
} }
} else { } else {
timer->ClearTiming(); timer->ClearTiming();
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
} else {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming(); timer->AccumulateTiming();
tuning_result->assign(params.begin(), params.end()); tuning_result->assign(params.begin(), params.end());
...@@ -273,10 +291,18 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -273,10 +291,18 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, 0, i * block_size), kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2), cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
} else {
uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming(); timer->AccumulateTiming();
} }
...@@ -304,6 +330,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -304,6 +330,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size = const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
...@@ -318,7 +345,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -318,7 +345,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
{kwg_size / 64, 64, 1}, {kwg_size / 64, 64, 1},
{kwg_size / 128, 128, 1}, {kwg_size / 128, 128, 1},
{kwg_size / 256, 256, 1}, {kwg_size / 256, 256, 1},
{kwg_size / 512, 512, 1},
{kwg_size, 1, 1}, {kwg_size, 1, 1},
{1, kwg_size, 1}}; {1, kwg_size, 1}};
}; };
...@@ -328,6 +354,13 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -328,6 +354,13 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
MACE_CHECK(params.size() == 3) MACE_CHECK(params.size() == 3)
<< "Tuning parameters of 2D kernel must be 3d"; << "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
std::vector<uint32_t> roundup_gws(2);
if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 2; ++i) {
roundup_gws[i] = RoundUp(gws[i], params[i]);
}
}
if (timer == nullptr) { if (timer == nullptr) {
uint32_t num_blocks = params[2]; uint32_t num_blocks = params[2];
const uint32_t block_size = gws[1] / num_blocks; const uint32_t block_size = gws[1] / num_blocks;
...@@ -335,16 +368,30 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -335,16 +368,30 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
} else {
uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size),
cl::NDRange(roundup_gws[0], roundup_gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
} }
} else { } else {
timer->ClearTiming(); timer->ClearTiming();
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
} else {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event);
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming(); timer->AccumulateTiming();
tuning_result->assign(params.begin(), params.end()); tuning_result->assign(params.begin(), params.end());
...@@ -360,9 +407,18 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -360,9 +407,18 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), kernel, cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]),
nullptr, &event);
} else {
uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size),
cl::NDRange(roundup_gws[0], roundup_gws1),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming(); timer->AccumulateTiming();
} }
......
...@@ -26,18 +26,33 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -26,18 +26,33 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
const index_t height_blocks = RoundUpDiv4(height); const index_t height_blocks = RoundUpDiv4(height);
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv4(width);
const uint32_t gws[2] = {
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height_blocks * batch),
};
if (kernel_.get() == nullptr) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name); built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options); kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
kernel_.setArg(idx++, *(A->opencl_image())); kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image())); kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image())); kernel_.setArg(idx++, *(C->opencl_image()));
...@@ -47,11 +62,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -47,11 +62,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(height_blocks)); kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2)))); kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
const uint32_t gws[2] = { const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height_blocks * batch),
};
const std::vector<uint32_t> lws = {16, 64, 1};
std::stringstream ss; std::stringstream ss;
ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_" ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
<< C->dim(2) << "_" << C->dim(3); << C->dim(2) << "_" << C->dim(3);
......
...@@ -18,12 +18,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -18,12 +18,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet"; << "Pooling opencl kernel not support dilation yet";
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value; const DataType dt = DataTypeToEnum<T>::value;
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name); built_options.emplace("-Dpooling=" + kernel_name);
if (pooling_type_ == MAX && input->dtype() == output->dtype()) { if (pooling_type_ == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
...@@ -35,8 +37,16 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -35,8 +37,16 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (pooling_type_ == AVG) { if (pooling_type_ == AVG) {
built_options.emplace("-DPOOL_AVG"); built_options.emplace("-DPOOL_AVG");
} }
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options); kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
std::vector<uint32_t> gws;
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], std::vector<index_t> filter_shape = {kernels_[0], kernels_[1],
...@@ -59,7 +69,24 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -59,7 +69,24 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
&output_image_shape); &output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
gws = {
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1))); kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2))); kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
...@@ -71,8 +98,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -71,8 +98,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape(); input_shape_ = input->shape();
} } else {
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t out_height = output->dim(1); index_t out_height = output->dim(1);
index_t out_width = output->dim(2); index_t out_width = output->dim(2);
...@@ -80,16 +106,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -80,16 +106,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
index_t channel_blocks = (channels + 3) / 4; index_t channel_blocks = (channels + 3) / 4;
gws = {
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width), static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height), static_cast<uint32_t>(batch * out_height),
}; };
std::vector<uint32_t> lws = {8, 16, 8, 1}; }
std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future);
} }
template struct PoolingFunctor<DeviceType::OPENCL, float>; template struct PoolingFunctor<DeviceType::OPENCL, float>;
......
...@@ -24,16 +24,27 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -24,16 +24,27 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const index_t out_height = out_height_; const index_t out_height = out_height_;
const index_t out_width = out_width_; const index_t out_width = out_width_;
if (kernel_.get() == nullptr) { const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options); runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0); MACE_CHECK(out_height > 0 && out_width > 0);
...@@ -50,6 +61,11 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -50,6 +61,11 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
CalculateResizeScale(in_width, out_width, align_corners_); CalculateResizeScale(in_width, out_width, align_corners_);
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale); kernel_.setArg(idx++, height_scale);
...@@ -61,10 +77,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -61,10 +77,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_" ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
......
...@@ -29,15 +29,22 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -29,15 +29,22 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
output_list[i]->ResizeImage(output_shape, image_shape); output_list[i]->ResizeImage(output_shape, image_shape);
} }
if (kernel_.get() == nullptr) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
built_options.emplace("-Dslice=" + kernel_name); built_options.emplace("-Dslice=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" built_options.emplace("-DCMD_DATA_TYPE="
+ DtToCLCMDDt(DataTypeToEnum<T>::value)); + DtToCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("slice", kernel_name, built_options); kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const index_t channel_blk = RoundUpDiv4(output_channels); const index_t channel_blk = RoundUpDiv4(output_channels);
...@@ -46,7 +53,8 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -46,7 +53,8 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(input->dim(2)), static_cast<uint32_t>(input->dim(2)),
static_cast<uint32_t>(input->dim(0) * input->dim(1)), static_cast<uint32_t>(input->dim(0) * input->dim(1)),
}; };
const std::vector<uint32_t> lws = {8, 16, 8, 1};
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "slice_opencl_kernel_" ss << "slice_opencl_kernel_"
<< input->dim(0) << "_" << input->dim(0) << "_"
...@@ -56,6 +64,11 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -56,6 +64,11 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
<< outputs_count; << outputs_count;
for (int i = 0; i < outputs_count; ++i) { for (int i = 0; i < outputs_count; ++i) {
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i)); kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
kernel_.setArg(idx++, *(output_list[i]->opencl_image())); kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
......
...@@ -23,29 +23,43 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -23,29 +23,43 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
const int remain_channels = channel_blocks * 4 - channels; const int remain_channels = channel_blocks * 4 - channels;
if (kernel_.get() == nullptr) { const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name); built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options); kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, logits->shape())) { if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(logits->opencl_image())); kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels)); kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels); kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = logits->shape(); input_shape_ = logits->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width), const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
......
...@@ -31,9 +31,15 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -31,9 +31,15 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
batch_tensor->ResizeImage(output_shape, output_image_shape); batch_tensor->ResizeImage(output_shape, output_image_shape);
kernel_name = "space_to_batch"; kernel_name = "space_to_batch";
} }
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options; std::set<std::string> built_options;
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
...@@ -41,11 +47,22 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -41,11 +47,22 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value)); DtToCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = kernel_ =
runtime->BuildKernel("space_to_batch", kernel_name, built_options); runtime->BuildKernel("space_to_batch", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(space_shape_, space_tensor->shape())) { if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
if (b2s_) { if (b2s_) {
kernel_.setArg(idx++, *(batch_tensor->opencl_image())); kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image())); kernel_.setArg(idx++, *(space_tensor->opencl_image()));
...@@ -65,11 +82,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -65,11 +82,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
space_shape_ = space_tensor->shape(); space_shape_ = space_tensor->shape();
} }
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3)); const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << kernel_name << "_" << batch_tensor->dim(0) << "_" ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
<< batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_" << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
......
...@@ -15,6 +15,8 @@ template <typename T> ...@@ -15,6 +15,8 @@ template <typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
...@@ -24,9 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -24,9 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Global(); if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options); built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1}; std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
...@@ -44,6 +51,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -44,6 +51,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
const index_t round_h = (output_shape[1] + 1) / 2; const index_t round_h = (output_shape[1] + 1) / 2;
const index_t round_w = (output_shape[2] + 1) / 2; const index_t round_w = (output_shape[2] + 1) / 2;
const index_t out_width = input_tensor->dim(0) * round_h * round_w; const index_t out_width = input_tensor->dim(0) * round_h * round_w;
const uint32_t gws[2] = {
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
if (!IsVecEqual(input_shape_, input_tensor->shape())) { if (!IsVecEqual(input_shape_, input_tensor->shape())) {
output_shape = {16, input_tensor->dim(3), out_width, 1}; output_shape = {16, input_tensor->dim(3), out_width, 1};
...@@ -52,6 +62,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -52,6 +62,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor->ResizeImage(output_shape, image_shape); output_tensor->ResizeImage(output_shape, image_shape);
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
kernel_.setArg(idx++, *(input_tensor->opencl_image())); kernel_.setArg(idx++, *(input_tensor->opencl_image()));
kernel_.setArg(idx++, *(output_tensor->opencl_image())); kernel_.setArg(idx++, *(output_tensor->opencl_image()));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1))); kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
...@@ -65,10 +79,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -65,10 +79,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_ = input_tensor->shape(); input_shape_ = input_tensor->shape();
} }
const uint32_t gws[2] = { const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
const std::vector<uint32_t> lws = {128, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_" ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
...@@ -82,6 +93,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -82,6 +93,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *bias, const Tensor *bias,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
...@@ -92,6 +106,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -92,6 +106,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation_) { switch (activation_) {
case NOOP: case NOOP:
...@@ -115,10 +132,16 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -115,10 +132,16 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options); built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const uint32_t gws[2] = {
static_cast<uint32_t>(input_tensor->dim(2)),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
if (!IsVecEqual(input_shape_, input_tensor->shape())) { if (!IsVecEqual(input_shape_, input_tensor->shape())) {
std::vector<index_t> output_shape = {batch_, height_, width_, std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)}; input_tensor->dim(1)};
...@@ -129,6 +152,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -129,6 +152,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2; const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0; uint32_t idx = 0;
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
kernel_.setArg( kernel_.setArg(
idx++, idx++,
*(static_cast<const cl::Image2D *>(input_tensor->opencl_image()))); *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
...@@ -147,10 +174,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -147,10 +174,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_ = input_tensor->shape(); input_shape_ = input_tensor->shape();
} }
const uint32_t gws[2] = { const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
static_cast<uint32_t>(input_tensor->dim(2)),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
const std::vector<uint32_t> lws = {128, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_" ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
......
...@@ -185,6 +185,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase { ...@@ -185,6 +185,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -173,6 +173,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T> ...@@ -173,6 +173,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -61,6 +61,7 @@ struct SliceFunctor<DeviceType::OPENCL, T> { ...@@ -61,6 +61,7 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -61,6 +61,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> { ...@@ -61,6 +61,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *logits, Tensor *output, StatsFuture *future); void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -56,6 +56,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase { ...@@ -56,6 +56,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> space_shape_; std::vector<index_t> space_shape_;
}; };
......
...@@ -51,6 +51,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T> ...@@ -51,6 +51,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
...@@ -108,6 +109,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> ...@@ -108,6 +109,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -43,6 +43,10 @@ else ...@@ -43,6 +43,10 @@ else
HEXAGON_MODE_BUILD_FLAG="--define hexagon=true" HEXAGON_MODE_BUILD_FLAG="--define hexagon=true"
fi fi
if [ x"$TARGET_ABI" = x"arm64-v8a" ]; then
NEON_ENABLE_FLAG="--define neon=true"
fi
bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \ bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
--crosstool_top=//external:android/crosstool \ --crosstool_top=//external:android/crosstool \
--host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
...@@ -54,6 +58,7 @@ else ...@@ -54,6 +58,7 @@ else
--copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \ --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
--define openmp=true \ --define openmp=true \
--copt="-O3" \ --copt="-O3" \
$NEON_ENABLE_FLAG \
$PRODUCTION_MODE_BUILD_FLAGS \ $PRODUCTION_MODE_BUILD_FLAGS \
$HEXAGON_MODE_BUILD_FLAG || exit 1 $HEXAGON_MODE_BUILD_FLAG || exit 1
fi fi
......
...@@ -376,6 +376,7 @@ def main(unused_args): ...@@ -376,6 +376,7 @@ def main(unused_args):
build_run_throughput_test(target_soc, FLAGS.run_seconds, build_run_throughput_test(target_soc, FLAGS.run_seconds,
merged_lib_file, FLAGS.output_dir) merged_lib_file, FLAGS.output_dir)
if FLAGS.mode == "build" or FLAGS.mode == "all":
packaging_lib_file(FLAGS.output_dir) packaging_lib_file(FLAGS.output_dir)
......
...@@ -14,8 +14,13 @@ source ${CURRENT_DIR}/env.sh ...@@ -14,8 +14,13 @@ source ${CURRENT_DIR}/env.sh
LIBMACE_BUILD_DIR=$1 LIBMACE_BUILD_DIR=$1
TAR_PACKAGE_NAME=libmace_${PROJECT_NAME}.tar.gz
pushd $LIBMACE_BUILD_DIR/$PROJECT_NAME pushd $LIBMACE_BUILD_DIR/$PROJECT_NAME
ls | grep -v build | xargs tar cvzf libmace_${PROJECT_NAME}.tar.gz if [ -f $TAR_PACKAGE_NAME ]; then
rm -f $TAR_PACKAGE_NAME
fi
ls | grep -v build | xargs tar cvzf $TAR_PACKAGE_NAME
popd popd
echo "Packaging done!" echo "Packaging done!"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册