提交 0fd7ce42 编写于 作者: Y yejianwu

update uniform work group size macro name, remove unnessary variable, use mace...

update uniform work group size macro name, remove unnessary variable, use mace for dim params in *.cl
上级 af6d9162
......@@ -156,7 +156,6 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
T relux_max_limit_;
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::string tuning_key_prefix_;
std::vector<index_t> input_shape_;
};
......
......@@ -91,7 +91,6 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -158,7 +158,6 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -65,7 +65,6 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -57,7 +57,6 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
const int groups_;
std::vector<index_t> input_shape_;
};
......
......@@ -86,7 +86,6 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -402,7 +402,6 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -109,7 +109,6 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
const int block_size_;
bool d2s_;
std::vector<index_t> input_shape_;
......
......@@ -438,7 +438,6 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -98,7 +98,6 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -242,7 +242,6 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
};
} // namespace kernels
......
......@@ -27,16 +27,14 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
switch (activation_) {
case RELU:
......@@ -63,6 +61,9 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......@@ -71,7 +72,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
......@@ -85,9 +86,6 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
......@@ -35,8 +35,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
}
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
......@@ -47,11 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
......@@ -70,7 +71,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
......@@ -80,9 +81,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
......
......@@ -38,16 +38,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (folded_constant_) {
built_options.emplace("-DFOLDED_CONSTANT");
......@@ -72,10 +70,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
......@@ -92,9 +93,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
......@@ -30,22 +30,23 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
......@@ -54,16 +55,13 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};
cl::Event event;
cl_int error;
if (is_non_uniform_work_groups_supported_) {
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
......
......@@ -62,16 +62,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
const bool is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
......@@ -87,7 +84,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
obfuscated_kernel_name, built_options);
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]);
b2f_kernel.setArg(idx++, gws[1]);
}
......@@ -123,7 +120,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
cl::Event event;
cl_int error;
if (is_non_uniform_work_groups_supported) {
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
......
......@@ -37,24 +37,25 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
......@@ -65,9 +66,6 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
#include <common.h>
__kernel void activation(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
#ifdef USE_PRELU
__read_only image2d_t alpha,
......@@ -16,7 +12,7 @@ __kernel void activation(
const int w = get_global_id(1);
const int hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (ch_blk >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
......
#include <common.h>
__kernel void addn(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1,
#if INPUT_NUM > 2
......@@ -17,7 +14,7 @@ __kernel void addn(
const int w = get_global_id(0);
const int hb = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || hb >= global_size_dim1) return;
#endif
......
#include <common.h>
// Supported data types: half/float
__kernel void batch_norm(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__read_only image2d_t scale,
__read_only image2d_t offset,
......@@ -20,7 +16,7 @@ __kernel void batch_norm(
const int w = get_global_id(1);
const int hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (ch_blk >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
......
#include <common.h>
// Supported data types: half/float
__kernel void bias_add(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__read_only image2d_t bias,
__write_only image2d_t output) {
......@@ -13,7 +9,7 @@ __kernel void bias_add(
const int w = get_global_id(1);
const int hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (ch_blk >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
......
#include <common.h>
__kernel void filter_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* h, w, oc, ic */
__private const int input_offset,
__private const int filter_h,
......@@ -15,7 +12,7 @@ __kernel void filter_buffer_to_image(
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
......@@ -57,10 +54,7 @@ __kernel void filter_buffer_to_image(
}
__kernel void filter_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, /* h, w, oc, ic */
__private const int filter_h,
__private const int filter_w,
......@@ -70,7 +64,7 @@ __kernel void filter_image_to_buffer(
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
......@@ -109,10 +103,7 @@ __kernel void filter_image_to_buffer(
}
__kernel void dw_filter_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* h, w, ic, m */
__private const int input_offset,
__private const int filter_w,
......@@ -122,7 +113,7 @@ __kernel void dw_filter_buffer_to_image(
const int w = get_global_id(0);
const int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
......@@ -170,10 +161,7 @@ __kernel void dw_filter_buffer_to_image(
}
__kernel void in_out_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height,
......@@ -183,7 +171,7 @@ __kernel void in_out_buffer_to_image(
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
......@@ -215,10 +203,7 @@ __kernel void in_out_buffer_to_image(
}
__kernel void in_out_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, /* nhwc */
__private const int height,
__private const int width,
......@@ -227,7 +212,7 @@ __kernel void in_out_image_to_buffer(
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
......@@ -258,10 +243,7 @@ __kernel void in_out_image_to_buffer(
}
__kernel void arg_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int count,
......@@ -269,7 +251,7 @@ __kernel void arg_buffer_to_image(
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
......@@ -297,17 +279,14 @@ __kernel void arg_buffer_to_image(
}
__kernel void arg_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, /* nhwc */
__private const int count,
__read_only image2d_t input) {
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
......@@ -334,10 +313,7 @@ __kernel void arg_image_to_buffer(
__kernel void in_out_height_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, //nhwc
__private const int input_offset,
__private const int height,
......@@ -347,7 +323,7 @@ __kernel void in_out_height_buffer_to_image(
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
......@@ -380,10 +356,7 @@ __kernel void in_out_height_buffer_to_image(
}
__kernel void in_out_height_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, //nhwc
__private const int height,
__private const int width,
......@@ -392,7 +365,7 @@ __kernel void in_out_height_image_to_buffer(
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
......@@ -422,10 +395,7 @@ __kernel void in_out_height_image_to_buffer(
__kernel void in_out_width_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height,
......@@ -435,7 +405,7 @@ __kernel void in_out_width_buffer_to_image(
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
......@@ -468,10 +438,7 @@ __kernel void in_out_width_buffer_to_image(
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels,
......@@ -481,7 +448,7 @@ __kernel void winograd_filter_buffer_to_image(
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
......@@ -563,10 +530,7 @@ __kernel void winograd_filter_buffer_to_image(
// only support 3x3 now
__kernel void winograd_filter_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int width,
......@@ -575,7 +539,7 @@ __kernel void winograd_filter_image_to_buffer(
const int w = get_global_id(0);
const int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
......
......@@ -2,11 +2,7 @@
// assume channes_per_group mod 4 = 0 && groups mod 4 == 0
__kernel void channel_shuffle(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int groups,
__private const int channels_per_group,
......@@ -15,7 +11,7 @@ __kernel void channel_shuffle(
const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (group_chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
......
......@@ -18,6 +18,23 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
#ifndef NON_UNIFORM_WORK_GROUP
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \
__private const int global_size_dim0, \
__private const int global_size_dim1,
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \
__private const int global_size_dim0, \
__private const int global_size_dim1, \
__private const int global_size_dim2,
#else
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
#endif
__constant sampler_t SAMPLER =
CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
......
......@@ -23,11 +23,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
// Supported data type: half/float
__kernel void concat_channel(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input0,
__read_only image2d_t input1,
__private const int input0_chan,
......@@ -36,7 +32,7 @@ __kernel void concat_channel(
const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
......@@ -89,11 +85,7 @@ __kernel void concat_channel(
// Required: All input channels are divisible by 4
__kernel void concat_channel_multi(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int chan_blk_offset,
__write_only image2d_t output) {
......@@ -101,7 +93,7 @@ __kernel void concat_channel_multi(
const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
......
#include <common.h>
__kernel void conv_2d(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
#ifdef BIAS
......@@ -29,7 +25,7 @@ __kernel void conv_2d(
const int out_w_blk = get_global_id(1);
const int out_hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
......
#include <common.h>
__kernel void conv_2d_1x1(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, cout/4 */
#ifdef BIAS
......@@ -23,7 +19,7 @@ __kernel void conv_2d_1x1(
const int out_w_blk = get_global_id(1);
const int out_hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
......
#include <common.h>
__kernel void conv_2d_3x3(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
#ifdef BIAS
......@@ -27,7 +23,7 @@ __kernel void conv_2d_3x3(
const int out_w_blk = get_global_id(1);
const int out_hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
......
#include <common.h>
__kernel void depth_to_space(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int block_size,
__private const int output_depth,
......@@ -14,7 +10,7 @@ __kernel void depth_to_space(
const int out_w = get_global_id(1);
const int out_h = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (out_d >= global_size_dim0 || out_w >= global_size_dim1
|| out_h >= global_size_dim2) {
return;
......@@ -43,11 +39,7 @@ __kernel void depth_to_space(
}
__kernel void space_to_depth(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int block_size,
__private const int input_depth,
......@@ -57,7 +49,7 @@ __kernel void space_to_depth(
const int w = get_global_id(1);
const int h = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (d >= global_size_dim0 || w >= global_size_dim1
|| h >= global_size_dim2) {
return;
......
......@@ -2,11 +2,7 @@
// Only multiplier = 1 is supported
__kernel void depthwise_conv2d(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS
......@@ -29,7 +25,7 @@ __kernel void depthwise_conv2d(
const short out_w_blk = get_global_id(1);
const short out_hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
......@@ -143,11 +139,7 @@ __kernel void depthwise_conv2d(
}
__kernel void depthwise_conv2d_s1(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS
......@@ -168,7 +160,7 @@ __kernel void depthwise_conv2d_s1(
const short out_w_blk = get_global_id(1) << 2;
const short out_hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
......
#include <common.h>
__kernel void eltwise(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1,
#ifdef COEFF_SUM
......@@ -15,7 +12,7 @@ __kernel void eltwise(
const int w = get_global_id(0);
const int hb = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || hb >= global_size_dim1) return;
#endif
......
......@@ -2,10 +2,7 @@
// C = A * B
__kernel void matmul(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only image2d_t A,
__read_only image2d_t B,
__write_only image2d_t C,
......@@ -17,7 +14,7 @@ __kernel void matmul(
const int gx = get_global_id(0) << 2;
const int hb = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (get_global_id(0) >= global_size_dim0 || hb >= global_size_dim1) return;
#endif
......
......@@ -20,11 +20,7 @@ inline int calculate_avg_block_size(const int pool_size,
// Supported data type: half/float
__kernel void pooling(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int in_height,
__private const int in_width,
......@@ -39,7 +35,7 @@ __kernel void pooling(
const int out_width_idx = get_global_id(1);
const int out_hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1
|| out_hb_idx >= global_size_dim2) {
return;
......
#include <common.h>
__kernel void resize_bilinear_nocache(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__write_only image2d_t output,
__private const float height_scale,
......@@ -18,7 +14,7 @@ __kernel void resize_bilinear_nocache(
const int w = get_global_id(1);
const int hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (ch_blk >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
......
#include <common.h>
__kernel void slice(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int chan_blk_offset,
__write_only image2d_t output) {
......@@ -13,7 +9,7 @@ __kernel void slice(
const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
......
#include <common.h>
__kernel void softmax(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int channels,
__private const int remain_channels,
......@@ -14,7 +10,7 @@ __kernel void softmax(
const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
......
#include <common.h>
__kernel void space_to_batch(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t space_data,
__write_only image2d_t batch_data,
__private const int block_height,
......@@ -20,7 +16,7 @@ __kernel void space_to_batch(
const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
|| batch_hb_idx >= global_size_dim2) {
return;
......@@ -53,11 +49,7 @@ __kernel void space_to_batch(
}
__kernel void batch_to_space(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t batch_data,
__write_only image2d_t space_data,
__private const int block_height,
......@@ -72,7 +64,7 @@ __kernel void batch_to_space(
const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
|| batch_hb_idx >= global_size_dim2) {
return;
......
#include <common.h>
__kernel void winograd_transform_2x2(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
......@@ -17,7 +14,7 @@ __kernel void winograd_transform_2x2(
int out_width_idx = get_global_id(0);
int chan_blk_idx = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
return;
}
......@@ -120,10 +117,7 @@ __kernel void winograd_transform_2x2(
}
__kernel void winograd_inverse_transform_2x2(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only image2d_t input,
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
......@@ -137,7 +131,7 @@ __kernel void winograd_inverse_transform_2x2(
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0
#ifndef NON_UNIFORM_WORK_GROUP
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return;
}
......
......@@ -18,7 +18,6 @@ static void Concat2(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
......@@ -34,13 +33,11 @@ static void Concat2(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name);
if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
......@@ -53,10 +50,13 @@ static void Concat2(cl::Kernel *kernel,
built_options.emplace("-DDIVISIBLE_FOUR");
}
*kernel = runtime->BuildKernel("concat", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
......@@ -70,9 +70,6 @@ static void Concat2(cl::Kernel *kernel,
*(static_cast<cl::Image2D *>(output->opencl_image())));
*prev_input_shape = input0->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
......@@ -87,7 +84,6 @@ static void ConcatN(cl::Kernel *kernel,
const DataType dt,
Tensor *output,
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
......@@ -97,17 +93,17 @@ static void ConcatN(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
*kernel = runtime->BuildKernel("concat", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const int inputs_count = input_list.size();
......@@ -121,7 +117,7 @@ static void ConcatN(cl::Kernel *kernel,
};
uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
......@@ -131,8 +127,6 @@ static void ConcatN(cl::Kernel *kernel,
kernel->setArg(idx++, *(output->opencl_image()));
chan_blk_offset += input_channel_blk;
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::stringstream ss;
ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
......@@ -178,13 +172,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch (inputs_count) {
case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
&input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
&input_shape_, output, future, &kwg_size_);
break;
default:
if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
&kwg_size_);
} else {
MACE_NOT_IMPLEMENTED;
}
......
......@@ -21,7 +21,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size);
extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
......@@ -37,7 +36,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size);
extern void Conv2dOpencl(cl::Kernel *kernel,
......@@ -53,7 +51,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size);
template <typename T>
......@@ -68,7 +65,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt,
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size);
uint32_t *kwg_size);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
......@@ -109,12 +106,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
&kwg_size_);
} else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
&kwg_size_);
}
}
......
......@@ -23,7 +23,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
......@@ -41,8 +40,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
MACE_CHECK(input_batch == batch);
std::set<std::string> built_options;
......@@ -50,8 +47,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (bias != nullptr) {
built_options.emplace("-DBIAS");
......@@ -76,6 +73,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
}
*kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......@@ -84,7 +84,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
......@@ -105,9 +105,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
kernel->setArg(idx++, stride);
*prev_input_shape = input->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
......
......@@ -25,7 +25,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
......@@ -40,15 +39,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
......@@ -71,6 +68,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
}
*kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......@@ -79,7 +79,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
......@@ -103,9 +103,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
kernel->setArg(idx++, dilations[1]);
*prev_input_shape = input->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 1};
......
......@@ -25,7 +25,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
......@@ -40,15 +39,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
built_options.emplace("-Dconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) {
......@@ -71,6 +68,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
}
*kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......@@ -79,7 +79,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
......@@ -105,9 +105,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
kernel->setArg(idx++, dilations[1]);
*prev_input_shape = input->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
......
......@@ -48,8 +48,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
......@@ -58,11 +56,14 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ =
runtime->BuildKernel("depth_to_space", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
uint32_t gws[3];
......@@ -83,7 +84,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
}
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
......@@ -94,9 +95,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
......@@ -24,7 +24,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
......@@ -45,8 +44,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
......@@ -55,8 +52,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
} else {
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
}
if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
......@@ -83,6 +80,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
*kernel =
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t input_batch = input->dim(0);
......@@ -97,7 +97,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
input_channels);
uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
......@@ -124,9 +124,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
}
*prev_input_shape = input->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
......@@ -185,7 +182,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
&kwg_size_);
}
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
......
......@@ -30,8 +30,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
......@@ -39,15 +37,18 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
......@@ -60,9 +61,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
......
......@@ -200,8 +200,6 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::vector<uint32_t> &lws,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
const bool is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size =
......@@ -239,7 +237,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
<< "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS;
std::vector<uint32_t> roundup_gws(3);
if (!is_non_uniform_work_groups_supported) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 3; ++i) {
roundup_gws[i] = RoundUp(gws[i], params[i]);
}
......@@ -252,7 +250,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
if (is_non_uniform_work_groups_supported) {
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
......@@ -268,7 +266,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
}
} else {
timer->ClearTiming();
if (is_non_uniform_work_groups_supported) {
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
......@@ -293,7 +291,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
if (is_non_uniform_work_groups_supported) {
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
......@@ -332,8 +330,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::vector<uint32_t> &lws,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
const bool is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size =
......@@ -359,7 +355,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
<< "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS;
std::vector<uint32_t> roundup_gws(2);
if (!is_non_uniform_work_groups_supported) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 2; ++i) {
roundup_gws[i] = RoundUp(gws[i], params[i]);
}
......@@ -372,7 +368,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
if (is_non_uniform_work_groups_supported) {
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
......@@ -387,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
}
} else {
timer->ClearTiming();
if (is_non_uniform_work_groups_supported) {
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event);
......@@ -411,7 +407,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
if (is_non_uniform_work_groups_supported) {
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]),
......
......@@ -34,21 +34,22 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
......@@ -61,8 +62,6 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
std::stringstream ss;
ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
......
......@@ -21,8 +21,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
......@@ -39,10 +37,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (pooling_type_ == AVG) {
built_options.emplace("-DPOOL_AVG");
}
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<uint32_t> gws;
......@@ -81,7 +82,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
};
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
......@@ -97,9 +98,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} else {
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
......
......@@ -31,19 +31,20 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
......@@ -60,7 +61,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
CalculateResizeScale(in_width, out_width, align_corners_);
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
......@@ -74,9 +75,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
......@@ -32,18 +32,19 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
built_options.emplace("-Dslice=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE="
+ DtToCLCMDDt(DataTypeToEnum<T>::value));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const index_t channel_blk = RoundUpDiv4(output_channels);
......@@ -53,8 +54,6 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(input->dim(0) * input->dim(1)),
};
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss;
ss << "slice_opencl_kernel_"
......@@ -65,7 +64,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
<< outputs_count;
for (int i = 0; i < outputs_count; ++i) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
......
......@@ -30,22 +30,23 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
......@@ -56,9 +57,6 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = logits->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
......@@ -39,8 +39,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
std::stringstream kernel_name_ss;
......@@ -49,15 +47,18 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ =
runtime->BuildKernel("space_to_batch", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
......@@ -79,9 +80,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
space_shape_ = space_tensor->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
......@@ -18,8 +18,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options;
......@@ -28,11 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
......@@ -61,7 +62,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor->ResizeImage(output_shape, image_shape);
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
......@@ -76,9 +77,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
input_shape_ = input_tensor->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
......@@ -99,8 +97,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::set<std::string> built_options;
......@@ -110,8 +106,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation_) {
......@@ -138,6 +134,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[2] = {
......@@ -153,7 +152,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
......@@ -173,9 +172,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input_tensor->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
......
......@@ -186,7 +186,6 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -174,7 +174,6 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -62,7 +62,6 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
};
} // namespace kernels
......
......@@ -62,7 +62,6 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -57,7 +57,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> space_shape_;
};
......
......@@ -52,7 +52,6 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......@@ -111,7 +110,6 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册