提交 0fd7ce42 编写于 作者: Y yejianwu

update uniform work group size macro name, remove unnessary variable, use mace...

update uniform work group size macro name, remove unnessary variable, use mace for dim params in *.cl
上级 af6d9162
...@@ -156,7 +156,6 @@ class ActivationFunctor<DeviceType::OPENCL, T> { ...@@ -156,7 +156,6 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
T relux_max_limit_; T relux_max_limit_;
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::string tuning_key_prefix_; std::string tuning_key_prefix_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -91,7 +91,6 @@ struct AddNFunctor<DeviceType::OPENCL, T> { ...@@ -91,7 +91,6 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -158,7 +158,6 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase { ...@@ -158,7 +158,6 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -65,7 +65,6 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> { ...@@ -65,7 +65,6 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -57,7 +57,6 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> { ...@@ -57,7 +57,6 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
const int groups_; const int groups_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -86,7 +86,6 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase { ...@@ -86,7 +86,6 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -402,7 +402,6 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase { ...@@ -402,7 +402,6 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -109,7 +109,6 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> { ...@@ -109,7 +109,6 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
const int block_size_; const int block_size_;
bool d2s_; bool d2s_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
......
...@@ -438,7 +438,6 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T> ...@@ -438,7 +438,6 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -98,7 +98,6 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase { ...@@ -98,7 +98,6 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -242,7 +242,6 @@ struct MatMulFunctor<DeviceType::OPENCL, T> { ...@@ -242,7 +242,6 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -27,16 +27,14 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -27,16 +27,14 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name); built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
switch (activation_) { switch (activation_) {
case RELU: case RELU:
...@@ -63,6 +61,9 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -63,6 +61,9 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options); kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
...@@ -71,7 +72,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -71,7 +72,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0; int idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
...@@ -85,9 +86,6 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -85,9 +86,6 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape(); input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1}; const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
...@@ -35,8 +35,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -35,8 +35,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
} }
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
if (input_tensors.size() > 4) { if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -47,11 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -47,11 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options); kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
std::vector<index_t> output_shape = input_tensors[0]->shape(); std::vector<index_t> output_shape = input_tensors[0]->shape();
...@@ -70,7 +71,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -70,7 +71,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor->ResizeImage(output_shape, output_image_shape); output_tensor->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
} }
...@@ -80,9 +81,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -80,9 +81,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, *(output_tensor->opencl_image())); kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape(); input_shape_ = input_tensors[0]->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1}; const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
......
...@@ -38,16 +38,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -38,16 +38,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name); built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
if (folded_constant_) { if (folded_constant_) {
built_options.emplace("-DFOLDED_CONSTANT"); built_options.emplace("-DFOLDED_CONSTANT");
...@@ -72,10 +70,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -72,10 +70,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options); kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
...@@ -92,9 +93,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -92,9 +93,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, relux_max_limit_); kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input->shape(); input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1}; const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
...@@ -30,22 +30,23 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -30,22 +30,23 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name); built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options); kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
...@@ -54,16 +55,13 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -54,16 +55,13 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, *(bias->opencl_image())); kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape(); input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8}; const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};
cl::Event event; cl::Event event;
cl_int error; cl_int error;
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
......
...@@ -62,16 +62,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -62,16 +62,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options; std::set<std::string> built_options;
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
if (is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
if (buffer->dtype() == image->dtype()) { if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
...@@ -87,7 +84,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -87,7 +84,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
obfuscated_kernel_name, built_options); obfuscated_kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
b2f_kernel.setArg(idx++, gws[0]); b2f_kernel.setArg(idx++, gws[0]);
b2f_kernel.setArg(idx++, gws[1]); b2f_kernel.setArg(idx++, gws[1]);
} }
...@@ -123,7 +120,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -123,7 +120,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
cl::Event event; cl::Event event;
cl_int error; cl_int error;
if (is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
......
...@@ -37,24 +37,25 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -37,24 +37,25 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name); built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
built_options); built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
...@@ -65,9 +66,6 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -65,9 +66,6 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape(); input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1}; const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
#include <common.h> #include <common.h>
__kernel void activation( __kernel void activation(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, __read_only image2d_t input,
#ifdef USE_PRELU #ifdef USE_PRELU
__read_only image2d_t alpha, __read_only image2d_t alpha,
...@@ -16,7 +12,7 @@ __kernel void activation( ...@@ -16,7 +12,7 @@ __kernel void activation(
const int w = get_global_id(1); const int w = get_global_id(1);
const int hb = get_global_id(2); const int hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (ch_blk >= global_size_dim0 || w >= global_size_dim1 if (ch_blk >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) { || hb >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
__kernel void addn( __kernel void addn(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1, __read_only image2d_t input1,
#if INPUT_NUM > 2 #if INPUT_NUM > 2
...@@ -17,7 +14,7 @@ __kernel void addn( ...@@ -17,7 +14,7 @@ __kernel void addn(
const int w = get_global_id(0); const int w = get_global_id(0);
const int hb = get_global_id(1); const int hb = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || hb >= global_size_dim1) return; if (w >= global_size_dim0 || hb >= global_size_dim1) return;
#endif #endif
......
#include <common.h> #include <common.h>
// Supported data types: half/float // Supported data types: half/float
__kernel void batch_norm( __kernel void batch_norm(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t scale, __read_only image2d_t scale,
__read_only image2d_t offset, __read_only image2d_t offset,
...@@ -20,7 +16,7 @@ __kernel void batch_norm( ...@@ -20,7 +16,7 @@ __kernel void batch_norm(
const int w = get_global_id(1); const int w = get_global_id(1);
const int hb = get_global_id(2); const int hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (ch_blk >= global_size_dim0 || w >= global_size_dim1 if (ch_blk >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) { || hb >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
// Supported data types: half/float // Supported data types: half/float
__kernel void bias_add( __kernel void bias_add(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t bias, __read_only image2d_t bias,
__write_only image2d_t output) { __write_only image2d_t output) {
...@@ -13,7 +9,7 @@ __kernel void bias_add( ...@@ -13,7 +9,7 @@ __kernel void bias_add(
const int w = get_global_id(1); const int w = get_global_id(1);
const int hb = get_global_id(2); const int hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (ch_blk >= global_size_dim0 || w >= global_size_dim1 if (ch_blk >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) { || hb >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
__kernel void filter_buffer_to_image( __kernel void filter_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, /* h, w, oc, ic */ __global const DATA_TYPE *input, /* h, w, oc, ic */
__private const int input_offset, __private const int input_offset,
__private const int filter_h, __private const int filter_h,
...@@ -15,7 +12,7 @@ __kernel void filter_buffer_to_image( ...@@ -15,7 +12,7 @@ __kernel void filter_buffer_to_image(
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
...@@ -57,10 +54,7 @@ __kernel void filter_buffer_to_image( ...@@ -57,10 +54,7 @@ __kernel void filter_buffer_to_image(
} }
__kernel void filter_image_to_buffer( __kernel void filter_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global DATA_TYPE *output, /* h, w, oc, ic */ __global DATA_TYPE *output, /* h, w, oc, ic */
__private const int filter_h, __private const int filter_h,
__private const int filter_w, __private const int filter_w,
...@@ -70,7 +64,7 @@ __kernel void filter_image_to_buffer( ...@@ -70,7 +64,7 @@ __kernel void filter_image_to_buffer(
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
...@@ -109,10 +103,7 @@ __kernel void filter_image_to_buffer( ...@@ -109,10 +103,7 @@ __kernel void filter_image_to_buffer(
} }
__kernel void dw_filter_buffer_to_image( __kernel void dw_filter_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, /* h, w, ic, m */ __global const DATA_TYPE *input, /* h, w, ic, m */
__private const int input_offset, __private const int input_offset,
__private const int filter_w, __private const int filter_w,
...@@ -122,7 +113,7 @@ __kernel void dw_filter_buffer_to_image( ...@@ -122,7 +113,7 @@ __kernel void dw_filter_buffer_to_image(
const int w = get_global_id(0); const int w = get_global_id(0);
const int h = get_global_id(1); const int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
...@@ -170,10 +161,7 @@ __kernel void dw_filter_buffer_to_image( ...@@ -170,10 +161,7 @@ __kernel void dw_filter_buffer_to_image(
} }
__kernel void in_out_buffer_to_image( __kernel void in_out_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, /* nhwc */ __global const DATA_TYPE *input, /* nhwc */
__private const int input_offset, __private const int input_offset,
__private const int height, __private const int height,
...@@ -183,7 +171,7 @@ __kernel void in_out_buffer_to_image( ...@@ -183,7 +171,7 @@ __kernel void in_out_buffer_to_image(
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
...@@ -215,10 +203,7 @@ __kernel void in_out_buffer_to_image( ...@@ -215,10 +203,7 @@ __kernel void in_out_buffer_to_image(
} }
__kernel void in_out_image_to_buffer( __kernel void in_out_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global DATA_TYPE *output, /* nhwc */ __global DATA_TYPE *output, /* nhwc */
__private const int height, __private const int height,
__private const int width, __private const int width,
...@@ -227,7 +212,7 @@ __kernel void in_out_image_to_buffer( ...@@ -227,7 +212,7 @@ __kernel void in_out_image_to_buffer(
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
...@@ -258,10 +243,7 @@ __kernel void in_out_image_to_buffer( ...@@ -258,10 +243,7 @@ __kernel void in_out_image_to_buffer(
} }
__kernel void arg_buffer_to_image( __kernel void arg_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, /* nhwc */ __global const DATA_TYPE *input, /* nhwc */
__private const int input_offset, __private const int input_offset,
__private const int count, __private const int count,
...@@ -269,7 +251,7 @@ __kernel void arg_buffer_to_image( ...@@ -269,7 +251,7 @@ __kernel void arg_buffer_to_image(
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
...@@ -297,17 +279,14 @@ __kernel void arg_buffer_to_image( ...@@ -297,17 +279,14 @@ __kernel void arg_buffer_to_image(
} }
__kernel void arg_image_to_buffer( __kernel void arg_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global DATA_TYPE *output, /* nhwc */ __global DATA_TYPE *output, /* nhwc */
__private const int count, __private const int count,
__read_only image2d_t input) { __read_only image2d_t input) {
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
...@@ -334,10 +313,7 @@ __kernel void arg_image_to_buffer( ...@@ -334,10 +313,7 @@ __kernel void arg_image_to_buffer(
__kernel void in_out_height_buffer_to_image( __kernel void in_out_height_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, //nhwc __global const DATA_TYPE *input, //nhwc
__private const int input_offset, __private const int input_offset,
__private const int height, __private const int height,
...@@ -347,7 +323,7 @@ __kernel void in_out_height_buffer_to_image( ...@@ -347,7 +323,7 @@ __kernel void in_out_height_buffer_to_image(
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
...@@ -380,10 +356,7 @@ __kernel void in_out_height_buffer_to_image( ...@@ -380,10 +356,7 @@ __kernel void in_out_height_buffer_to_image(
} }
__kernel void in_out_height_image_to_buffer( __kernel void in_out_height_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global DATA_TYPE *output, //nhwc __global DATA_TYPE *output, //nhwc
__private const int height, __private const int height,
__private const int width, __private const int width,
...@@ -392,7 +365,7 @@ __kernel void in_out_height_image_to_buffer( ...@@ -392,7 +365,7 @@ __kernel void in_out_height_image_to_buffer(
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
...@@ -422,10 +395,7 @@ __kernel void in_out_height_image_to_buffer( ...@@ -422,10 +395,7 @@ __kernel void in_out_height_image_to_buffer(
__kernel void in_out_width_buffer_to_image( __kernel void in_out_width_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, /* nhwc */ __global const DATA_TYPE *input, /* nhwc */
__private const int input_offset, __private const int input_offset,
__private const int height, __private const int height,
...@@ -435,7 +405,7 @@ __kernel void in_out_width_buffer_to_image( ...@@ -435,7 +405,7 @@ __kernel void in_out_width_buffer_to_image(
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
...@@ -468,10 +438,7 @@ __kernel void in_out_width_buffer_to_image( ...@@ -468,10 +438,7 @@ __kernel void in_out_width_buffer_to_image(
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_buffer_to_image( __kernel void winograd_filter_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, //Oc, Ic, H, W __global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset, __private const int input_offset,
__private const int in_channels, __private const int in_channels,
...@@ -481,7 +448,7 @@ __kernel void winograd_filter_buffer_to_image( ...@@ -481,7 +448,7 @@ __kernel void winograd_filter_buffer_to_image(
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
...@@ -563,10 +530,7 @@ __kernel void winograd_filter_buffer_to_image( ...@@ -563,10 +530,7 @@ __kernel void winograd_filter_buffer_to_image(
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_image_to_buffer( __kernel void winograd_filter_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global DATA_TYPE *output, //Oc, Ic, H, W __global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height, __private const int height,
__private const int width, __private const int width,
...@@ -575,7 +539,7 @@ __kernel void winograd_filter_image_to_buffer( ...@@ -575,7 +539,7 @@ __kernel void winograd_filter_image_to_buffer(
const int w = get_global_id(0); const int w = get_global_id(0);
const int h = get_global_id(1); const int h = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) { if (w >= global_size_dim0 || h >= global_size_dim1) {
return; return;
} }
......
...@@ -2,11 +2,7 @@ ...@@ -2,11 +2,7 @@
// assume channes_per_group mod 4 = 0 && groups mod 4 == 0 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
__kernel void channel_shuffle( __kernel void channel_shuffle(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, __read_only image2d_t input,
__private const int groups, __private const int groups,
__private const int channels_per_group, __private const int channels_per_group,
...@@ -15,7 +11,7 @@ __kernel void channel_shuffle( ...@@ -15,7 +11,7 @@ __kernel void channel_shuffle(
const int width_idx = get_global_id(1); const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2); const int hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (group_chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 if (group_chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) { || hb_idx >= global_size_dim2) {
return; return;
......
...@@ -18,6 +18,23 @@ ...@@ -18,6 +18,23 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
#ifndef NON_UNIFORM_WORK_GROUP
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \
__private const int global_size_dim0, \
__private const int global_size_dim1,
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \
__private const int global_size_dim0, \
__private const int global_size_dim1, \
__private const int global_size_dim2,
#else
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
#endif
__constant sampler_t SAMPLER = __constant sampler_t SAMPLER =
CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
......
...@@ -23,11 +23,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left, ...@@ -23,11 +23,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
// Supported data type: half/float // Supported data type: half/float
__kernel void concat_channel( __kernel void concat_channel(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input0, __read_only image2d_t input0,
__read_only image2d_t input1, __read_only image2d_t input1,
__private const int input0_chan, __private const int input0_chan,
...@@ -36,7 +32,7 @@ __kernel void concat_channel( ...@@ -36,7 +32,7 @@ __kernel void concat_channel(
const int width_idx = get_global_id(1); const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2); const int hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) { || hb_idx >= global_size_dim2) {
return; return;
...@@ -89,11 +85,7 @@ __kernel void concat_channel( ...@@ -89,11 +85,7 @@ __kernel void concat_channel(
// Required: All input channels are divisible by 4 // Required: All input channels are divisible by 4
__kernel void concat_channel_multi( __kernel void concat_channel_multi(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, __read_only image2d_t input,
__private const int chan_blk_offset, __private const int chan_blk_offset,
__write_only image2d_t output) { __write_only image2d_t output) {
...@@ -101,7 +93,7 @@ __kernel void concat_channel_multi( ...@@ -101,7 +93,7 @@ __kernel void concat_channel_multi(
const int width_idx = get_global_id(1); const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2); const int hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) { || hb_idx >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
__kernel void conv_2d( __kernel void conv_2d(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */ __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
#ifdef BIAS #ifdef BIAS
...@@ -29,7 +25,7 @@ __kernel void conv_2d( ...@@ -29,7 +25,7 @@ __kernel void conv_2d(
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
const int out_hb = get_global_id(2); const int out_hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) { || out_hb >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
__kernel void conv_2d_1x1( __kernel void conv_2d_1x1(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, cout/4 */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
#ifdef BIAS #ifdef BIAS
...@@ -23,7 +19,7 @@ __kernel void conv_2d_1x1( ...@@ -23,7 +19,7 @@ __kernel void conv_2d_1x1(
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
const int out_hb = get_global_id(2); const int out_hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) { || out_hb >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
__kernel void conv_2d_3x3( __kernel void conv_2d_3x3(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */ __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
#ifdef BIAS #ifdef BIAS
...@@ -27,7 +23,7 @@ __kernel void conv_2d_3x3( ...@@ -27,7 +23,7 @@ __kernel void conv_2d_3x3(
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
const int out_hb = get_global_id(2); const int out_hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) { || out_hb >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
__kernel void depth_to_space( __kernel void depth_to_space(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, __read_only image2d_t input,
__private const int block_size, __private const int block_size,
__private const int output_depth, __private const int output_depth,
...@@ -14,7 +10,7 @@ __kernel void depth_to_space( ...@@ -14,7 +10,7 @@ __kernel void depth_to_space(
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_h = get_global_id(2); const int out_h = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (out_d >= global_size_dim0 || out_w >= global_size_dim1 if (out_d >= global_size_dim0 || out_w >= global_size_dim1
|| out_h >= global_size_dim2) { || out_h >= global_size_dim2) {
return; return;
...@@ -43,11 +39,7 @@ __kernel void depth_to_space( ...@@ -43,11 +39,7 @@ __kernel void depth_to_space(
} }
__kernel void space_to_depth( __kernel void space_to_depth(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, __read_only image2d_t input,
__private const int block_size, __private const int block_size,
__private const int input_depth, __private const int input_depth,
...@@ -57,7 +49,7 @@ __kernel void space_to_depth( ...@@ -57,7 +49,7 @@ __kernel void space_to_depth(
const int w = get_global_id(1); const int w = get_global_id(1);
const int h = get_global_id(2); const int h = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (d >= global_size_dim0 || w >= global_size_dim1 if (d >= global_size_dim0 || w >= global_size_dim1
|| h >= global_size_dim2) { || h >= global_size_dim2) {
return; return;
......
...@@ -2,11 +2,7 @@ ...@@ -2,11 +2,7 @@
// Only multiplier = 1 is supported // Only multiplier = 1 is supported
__kernel void depthwise_conv2d( __kernel void depthwise_conv2d(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS #ifdef BIAS
...@@ -29,7 +25,7 @@ __kernel void depthwise_conv2d( ...@@ -29,7 +25,7 @@ __kernel void depthwise_conv2d(
const short out_w_blk = get_global_id(1); const short out_w_blk = get_global_id(1);
const short out_hb = get_global_id(2); const short out_hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) { || out_hb >= global_size_dim2) {
return; return;
...@@ -143,11 +139,7 @@ __kernel void depthwise_conv2d( ...@@ -143,11 +139,7 @@ __kernel void depthwise_conv2d(
} }
__kernel void depthwise_conv2d_s1( __kernel void depthwise_conv2d_s1(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS #ifdef BIAS
...@@ -168,7 +160,7 @@ __kernel void depthwise_conv2d_s1( ...@@ -168,7 +160,7 @@ __kernel void depthwise_conv2d_s1(
const short out_w_blk = get_global_id(1) << 2; const short out_w_blk = get_global_id(1) << 2;
const short out_hb = get_global_id(2); const short out_hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1 if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1
|| out_hb >= global_size_dim2) { || out_hb >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
__kernel void eltwise( __kernel void eltwise(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1, __read_only image2d_t input1,
#ifdef COEFF_SUM #ifdef COEFF_SUM
...@@ -15,7 +12,7 @@ __kernel void eltwise( ...@@ -15,7 +12,7 @@ __kernel void eltwise(
const int w = get_global_id(0); const int w = get_global_id(0);
const int hb = get_global_id(1); const int hb = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || hb >= global_size_dim1) return; if (w >= global_size_dim0 || hb >= global_size_dim1) return;
#endif #endif
......
...@@ -2,10 +2,7 @@ ...@@ -2,10 +2,7 @@
// C = A * B // C = A * B
__kernel void matmul( __kernel void matmul(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__read_only image2d_t A, __read_only image2d_t A,
__read_only image2d_t B, __read_only image2d_t B,
__write_only image2d_t C, __write_only image2d_t C,
...@@ -17,7 +14,7 @@ __kernel void matmul( ...@@ -17,7 +14,7 @@ __kernel void matmul(
const int gx = get_global_id(0) << 2; const int gx = get_global_id(0) << 2;
const int hb = get_global_id(1); const int hb = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (get_global_id(0) >= global_size_dim0 || hb >= global_size_dim1) return; if (get_global_id(0) >= global_size_dim0 || hb >= global_size_dim1) return;
#endif #endif
......
...@@ -20,11 +20,7 @@ inline int calculate_avg_block_size(const int pool_size, ...@@ -20,11 +20,7 @@ inline int calculate_avg_block_size(const int pool_size,
// Supported data type: half/float // Supported data type: half/float
__kernel void pooling( __kernel void pooling(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, __read_only image2d_t input,
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
...@@ -39,7 +35,7 @@ __kernel void pooling( ...@@ -39,7 +35,7 @@ __kernel void pooling(
const int out_width_idx = get_global_id(1); const int out_width_idx = get_global_id(1);
const int out_hb_idx = get_global_id(2); const int out_hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1 if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1
|| out_hb_idx >= global_size_dim2) { || out_hb_idx >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
__kernel void resize_bilinear_nocache( __kernel void resize_bilinear_nocache(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__write_only image2d_t output, __write_only image2d_t output,
__private const float height_scale, __private const float height_scale,
...@@ -18,7 +14,7 @@ __kernel void resize_bilinear_nocache( ...@@ -18,7 +14,7 @@ __kernel void resize_bilinear_nocache(
const int w = get_global_id(1); const int w = get_global_id(1);
const int hb = get_global_id(2); const int hb = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (ch_blk >= global_size_dim0 || w >= global_size_dim1 if (ch_blk >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) { || hb >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
__kernel void slice( __kernel void slice(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, __read_only image2d_t input,
__private const int chan_blk_offset, __private const int chan_blk_offset,
__write_only image2d_t output) { __write_only image2d_t output) {
...@@ -13,7 +9,7 @@ __kernel void slice( ...@@ -13,7 +9,7 @@ __kernel void slice(
const int width_idx = get_global_id(1); const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2); const int hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) { || hb_idx >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
__kernel void softmax( __kernel void softmax(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, __read_only image2d_t input,
__private const int channels, __private const int channels,
__private const int remain_channels, __private const int remain_channels,
...@@ -14,7 +10,7 @@ __kernel void softmax( ...@@ -14,7 +10,7 @@ __kernel void softmax(
const int width_idx = get_global_id(1); const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2); const int hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) { || hb_idx >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
__kernel void space_to_batch( __kernel void space_to_batch(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t space_data, __read_only image2d_t space_data,
__write_only image2d_t batch_data, __write_only image2d_t batch_data,
__private const int block_height, __private const int block_height,
...@@ -20,7 +16,7 @@ __kernel void space_to_batch( ...@@ -20,7 +16,7 @@ __kernel void space_to_batch(
const int batch_w_idx = get_global_id(1); const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2); const int batch_hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1 if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
|| batch_hb_idx >= global_size_dim2) { || batch_hb_idx >= global_size_dim2) {
return; return;
...@@ -53,11 +49,7 @@ __kernel void space_to_batch( ...@@ -53,11 +49,7 @@ __kernel void space_to_batch(
} }
__kernel void batch_to_space( __kernel void batch_to_space(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t batch_data, __read_only image2d_t batch_data,
__write_only image2d_t space_data, __write_only image2d_t space_data,
__private const int block_height, __private const int block_height,
...@@ -72,7 +64,7 @@ __kernel void batch_to_space( ...@@ -72,7 +64,7 @@ __kernel void batch_to_space(
const int batch_w_idx = get_global_id(1); const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2); const int batch_hb_idx = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1 if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
|| batch_hb_idx >= global_size_dim2) { || batch_hb_idx >= global_size_dim2) {
return; return;
......
#include <common.h> #include <common.h>
__kernel void winograd_transform_2x2( __kernel void winograd_transform_2x2(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const int in_height, __private const int in_height,
...@@ -17,7 +14,7 @@ __kernel void winograd_transform_2x2( ...@@ -17,7 +14,7 @@ __kernel void winograd_transform_2x2(
int out_width_idx = get_global_id(0); int out_width_idx = get_global_id(0);
int chan_blk_idx = get_global_id(1); int chan_blk_idx = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) { if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
return; return;
} }
...@@ -120,10 +117,7 @@ __kernel void winograd_transform_2x2( ...@@ -120,10 +117,7 @@ __kernel void winograd_transform_2x2(
} }
__kernel void winograd_inverse_transform_2x2( __kernel void winograd_inverse_transform_2x2(
#ifndef USE_QUALCOMM_OPENCL_2_0 UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__read_only image2d_t input, __read_only image2d_t input,
#ifdef BIAS #ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
...@@ -137,7 +131,7 @@ __kernel void winograd_inverse_transform_2x2( ...@@ -137,7 +131,7 @@ __kernel void winograd_inverse_transform_2x2(
const int width_idx = get_global_id(0); const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1); const int height_idx = get_global_id(1);
#ifndef USE_QUALCOMM_OPENCL_2_0 #ifndef NON_UNIFORM_WORK_GROUP
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) { if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return; return;
} }
......
...@@ -18,7 +18,6 @@ static void Concat2(cl::Kernel *kernel, ...@@ -18,7 +18,6 @@ static void Concat2(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) { uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -34,13 +33,11 @@ static void Concat2(cl::Kernel *kernel, ...@@ -34,13 +33,11 @@ static void Concat2(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name); built_options.emplace("-Dconcat_channel=" + kernel_name);
if (*is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
if (input0->dtype() == output->dtype()) { if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
...@@ -53,10 +50,13 @@ static void Concat2(cl::Kernel *kernel, ...@@ -53,10 +50,13 @@ static void Concat2(cl::Kernel *kernel,
built_options.emplace("-DDIVISIBLE_FOUR"); built_options.emplace("-DDIVISIBLE_FOUR");
} }
*kernel = runtime->BuildKernel("concat", kernel_name, built_options); *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
if (!IsVecEqual(*prev_input_shape, input0->shape())) { if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]); kernel->setArg(idx++, gws[2]);
...@@ -70,9 +70,6 @@ static void Concat2(cl::Kernel *kernel, ...@@ -70,9 +70,6 @@ static void Concat2(cl::Kernel *kernel,
*(static_cast<cl::Image2D *>(output->opencl_image()))); *(static_cast<cl::Image2D *>(output->opencl_image())));
*prev_input_shape = input0->shape(); *prev_input_shape = input0->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1}; const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
...@@ -87,7 +84,6 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -87,7 +84,6 @@ static void ConcatN(cl::Kernel *kernel,
const DataType dt, const DataType dt,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) { uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -97,17 +93,17 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -97,17 +93,17 @@ static void ConcatN(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (*is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
*kernel = runtime->BuildKernel("concat", kernel_name, built_options); *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const int inputs_count = input_list.size(); const int inputs_count = input_list.size();
...@@ -121,7 +117,7 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -121,7 +117,7 @@ static void ConcatN(cl::Kernel *kernel,
}; };
uint32_t idx = 0; uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]); kernel->setArg(idx++, gws[2]);
...@@ -131,8 +127,6 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -131,8 +127,6 @@ static void ConcatN(cl::Kernel *kernel,
kernel->setArg(idx++, *(output->opencl_image())); kernel->setArg(idx++, *(output->opencl_image()));
chan_blk_offset += input_channel_blk; chan_blk_offset += input_channel_blk;
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1}; const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_" ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
...@@ -178,13 +172,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -178,13 +172,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch (inputs_count) { switch (inputs_count) {
case 2: case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value, Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
&input_shape_, output, future, &input_shape_, output, future, &kwg_size_);
&is_non_uniform_work_groups_supported_, &kwg_size_);
break; break;
default: default:
if (divisible_four) { if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future, ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_); &kwg_size_);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
......
...@@ -21,7 +21,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -21,7 +21,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size); uint32_t *kwg_size);
extern void Conv2dOpenclK3x3(cl::Kernel *kernel, extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
...@@ -37,7 +36,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -37,7 +36,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size); uint32_t *kwg_size);
extern void Conv2dOpencl(cl::Kernel *kernel, extern void Conv2dOpencl(cl::Kernel *kernel,
...@@ -53,7 +51,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -53,7 +51,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size); uint32_t *kwg_size);
template <typename T> template <typename T>
...@@ -68,7 +65,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -68,7 +65,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const int *dilations, const ActivationType activation, const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt, const float relux_max_limit, const DataType dt,
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future, std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size); uint32_t *kwg_size);
// Selection matrix: kernel_size x stride_size // Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] = { static const Conv2dOpenclFunction selector[5] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
...@@ -109,12 +106,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -109,12 +106,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future, DataTypeToEnum<T>::value, &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_); &kwg_size_);
} else { } else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future, DataTypeToEnum<T>::value, &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_); &kwg_size_);
} }
} }
......
...@@ -23,7 +23,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -23,7 +23,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) { uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -41,8 +40,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -41,8 +40,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
MACE_CHECK(input_batch == batch); MACE_CHECK(input_batch == batch);
std::set<std::string> built_options; std::set<std::string> built_options;
...@@ -50,8 +47,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -50,8 +47,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (*is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
if (bias != nullptr) { if (bias != nullptr) {
built_options.emplace("-DBIAS"); built_options.emplace("-DBIAS");
...@@ -76,6 +73,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -76,6 +73,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
} }
*kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options); *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
...@@ -84,7 +84,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -84,7 +84,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]); kernel->setArg(idx++, gws[2]);
...@@ -105,9 +105,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -105,9 +105,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
kernel->setArg(idx++, stride); kernel->setArg(idx++, stride);
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1}; const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
......
...@@ -25,7 +25,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -25,7 +25,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) { uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -40,15 +39,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -40,15 +39,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (*is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
...@@ -71,6 +68,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -71,6 +68,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
} }
*kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options); *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
...@@ -79,7 +79,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -79,7 +79,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]); kernel->setArg(idx++, gws[2]);
...@@ -103,9 +103,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -103,9 +103,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
kernel->setArg(idx++, dilations[1]); kernel->setArg(idx++, dilations[1]);
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 1}; const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 1};
......
...@@ -25,7 +25,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -25,7 +25,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) { uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -40,15 +39,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -40,15 +39,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-Dconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (*is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation) { switch (activation) {
...@@ -71,6 +68,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -71,6 +68,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
} }
*kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options); *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
...@@ -79,7 +79,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -79,7 +79,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]); kernel->setArg(idx++, gws[2]);
...@@ -105,9 +105,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -105,9 +105,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
kernel->setArg(idx++, dilations[1]); kernel->setArg(idx++, dilations[1]);
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1}; const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
......
...@@ -48,8 +48,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -48,8 +48,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
...@@ -58,11 +56,14 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -58,11 +56,14 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = kernel_ =
runtime->BuildKernel("depth_to_space", kernel_name, built_options); runtime->BuildKernel("depth_to_space", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
uint32_t gws[3]; uint32_t gws[3];
...@@ -83,7 +84,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -83,7 +84,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
} }
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
...@@ -94,9 +95,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -94,9 +95,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape(); input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1}; const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
...@@ -24,7 +24,6 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -24,7 +24,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future, StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) { uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -45,8 +44,6 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -45,8 +44,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
...@@ -55,8 +52,8 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -55,8 +52,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
} else { } else {
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
} }
if (*is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
...@@ -83,6 +80,9 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -83,6 +80,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
*kernel = *kernel =
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options); runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t input_batch = input->dim(0); const index_t input_batch = input->dim(0);
...@@ -97,7 +97,7 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -97,7 +97,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
input_channels); input_channels);
uint32_t idx = 0; uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]); kernel->setArg(idx++, gws[2]);
...@@ -124,9 +124,6 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -124,9 +124,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
} }
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1}; const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
...@@ -185,7 +182,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -185,7 +182,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future, DataTypeToEnum<T>::value, &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_); &kwg_size_);
} }
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>; template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
......
...@@ -30,8 +30,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -30,8 +30,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
...@@ -39,15 +37,18 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -39,15 +37,18 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options); kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, input0->shape())) { if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
} }
...@@ -60,9 +61,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -60,9 +61,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape(); input_shape_ = input0->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1}; const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
......
...@@ -200,8 +200,6 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -200,8 +200,6 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size = const uint32_t kwg_size =
...@@ -239,7 +237,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -239,7 +237,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
<< "Tuning parameters of 3D kernel must be 4D"; << "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
std::vector<uint32_t> roundup_gws(3); std::vector<uint32_t> roundup_gws(3);
if (!is_non_uniform_work_groups_supported) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 3; ++i) { for (size_t i = 0; i < 3; ++i) {
roundup_gws[i] = RoundUp(gws[i], params[i]); roundup_gws[i] = RoundUp(gws[i], params[i]);
} }
...@@ -252,7 +250,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -252,7 +250,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
if (is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, 0, i * block_size), kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2), cl::NDRange(gws[0], gws[1], gws2),
...@@ -268,7 +266,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -268,7 +266,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
} }
} else { } else {
timer->ClearTiming(); timer->ClearTiming();
if (is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
...@@ -293,7 +291,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -293,7 +291,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
if (is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, 0, i * block_size), kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2), cl::NDRange(gws[0], gws[1], gws2),
...@@ -332,8 +330,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -332,8 +330,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size = const uint32_t kwg_size =
...@@ -359,7 +355,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -359,7 +355,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
<< "Tuning parameters of 2D kernel must be 3d"; << "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
std::vector<uint32_t> roundup_gws(2); std::vector<uint32_t> roundup_gws(2);
if (!is_non_uniform_work_groups_supported) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 2; ++i) { for (size_t i = 0; i < 2; ++i) {
roundup_gws[i] = RoundUp(gws[i], params[i]); roundup_gws[i] = RoundUp(gws[i], params[i]);
} }
...@@ -372,7 +368,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -372,7 +368,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
if (is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
...@@ -387,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -387,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
} }
} else { } else {
timer->ClearTiming(); timer->ClearTiming();
if (is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
...@@ -411,7 +407,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -411,7 +407,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
if (is_non_uniform_work_groups_supported) { if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size), kernel, cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]), cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]),
......
...@@ -34,21 +34,22 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -34,21 +34,22 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name); built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options); kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
} }
...@@ -61,8 +62,6 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -61,8 +62,6 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(height_blocks)); kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2)))); kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1}; const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
std::stringstream ss; std::stringstream ss;
ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_" ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
......
...@@ -21,8 +21,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -21,8 +21,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
const DataType dt = DataTypeToEnum<T>::value; const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
...@@ -39,10 +37,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -39,10 +37,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (pooling_type_ == AVG) { if (pooling_type_ == AVG) {
built_options.emplace("-DPOOL_AVG"); built_options.emplace("-DPOOL_AVG");
} }
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options); kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
std::vector<uint32_t> gws; std::vector<uint32_t> gws;
...@@ -81,7 +82,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -81,7 +82,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}; };
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
...@@ -97,9 +98,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -97,9 +98,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape(); input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} else { } else {
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t out_height = output->dim(1); index_t out_height = output->dim(1);
......
...@@ -31,19 +31,20 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -31,19 +31,20 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options); runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0); MACE_CHECK(out_height > 0 && out_width > 0);
...@@ -60,7 +61,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -60,7 +61,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
CalculateResizeScale(in_width, out_width, align_corners_); CalculateResizeScale(in_width, out_width, align_corners_);
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
...@@ -74,9 +75,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -74,9 +75,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(out_height)); kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape(); input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1}; const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
...@@ -32,18 +32,19 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -32,18 +32,19 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
built_options.emplace("-Dslice=" + kernel_name); built_options.emplace("-Dslice=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" built_options.emplace("-DCMD_DATA_TYPE="
+ DtToCLCMDDt(DataTypeToEnum<T>::value)); + DtToCLCMDDt(DataTypeToEnum<T>::value));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("slice", kernel_name, built_options); kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const index_t channel_blk = RoundUpDiv4(output_channels); const index_t channel_blk = RoundUpDiv4(output_channels);
...@@ -53,8 +54,6 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -53,8 +54,6 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(input->dim(0) * input->dim(1)), static_cast<uint32_t>(input->dim(0) * input->dim(1)),
}; };
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1}; const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "slice_opencl_kernel_" ss << "slice_opencl_kernel_"
...@@ -65,7 +64,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -65,7 +64,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
<< outputs_count; << outputs_count;
for (int i = 0; i < outputs_count; ++i) { for (int i = 0; i < outputs_count; ++i) {
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
......
...@@ -30,22 +30,23 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -30,22 +30,23 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name); built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options); kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(input_shape_, logits->shape())) { if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
...@@ -56,9 +57,6 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -56,9 +57,6 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = logits->shape(); input_shape_ = logits->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1}; const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
...@@ -39,8 +39,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -39,8 +39,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options; std::set<std::string> built_options;
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
...@@ -49,15 +47,18 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -49,15 +47,18 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value)); DtToCLCMDDt(DataTypeToEnum<T>::value));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = kernel_ =
runtime->BuildKernel("space_to_batch", kernel_name, built_options); runtime->BuildKernel("space_to_batch", kernel_name, built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (!IsVecEqual(space_shape_, space_tensor->shape())) { if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
...@@ -79,9 +80,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -79,9 +80,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2))); kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
space_shape_ = space_tensor->shape(); space_shape_ = space_tensor->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1}; const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
...@@ -18,8 +18,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -18,8 +18,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name = std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options; std::set<std::string> built_options;
...@@ -28,11 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -28,11 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options); built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1}; std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
...@@ -61,7 +62,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -61,7 +62,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor->ResizeImage(output_shape, image_shape); output_tensor->ResizeImage(output_shape, image_shape);
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
} }
...@@ -76,9 +77,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -76,9 +77,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2)); kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
input_shape_ = input_tensor->shape(); input_shape_ = input_tensor->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1}; const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
...@@ -99,8 +97,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -99,8 +97,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name = std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::set<std::string> built_options; std::set<std::string> built_options;
...@@ -110,8 +106,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -110,8 +106,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (is_non_uniform_work_groups_supported_) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation_) { switch (activation_) {
...@@ -138,6 +134,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -138,6 +134,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options); built_options);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const uint32_t gws[2] = { const uint32_t gws[2] = {
...@@ -153,7 +152,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -153,7 +152,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2; const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0; uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
} }
...@@ -173,9 +172,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -173,9 +172,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, relux_max_limit_); kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input_tensor->shape(); input_shape_ = input_tensor->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1}; const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
......
...@@ -186,7 +186,6 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase { ...@@ -186,7 +186,6 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -174,7 +174,6 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T> ...@@ -174,7 +174,6 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -62,7 +62,6 @@ struct SliceFunctor<DeviceType::OPENCL, T> { ...@@ -62,7 +62,6 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -62,7 +62,6 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> { ...@@ -62,7 +62,6 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -57,7 +57,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase { ...@@ -57,7 +57,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> space_shape_; std::vector<index_t> space_shape_;
}; };
......
...@@ -52,7 +52,6 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T> ...@@ -52,7 +52,6 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
...@@ -111,7 +110,6 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> ...@@ -111,7 +110,6 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册