提交 ed267833 编写于 作者: Y yejianwu

format code and reduce get kwg size

上级 44d4903d
...@@ -147,16 +147,9 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, ...@@ -147,16 +147,9 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) { if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
*device_ = device; *device_ = device;
gpu_detected = true; gpu_detected = true;
const std::string device_name = device.getInfo<CL_DEVICE_NAME>(); const std::string device_name = device.getInfo<CL_DEVICE_NAME>();
constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)"; gpu_type_ = ParseGPUTypeFromDeviceName(device_name);
constexpr const char *kMaliGPUStr = "Mali";
if (device_name == kQualcommAdrenoGPUStr) {
gpu_type_ = GPU_TYPE::QUALCOMM_ADRENO;
} else if (device_name.find(kMaliGPUStr) != std::string::npos) {
gpu_type_ = GPU_TYPE::MALI;
} else {
gpu_type_ = GPU_TYPE::UNKNOWN;
}
const std::string device_version = device.getInfo<CL_DEVICE_VERSION>(); const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
opencl_version_ = device_version.substr(7, 3); opencl_version_ = device_version.substr(7, 3);
...@@ -178,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, ...@@ -178,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
} }
cl_int err; cl_int err;
if (gpu_type_ == GPU_TYPE::QUALCOMM_ADRENO) { if (gpu_type_ == GPUType::QUALCOMM_ADRENO) {
std::vector<cl_context_properties> context_properties; std::vector<cl_context_properties> context_properties;
context_properties.reserve(5); context_properties.reserve(5);
GetAdrenoContextProperties(&context_properties, gpu_perf_hint, GetAdrenoContextProperties(&context_properties, gpu_perf_hint,
...@@ -357,12 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) { ...@@ -357,12 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
return size; return size;
} }
const GPU_TYPE OpenCLRuntime::GetGPUType() const { const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() {
return gpu_type_; if (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
opencl_version_ == "2.0") {
return true;
} else {
return false;
}
} }
const std::string &OpenCLRuntime::GetOpenclVersion() const { const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
return opencl_version_; const std::string &device_name) {
constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
constexpr const char *kMaliGPUStr = "Mali";
constexpr const char *kPowerVRGPUStr = "PowerVR";
if (device_name == kQualcommAdrenoGPUStr) {
return GPUType::QUALCOMM_ADRENO;
} else if (device_name.find(kMaliGPUStr) != std::string::npos) {
return GPUType::MALI;
} else if (device_name.find(kPowerVRGPUStr) != std::string::npos) {
return GPUType::PowerVR;
} else {
return GPUType::UNKNOWN;
}
} }
} // namespace mace } // namespace mace
...@@ -18,9 +18,10 @@ ...@@ -18,9 +18,10 @@
namespace mace { namespace mace {
enum GPU_TYPE { enum GPUType {
QUALCOMM_ADRENO, QUALCOMM_ADRENO,
MALI, MALI,
PowerVR,
UNKNOWN, UNKNOWN,
}; };
...@@ -55,8 +56,8 @@ class OpenCLRuntime { ...@@ -55,8 +56,8 @@ class OpenCLRuntime {
uint64_t GetDeviceMaxWorkGroupSize(); uint64_t GetDeviceMaxWorkGroupSize();
uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel); uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
uint64_t GetKernelWaveSize(const cl::Kernel &kernel); uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
const GPU_TYPE GetGPUType() const; const bool IsNonUniformWorkgroupsSupported();
const std::string &GetOpenclVersion() const; const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name);
cl::Kernel BuildKernel(const std::string &program_name, cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name, const std::string &kernel_name,
const std::set<std::string> &build_options); const std::set<std::string> &build_options);
...@@ -82,7 +83,7 @@ class OpenCLRuntime { ...@@ -82,7 +83,7 @@ class OpenCLRuntime {
std::map<std::string, cl::Program> built_program_map_; std::map<std::string, cl::Program> built_program_map_;
std::mutex program_build_mutex_; std::mutex program_build_mutex_;
std::string kernel_path_; std::string kernel_path_;
GPU_TYPE gpu_type_; GPUType gpu_type_;
std::string opencl_version_; std::string opencl_version_;
static GPUPerfHint gpu_perf_hint_; static GPUPerfHint gpu_perf_hint_;
......
...@@ -155,6 +155,8 @@ class ActivationFunctor<DeviceType::OPENCL, T> { ...@@ -155,6 +155,8 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
ActivationType activation_; ActivationType activation_;
T relux_max_limit_; T relux_max_limit_;
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::string tuning_key_prefix_; std::string tuning_key_prefix_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -90,6 +90,8 @@ struct AddNFunctor<DeviceType::OPENCL, T> { ...@@ -90,6 +90,8 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -157,6 +157,8 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase { ...@@ -157,6 +157,8 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -64,6 +64,8 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> { ...@@ -64,6 +64,8 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -56,6 +56,8 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> { ...@@ -56,6 +56,8 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
const int groups_; const int groups_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -85,6 +85,8 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase { ...@@ -85,6 +85,8 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -401,6 +401,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase { ...@@ -401,6 +401,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -108,6 +108,8 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> { ...@@ -108,6 +108,8 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
const int block_size_; const int block_size_;
bool d2s_; bool d2s_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
......
...@@ -437,6 +437,8 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T> ...@@ -437,6 +437,8 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -97,6 +97,8 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase { ...@@ -97,6 +97,8 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -241,6 +241,8 @@ struct MatMulFunctor<DeviceType::OPENCL, T> { ...@@ -241,6 +241,8 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -26,16 +26,16 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -26,16 +26,16 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name); built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
switch (activation_) { switch (activation_) {
...@@ -83,11 +83,12 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -83,11 +83,12 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape(); input_shape_ = input->shape();
}
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3)); output->dim(3));
......
...@@ -26,8 +26,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -26,8 +26,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
for (int i = 1; i < size; ++i) { for (int i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]); MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0)); MACE_CHECK(batch == input_tensors[i]->dim(0));
...@@ -37,6 +35,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -37,6 +35,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
} }
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
if (input_tensors.size() > 4) { if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -47,7 +47,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -47,7 +47,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
...@@ -78,11 +78,12 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -78,11 +78,12 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
input_shape_ = input_tensors[0]->shape(); input_shape_ = input_tensors[0]->shape();
}
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {kwg_size / 16, 16, 1}; }
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
std::stringstream ss; std::stringstream ss;
ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1] ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
<< "_" << output_shape[2] << "_" << output_shape[3]; << "_" << output_shape[2] << "_" << output_shape[3];
......
...@@ -36,16 +36,17 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -36,16 +36,17 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name); built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
if (folded_constant_) { if (folded_constant_) {
...@@ -89,11 +90,12 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -89,11 +90,12 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape(); input_shape_ = input->shape();
}
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_); output->dim(1), output->dim(2), output->dim(3), folded_constant_);
......
...@@ -29,16 +29,16 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -29,16 +29,16 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name); built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options); kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
...@@ -52,15 +52,16 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -52,15 +52,16 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape(); input_shape_ = input->shape();
}
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8}; }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};
cl::Event event; cl::Event event;
cl_int error; cl_int error;
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
......
...@@ -62,14 +62,15 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -62,14 +62,15 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); const bool is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options; std::set<std::string> built_options;
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
if (buffer->dtype() == image->dtype()) { if (buffer->dtype() == image->dtype()) {
...@@ -115,7 +116,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -115,7 +116,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
cl::Event event; cl::Event event;
cl_int error; cl_int error;
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
......
...@@ -36,16 +36,16 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -36,16 +36,16 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name); built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
...@@ -63,11 +63,12 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -63,11 +63,12 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape(); input_shape_ = input->shape();
}
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "channel_shuffle_opencl_kernel_" ss << "channel_shuffle_opencl_kernel_"
<< output->dim(0) << "_" << output->dim(0) << "_"
......
...@@ -17,7 +17,9 @@ static void Concat2(cl::Kernel *kernel, ...@@ -17,7 +17,9 @@ static void Concat2(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -31,13 +33,13 @@ static void Concat2(cl::Kernel *kernel, ...@@ -31,13 +33,13 @@ static void Concat2(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name); built_options.emplace("-Dconcat_channel=" + kernel_name);
if (is_qualcomm_opencl200) { if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
if (input0->dtype() == output->dtype()) { if (input0->dtype() == output->dtype()) {
...@@ -66,11 +68,12 @@ static void Concat2(cl::Kernel *kernel, ...@@ -66,11 +68,12 @@ static void Concat2(cl::Kernel *kernel,
kernel->setArg(idx++, gws[2]); kernel->setArg(idx++, gws[2]);
*prev_input_shape = input0->shape(); *prev_input_shape = input0->shape();
}
const uint32_t kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; }
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
...@@ -81,7 +84,9 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -81,7 +84,9 @@ static void ConcatN(cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
const DataType dt, const DataType dt,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -89,15 +94,15 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -89,15 +94,15 @@ static void ConcatN(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (is_qualcomm_opencl200) { if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
*kernel = runtime->BuildKernel("concat", kernel_name, built_options); *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
...@@ -122,9 +127,9 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -122,9 +127,9 @@ static void ConcatN(cl::Kernel *kernel,
kernel->setArg(idx++, gws[2]); kernel->setArg(idx++, gws[2]);
chan_blk_offset += input_channel_blk; chan_blk_offset += input_channel_blk;
const uint32_t kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_" ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
<< batch * height; << batch * height;
...@@ -169,11 +174,13 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -169,11 +174,13 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch (inputs_count) { switch (inputs_count) {
case 2: case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value, Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
&input_shape_, output, future); &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
break; break;
default: default:
if (divisible_four) { if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future); ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
......
...@@ -20,7 +20,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -20,7 +20,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size);
extern void Conv2dOpenclK3x3(cl::Kernel *kernel, extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
...@@ -34,7 +36,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -34,7 +36,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size);
extern void Conv2dOpencl(cl::Kernel *kernel, extern void Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
...@@ -48,7 +52,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -48,7 +52,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size);
template <typename T> template <typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...@@ -61,7 +67,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -61,7 +67,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *bias, const int stride, const int *padding, const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation, const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt, const float relux_max_limit, const DataType dt,
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future); std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size);
// Selection matrix: kernel_size x stride_size // Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] = { static const Conv2dOpenclFunction selector[5] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
...@@ -101,11 +108,13 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -101,11 +108,13 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto conv2d_func = selector[kernel_h - 1]; auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future); DataTypeToEnum<T>::value, &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
} else { } else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future); DataTypeToEnum<T>::value, &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
} }
} }
......
...@@ -22,7 +22,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -22,7 +22,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -38,9 +40,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -38,9 +40,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
MACE_CHECK(input_batch == batch); MACE_CHECK(input_batch == batch);
std::set<std::string> built_options; std::set<std::string> built_options;
...@@ -48,7 +50,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -48,7 +50,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) { if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
if (bias != nullptr) { if (bias != nullptr) {
...@@ -101,11 +103,12 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -101,11 +103,12 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
kernel->setArg(idx++, gws[2]); kernel->setArg(idx++, gws[2]);
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
}
const uint32_t kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; }
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
......
...@@ -24,7 +24,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -24,7 +24,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -37,15 +39,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -37,15 +39,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) { if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
...@@ -99,11 +101,12 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -99,11 +101,12 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
kernel->setArg(idx++, gws[2]); kernel->setArg(idx++, gws[2]);
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
}
const uint32_t kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {4, kwg_size / 32, 8, 1}; }
const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
......
...@@ -24,7 +24,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -24,7 +24,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -37,15 +39,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -37,15 +39,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-Dconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) { if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
...@@ -101,11 +103,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -101,11 +103,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
kernel->setArg(idx++, gws[2]); kernel->setArg(idx++, gws[2]);
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
}
const uint32_t kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; }
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
......
...@@ -47,9 +47,9 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -47,9 +47,9 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
...@@ -58,7 +58,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -58,7 +58,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
kernel_ = kernel_ =
...@@ -93,11 +93,12 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -93,11 +93,12 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape(); input_shape_ = input->shape();
}
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
} }
......
...@@ -23,7 +23,9 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -23,7 +23,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -42,9 +44,9 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -42,9 +44,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
...@@ -53,7 +55,7 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -53,7 +55,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
} else { } else {
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
} }
if (is_qualcomm_opencl200) { if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
...@@ -118,12 +120,14 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -118,12 +120,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]); kernel->setArg(idx++, gws[2]);
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
}
const uint32_t kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; }
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
batch, height, width, channels, multiplier); batch, height, width, channels, multiplier);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
...@@ -178,7 +182,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -178,7 +182,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future); DataTypeToEnum<T>::value, &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
} }
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>; template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
......
...@@ -29,9 +29,9 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -29,9 +29,9 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
...@@ -39,7 +39,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -39,7 +39,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
...@@ -56,12 +56,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -56,12 +56,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
input_shape_ = input0->shape(); input_shape_ = input0->shape();
}
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {kwg_size / 16, 16, 1}; }
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
std::stringstream ss; std::stringstream ss;
ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
......
...@@ -194,24 +194,14 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { ...@@ -194,24 +194,14 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
} }
} }
const bool IsQualcommOpenCL200() {
auto runtime = OpenCLRuntime::Global();
if (runtime->GetGPUType() == GPU_TYPE::QUALCOMM_ADRENO &&
runtime->GetOpenclVersion() == "2.0") {
return true;
} else {
return false;
}
}
void TuningOrRun3DKernel(const cl::Kernel &kernel, void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); const bool is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size = const uint32_t kwg_size =
...@@ -249,7 +239,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -249,7 +239,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
<< "Tuning parameters of 3D kernel must be 4D"; << "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
std::vector<uint32_t> roundup_gws(3); std::vector<uint32_t> roundup_gws(3);
if (!is_qualcomm_opencl200) { if (!is_non_uniform_work_groups_supported) {
for (size_t i = 0; i < 3; ++i) { for (size_t i = 0; i < 3; ++i) {
roundup_gws[i] = RoundUp(gws[i], params[i]); roundup_gws[i] = RoundUp(gws[i], params[i]);
} }
...@@ -262,7 +252,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -262,7 +252,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, 0, i * block_size), kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2), cl::NDRange(gws[0], gws[1], gws2),
...@@ -278,7 +268,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -278,7 +268,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
} }
} else { } else {
timer->ClearTiming(); timer->ClearTiming();
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
...@@ -303,7 +293,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -303,7 +293,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, 0, i * block_size), kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2), cl::NDRange(gws[0], gws[1], gws2),
...@@ -342,7 +332,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -342,7 +332,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); const bool is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size = const uint32_t kwg_size =
...@@ -368,7 +359,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -368,7 +359,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
<< "Tuning parameters of 2D kernel must be 3d"; << "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
std::vector<uint32_t> roundup_gws(2); std::vector<uint32_t> roundup_gws(2);
if (!is_qualcomm_opencl200) { if (!is_non_uniform_work_groups_supported) {
for (size_t i = 0; i < 2; ++i) { for (size_t i = 0; i < 2; ++i) {
roundup_gws[i] = RoundUp(gws[i], params[i]); roundup_gws[i] = RoundUp(gws[i], params[i]);
} }
...@@ -381,7 +372,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -381,7 +372,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
...@@ -396,7 +387,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -396,7 +387,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
} }
} else { } else {
timer->ClearTiming(); timer->ClearTiming();
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
...@@ -420,7 +411,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -420,7 +411,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size), kernel, cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]), cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]),
......
...@@ -102,8 +102,6 @@ std::string Concat(Args... args) { ...@@ -102,8 +102,6 @@ std::string Concat(Args... args) {
return ss.str(); return ss.str();
} }
const bool IsQualcommOpenCL200();
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_OPENCL_HELPER_H_ #endif // MACE_KERNELS_OPENCL_HELPER_H_
...@@ -33,16 +33,16 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -33,16 +33,16 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name); built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options); kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
...@@ -59,9 +59,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -59,9 +59,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {kwg_size / 64, 64, 1}; const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
std::stringstream ss; std::stringstream ss;
ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_" ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
<< C->dim(2) << "_" << C->dim(3); << C->dim(2) << "_" << C->dim(3);
......
...@@ -20,9 +20,9 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -20,9 +20,9 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
const DataType dt = DataTypeToEnum<T>::value; const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
...@@ -39,13 +39,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -39,13 +39,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (pooling_type_ == AVG) { if (pooling_type_ == AVG) {
built_options.emplace("-DPOOL_AVG"); built_options.emplace("-DPOOL_AVG");
} }
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options); kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
} }
uint32_t gws[3]; std::vector<uint32_t> gws;
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], std::vector<index_t> filter_shape = {kernels_[0], kernels_[1],
...@@ -75,9 +75,10 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -75,9 +75,10 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
index_t channel_blocks = (channels + 3) / 4; index_t channel_blocks = (channels + 3) / 4;
gws[0] = static_cast<uint32_t>(channel_blocks); gws = {
gws[1] = static_cast<uint32_t>(out_width); static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
gws[2] = static_cast<uint32_t>(batch * out_height); static_cast<uint32_t>(batch * out_height),
};
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
...@@ -94,26 +95,16 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -94,26 +95,16 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape(); input_shape_ = input->shape();
} else {
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
gws[0] = static_cast<uint32_t>(channel_blocks); kwg_size_ =
gws[1] = static_cast<uint32_t>(out_width); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
gws[2] = static_cast<uint32_t>(batch * out_height);
} }
const uint32_t kwg_size = std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future);
} }
template struct PoolingFunctor<DeviceType::OPENCL, float>; template struct PoolingFunctor<DeviceType::OPENCL, float>;
......
...@@ -30,16 +30,16 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -30,16 +30,16 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
kernel_ = kernel_ =
...@@ -72,11 +72,12 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -72,11 +72,12 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape(); input_shape_ = input->shape();
}
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_" ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
......
...@@ -31,16 +31,16 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -31,16 +31,16 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
built_options.emplace("-Dslice=" + kernel_name); built_options.emplace("-Dslice=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" built_options.emplace("-DCMD_DATA_TYPE="
+ DtToCLCMDDt(DataTypeToEnum<T>::value)); + DtToCLCMDDt(DataTypeToEnum<T>::value));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
kernel_ = runtime->BuildKernel("slice", kernel_name, built_options); kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
...@@ -53,9 +53,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -53,9 +53,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(input->dim(0) * input->dim(1)), static_cast<uint32_t>(input->dim(0) * input->dim(1)),
}; };
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "slice_opencl_kernel_" ss << "slice_opencl_kernel_"
<< input->dim(0) << "_" << input->dim(0) << "_"
......
...@@ -29,16 +29,16 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -29,16 +29,16 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name); built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options); kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
...@@ -52,12 +52,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -52,12 +52,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
input_shape_ = logits->shape(); input_shape_ = logits->shape();
}
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3); << "_" << output->dim(2) << "_" << output->dim(3);
......
...@@ -38,9 +38,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -38,9 +38,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options; std::set<std::string> built_options;
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
...@@ -49,7 +49,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -49,7 +49,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value)); DtToCLCMDDt(DataTypeToEnum<T>::value));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
kernel_ = kernel_ =
...@@ -77,11 +77,12 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -77,11 +77,12 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[2]); kernel_.setArg(idx++, gws[2]);
space_shape_ = space_tensor->shape(); space_shape_ = space_tensor->shape();
}
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1}; }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << kernel_name << "_" << batch_tensor->dim(0) << "_" ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
<< batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_" << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
......
...@@ -17,9 +17,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -17,9 +17,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name = std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options; std::set<std::string> built_options;
...@@ -28,7 +28,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -28,7 +28,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
...@@ -74,11 +74,12 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -74,11 +74,12 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
input_shape_ = input_tensor->shape(); input_shape_ = input_tensor->shape();
}
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {kwg_size / 8, 8, 1}; }
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_" ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
...@@ -95,9 +96,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -95,9 +96,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name = std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::set<std::string> built_options; std::set<std::string> built_options;
...@@ -107,7 +108,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -107,7 +108,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (is_qualcomm_opencl200) { if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
} }
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
...@@ -168,11 +169,12 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -168,11 +169,12 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[1]);
input_shape_ = input_tensor->shape(); input_shape_ = input_tensor->shape();
}
const uint32_t kwg_size = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {kwg_size / 8, 8, 1}; }
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_" ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
......
...@@ -185,6 +185,8 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase { ...@@ -185,6 +185,8 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -173,6 +173,8 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T> ...@@ -173,6 +173,8 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -61,6 +61,8 @@ struct SliceFunctor<DeviceType::OPENCL, T> { ...@@ -61,6 +61,8 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -61,6 +61,8 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> { ...@@ -61,6 +61,8 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *logits, Tensor *output, StatsFuture *future); void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -56,6 +56,8 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase { ...@@ -56,6 +56,8 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> space_shape_; std::vector<index_t> space_shape_;
}; };
......
...@@ -51,6 +51,8 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T> ...@@ -51,6 +51,8 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
...@@ -108,6 +110,8 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> ...@@ -108,6 +110,8 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
......
...@@ -43,6 +43,10 @@ else ...@@ -43,6 +43,10 @@ else
HEXAGON_MODE_BUILD_FLAG="--define hexagon=true" HEXAGON_MODE_BUILD_FLAG="--define hexagon=true"
fi fi
if [ x"$TARGET_ABI" = x"arm64-v8a" ]; then
NEON_ENABLE_FLAG="--define neon=true"
fi
bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \ bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
--crosstool_top=//external:android/crosstool \ --crosstool_top=//external:android/crosstool \
--host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
...@@ -54,6 +58,7 @@ else ...@@ -54,6 +58,7 @@ else
--copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \ --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
--define openmp=true \ --define openmp=true \
--copt="-O3" \ --copt="-O3" \
$NEON_ENABLE_FLAG \
$PRODUCTION_MODE_BUILD_FLAGS \ $PRODUCTION_MODE_BUILD_FLAGS \
$HEXAGON_MODE_BUILD_FLAG || exit 1 $HEXAGON_MODE_BUILD_FLAG || exit 1
fi fi
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册