提交 ed267833 编写于 作者: Y yejianwu

format code and reduce get kwg size

上级 44d4903d
......@@ -147,16 +147,9 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
*device_ = device;
gpu_detected = true;
const std::string device_name = device.getInfo<CL_DEVICE_NAME>();
constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
constexpr const char *kMaliGPUStr = "Mali";
if (device_name == kQualcommAdrenoGPUStr) {
gpu_type_ = GPU_TYPE::QUALCOMM_ADRENO;
} else if (device_name.find(kMaliGPUStr) != std::string::npos) {
gpu_type_ = GPU_TYPE::MALI;
} else {
gpu_type_ = GPU_TYPE::UNKNOWN;
}
gpu_type_ = ParseGPUTypeFromDeviceName(device_name);
const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
opencl_version_ = device_version.substr(7, 3);
......@@ -178,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
}
cl_int err;
if (gpu_type_ == GPU_TYPE::QUALCOMM_ADRENO) {
if (gpu_type_ == GPUType::QUALCOMM_ADRENO) {
std::vector<cl_context_properties> context_properties;
context_properties.reserve(5);
GetAdrenoContextProperties(&context_properties, gpu_perf_hint,
......@@ -357,12 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
return size;
}
const GPU_TYPE OpenCLRuntime::GetGPUType() const {
return gpu_type_;
const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() {
if (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
opencl_version_ == "2.0") {
return true;
} else {
return false;
}
}
const std::string &OpenCLRuntime::GetOpenclVersion() const {
return opencl_version_;
const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
const std::string &device_name) {
constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
constexpr const char *kMaliGPUStr = "Mali";
constexpr const char *kPowerVRGPUStr = "PowerVR";
if (device_name == kQualcommAdrenoGPUStr) {
return GPUType::QUALCOMM_ADRENO;
} else if (device_name.find(kMaliGPUStr) != std::string::npos) {
return GPUType::MALI;
} else if (device_name.find(kPowerVRGPUStr) != std::string::npos) {
return GPUType::PowerVR;
} else {
return GPUType::UNKNOWN;
}
}
} // namespace mace
......@@ -18,9 +18,10 @@
namespace mace {
enum GPU_TYPE {
enum GPUType {
QUALCOMM_ADRENO,
MALI,
PowerVR,
UNKNOWN,
};
......@@ -55,8 +56,8 @@ class OpenCLRuntime {
uint64_t GetDeviceMaxWorkGroupSize();
uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
const GPU_TYPE GetGPUType() const;
const std::string &GetOpenclVersion() const;
const bool IsNonUniformWorkgroupsSupported();
const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name);
cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options);
......@@ -82,7 +83,7 @@ class OpenCLRuntime {
std::map<std::string, cl::Program> built_program_map_;
std::mutex program_build_mutex_;
std::string kernel_path_;
GPU_TYPE gpu_type_;
GPUType gpu_type_;
std::string opencl_version_;
static GPUPerfHint gpu_perf_hint_;
......
......@@ -155,6 +155,8 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
ActivationType activation_;
T relux_max_limit_;
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::string tuning_key_prefix_;
std::vector<index_t> input_shape_;
};
......
......@@ -90,6 +90,8 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -157,6 +157,8 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -64,6 +64,8 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -56,6 +56,8 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
const int groups_;
std::vector<index_t> input_shape_;
};
......
......@@ -85,6 +85,8 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -401,6 +401,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -108,6 +108,8 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
const int block_size_;
bool d2s_;
std::vector<index_t> input_shape_;
......
......@@ -437,6 +437,8 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -97,6 +97,8 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -241,6 +241,8 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
};
} // namespace kernels
......
......@@ -26,16 +26,16 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
switch (activation_) {
......@@ -83,11 +83,12 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
......
......@@ -26,8 +26,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
for (int i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
MACE_CHECK(batch == input_tensors[i]->dim(0));
......@@ -37,6 +35,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
}
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
......@@ -47,7 +47,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
......@@ -78,11 +78,12 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[1]);
input_shape_ = input_tensors[0]->shape();
}
const uint32_t kwg_size =
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {kwg_size / 16, 16, 1};
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
std::stringstream ss;
ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
<< "_" << output_shape[2] << "_" << output_shape[3];
......
......@@ -36,16 +36,17 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
built_options.emplace("-Dbatch_norm=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
if (folded_constant_) {
......@@ -89,11 +90,12 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::string tuning_key =
Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_);
......
......@@ -29,16 +29,16 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
......@@ -52,15 +52,16 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8};
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};
cl::Event event;
cl_int error;
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
......
......@@ -62,14 +62,15 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
const bool is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
if (buffer->dtype() == image->dtype()) {
......@@ -115,7 +116,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
cl::Event event;
cl_int error;
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
......
......@@ -36,16 +36,16 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
......@@ -63,11 +63,12 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss;
ss << "channel_shuffle_opencl_kernel_"
<< output->dim(0) << "_"
......
......@@ -17,7 +17,9 @@ static void Concat2(cl::Kernel *kernel,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -31,13 +33,13 @@ static void Concat2(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name);
if (is_qualcomm_opencl200) {
if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
if (input0->dtype() == output->dtype()) {
......@@ -66,11 +68,12 @@ static void Concat2(cl::Kernel *kernel,
kernel->setArg(idx++, gws[2]);
*prev_input_shape = input0->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::stringstream ss;
ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
......@@ -81,7 +84,9 @@ static void ConcatN(cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
StatsFuture *future) {
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -89,15 +94,15 @@ static void ConcatN(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (is_qualcomm_opencl200) {
if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
*kernel = runtime->BuildKernel("concat", kernel_name, built_options);
......@@ -122,9 +127,9 @@ static void ConcatN(cl::Kernel *kernel,
kernel->setArg(idx++, gws[2]);
chan_blk_offset += input_channel_blk;
const uint32_t kwg_size =
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::stringstream ss;
ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
<< batch * height;
......@@ -169,11 +174,13 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch (inputs_count) {
case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
&input_shape_, output, future);
&input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
break;
default:
if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future);
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
} else {
MACE_NOT_IMPLEMENTED;
}
......
......@@ -20,7 +20,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future);
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size);
extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input,
......@@ -34,7 +36,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future);
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size);
extern void Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input,
......@@ -48,7 +52,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future);
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size);
template <typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
......@@ -61,7 +67,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt,
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future);
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
......@@ -101,11 +108,13 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future);
DataTypeToEnum<T>::value, &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
} else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future);
DataTypeToEnum<T>::value, &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
}
}
......
......@@ -22,7 +22,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -38,9 +40,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
MACE_CHECK(input_batch == batch);
std::set<std::string> built_options;
......@@ -48,7 +50,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) {
if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
if (bias != nullptr) {
......@@ -101,11 +103,12 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
kernel->setArg(idx++, gws[2]);
*prev_input_shape = input->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
......
......@@ -24,7 +24,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -37,15 +39,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) {
if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
......@@ -99,11 +101,12 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
kernel->setArg(idx++, gws[2]);
*prev_input_shape = input->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {4, kwg_size / 32, 8, 1};
const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 1};
std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
......
......@@ -24,7 +24,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -37,15 +39,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
built_options.emplace("-Dconv_2d=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) {
if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
......@@ -101,11 +103,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
kernel->setArg(idx++, gws[2]);
*prev_input_shape = input->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::string tuning_key =
Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
......
......@@ -47,9 +47,9 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
......@@ -58,7 +58,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
kernel_ =
......@@ -93,11 +93,12 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
......
......@@ -23,7 +23,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
StatsFuture *future,
bool *is_non_uniform_work_groups_supported,
uint32_t *kwg_size) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -42,9 +44,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel->get() == nullptr) {
*is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
......@@ -53,7 +55,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
} else {
built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
}
if (is_qualcomm_opencl200) {
if (*is_non_uniform_work_groups_supported) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
......@@ -118,12 +120,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
*prev_input_shape = input->shape();
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
batch, height, width, channels, multiplier);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
......@@ -178,7 +182,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future);
DataTypeToEnum<T>::value, &input_shape_, output, future,
&is_non_uniform_work_groups_supported_, &kwg_size_);
}
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
......
......@@ -29,9 +29,9 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
......@@ -39,7 +39,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
......@@ -56,12 +56,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
input_shape_ = input0->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {kwg_size / 16, 16, 1};
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
std::stringstream ss;
ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
......
......@@ -194,24 +194,14 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
}
}
const bool IsQualcommOpenCL200() {
auto runtime = OpenCLRuntime::Global();
if (runtime->GetGPUType() == GPU_TYPE::QUALCOMM_ADRENO &&
runtime->GetOpenclVersion() == "2.0") {
return true;
} else {
return false;
}
}
void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
const bool is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size =
......@@ -249,7 +239,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
<< "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS;
std::vector<uint32_t> roundup_gws(3);
if (!is_qualcomm_opencl200) {
if (!is_non_uniform_work_groups_supported) {
for (size_t i = 0; i < 3; ++i) {
roundup_gws[i] = RoundUp(gws[i], params[i]);
}
......@@ -262,7 +252,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
......@@ -278,7 +268,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
}
} else {
timer->ClearTiming();
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
......@@ -303,7 +293,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
......@@ -342,7 +332,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::vector<uint32_t> &lws,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
const bool is_non_uniform_work_groups_supported =
runtime->IsNonUniformWorkgroupsSupported();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size =
......@@ -368,7 +359,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
<< "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS;
std::vector<uint32_t> roundup_gws(2);
if (!is_qualcomm_opencl200) {
if (!is_non_uniform_work_groups_supported) {
for (size_t i = 0; i < 2; ++i) {
roundup_gws[i] = RoundUp(gws[i], params[i]);
}
......@@ -381,7 +372,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
......@@ -396,7 +387,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
}
} else {
timer->ClearTiming();
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event);
......@@ -420,7 +411,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]),
......
......@@ -102,8 +102,6 @@ std::string Concat(Args... args) {
return ss.str();
}
const bool IsQualcommOpenCL200();
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_HELPER_H_
......@@ -33,16 +33,16 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
built_options.emplace("-Dmatmul=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
......@@ -59,9 +59,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
const uint32_t kwg_size =
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {kwg_size / 64, 64, 1};
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
std::stringstream ss;
ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
<< C->dim(2) << "_" << C->dim(3);
......
......@@ -20,9 +20,9 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
......@@ -39,13 +39,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (pooling_type_ == AVG) {
built_options.emplace("-DPOOL_AVG");
}
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
}
uint32_t gws[3];
std::vector<uint32_t> gws;
if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1],
......@@ -75,9 +75,10 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
index_t channel_blocks = (channels + 3) / 4;
gws[0] = static_cast<uint32_t>(channel_blocks);
gws[1] = static_cast<uint32_t>(out_width);
gws[2] = static_cast<uint32_t>(batch * out_height);
gws = {
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
......@@ -94,26 +95,16 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
} else {
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
gws[0] = static_cast<uint32_t>(channel_blocks);
gws[1] = static_cast<uint32_t>(out_width);
gws[2] = static_cast<uint32_t>(batch * out_height);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss;
ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future);
}
template struct PoolingFunctor<DeviceType::OPENCL, float>;
......
......@@ -30,16 +30,16 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
kernel_ =
......@@ -72,11 +72,12 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
......
......@@ -31,16 +31,16 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
built_options.emplace("-Dslice=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE="
+ DtToCLCMDDt(DataTypeToEnum<T>::value));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
......@@ -53,9 +53,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(input->dim(0) * input->dim(1)),
};
const uint32_t kwg_size =
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss;
ss << "slice_opencl_kernel_"
<< input->dim(0) << "_"
......
......@@ -29,16 +29,16 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
......@@ -52,12 +52,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
input_shape_ = logits->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss;
ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
......
......@@ -38,9 +38,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
std::stringstream kernel_name_ss;
......@@ -49,7 +49,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
kernel_ =
......@@ -77,11 +77,12 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[2]);
space_shape_ = space_tensor->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
std::stringstream ss;
ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
<< batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
......
......@@ -17,9 +17,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options;
......@@ -28,7 +28,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
......@@ -74,11 +74,12 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[1]);
input_shape_ = input_tensor->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {kwg_size / 8, 8, 1};
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
std::stringstream ss;
ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
......@@ -95,9 +96,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) {
is_non_uniform_work_groups_supported_ =
runtime->IsNonUniformWorkgroupsSupported();
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::set<std::string> built_options;
......@@ -107,7 +108,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
if (is_qualcomm_opencl200) {
if (is_non_uniform_work_groups_supported_) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
......@@ -168,11 +169,12 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, gws[1]);
input_shape_ = input_tensor->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
const std::vector<uint32_t> lws = {kwg_size / 8, 8, 1};
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
std::stringstream ss;
ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
......
......@@ -185,6 +185,8 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -173,6 +173,8 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -61,6 +61,8 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
const std::vector<Tensor *> &output_list,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
};
} // namespace kernels
......
......@@ -61,6 +61,8 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -56,6 +56,8 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> space_shape_;
};
......
......@@ -51,6 +51,8 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......@@ -108,6 +110,8 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
bool is_non_uniform_work_groups_supported_;
std::vector<index_t> input_shape_;
};
......
......@@ -43,6 +43,10 @@ else
HEXAGON_MODE_BUILD_FLAG="--define hexagon=true"
fi
if [ x"$TARGET_ABI" = x"arm64-v8a" ]; then
NEON_ENABLE_FLAG="--define neon=true"
fi
bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
--crosstool_top=//external:android/crosstool \
--host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
......@@ -54,6 +58,7 @@ else
--copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
--define openmp=true \
--copt="-O3" \
$NEON_ENABLE_FLAG \
$PRODUCTION_MODE_BUILD_FLAGS \
$HEXAGON_MODE_BUILD_FLAG || exit 1
fi
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册