提交 7645d8d6 编写于 作者: L Liangliang He

Merge branch 'gpu_queue_window' into 'master'

Add option `opencl_queue_window_size` to keep UI responsiveness.

See merge request !1185
......@@ -55,6 +55,13 @@ Try to set `limit_opencl_kernel_time` to `1`. If still not resolved, try to
modify the source code to use even smaller time intervals or changed to CPU
or DSP runtime.
For GPUs such as Arm Mali, sometimes even setting `limit_opencl_kernel_time` to
a small time interval can not solve the problem. At this time, you can try to
set `opencl_queue_window_size`, such as 16. This parameter means that the GPU
command queue will contain only `opencl_queue_window_size` commands at most.
You can adjust this parameter to achieve a balance between performance and UI
response. You should not use this parameter unless you have to.
Why is MACE not working on DSP?
------------------------------------------------------------------------------
Running models on Hexagon DSP need a few prerequisites for DSP developers:
......
......@@ -94,6 +94,8 @@ in one deployment file.
- [optional] The format of the output tensors, one of [NONE, NHWC, NCHW]. If there is no format of the output, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
* - limit_opencl_kernel_time
- [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0.
* - opencl_queue_window_size
- [optional] Limit the max commands in OpenCL command queue to keep UI responsiveness, default is 0.
* - obfuscate
- [optional] Whether to obfuscate the model operator name, default to 0.
* - winograd
......
......@@ -323,7 +323,7 @@ OpenCLRuntime::OpenCLRuntime(
cl_command_queue_properties properties = 0;
const char *profiling = getenv("MACE_OPENCL_PROFILING");
if (IsTuning() ||
if (tuner_->IsTuning() ||
(profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) {
properties |= CL_QUEUE_PROFILING_ENABLE;
is_profiling_enabled_ = true;
......
......@@ -96,6 +96,29 @@ std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
return lws;
}
/**
* For GPUs like Arm Mali, when too many commands are added in the command
* queue, the UI responsiveness may be poor. This function limits the number of
* comands in the command queue to no more than kQueueWndSize. when
* opencl_commands >= kQueueWndSize, it will wait for the completion of GPU
* command queue's execution.
*
* If kQueueWndSize <= 0, this function does nothing.
*/
inline void WaitForQueueExecution(OpenCLRuntime *runtime,
const cl::Event &event) {
static const unsigned int kQueueWndSize =
runtime->tuner()->GetOpenclQueueWindowSize();
static thread_local unsigned int opencl_commands = 0;
if (kQueueWndSize > 0) {
opencl_commands++;
if (opencl_commands >= kQueueWndSize) {
event.wait();
opencl_commands = 0;
}
}
}
MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
const cl::Kernel &kernel,
const std::string tuning_key,
......@@ -176,6 +199,7 @@ MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
cl::NDRange(internal_gws[0], internal_gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CL_RET_ERROR(error);
WaitForQueueExecution(runtime, event);
}
} else {
timer->ClearTiming();
......@@ -285,6 +309,7 @@ MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
cl::NDRange(internal_gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CL_RET_ERROR(error);
WaitForQueueExecution(runtime, event);
}
} else {
timer->ClearTiming();
......
......@@ -548,6 +548,10 @@ int Main(int argc, char **argv) {
LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint;
LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads;
LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy;
LOG(INFO) << "limit_opencl_kernel_time: "
<< getenv("MACE_LIMIT_OPENCL_KERNEL_TIME");
LOG(INFO) << "opencl_queue_window_size: "
<< getenv("MACE_OPENCL_QUEUE_WINDOW_SIZE");
std::vector<std::string> input_shapes = Split(FLAGS_input_shape, ':');
std::vector<std::string> output_shapes = Split(FLAGS_output_shape, ':');
......
......@@ -35,20 +35,38 @@
namespace mace {
inline bool IsTuning() {
constexpr const char *kOpenClWindowSize = "MACE_OPENCL_QUEUE_WINDOW_SIZE";
inline bool GetTuningFromEnv() {
std::string tuning;
GetEnv("MACE_TUNING", &tuning);
return tuning.size() == 1 && tuning[0] == '1';
}
inline unsigned int GetOpenclQueueWindowSizeFromEnv() {
std::string str_size;
GetEnv(kOpenClWindowSize, &str_size);
unsigned int window_size = 0;
if (str_size.size() > 0) {
window_size = std::stoi(str_size);
}
return window_size;
}
template <typename param_type>
class Tuner {
public:
explicit Tuner(const std::string tuned_param_file_path = "",
const unsigned char *param_byte_stream = nullptr,
const size_t param_byte_stream_size = 0):
const size_t param_byte_stream_size = 0) :
tuned_param_file_path_(tuned_param_file_path) {
GetEnv("MACE_RUN_PARAMETER_PATH", &path_);
is_tuning_ = GetTuningFromEnv();
if (is_tuning_) {
unsigned int wnd_size = GetOpenclQueueWindowSizeFromEnv();
param_table_[kOpenClWindowSize] = {wnd_size};
}
if (param_byte_stream != nullptr && param_byte_stream_size != 0) {
ParseData(param_byte_stream, param_byte_stream_size);
} else {
......@@ -92,6 +110,19 @@ class Tuner {
}
}
unsigned int GetOpenclQueueWindowSize() {
unsigned int window_size = 0;
if (!IsTuning()
&& param_table_.find(kOpenClWindowSize) != param_table_.end()) {
window_size = param_table_[kOpenClWindowSize][0];
}
return window_size;
}
bool IsTuning() {
return is_tuning_;
}
private:
void WriteRunParameters() {
if (!path_.empty()) {
......@@ -239,6 +270,8 @@ class Tuner {
std::string tuned_param_file_path_;
std::string path_;
std::unordered_map<std::string, std::vector<param_type>> param_table_;
unsigned int opencl_queue_window_size_;
bool is_tuning_;
};
} // namespace mace
......
......@@ -407,6 +407,7 @@ class YAMLKeyword(object):
input_data_formats = 'input_data_formats'
output_data_formats = 'output_data_formats'
limit_opencl_kernel_time = 'limit_opencl_kernel_time'
opencl_queue_window_size = 'opencl_queue_window_size'
nnlib_graph_mode = 'nnlib_graph_mode'
obfuscate = 'obfuscate'
winograd = 'winograd'
......
......@@ -656,6 +656,7 @@ def format_model_config(flags):
accuracy_validation_script
for key in [YAMLKeyword.limit_opencl_kernel_time,
YAMLKeyword.opencl_queue_window_size,
YAMLKeyword.nnlib_graph_mode,
YAMLKeyword.obfuscate,
YAMLKeyword.winograd,
......
......@@ -171,6 +171,7 @@ class DeviceWrapper:
running_round,
restart_round,
limit_opencl_kernel_time,
opencl_queue_window_size,
tuning,
out_of_range_check,
model_graph_format,
......@@ -312,6 +313,7 @@ class DeviceWrapper:
"MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % self.data_dir,
"MACE_INTERNAL_STORAGE_PATH=%s" % internal_storage_dir,
"MACE_LIMIT_OPENCL_KERNEL_TIME=%s" % limit_opencl_kernel_time,
"MACE_OPENCL_QUEUE_WINDOW_SIZE=%s" % opencl_queue_window_size,
"MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
"MACE_LOG_TENSOR_RANGE=%d" % (1 if quantize_stat else 0),
]
......@@ -429,6 +431,8 @@ class DeviceWrapper:
restart_round=1,
limit_opencl_kernel_time=model_config[
YAMLKeyword.limit_opencl_kernel_time],
opencl_queue_window_size=model_config[
YAMLKeyword.opencl_queue_window_size],
tuning=True,
out_of_range_check=False,
model_graph_format=model_graph_format,
......@@ -541,6 +545,8 @@ class DeviceWrapper:
restart_round=flags.restart_round,
limit_opencl_kernel_time=model_config[
YAMLKeyword.limit_opencl_kernel_time],
opencl_queue_window_size=model_config[
YAMLKeyword.opencl_queue_window_size],
tuning=False,
out_of_range_check=flags.gpu_out_of_range_check,
model_graph_format=configs[
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册