提交 01b400c8 编写于 作者: X xiebaiyuan 提交者: GitHub

[LITE][OPENCL] optimisei tune logic ,default close ,test=develop (#3576)

上级 cd7b24e4
......@@ -157,6 +157,48 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
static_cast<size_t>(gws0)};
#endif
}
cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
size_t max_work_size,
int divisor) {
int preferred_lws = 0;
#if 0
auto gws0 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws2 = global_work_size[2];
#else
auto gws2 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws0 = global_work_size[2];
#endif
if (divisor > 1) {
max_work_size /= divisor;
}
if (preferred_lws > 0 && preferred_lws <= max_work_size) {
max_work_size = preferred_lws;
}
while (gws1 > max_work_size && max_work_size > 0) {
gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
}
while (gws2 * gws1 > max_work_size && max_work_size > 0) {
gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
}
while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
}
#if 0
return cl::NDRange{static_cast<size_t>(gws0),
static_cast<size_t>(gws1),
static_cast<size_t>(gws2)};
#else
return cl::NDRange{static_cast<size_t>(gws2),
static_cast<size_t>(gws1),
static_cast<size_t>(gws0)};
#endif
}
bool CLContext::IsArmMali() {
return CLRuntime::Global()->GetGpuType() == GpuType::ARM_MALI;
}
cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
size_t max_work_size) {
......
......@@ -66,6 +66,10 @@ class CLContext {
cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
size_t max_work_size,
int divitor = 2);
cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
size_t max_work_size,
int divitor = 2);
bool IsArmMali();
// cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
// size_t max_work_size);
......
......@@ -191,6 +191,9 @@ bool CLRuntime::InitializeDevice() {
}
return t_str;
};
const std::string device_version = device_->getInfo<CL_DEVICE_VERSION>();
LOG(INFO) << "device_version:" << device_version;
LOG(INFO) << "device_type:" << device_type_to_str(device_type);
device_info_["CL_DEVICE_TYPE"] = device_type;
......@@ -317,6 +320,8 @@ std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
return device_info_;
}
GpuType& CLRuntime::GetGpuType() { return gpu_type_; }
void CLRuntime::GetAdrenoContextProperties(
std::vector<cl_context_properties>* properties,
GPUPerfMode gpu_perf_mode,
......
......@@ -93,6 +93,8 @@ class CLRuntime {
std::map<std::string, size_t>& GetDeviceInfo();
GpuType& GetGpuType();
private:
CLRuntime() { Init(); }
......
......@@ -36,7 +36,7 @@ void ConvImageCompute::PrepareForRun() {
float* filter_cpu = param.filter->mutable_data<float>();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const bool is_mali = context.cl_context()->IsArmMali();
filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
......@@ -63,6 +63,7 @@ void ConvImageCompute::PrepareForRun() {
bool stride_equal = stride_h == stride_w;
bool dilation_equal = dilations[0] == dilations[1];
VLOG(3) << "Is arm mali / " << (is_mali ? "Yes" : "No");
VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
<< " stride_w:" << stride_w << " pad_h:" << pad_h
......@@ -278,7 +279,6 @@ void ConvImageCompute::PrepareForRun() {
#endif
#undef CONV3x3OPT_FALL_BACK
} else if (kernel_h == 5 && kernel_w == 5) {
#define CONV_5x5_OPT
#ifndef CONV_5x5_OPT
......@@ -393,7 +393,6 @@ void ConvImageCompute::PrepareForRun() {
}
#endif
#undef CONV_7x7_OPT
} else {
LOG(FATAL) << "conv image compute not support this condition yet! ";
}
......@@ -477,6 +476,8 @@ void ConvImageCompute::PrepareForRun() {
double min_turn_time = DBL_MAX;
cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize(
global_work_size_, max_work_group_size);
VLOG(3) << "origin :local_work_size_ : " << best_local_work_size[0] << " "
<< best_local_work_size[1] << " " << best_local_work_size[2];
cl::NDRange last_local_work_size = cl::NDRange{
static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
if (use_turn_) {
......@@ -495,7 +496,30 @@ void ConvImageCompute::PrepareForRun() {
// skiped turned lws
continue;
}
auto turn_time = this->Turn(5);
auto turn_time = this->Turn(10);
if (min_turn_time > turn_time) {
min_turn_time = turn_time;
best_local_work_size = local_work_size_;
}
last_local_work_size = local_work_size_;
}
// reverse
for (size_t i = 1; i < 15; i++) {
if (kernel_h == 1 && kernel_w == 1) {
// todo use diff logics
local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse(
global_work_size_, max_work_group_size, i);
} else {
local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse(
global_work_size_, max_work_group_size, i);
}
if (last_local_work_size[0] == local_work_size_[0] &&
last_local_work_size[1] == local_work_size_[1] &&
last_local_work_size[2] == local_work_size_[2]) {
// skiped turned lws
continue;
}
auto turn_time = this->Turn(10);
if (min_turn_time > turn_time) {
min_turn_time = turn_time;
best_local_work_size = local_work_size_;
......@@ -504,6 +528,8 @@ void ConvImageCompute::PrepareForRun() {
}
}
local_work_size_ = best_local_work_size;
VLOG(3) << "chossen :local_work_size_ : " << local_work_size_[0] << " "
<< local_work_size_[1] << " " << local_work_size_[2];
VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
<< local_work_size_[1] << "," << local_work_size_[2] << "}";
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册