未验证 提交 942bc409 编写于 作者: X xiebaiyuan 提交者: GitHub

[LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog… (#3309)

* [LITE][OPENCL][Image] add lws turn &  close cl check when shutdownlog , test=develop

* [LITE][OPENCL][Image] add lws turn &  close cl check when shutdownlog , test=develop

* [LITE][OPENCL][Image] add lws turn &  close cl check when shutdownlog , test=develop

* [LITE][OPENCL][Image] add lws turn &  close cl check when shutdownlog , test=develop

* [LITE][OPENCL][Image] add lws turn &  close cl check when shutdownlog , test=develop
上级 185c7096
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "lite/backends/opencl/cl_context.h" #include "lite/backends/opencl/cl_context.h"
#include <algorithm>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility> #include <utility>
...@@ -121,14 +122,53 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) { ...@@ -121,14 +122,53 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
} }
} }
cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
size_t max_work_size,
int divisor) {
int preferred_lws = 0;
#if 1
auto gws0 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws2 = global_work_size[2];
#else
auto gws2 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws0 = global_work_size[2];
#endif
if (divisor > 1) {
max_work_size /= divisor;
}
if (preferred_lws > 0 && preferred_lws <= max_work_size) {
max_work_size = preferred_lws;
}
while (gws1 > max_work_size && max_work_size > 0) {
gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
}
while (gws2 * gws1 > max_work_size && max_work_size > 0) {
gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
}
while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
}
#if 1
return cl::NDRange{static_cast<size_t>(gws0),
static_cast<size_t>(gws1),
static_cast<size_t>(gws2)};
#else
return cl::NDRange{static_cast<size_t>(gws2),
static_cast<size_t>(gws1),
static_cast<size_t>(gws0)};
#endif
}
cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size, cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
size_t max_work_size) { size_t max_work_size) {
int preferred_lws = 0; int preferred_lws = 0;
int divisor = 2; int divisor = 2;
auto tmp0 = global_work_size[0]; auto gws0 = global_work_size[0];
auto tmp1 = global_work_size[1]; auto gws1 = global_work_size[1];
auto tmp2 = global_work_size[2]; auto gws2 = global_work_size[2];
if (divisor > 1) { if (divisor > 1) {
max_work_size /= divisor; max_work_size /= divisor;
...@@ -136,18 +176,18 @@ cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size, ...@@ -136,18 +176,18 @@ cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
if (preferred_lws > 0 && preferred_lws <= max_work_size) { if (preferred_lws > 0 && preferred_lws <= max_work_size) {
max_work_size = preferred_lws; max_work_size = preferred_lws;
} }
while (tmp1 > max_work_size && max_work_size > 0) { while (gws1 > max_work_size && max_work_size > 0) {
tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
} }
while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { while (gws2 * gws1 > max_work_size && max_work_size > 0) {
tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
} }
while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
} }
return cl::NDRange{static_cast<size_t>(tmp0), return cl::NDRange{static_cast<size_t>(gws0),
static_cast<size_t>(tmp1), static_cast<size_t>(gws1),
static_cast<size_t>(tmp2)}; static_cast<size_t>(gws2)};
} }
} // namespace lite } // namespace lite
......
...@@ -45,6 +45,11 @@ class CLContext { ...@@ -45,6 +45,11 @@ class CLContext {
cl::NDRange DefaultWorkSize(const CLImage &image); cl::NDRange DefaultWorkSize(const CLImage &image);
cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size); cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
size_t max_work_size,
int divitor = 2);
// cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
// size_t max_work_size);
private: private:
std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_; std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
......
...@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error); ...@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error);
__FILE__, \ __FILE__, \
__LINE__); \ __LINE__); \
} }
#ifndef LITE_SHUTDOWN_LOG
#define CL_CHECK_FATAL(err_code__) \ #define CL_CHECK_FATAL(err_code__) \
if (err_code__ != CL_SUCCESS) { \ if (err_code__ != CL_SUCCESS) { \
LOG(FATAL) << string_format( \ LOG(FATAL) << string_format( \
...@@ -42,5 +42,8 @@ const char* opencl_error_to_str(cl_int error); ...@@ -42,5 +42,8 @@ const char* opencl_error_to_str(cl_int error);
__FILE__, \ __FILE__, \
__LINE__); \ __LINE__); \
} }
#else
#define CL_CHECK_FATAL(err_code__)
#endif
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -388,18 +388,43 @@ void ConvImageCompute::PrepareForRun() { ...@@ -388,18 +388,43 @@ void ConvImageCompute::PrepareForRun() {
VLOG(4) << "max_work_group_size: " << max_work_group_size; VLOG(4) << "max_work_group_size: " << max_work_group_size;
if (max_work_group_size > 0 && use_lws) { if (max_work_group_size > 0 && use_lws_) {
// local_work_size_ = context.cl_context()->LocalWorkSizeConv1x1( double min_turn_time = DBL_MAX;
// global_work_size_, max_work_group_size); cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize(
local_work_size_ = context.cl_context()->LocalWorkSize(global_work_size_, global_work_size_, max_work_group_size);
max_work_group_size); cl::NDRange last_local_work_size = cl::NDRange{
static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
if (use_turn_) {
for (size_t i = 1; i < 15; i++) {
if (kernel_h == 1 && kernel_w == 1) {
// todo use diff logics
local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
global_work_size_, max_work_group_size, i);
} else {
local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
global_work_size_, max_work_group_size, i);
}
if (last_local_work_size[0] == local_work_size_[0] &&
last_local_work_size[1] == local_work_size_[1] &&
last_local_work_size[2] == local_work_size_[2]) {
// skiped turned lws
continue;
}
auto turn_time = this->Turn(5);
if (min_turn_time > turn_time) {
min_turn_time = turn_time;
best_local_work_size = local_work_size_;
}
last_local_work_size = local_work_size_;
}
}
local_work_size_ = best_local_work_size;
VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << "," VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
<< local_work_size_[1] << "," << local_work_size_[2] << "}"; << local_work_size_[1] << "," << local_work_size_[2] << "}";
} }
} }
void ConvImageCompute::Conv2d1x1opt() { void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -431,23 +456,6 @@ void ConvImageCompute::Conv2d1x1opt() { ...@@ -431,23 +456,6 @@ void ConvImageCompute::Conv2d1x1opt() {
int input_c = input_dims[1]; int input_c = input_dims[1];
auto dilations = *param.dilations; auto dilations = *param.dilations;
// const std::vector<size_t>& default_work_size =
// DefaultWorkSize(output_dims,
// DDim(std::vector<DDim::value_type>{
// static_cast<int64_t>(out_image_shape["width"]),
// static_cast<int64_t>(out_image_shape["height"])}));
// int c_block = default_work_size[0];
// int w = default_work_size[1];
// int nh = default_work_size[2];
// int maped_w = maptofactor(w, 4);
// auto global_work_size_ =
// cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
// static_cast<size_t>(maped_w),
// static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image; // VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
...@@ -541,73 +549,12 @@ void ConvImageCompute::Conv2d1x1opt() { ...@@ -541,73 +549,12 @@ void ConvImageCompute::Conv2d1x1opt() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
#ifdef PROFILE_CONV_KERNEL event_->wait();
bool use_profile = false;
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
double start = GetCurrentUS();
if (use_profile) {
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
} else {
int count = 50;
double sumtime = 0;
if (!use_profile) {
count = 1;
}
for (size_t i = 0; i < count; i++) {
start = GetCurrentUS();
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
if (use_profile) {
event_->wait();
double duration = GetCurrentUS() - start;
sumtime += duration;
}
}
auto dims_string = [](DDimLite dims) -> std::string {
std::ostringstream stream;
stream << "[" << dims[0] << "," << dims[1] << "," << dims[2] << ","
<< dims[3] << "]";
return stream.str();
};
if (use_profile) {
// LOG(INFO) << "input: " << input_dims;
// LOG(INFO) << "filter: " << filter_dims;
// LOG(INFO) << "output: " << output_dims;
std::cout << std::setw(25) << std::left << dims_string(input_dims)
<< std::setw(25) << std::left << dims_string(filter_dims)
<< std::setw(25) << std::left << dims_string(output_dims)
<< std::setw(25) << std::left << sumtime / count << std::endl;
} else {
dims_string(input_dims);
}
} }
#endif
} }
void ConvImageCompute::Conv2d3x3() { void ConvImageCompute::Conv2d3x3(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -767,9 +714,13 @@ void ConvImageCompute::Conv2d3x3() { ...@@ -767,9 +714,13 @@ void ConvImageCompute::Conv2d3x3() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::Conv2d3x3opt() { void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -890,9 +841,12 @@ void ConvImageCompute::Conv2d3x3opt() { ...@@ -890,9 +841,12 @@ void ConvImageCompute::Conv2d3x3opt() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::Conv2d5x5() { void ConvImageCompute::Conv2d5x5(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1018,9 +972,12 @@ void ConvImageCompute::Conv2d5x5() { ...@@ -1018,9 +972,12 @@ void ConvImageCompute::Conv2d5x5() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::Conv2d5x5opt() { void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1134,9 +1091,12 @@ void ConvImageCompute::Conv2d5x5opt() { ...@@ -1134,9 +1091,12 @@ void ConvImageCompute::Conv2d5x5opt() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::Conv2d7x7() { void ConvImageCompute::Conv2d7x7(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1262,8 +1222,12 @@ void ConvImageCompute::Conv2d7x7() { ...@@ -1262,8 +1222,12 @@ void ConvImageCompute::Conv2d7x7() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::Conv2d7x7opt() { void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1374,8 +1338,12 @@ void ConvImageCompute::Conv2d7x7opt() { ...@@ -1374,8 +1338,12 @@ void ConvImageCompute::Conv2d7x7opt() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::DepthwiseConv2d3x3s1() { void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1454,9 +1422,13 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { ...@@ -1454,9 +1422,13 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(output_img, event_); context.cl_wait_list()->emplace(output_img, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::DepthwiseConv2d3x3() { void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1548,9 +1520,13 @@ void ConvImageCompute::DepthwiseConv2d3x3() { ...@@ -1548,9 +1520,13 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(output_img, event_); context.cl_wait_list()->emplace(output_img, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::DepthwiseConv2d() { void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1683,8 +1659,22 @@ void ConvImageCompute::DepthwiseConv2d() { ...@@ -1683,8 +1659,22 @@ void ConvImageCompute::DepthwiseConv2d() {
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
} }
void ConvImageCompute::Run() { (this->*impl_)(); } void ConvImageCompute::Run() { (this->*impl_)(false); }
#undef PROFILE_CONV_KERNEL
double ConvImageCompute::Turn(int times) {
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
auto start = GetCurrentUS();
for (size_t i = 0; i < times; i++) {
(this->*impl_)(true);
}
auto time_diff = (GetCurrentUS() - start) / times;
return time_diff;
}
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -28,29 +28,29 @@ namespace paddle { ...@@ -28,29 +28,29 @@ namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
class ConvImageCompute : public KernelLite<TARGET(kOpenCL), class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kImageDefault)> { DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ConvParam; using param_t = operators::ConvParam;
using kernel_t = void (ConvImageCompute::*)(); using kernel_t = void (ConvImageCompute::*)(bool);
void PrepareForRun() override; void PrepareForRun() override;
void Run() override; void Run() override;
double Turn(int times = 5);
private: private:
void Conv2d1x1opt(); void Conv2d1x1opt(bool is_turn = false);
void Conv2d3x3(); void Conv2d3x3(bool is_turn = false);
void Conv2d3x3opt(); void Conv2d3x3opt(bool is_turn = false);
void Conv2d5x5(); void Conv2d5x5(bool is_turn = false);
void Conv2d5x5opt(); void Conv2d5x5opt(bool is_turn = false);
void Conv2d7x7(); void Conv2d7x7(bool is_turn = false);
void Conv2d7x7opt(); void Conv2d7x7opt(bool is_turn = false);
void DepthwiseConv2d3x3s1(); void DepthwiseConv2d3x3s1(bool is_turn = false);
void DepthwiseConv2d3x3(); void DepthwiseConv2d3x3(bool is_turn = false);
void DepthwiseConv2d(); void DepthwiseConv2d(bool is_turn = false);
kernel_t impl_; kernel_t impl_;
std::vector<std::string> kernel_func_names_{}; std::vector<std::string> kernel_func_names_{};
...@@ -72,7 +72,8 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -72,7 +72,8 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
cl::Kernel kernel_; cl::Kernel kernel_;
cl::NDRange local_work_size_ = cl::NDRange{ cl::NDRange local_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)}; static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
bool use_lws{true}; bool use_lws_{true};
bool use_turn_{false};
}; };
} // namespace opencl } // namespace opencl
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册