diff --git a/src/framework/cl/cl_deleter.h b/src/framework/cl/cl_deleter.h index f475997a75eb9df3e3f533156bfbc5dce8f315fb..55af631174ae9f2a7815c2da35ebadda3ebfd9e9 100644 --- a/src/framework/cl/cl_deleter.h +++ b/src/framework/cl/cl_deleter.h @@ -30,6 +30,13 @@ struct CLMemDeleter { } }; +struct CLEventDeleter { + template + void operator()(T *clEventObj) { + clReleaseEvent(clEventObj); + } +}; + struct CLCommQueueDeleter { template void operator()(T *clQueueObj) { diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h index 6411b3cd05c925eb9823224bfd198d1033f64cdb..654db6d38e39c2851e4bd5c17ef1ccbb08ae82b9 100644 --- a/src/framework/cl/cl_engine.h +++ b/src/framework/cl/cl_engine.h @@ -81,6 +81,12 @@ class CLEngine { return std::move(program_ptr); } + std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) { + cl_event event = clCreateUserEvent(context, status_); + std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event); + return std::move(event_ptr); + } + bool BuildProgram(cl_program program) { cl_int status; status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math", 0, 0); diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index 2162d303acc860105c0d7cf146f5ec045ef48cfa..bed423441164420b5c1e50e0eaebc0a933f3d811 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -21,6 +21,7 @@ limitations under the License. */ #include "framework/cl/cl_half.h" #include "framework/cl/cl_tool.h" #include "framework/cl/cl_deleter.h" +#include "framework/cl/cl_engine.h" #include "framework/ddim.h" #include "framework/tensor.h" @@ -97,6 +98,8 @@ class CLImage { InitCLImage(context, command_queue, tensor_data_, tensor_dims_); } + cl_event_ = CLEngine::Instance()->CreateEvent(context); + // InitCLImage(context, command_queue, nullptr, dim); initialized_ = true; @@ -157,6 +160,8 @@ class CLImage { const ImageType GetImageType() const { return image_type_; } + cl_event GetClEvent() const { return cl_event_.get(); } + private: ImageType image_type_ = Invalid; void InitCLImage2C(cl_context context, cl_command_queue command_queue, @@ -295,6 +300,7 @@ class CLImage { bool initialized_ = false; std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_; + std::unique_ptr<_cl_event, CLEventDeleter> cl_event_; size_t image_width_; size_t width_of_one_block_; size_t height_of_one_block_; diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index 2eb7124495ad8b82d78c875aa201d6e176e60589..45f9726d1eb9f031e910ebebf6726807970fcb61 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -37,8 +37,6 @@ limitations under the License. */ #include "framework/cl/cl_image.h" #endif -int debug_to = 32; - namespace paddle_mobile { namespace framework { @@ -87,7 +85,7 @@ Executor::Executor(const framework::Program p, int batch_size, for (int i = 0; i < blocks.size(); ++i) { std::shared_ptr block_desc = blocks[i]; std::vector> ops = block_desc->Ops(); - for (int j = 0; j < debug_to; ++j) { + for (int j = 0; j < ops.size(); ++j) { std::shared_ptr op = ops[j]; DLOG << "create op: " << j << " " << op->Type(); auto op_base = framework::OpRegistry::CreateOp( @@ -416,7 +414,7 @@ std::shared_ptr Executor::Predict( } } #else - for (int i = 0; i < debug_to; i++) { + for (int i = 0; i < ops.size(); i++) { #ifdef PADDLE_MOBILE_PROFILE struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); @@ -433,8 +431,6 @@ std::shared_ptr Executor::Predict( DLOG << " predict return nullptr"; - return nullptr; - auto last_op = ops.rbegin(); auto output_map = (*last_op)->Outputs(); std::vector out_keys = (*last_op)->GetOutKeys(); diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index 79ccffb3edb069d77399522d40b1c117aaa44525..8a0417eaaa79763c38bd4d77d646f1dbb0d92d06 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -168,20 +168,20 @@ void ConvAddBNReluKernel::Compute( int output_width = param.Output()->WidthOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock(); - DLOG << " c block " << c_block; - DLOG << " w " << w; - DLOG << " nh " << nh; - DLOG << " stride " << stride; - DLOG << " offset " << offset; - DLOG << " input_c " << input_c; - DLOG << " dilation " << dilation; - DLOG << " input width " << input_width; - DLOG << " input height " << input_height; - DLOG << " output width " << output_width; - DLOG << " output height " << output_height; - DLOG << " input dim " << param.Input()->dims(); - DLOG << " output dim " << param.Output()->dims(); - DLOG << " filter dim " << param.Filter()->dims(); +// DLOG << " c block " << c_block; +// DLOG << " w " << w; +// DLOG << " nh " << nh; +// DLOG << " stride " << stride; +// DLOG << " offset " << offset; +// DLOG << " input_c " << input_c; +// DLOG << " dilation " << dilation; +// DLOG << " input width " << input_width; +// DLOG << " input height " << input_height; +// DLOG << " output width " << output_width; +// DLOG << " output height " << output_height; +// DLOG << " input dim " << param.Input()->dims(); +// DLOG << " output dim " << param.Output()->dims(); +// DLOG << " filter dim " << param.Filter()->dims(); cl_int status; @@ -236,9 +236,12 @@ void ConvAddBNReluKernel::Compute( status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); CL_CHECK_ERRORS(status); + cl_event out_event = param.Output()->GetClEvent(); + cl_event wait_event = param.Input()->GetClEvent(); + status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + default_work_size.data(), NULL, 1, &wait_event, &out_event); CL_CHECK_ERRORS(status); } diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index e83593187393dcc7de6a6541d0a6c521d380aeaa..390aebef8c9ee4cf440cb70226a3195ffc7b639c 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -117,9 +117,12 @@ void ConvAddKernel::Compute( status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); CL_CHECK_ERRORS(status); + cl_event out_event = param.Output()->GetClEvent(); + cl_event wait_event = param.Input()->GetClEvent(); + status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + default_work_size.data(), NULL, 1, &wait_event, &out_event); CL_CHECK_ERRORS(status); } diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp index 0c5ab87d6de62d78acb5d1042a84aef8c38e4e02..8cec6b9893ba934cdccba053f5ce335618c3e251 100644 --- a/src/operators/kernel/cl/conv_kernel.cpp +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -62,27 +62,15 @@ bool ConvKernel::Init(ConvParam *param) { template <> void ConvKernel::Compute(const ConvParam ¶m) { - DLOG << " Compute helper: " << &cl_helper_; - DLOG << " begin compute "; auto kernel = this->cl_helper_.KernelAt(0); - DLOG << " get work size "; auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); - DLOG << " end work size "; int c_block = default_work_size[0]; int w = default_work_size[1]; int nh = default_work_size[2]; auto input = param.Input()->GetCLImage(); - - DLOG << " get Input "; - auto filter = param.Filter()->GetCLImage(); - - DLOG << " get Filter "; - auto output = param.Output()->GetCLImage(); - DLOG << " get Output "; - int stride = param.Strides()[0]; int offset = param.Offset(); int input_c = param.Input()->CBlock(); @@ -109,56 +97,27 @@ void ConvKernel::Compute(const ConvParam ¶m) { DLOG << " output height " << output_height; status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - DLOG << " end set kernel arg "; - DLOG << " begin enqueue "; + cl_event out_event = param.Output()->GetClEvent(); + cl_event wait_event = param.Input()->GetClEvent(); status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + default_work_size.data(), NULL, 1, &wait_event, &out_event); CL_CHECK_ERRORS(status); - DLOG << " end enqueue "; } template class ConvKernel; diff --git a/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/src/operators/kernel/cl/depthwise_conv_kernel.cpp index bbf4c07fc2f2a55c9ef5a3cb0ae241c9f11af9dc..f24e6f0be8322e296bcf599f497425014d2ea8a5 100644 --- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp +++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp @@ -76,9 +76,12 @@ void DepthwiseConvKernel::Compute( CL_CHECK_ERRORS(status); + cl_event out_event = param.Output()->GetClEvent(); + cl_event wait_event = param.Input()->GetClEvent(); + status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + default_work_size.data(), NULL, 1, &wait_event, &out_event); CL_CHECK_ERRORS(status); } diff --git a/src/operators/kernel/cl/softmax_kernel.cpp b/src/operators/kernel/cl/softmax_kernel.cpp index d178b52318de4d265c9da645398e901f0a29de71..c3384d956765663f10099dd0a95e32059a0ac753 100644 --- a/src/operators/kernel/cl/softmax_kernel.cpp +++ b/src/operators/kernel/cl/softmax_kernel.cpp @@ -34,21 +34,13 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { auto inputImage = input->GetCLImage(); auto outputImage = output->GetCLImage(); - DLOG << " softmax - output image dim " << output->ImageDims(); - DLOG << " softmax - output image tensor dim " << output->dims(); - int group = output->ImageWidth(); cl_int status; status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &group); - CL_CHECK_ERRORS(status); // const auto &inputDim = input->dims(); // @@ -62,7 +54,6 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { // clSetKernelArg(kernel, 3, sizeof(int), &dims[1]); // clSetKernelArg(kernel, 4, sizeof(int), &dims[2]); // clSetKernelArg(kernel, 5, sizeof(int), &dims[3]); - DLOG << "default_work_size: " << default_work_size; status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, default_work_size.data(), NULL, 0, NULL, NULL);