提交 2b460663 编写于 作者: L liuruilong

commit event

上级 ccb9de67
...@@ -30,6 +30,13 @@ struct CLMemDeleter { ...@@ -30,6 +30,13 @@ struct CLMemDeleter {
} }
}; };
struct CLEventDeleter {
template <class T>
void operator()(T *clEventObj) {
clReleaseEvent(clEventObj);
}
};
struct CLCommQueueDeleter { struct CLCommQueueDeleter {
template <class T> template <class T>
void operator()(T *clQueueObj) { void operator()(T *clQueueObj) {
......
...@@ -81,6 +81,12 @@ class CLEngine { ...@@ -81,6 +81,12 @@ class CLEngine {
return std::move(program_ptr); return std::move(program_ptr);
} }
std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) {
cl_event event = clCreateUserEvent(context, status_);
std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event);
return std::move(event_ptr);
}
bool BuildProgram(cl_program program) { bool BuildProgram(cl_program program) {
cl_int status; cl_int status;
status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math", 0, 0); status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math", 0, 0);
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "framework/cl/cl_half.h" #include "framework/cl/cl_half.h"
#include "framework/cl/cl_tool.h" #include "framework/cl/cl_tool.h"
#include "framework/cl/cl_deleter.h" #include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_engine.h"
#include "framework/ddim.h" #include "framework/ddim.h"
#include "framework/tensor.h" #include "framework/tensor.h"
...@@ -97,6 +98,8 @@ class CLImage { ...@@ -97,6 +98,8 @@ class CLImage {
InitCLImage(context, command_queue, tensor_data_, tensor_dims_); InitCLImage(context, command_queue, tensor_data_, tensor_dims_);
} }
cl_event_ = CLEngine::Instance()->CreateEvent(context);
// InitCLImage(context, command_queue, nullptr, dim); // InitCLImage(context, command_queue, nullptr, dim);
initialized_ = true; initialized_ = true;
...@@ -157,6 +160,8 @@ class CLImage { ...@@ -157,6 +160,8 @@ class CLImage {
const ImageType GetImageType() const { return image_type_; } const ImageType GetImageType() const { return image_type_; }
cl_event GetClEvent() const { return cl_event_.get(); }
private: private:
ImageType image_type_ = Invalid; ImageType image_type_ = Invalid;
void InitCLImage2C(cl_context context, cl_command_queue command_queue, void InitCLImage2C(cl_context context, cl_command_queue command_queue,
...@@ -295,6 +300,7 @@ class CLImage { ...@@ -295,6 +300,7 @@ class CLImage {
bool initialized_ = false; bool initialized_ = false;
std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_; std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_;
std::unique_ptr<_cl_event, CLEventDeleter> cl_event_;
size_t image_width_; size_t image_width_;
size_t width_of_one_block_; size_t width_of_one_block_;
size_t height_of_one_block_; size_t height_of_one_block_;
......
...@@ -37,8 +37,6 @@ limitations under the License. */ ...@@ -37,8 +37,6 @@ limitations under the License. */
#include "framework/cl/cl_image.h" #include "framework/cl/cl_image.h"
#endif #endif
int debug_to = 32;
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
...@@ -87,7 +85,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, ...@@ -87,7 +85,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
for (int i = 0; i < blocks.size(); ++i) { for (int i = 0; i < blocks.size(); ++i) {
std::shared_ptr<framework::BlockDesc> block_desc = blocks[i]; std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops(); std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
for (int j = 0; j < debug_to; ++j) { for (int j = 0; j < ops.size(); ++j) {
std::shared_ptr<framework::OpDesc> op = ops[j]; std::shared_ptr<framework::OpDesc> op = ops[j];
DLOG << "create op: " << j << " " << op->Type(); DLOG << "create op: " << j << " " << op->Type();
auto op_base = framework::OpRegistry<Dtype>::CreateOp( auto op_base = framework::OpRegistry<Dtype>::CreateOp(
...@@ -416,7 +414,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( ...@@ -416,7 +414,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
} }
} }
#else #else
for (int i = 0; i < debug_to; i++) { for (int i = 0; i < ops.size(); i++) {
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
struct timespec ts; struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts); clock_gettime(CLOCK_MONOTONIC, &ts);
...@@ -433,8 +431,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( ...@@ -433,8 +431,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
DLOG << " predict return nullptr"; DLOG << " predict return nullptr";
return nullptr;
auto last_op = ops.rbegin(); auto last_op = ops.rbegin();
auto output_map = (*last_op)->Outputs(); auto output_map = (*last_op)->Outputs();
std::vector<std::string> out_keys = (*last_op)->GetOutKeys(); std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
......
...@@ -168,20 +168,20 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute( ...@@ -168,20 +168,20 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
int output_width = param.Output()->WidthOfOneBlock(); int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock();
DLOG << " c block " << c_block; // DLOG << " c block " << c_block;
DLOG << " w " << w; // DLOG << " w " << w;
DLOG << " nh " << nh; // DLOG << " nh " << nh;
DLOG << " stride " << stride; // DLOG << " stride " << stride;
DLOG << " offset " << offset; // DLOG << " offset " << offset;
DLOG << " input_c " << input_c; // DLOG << " input_c " << input_c;
DLOG << " dilation " << dilation; // DLOG << " dilation " << dilation;
DLOG << " input width " << input_width; // DLOG << " input width " << input_width;
DLOG << " input height " << input_height; // DLOG << " input height " << input_height;
DLOG << " output width " << output_width; // DLOG << " output width " << output_width;
DLOG << " output height " << output_height; // DLOG << " output height " << output_height;
DLOG << " input dim " << param.Input()->dims(); // DLOG << " input dim " << param.Input()->dims();
DLOG << " output dim " << param.Output()->dims(); // DLOG << " output dim " << param.Output()->dims();
DLOG << " filter dim " << param.Filter()->dims(); // DLOG << " filter dim " << param.Filter()->dims();
cl_int status; cl_int status;
...@@ -236,9 +236,12 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute( ...@@ -236,9 +236,12 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
cl_event out_event = param.Output()->GetClEvent();
cl_event wait_event = param.Input()->GetClEvent();
status = status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL); default_work_size.data(), NULL, 1, &wait_event, &out_event);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
} }
......
...@@ -117,9 +117,12 @@ void ConvAddKernel<GPU_CL, float>::Compute( ...@@ -117,9 +117,12 @@ void ConvAddKernel<GPU_CL, float>::Compute(
status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
cl_event out_event = param.Output()->GetClEvent();
cl_event wait_event = param.Input()->GetClEvent();
status = status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL); default_work_size.data(), NULL, 1, &wait_event, &out_event);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
} }
......
...@@ -62,27 +62,15 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { ...@@ -62,27 +62,15 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
template <> template <>
void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) { void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
DLOG << " Compute helper: " << &cl_helper_;
DLOG << " begin compute ";
auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
DLOG << " get work size ";
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
DLOG << " end work size ";
int c_block = default_work_size[0]; int c_block = default_work_size[0];
int w = default_work_size[1]; int w = default_work_size[1];
int nh = default_work_size[2]; int nh = default_work_size[2];
auto input = param.Input()->GetCLImage(); auto input = param.Input()->GetCLImage();
DLOG << " get Input ";
auto filter = param.Filter()->GetCLImage(); auto filter = param.Filter()->GetCLImage();
DLOG << " get Filter ";
auto output = param.Output()->GetCLImage(); auto output = param.Output()->GetCLImage();
DLOG << " get Output ";
int stride = param.Strides()[0]; int stride = param.Strides()[0];
int offset = param.Offset(); int offset = param.Offset();
int input_c = param.Input()->CBlock(); int input_c = param.Input()->CBlock();
...@@ -109,56 +97,27 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) { ...@@ -109,56 +97,27 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
DLOG << " output height " << output_height; DLOG << " output height " << output_height;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w); status = clSetKernelArg(kernel, 1, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh); status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(int), &stride); status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(int), &offset); status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
DLOG << " end set kernel arg ";
DLOG << " begin enqueue "; cl_event out_event = param.Output()->GetClEvent();
cl_event wait_event = param.Input()->GetClEvent();
status = status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL); default_work_size.data(), NULL, 1, &wait_event, &out_event);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
DLOG << " end enqueue ";
} }
template class ConvKernel<GPU_CL, float>; template class ConvKernel<GPU_CL, float>;
......
...@@ -76,9 +76,12 @@ void DepthwiseConvKernel<GPU_CL, float>::Compute( ...@@ -76,9 +76,12 @@ void DepthwiseConvKernel<GPU_CL, float>::Compute(
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
cl_event out_event = param.Output()->GetClEvent();
cl_event wait_event = param.Input()->GetClEvent();
status = status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL); default_work_size.data(), NULL, 1, &wait_event, &out_event);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
} }
......
...@@ -34,21 +34,13 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) { ...@@ -34,21 +34,13 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
auto inputImage = input->GetCLImage(); auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage(); auto outputImage = output->GetCLImage();
DLOG << " softmax - output image dim " << output->ImageDims();
DLOG << " softmax - output image tensor dim " << output->dims();
int group = output->ImageWidth(); int group = output->ImageWidth();
cl_int status; cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &group); status = clSetKernelArg(kernel, 2, sizeof(int), &group);
CL_CHECK_ERRORS(status);
// const auto &inputDim = input->dims(); // const auto &inputDim = input->dims();
// //
...@@ -62,7 +54,6 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) { ...@@ -62,7 +54,6 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
// clSetKernelArg(kernel, 3, sizeof(int), &dims[1]); // clSetKernelArg(kernel, 3, sizeof(int), &dims[1]);
// clSetKernelArg(kernel, 4, sizeof(int), &dims[2]); // clSetKernelArg(kernel, 4, sizeof(int), &dims[2]);
// clSetKernelArg(kernel, 5, sizeof(int), &dims[3]); // clSetKernelArg(kernel, 5, sizeof(int), &dims[3]);
DLOG << "default_work_size: " << default_work_size;
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL); default_work_size.data(), NULL, 0, NULL, NULL);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册