diff --git a/src/framework/cl/cl_image.cpp b/src/framework/cl/cl_image.cpp index 447e08a7012018eae1d09b056a77a233e74f2bee..a0cbdb430913632b945a2042de802c0f9d328846 100644 --- a/src/framework/cl/cl_image.cpp +++ b/src/framework/cl/cl_image.cpp @@ -37,7 +37,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor, size_t width = W * ((C + 3) / 4); size_t height = H * N; - float *p = tensor->data(); + float *p = tensor->mutable_data(); half imageData[width * height * 4]; cl_int err; cl_mem image = cl_image->GetCLImage(); @@ -116,5 +116,65 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, i0 += width * H; } } +#ifdef PADDLE_MOBILE_DEBUG +Print &operator<<(Print &printer, const CLImage &cl_image){ + printer << " dims: " << cl_image.dims() << "\n"; + int stride = cl_image.numel() / 20; + stride = stride > 0 ? stride : 1; + float *data = new float[cl_image.numel()]; + DDim ddim = cl_image.dims(); + size_t N, C, H, W; + if (ddim.size() == 4) { + N = ddim[0]; + if (N < 0) { + N = 1; + } + C = ddim[1]; + H = ddim[2]; + W = ddim[3]; + } else if (ddim.size() == 1) { + N = 1; + C = ddim[0]; + H = 1; + W = 1; + } + + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + float *p = data; + half imageData[width * height * 4]; + cl_int err; + cl_mem image = cl_image.GetCLImage(); + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width, height, 1}; + err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, region, 0, 0, + imageData, 0, NULL, NULL); + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + size_t i1 = i0; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + *p = Half2Float(imageData[i2]); + i2 += 4; + p++; + } + i1 += width; + } + } + i0 += width * H; + } + + if (err != CL_SUCCESS) { + // TODO: error handling + } + for (int i = 0; i < cl_image.numel(); i += stride) { + printer << data[i] << " "; + } + return printer; + } +#endif } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index f7d86ec8532b8557bb51d088bdae10c92dfe77fe..e4784083414c02608d505e853c2413b44d05537a 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -46,27 +46,27 @@ class CLImage { /* * need call SetTensorData first * */ - void InitCLImage(cl_context context) { + void InitCLImage(cl_context context,cl_command_queue command_queue) { if (tensor_data_ == nullptr) { PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); } if (tensor_dims_.size() <= 2) { - InitCLImage2C(context, tensor_data_, tensor_dims_); + InitCLImage2C(context, command_queue,tensor_data_, tensor_dims_); } else { - InitCLImage(context, tensor_data_, tensor_dims_); + InitCLImage(context, command_queue,tensor_data_, tensor_dims_); } delete[](tensor_data_); tensor_data_ = nullptr; initialized_ = true; } - void InitEmptyImage(cl_context context, const DDim &dim) { + void InitEmptyImage(cl_context context, cl_command_queue command_queue,const DDim &dim) { if (tensor_data_ != nullptr) { PADDLE_MOBILE_THROW_EXCEPTION( " empty image tensor data shouldn't have value"); } DLOG << " init empty image "; - InitCLImage(context, nullptr, dim); + InitCLImage(context, command_queue,nullptr, dim); initialized_ = true; } @@ -93,6 +93,8 @@ class CLImage { * */ inline size_t HeightOfOneBlock() const { return height_of_one_block_; } + inline cl_command_queue CommandQueue() const{ return command_queue_;} + /* * resize original tensor dim * */ @@ -122,7 +124,8 @@ class CLImage { const DDim &dims() const { return tensor_dims_; } private: - void InitCLImage2C(cl_context context, float *tensor_data, const DDim &dim) { + void InitCLImage2C(cl_context context, cl_command_queue command_queue,float *tensor_data, const DDim &dim) { + command_queue_ = command_queue; assert(dim.size() <= 2); int tdim[2] = {1, 1}; if (dim.size() == 1) { @@ -145,7 +148,7 @@ class CLImage { InitCLImage(context, width, height, imageData.get()); } - void InitCLImage(cl_context context, int width, int height, void *data) { + void InitCLImage(cl_context context,int width, int height, void *data) { cl_image_format cf = {.image_channel_order = CL_RGBA, .image_channel_data_type = CL_HALF_FLOAT}; cl_image_desc cid = { @@ -174,10 +177,11 @@ class CLImage { PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error "); } } - void InitCLImage(cl_context context, float *tensor_data, const DDim &dim) { + void InitCLImage(cl_context context, cl_command_queue command_queue,float *tensor_data, const DDim &dim) { DLOG << " tensor dim: " << dim; // NCHW -> [W * (C+3)/4, H * N] tensor_dims_ = dim; + command_queue_ = command_queue; if (tensor_data) { tensor_data_ = tensor_data; } @@ -240,6 +244,7 @@ class CLImage { DDim image_dims_; float *tensor_data_; cl_context context_; + cl_command_queue command_queue_; }; void TensorToCLImage(Tensor *tensor, CLImage *image, diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index d61abac29e7946c75373f3168ede2df8ecc3a33d..8b1a2cd8ec05c98321dfdc0366e01d1abca9e206 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -37,7 +37,7 @@ limitations under the License. */ #include "framework/cl/cl_image.h" #endif -int debug_to = 115; +int debug_to = 4; namespace paddle_mobile { namespace framework { @@ -953,12 +953,13 @@ void Executor::InitMemory() { if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { auto cl_image = var->template GetMutable(); cl_context context = program_.scope->GetCLScpoe()->Context(); + cl_command_queue command_queue = program_.scope->GetCLScpoe()->CommandQueue(); const framework::TensorDesc &desc = var_desc->Tensor_desc(); // framework::DDim ddim = framework::make_ddim(desc.Dims()); framework::DDim ddim = cl_image->dims(); DLOG << var_desc->Name(); - cl_image->InitEmptyImage(context, ddim); + cl_image->InitEmptyImage(context,command_queue, ddim); } } } @@ -1010,11 +1011,11 @@ void Executor::InitCombineMemory() { } else { auto cl_image = var->template GetMutable(); cl_context context = program_.scope->GetCLScpoe()->Context(); - + cl_command_queue command_queue = program_.scope->GetCLScpoe()->CommandQueue(); const framework::TensorDesc &desc = var_desc->Tensor_desc(); framework::DDim ddim = cl_image->dims(); // framework::DDim ddim = framework::make_ddim(desc.Dims()); - cl_image->InitEmptyImage(context, ddim); + cl_image->InitEmptyImage(context, command_queue,ddim); } } } diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp index 1bfac97c8bed8fea68b7d24b495ef6f4a2009340..75674f3cf012fe4619a6bd1cf30feb3447ed5520 100644 --- a/src/framework/operator.cpp +++ b/src/framework/operator.cpp @@ -57,10 +57,7 @@ void OperatorBase::CheckAllInputOutputSet() const {} template void OperatorBase::Run() { - DLOG << " begin run " << type_; RunImpl(); - DLOG << " end run " << type_; - return; #ifdef PADDLE_MOBILE_DEBUG DLOG << "-------------" << type_ << "----------------------------"; vector input_keys = GetInputKeys(); @@ -75,16 +72,8 @@ void OperatorBase::Run() { if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; } else { CLImage *cl_image = vari->template GetMutable(); - // cl_command_queue commandQueue = - // scope_->GetCLScpoe()->CommandQueue(); Tensor - // *tmp ; - // CLImageToTensor(cl_image,tmp,commandQueue); - // tmp->Resize(cl_image->dims()); - const float *input = cl_image->data(); if (cl_image) { - DLOG << type_ << " input- " << key << "=" << cl_image->dims(); - // if(input) - // DLOG<Filter()->InitCLImage(cl_helper_.CLContext()); - param->Bias()->InitCLImage(cl_helper_.CLContext()); + param->Filter()->InitCLImage(cl_helper_.CLContext(),cl_helper_.CLCommandQueue()); + param->Bias()->InitCLImage(cl_helper_.CLContext(),cl_helper_.CLCommandQueue()); // const CL *mean = param->InputMean(); const framework::CLImage *mean = param->InputMean(); @@ -62,12 +62,12 @@ bool ConvAddBNReluKernel::Init( framework::CLImage *new_scale = new framework::CLImage(); new_scale->SetTensorData(new_scale_ptr, variance->dims()); - new_scale->InitCLImage(this->cl_helper_.CLContext()); + new_scale->InitCLImage(this->cl_helper_.CLContext(),cl_helper_.CLCommandQueue()); framework::CLImage *new_bias = new framework::CLImage(); new_bias->SetTensorData(new_bias_ptr, variance->dims()); - new_bias->InitCLImage(this->cl_helper_.CLContext()); + new_bias->InitCLImage(this->cl_helper_.CLContext(),cl_helper_.CLCommandQueue()); param->SetNewScale(new_scale); param->SetNewBias(new_bias); diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index ac36e70ed793ee000602114eb690780235b7d858..16e1a889dfc5cfc4d1e109bffa78891a333c988d 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -25,8 +25,8 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Paddings()[0] == param->Paddings()[1], "need equal"); - param->Filter()->InitCLImage(cl_helper_.CLContext()); - param->Bias()->InitCLImage(cl_helper_.CLContext()); + param->Filter()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue()); + param->Bias()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue()); int offset = static_cast(param->Filter()->dims()[2]) / 2 - static_cast(param->Paddings()[1]); diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp index bc50440273c48df6ea765716b2b7711d0765b007..0251db1f838ad9205852e92ea3fc9e5cd1affd25 100644 --- a/src/operators/kernel/cl/conv_kernel.cpp +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -26,7 +26,7 @@ bool ConvKernel::Init(ConvParam *param) { param->Paddings()[0] == param->Paddings()[1], "need equal"); - param->Filter()->InitCLImage(cl_helper_.CLContext()); + param->Filter()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue()); int offset = static_cast(param->Filter()->dims()[2]) / 2 - static_cast(param->Paddings()[1]); diff --git a/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/src/operators/kernel/cl/depthwise_conv_kernel.cpp index dcb95b4c6a9c42ff8c08f83611fc89aebc6efdfa..65c31dca3da6052ae184d139d091bd55a0539aa0 100644 --- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp +++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp @@ -27,7 +27,7 @@ bool DepthwiseConvKernel::Init(ConvParam *param) { param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Paddings()[0] == param->Paddings()[1], "need equal"); - param->Filter()->InitCLImage(cl_helper_.CLContext()); + param->Filter()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue()); int offset = static_cast(param->Filter()->dims()[2]) / 2 - static_cast(param->Paddings()[1]); param->SetOffset(offset); diff --git a/src/operators/kernel/cl/elementwise_add_kernel.cpp b/src/operators/kernel/cl/elementwise_add_kernel.cpp index a64f3c688b4519daa896d6f855fc464ab8b77f57..856afc3b9126185cd13ec6cde958d86a659e9273 100644 --- a/src/operators/kernel/cl/elementwise_add_kernel.cpp +++ b/src/operators/kernel/cl/elementwise_add_kernel.cpp @@ -23,7 +23,7 @@ template <> bool ElementwiseAddKernel::Init( ElementwiseAddParam *param) { CLImage *bias = (CLImage*)param->InputY(); - bias->InitCLImage(cl_helper_.CLContext()); + bias->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue()); if(bias->dims().size()==4){ this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); }else if(param->InputY()->dims().size()==1){ diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp index 6b4d883733634401f293ff304d63bfb3d913e134..ba90e75ca89d99c48e5bcb740dbdc837a2ec489b 100644 --- a/src/operators/kernel/cl/feed_kernel.cpp +++ b/src/operators/kernel/cl/feed_kernel.cpp @@ -30,6 +30,7 @@ void FeedKernel::Compute(const FeedParam ¶m) { cl_int status; auto output = param.Out(); const Tensor *input = param.InputX(); + DLOG<<*input; const float *input_data = input->data(); int numel = input->numel(); cl_mem cl_image = output->GetCLImage(); @@ -53,14 +54,6 @@ void FeedKernel::Compute(const FeedParam ¶m) { status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); - - int len = 4 * 224 * 224; - half *out = new half[len]; - cl_command_queue commandQueue = this->cl_helper_.CLCommandQueue(); - size_t origin[3] = {0, 0, 0}; - size_t region[3] = {height, width, 1}; - clEnqueueReadImage(commandQueue, cl_image, CL_TRUE, origin, region, 0, 0, out, - 0, NULL, NULL); } template class FeedKernel; diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp index 995713ce5afaf0a93bc6b8ddd9928d7cee1c55ff..49f94ad895fdeb5cc7c395e1c8e4e9488e443385 100644 --- a/src/operators/kernel/cl/fetch_kernel.cpp +++ b/src/operators/kernel/cl/fetch_kernel.cpp @@ -19,44 +19,44 @@ namespace operators { template <> bool FetchKernel::Init(FetchParam *param) { - this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); +// this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); return true; } template <> void FetchKernel::Compute(const FetchParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX()); - - auto input = param.InputX()->GetCLImage(); - auto *out = param.Out(); - - const auto &dims = param.InputX()->dims(); - const int N = dims[0]; - const int C = dims[1]; - const int in_height = dims[2]; - const int in_width = dims[3]; - - int size_ch = in_height * in_width; - int size_block = size_ch * 4; - int size_batch = size_ch * C; - - // need create outputBuffer - cl_image_format imageFormat; - imageFormat.image_channel_order = CL_RGBA; - imageFormat.image_channel_data_type = CL_FLOAT; - cl_mem outputBuffer; - - clSetKernelArg(kernel, 0, sizeof(int), &in_height); - clSetKernelArg(kernel, 1, sizeof(int), &in_width); - clSetKernelArg(kernel, 2, sizeof(int), &size_ch); - clSetKernelArg(kernel, 3, sizeof(int), &size_block); - clSetKernelArg(kernel, 4, sizeof(int), &size_batch); - clSetKernelArg(kernel, 5, sizeof(cl_mem), &input); - clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer); - - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - default_work_size.data(), NULL, 0, NULL, NULL); +// auto kernel = this->cl_helper_.KernelAt(0); +// auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX()); +// +// auto input = param.InputX()->GetCLImage(); +// auto *out = param.Out(); +// +// const auto &dims = param.InputX()->dims(); +// const int N = dims[0]; +// const int C = dims[1]; +// const int in_height = dims[2]; +// const int in_width = dims[3]; +// +// int size_ch = in_height * in_width; +// int size_block = size_ch * 4; +// int size_batch = size_ch * C; +// +// // need create outputBuffer +// cl_image_format imageFormat; +// imageFormat.image_channel_order = CL_RGBA; +// imageFormat.image_channel_data_type = CL_FLOAT; +// cl_mem outputBuffer; +// +// clSetKernelArg(kernel, 0, sizeof(int), &in_height); +// clSetKernelArg(kernel, 1, sizeof(int), &in_width); +// clSetKernelArg(kernel, 2, sizeof(int), &size_ch); +// clSetKernelArg(kernel, 3, sizeof(int), &size_block); +// clSetKernelArg(kernel, 4, sizeof(int), &size_batch); +// clSetKernelArg(kernel, 5, sizeof(cl_mem), &input); +// clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer); +// +// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, +// default_work_size.data(), NULL, 0, NULL, NULL); } template class FetchKernel;