提交 404bfb04 编写于 作者: Y yangfei

imp CLimage printer function

上级 8013c608
......@@ -37,7 +37,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
float *p = tensor->data<float>();
float *p = tensor->mutable_data<float>();
half imageData[width * height * 4];
cl_int err;
cl_mem image = cl_image->GetCLImage();
......@@ -116,5 +116,65 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
i0 += width * H;
}
}
#ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const CLImage &cl_image){
printer << " dims: " << cl_image.dims() << "\n";
int stride = cl_image.numel() / 20;
stride = stride > 0 ? stride : 1;
float *data = new float[cl_image.numel()];
DDim ddim = cl_image.dims();
size_t N, C, H, W;
if (ddim.size() == 4) {
N = ddim[0];
if (N < 0) {
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
} else if (ddim.size() == 1) {
N = 1;
C = ddim[0];
H = 1;
W = 1;
}
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
float *p = data;
half imageData[width * height * 4];
cl_int err;
cl_mem image = cl_image.GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, region, 0, 0,
imageData, 0, NULL, NULL);
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
*p = Half2Float(imageData[i2]);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
if (err != CL_SUCCESS) {
// TODO: error handling
}
for (int i = 0; i < cl_image.numel(); i += stride) {
printer << data[i] << " ";
}
return printer;
}
#endif
} // namespace framework
} // namespace paddle_mobile
......@@ -46,27 +46,27 @@ class CLImage {
/*
* need call SetTensorData first
* */
void InitCLImage(cl_context context) {
void InitCLImage(cl_context context,cl_command_queue command_queue) {
if (tensor_data_ == nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
}
if (tensor_dims_.size() <= 2) {
InitCLImage2C(context, tensor_data_, tensor_dims_);
InitCLImage2C(context, command_queue,tensor_data_, tensor_dims_);
} else {
InitCLImage(context, tensor_data_, tensor_dims_);
InitCLImage(context, command_queue,tensor_data_, tensor_dims_);
}
delete[](tensor_data_);
tensor_data_ = nullptr;
initialized_ = true;
}
void InitEmptyImage(cl_context context, const DDim &dim) {
void InitEmptyImage(cl_context context, cl_command_queue command_queue,const DDim &dim) {
if (tensor_data_ != nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(
" empty image tensor data shouldn't have value");
}
DLOG << " init empty image ";
InitCLImage(context, nullptr, dim);
InitCLImage(context, command_queue,nullptr, dim);
initialized_ = true;
}
......@@ -93,6 +93,8 @@ class CLImage {
* */
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
inline cl_command_queue CommandQueue() const{ return command_queue_;}
/*
* resize original tensor dim
* */
......@@ -122,7 +124,8 @@ class CLImage {
const DDim &dims() const { return tensor_dims_; }
private:
void InitCLImage2C(cl_context context, float *tensor_data, const DDim &dim) {
void InitCLImage2C(cl_context context, cl_command_queue command_queue,float *tensor_data, const DDim &dim) {
command_queue_ = command_queue;
assert(dim.size() <= 2);
int tdim[2] = {1, 1};
if (dim.size() == 1) {
......@@ -145,7 +148,7 @@ class CLImage {
InitCLImage(context, width, height, imageData.get());
}
void InitCLImage(cl_context context, int width, int height, void *data) {
void InitCLImage(cl_context context,int width, int height, void *data) {
cl_image_format cf = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT};
cl_image_desc cid = {
......@@ -174,10 +177,11 @@ class CLImage {
PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
}
}
void InitCLImage(cl_context context, float *tensor_data, const DDim &dim) {
void InitCLImage(cl_context context, cl_command_queue command_queue,float *tensor_data, const DDim &dim) {
DLOG << " tensor dim: " << dim;
// NCHW -> [W * (C+3)/4, H * N]
tensor_dims_ = dim;
command_queue_ = command_queue;
if (tensor_data) {
tensor_data_ = tensor_data;
}
......@@ -240,6 +244,7 @@ class CLImage {
DDim image_dims_;
float *tensor_data_;
cl_context context_;
cl_command_queue command_queue_;
};
void TensorToCLImage(Tensor *tensor, CLImage *image,
......
......@@ -37,7 +37,7 @@ limitations under the License. */
#include "framework/cl/cl_image.h"
#endif
int debug_to = 115;
int debug_to = 4;
namespace paddle_mobile {
namespace framework {
......@@ -953,12 +953,13 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
auto cl_image = var->template GetMutable<framework::CLImage>();
cl_context context = program_.scope->GetCLScpoe()->Context();
cl_command_queue command_queue = program_.scope->GetCLScpoe()->CommandQueue();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
// framework::DDim ddim = framework::make_ddim(desc.Dims());
framework::DDim ddim = cl_image->dims();
DLOG << var_desc->Name();
cl_image->InitEmptyImage(context, ddim);
cl_image->InitEmptyImage(context,command_queue, ddim);
}
}
}
......@@ -1010,11 +1011,11 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
} else {
auto cl_image = var->template GetMutable<framework::CLImage>();
cl_context context = program_.scope->GetCLScpoe()->Context();
cl_command_queue command_queue = program_.scope->GetCLScpoe()->CommandQueue();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
framework::DDim ddim = cl_image->dims();
// framework::DDim ddim = framework::make_ddim(desc.Dims());
cl_image->InitEmptyImage(context, ddim);
cl_image->InitEmptyImage(context, command_queue,ddim);
}
}
}
......
......@@ -57,10 +57,7 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
template <typename Dtype>
void OperatorBase<Dtype>::Run() {
DLOG << " begin run " << type_;
RunImpl();
DLOG << " end run " << type_;
return;
#ifdef PADDLE_MOBILE_DEBUG
DLOG << "-------------" << type_ << "----------------------------";
vector<string> input_keys = GetInputKeys();
......@@ -75,16 +72,8 @@ void OperatorBase<Dtype>::Run() {
if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
} else {
CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
// cl_command_queue commandQueue =
// scope_->GetCLScpoe()->CommandQueue(); Tensor
// *tmp ;
// CLImageToTensor(cl_image,tmp,commandQueue);
// tmp->Resize(cl_image->dims());
const float *input = cl_image->data<float>();
if (cl_image) {
DLOG << type_ << " input- " << key << "=" << cl_image->dims();
// if(input)
// DLOG<<type_<<" input- "<<key<<"="<<*input;
DLOG << type_ << " input- " << key << "="<<*cl_image;
}
}
......@@ -108,15 +97,8 @@ void OperatorBase<Dtype>::Run() {
}
} else {
CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
// cl_command_queue commandQueue =
// scope_->GetCLScpoe()->CommandQueue(); Tensor *tmp ;
// CLImageToTensor(cl_image,tmp,commandQueue);
// tmp->Resize(cl_image->dims());
if (cl_image) {
const float *output = cl_image->data<float>();
DLOG << type_ << " output- " << key << "=" << cl_image->dims();
// if(output)
// DLOG<<type_<<" output- "<<key<<"="<<*output;
DLOG << type_ << " output- " << key << "="<<*cl_image ;
}
}
......
......@@ -49,11 +49,11 @@ bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) {
framework::CLImage *new_scale = new framework::CLImage();
new_scale->SetTensorData(new_scale_ptr, variance->dims());
new_scale->InitCLImage(this->cl_helper_.CLContext());
new_scale->InitCLImage(this->cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
framework::CLImage *new_bias = new framework::CLImage();
new_bias->SetTensorData(new_bias_ptr, variance->dims());
new_bias->InitCLImage(this->cl_helper_.CLContext());
new_bias->InitCLImage(this->cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
......
......@@ -29,8 +29,8 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
param->Paddings()[0] == param->Paddings()[1],
"need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext());
param->Bias()->InitCLImage(cl_helper_.CLContext());
param->Filter()->InitCLImage(cl_helper_.CLContext(),cl_helper_.CLCommandQueue());
param->Bias()->InitCLImage(cl_helper_.CLContext(),cl_helper_.CLCommandQueue());
// const CL *mean = param->InputMean();
const framework::CLImage *mean = param->InputMean();
......@@ -62,12 +62,12 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
framework::CLImage *new_scale = new framework::CLImage();
new_scale->SetTensorData(new_scale_ptr, variance->dims());
new_scale->InitCLImage(this->cl_helper_.CLContext());
new_scale->InitCLImage(this->cl_helper_.CLContext(),cl_helper_.CLCommandQueue());
framework::CLImage *new_bias = new framework::CLImage();
new_bias->SetTensorData(new_bias_ptr, variance->dims());
new_bias->InitCLImage(this->cl_helper_.CLContext());
new_bias->InitCLImage(this->cl_helper_.CLContext(),cl_helper_.CLCommandQueue());
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
......
......@@ -25,8 +25,8 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext());
param->Bias()->InitCLImage(cl_helper_.CLContext());
param->Filter()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
param->Bias()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
......
......@@ -26,7 +26,7 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
param->Paddings()[0] == param->Paddings()[1],
"need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext());
param->Filter()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
......
......@@ -27,7 +27,7 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext());
param->Filter()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset);
......
......@@ -23,7 +23,7 @@ template <>
bool ElementwiseAddKernel<GPU_CL, float>::Init(
ElementwiseAddParam<GPU_CL> *param) {
CLImage *bias = (CLImage*)param->InputY();
bias->InitCLImage(cl_helper_.CLContext());
bias->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
if(bias->dims().size()==4){
this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
}else if(param->InputY()->dims().size()==1){
......
......@@ -30,6 +30,7 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
cl_int status;
auto output = param.Out();
const Tensor *input = param.InputX();
DLOG<<*input;
const float *input_data = input->data<float>();
int numel = input->numel();
cl_mem cl_image = output->GetCLImage();
......@@ -53,14 +54,6 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
int len = 4 * 224 * 224;
half *out = new half[len];
cl_command_queue commandQueue = this->cl_helper_.CLCommandQueue();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {height, width, 1};
clEnqueueReadImage(commandQueue, cl_image, CL_TRUE, origin, region, 0, 0, out,
0, NULL, NULL);
}
template class FeedKernel<GPU_CL, float>;
......
......@@ -19,44 +19,44 @@ namespace operators {
template <>
bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
// this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
return true;
}
template <>
void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
auto input = param.InputX()->GetCLImage();
auto *out = param.Out();
const auto &dims = param.InputX()->dims();
const int N = dims[0];
const int C = dims[1];
const int in_height = dims[2];
const int in_width = dims[3];
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * C;
// need create outputBuffer
cl_image_format imageFormat;
imageFormat.image_channel_order = CL_RGBA;
imageFormat.image_channel_data_type = CL_FLOAT;
cl_mem outputBuffer;
clSetKernelArg(kernel, 0, sizeof(int), &in_height);
clSetKernelArg(kernel, 1, sizeof(int), &in_width);
clSetKernelArg(kernel, 2, sizeof(int), &size_ch);
clSetKernelArg(kernel, 3, sizeof(int), &size_block);
clSetKernelArg(kernel, 4, sizeof(int), &size_batch);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
// auto kernel = this->cl_helper_.KernelAt(0);
// auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
//
// auto input = param.InputX()->GetCLImage();
// auto *out = param.Out();
//
// const auto &dims = param.InputX()->dims();
// const int N = dims[0];
// const int C = dims[1];
// const int in_height = dims[2];
// const int in_width = dims[3];
//
// int size_ch = in_height * in_width;
// int size_block = size_ch * 4;
// int size_batch = size_ch * C;
//
// // need create outputBuffer
// cl_image_format imageFormat;
// imageFormat.image_channel_order = CL_RGBA;
// imageFormat.image_channel_data_type = CL_FLOAT;
// cl_mem outputBuffer;
//
// clSetKernelArg(kernel, 0, sizeof(int), &in_height);
// clSetKernelArg(kernel, 1, sizeof(int), &in_width);
// clSetKernelArg(kernel, 2, sizeof(int), &size_ch);
// clSetKernelArg(kernel, 3, sizeof(int), &size_block);
// clSetKernelArg(kernel, 4, sizeof(int), &size_batch);
// clSetKernelArg(kernel, 5, sizeof(cl_mem), &input);
// clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer);
//
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
// default_work_size.data(), NULL, 0, NULL, NULL);
}
template class FetchKernel<GPU_CL, float>;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册