提交 88c0ed3b 编写于 作者: L liuruilong

fix cl image error

上级 ac1c2581
......@@ -101,7 +101,7 @@ class CLImage {
T *data() const {
if (initialized_) {
PADDLE_MOBILE_THROW_EXCEPTION(
" cl image has initialized, tensor data has been deleted ");
" cl image has initialized, tensor data has been deleted, can't use tensor data");
}
return reinterpret_cast<T *>(tensor_data_);
}
......@@ -118,6 +118,7 @@ class CLImage {
private:
void InitCLImage(cl_context context, float *tensor_data, const DDim &dim) {
DLOG << " tensor dim: " << dim;
cl_image_format cf = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT};
// NCHW -> [W * (C+3)/4, H * N]
......@@ -135,29 +136,23 @@ class CLImage {
tensor_data_[i] = 0;
}
}
size_t N, C, H, W;
if (tensor_dims_.size() == 4) {
N = tensor_dims_[0];
if (N < 0) {
N = 1;
}
C = tensor_dims_[1];
H = tensor_dims_[2];
W = tensor_dims_[3];
width_of_one_block_ = W;
height_of_one_block_ = H;
size_t new_dims[] = {1, 1, 1, 1};
} else if (tensor_dims_.size() == 1) {
N = 1;
C = tensor_dims_[0];
H = 1;
W = 1;
width_of_one_block_ = W;
height_of_one_block_ = H;
for (int j = 0; j < dim.size(); ++j) {
new_dims[4 - dim.size() + j] = dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
width_of_one_block_ = W;
height_of_one_block_ = H;
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
......@@ -196,6 +191,8 @@ class CLImage {
}
}
cl_int err;
DLOG << " image width: " << width;
DLOG << " image height: " << height;
cl_image_ = clCreateImage2D(
context, // cl_context context
CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0), // cl_mem_flags flags
......
......@@ -60,6 +60,7 @@ void OperatorBase<Dtype>::Run() {
DLOG << " begin run " << type_;
RunImpl();
DLOG << " end run " << type_;
#ifdef PADDLE_MOBILE_DEBUG
DLOG << "-------------" << type_ << "----------------------------";
vector<string> input_keys = GetInputKeys();
......
......@@ -12,4 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "conv_kernel.inc.cl"
//#include "conv_kernel.inc.cl"
__kernel void conv_3x3() {}
\ No newline at end of file
......@@ -42,18 +42,18 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
param->Filter()->HeightOfOneBlock() == 1) {
DLOG << " here1 ";
this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl");
} else if (param->Filter()->dims()[1] == 1) {
DLOG << " here2 ";
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_kernel.cl");
} else if (param->Filter()->WidthOfOneBlock() == 3 &&
param->Filter()->HeightOfOneBlock() == 3) {
DLOG << " here3 ";
this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
......@@ -64,64 +64,64 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
template <>
void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
DLOG << " Compute helper: " << &cl_helper_;
DLOG << " begin compute ";
auto kernel = this->cl_helper_.KernelAt(0);
DLOG << " get work size ";
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
DLOG << " end work size ";
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
DLOG << " get Input ";
auto filter = param.Filter()->GetCLImage();
DLOG << " get Filter ";
auto output = param.Output();
DLOG << " get Output ";
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = param.Input()->CBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->WidthOfOneBlock();
int input_height = param.Input()->HeightOfOneBlock();
cl_int status;
DLOG << " begin set kernel arg ";
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
DLOG << " end set kernel arg ";
CL_CHECK_ERRORS(status);
DLOG << " begin enqueue ";
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
DLOG << " end enqueue ";
CL_CHECK_ERRORS(status);
// DLOG << " Compute helper: " << &cl_helper_;
// DLOG << " begin compute ";
// auto kernel = this->cl_helper_.KernelAt(0);
// DLOG << " get work size ";
// auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
// DLOG << " end work size ";
// int c_block = default_work_size[0];
// int w = default_work_size[1];
// int nh = default_work_size[2];
// auto input = param.Input()->GetCLImage();
//
// DLOG << " get Input ";
//
// auto filter = param.Filter()->GetCLImage();
//
// DLOG << " get Filter ";
//
// auto output = param.Output();
//
// DLOG << " get Output ";
//
// int stride = param.Strides()[0];
// int offset = param.Offset();
// int input_c = param.Input()->CBlock();
// int dilation = param.Dilations()[0];
// int input_width = param.Input()->WidthOfOneBlock();
// int input_height = param.Input()->HeightOfOneBlock();
//
// cl_int status;
//
// DLOG << " begin set kernel arg ";
//
// status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
// status = clSetKernelArg(kernel, 1, sizeof(int), &w);
// status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
// status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
// status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
// status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
// status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
// status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
// status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
// status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
// status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
// status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
//
// DLOG << " end set kernel arg ";
//
// CL_CHECK_ERRORS(status);
//
// DLOG << " begin enqueue ";
//
// status =
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
// default_work_size.data(), NULL, 0, NULL, NULL);
//
// DLOG << " end enqueue ";
//
// CL_CHECK_ERRORS(status);
}
template class ConvKernel<GPU_CL, float>;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册