未验证 提交 8d332397 编写于 作者: X xiebaiyuan 提交者: GitHub

[mobile]Develop common deepwise & fix bug in element mul (#2687)

* [mobile][opencl]common deepwise conv,test=mobile

* [mobile][opencl]revert deepwise 3x3 for stable ,test = mobile

* [mobile][opencl]format convkernel.inc.cl with clang-format ,test = mobile

* [mobile][opencl] suite 1*X Y element_y ,test=mobile

* [mobile][opencl] add whole print method for cl_image ,test=mobile
上级 53a5906c
......@@ -18,6 +18,37 @@ limitations under the License. */
namespace paddle_mobile {
namespace framework {
void CLImage::PrintTensor(const CLImage &cl_image) const {
size_t width = cl_image.ImageDims()[0];
size_t height = cl_image.ImageDims()[1];
half_t *image_data = new half_t[height * width * 4];
cl_int err;
cl_mem image = cl_image.GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
region, 0, 0, image_data, 0, NULL, NULL);
CL_CHECK_ERRORS(err);
PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0,
"cl_image numel should not be 0 ");
float *tensor_data = new float[cl_image.numel()];
auto converter = cl_image.Converter();
converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
cl_image.dims());
int stride = cl_image.numel() / 20;
stride = stride > 0 ? stride : 1;
for (int i = 0; i < cl_image.numel(); i++) {
printf("%f \n", tensor_data[i]);
}
delete[](tensor_data);
delete[](image_data);
}
void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context,
cl_command_queue commandQueue, cl_kernel kernel) {
tensor->mutable_data<float>();
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <iostream>
#include <memory>
#include <vector>
......@@ -285,6 +286,7 @@ class CLImage {
cl_event GetClEvent() const { return cl_event_.get(); }
CLImageConverterBase *Converter() const { return image_converter_; }
void PrintTensor(const CLImage &cl_image) const;
private:
void InitCLImage(cl_context context, size_t width, size_t height,
......
......@@ -21,13 +21,14 @@ namespace framework {
const char* opencl_error_to_str(cl_int error);
#define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \
printf( \
"OpenCL error with code %s happened in file %s at line %d. " \
"Exiting.\n", \
paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
__LINE__); \
#define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \
printf( \
"\033[1;31;40mOpenCL error with code %s happened in file %s at line " \
"%d. " \
"Exiting.\033[0m\n", \
paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
__LINE__); \
}
} // namespace framework
......
......@@ -241,7 +241,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
cl_int status;
int index = 0;
if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
const int filter_height = param.Filter()->dims()[2];
const int filter_width = param.Filter()->dims()[3];
if (filter_height == 1 && filter_width == 1) {
status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
......@@ -404,7 +406,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) {
if (filter_height == 3 && filter_width == 3) {
// normal conv
if (param.Filter()->dims()[0] == param.Output()->dims()[1] &&
param.Filter()->dims()[1] == param.Input()->dims()[1]) {
......@@ -425,6 +427,17 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
status = clSetKernelArg(kernel, index++, sizeof(int), &group);
CL_CHECK_ERRORS(status);
}
} else if (filter_height != 3 && filter_width != 3) {
// not 3x3
if (param.Filter()->dims()[1] == 1 &&
param.Input()->dims()[1] == param.Output()->dims()[1]) {
// deepwise basic use in not 3x3
status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
CL_CHECK_ERRORS(status);
}
}
status = clEnqueueNDRangeKernel(
......
此差异已折叠。
......@@ -13,33 +13,101 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords);
half4 output = in * biase;
write_imageh(outputImage,coords,output);
}
__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only
image2d_t outputImage, int w) {
__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
__write_only image2d_t outputImage) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords);
half4 output = in * biase;
write_imageh(outputImage, coords, output);
}
__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,
__write_only image2d_t outputImage, int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias;
coords_bias.x = x/w;
coords_bias.x = x / w;
coords_bias.y = 0;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords_bias);
half4 output = in * biase;
write_imageh(outputImage,coords,output);
write_imageh(outputImage, coords, output);
}
// etc : 1 1 1 72
// run time Y [value,0,0,0] * 72
__kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias,
__write_only image2d_t outputImage, int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias0;
int2 coords_bias1;
int2 coords_bias2;
int2 coords_bias3;
/* if (x == 0 && y == 0) {
half4 b = (half4){0, 0, 0, 0};
#define PPI(j, k) \
b = read_imageh(bias, sampler, (int2){j, k}); \
printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \
convert_float(b.y), convert_float(b.z), convert_float(b.w));
for (int i = 0; i < 73; ++i) {
PPI(i, 0);
}
#undef PPI
}*/
coords_bias0.x = x / w * 4;
coords_bias0.y = 0;
coords_bias1.x = x / w * 4 + 1;
coords_bias1.y = 0;
coords_bias2.x = x / w * 4 + 2;
coords_bias2.y = 0;
coords_bias3.x = x / w * 4 + 3;
coords_bias3.y = 0;
half4 biase0 = read_imageh(bias, sampler, coords_bias0);
half4 biase1 = read_imageh(bias, sampler, coords_bias1);
half4 biase2 = read_imageh(bias, sampler, coords_bias2);
half4 biase3 = read_imageh(bias, sampler, coords_bias3);
/* if (x == 0 && y == 0) {
printf("bias0={ %f , %f , %f , %f }\n ",
convert_float(biase0.x), convert_float(biase0.y),
convert_float(biase0.z), convert_float(biase0.w));
printf("bias1={ %f , %f , %f , %f }\n ",
convert_float(biase1.x), convert_float(biase1.y),
convert_float(biase1.z), convert_float(biase1.w));
printf("bias2={ %f , %f , %f , %f }\n ",
convert_float(biase2.x), convert_float(biase2.y),
convert_float(biase2.z), convert_float(biase2.w));
printf("bias3={ %f , %f , %f , %f }\n ",
convert_float(biase3.x), convert_float(biase3.y),
convert_float(biase3.z), convert_float(biase3.w));
}*/
half4 biase = {biase0.x, biase1.x, biase2.x, biase3.x};
half4 in = read_imageh(input, sampler, coords);
half4 output = mad(in, biase, 0);
write_imageh(outputImage, coords, output);
}
\ No newline at end of file
......@@ -174,6 +174,16 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
build_options);
}
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
// other depthwise not with filter 3x3
DLOG << "depth_conv basic ";
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
......@@ -214,6 +224,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
param.NewScale(), param.NewBias());
break;
......
......@@ -71,6 +71,14 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
build_options);
}
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
......@@ -124,6 +132,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
......
......@@ -72,6 +72,14 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
build_options);
}
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
DLOG << "init depwise conv basic";
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
......@@ -130,6 +138,7 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
......
......@@ -129,6 +129,14 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
build_options);
}
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
......@@ -168,6 +176,7 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
param.NewBias());
break;
......
......@@ -66,6 +66,14 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
}
DLOG << "depth_conv 3x3";
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file);
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
......@@ -115,6 +123,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param);
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
......
......@@ -72,6 +72,14 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
DLOG << "depth_conv 3x3";
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
......@@ -120,6 +128,7 @@ void ConvReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true);
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#ifdef ELEMENTWISEMUL_OP
#include "operators/kernel/elementwise_mul_kernel.h"
#include <framework/cl/cl_half.h>
#include <iostream>
#include "framework/cl/cl_image.h"
namespace paddle_mobile {
......@@ -23,19 +25,24 @@ namespace operators {
template <>
bool ElementwiseMulKernel<GPU_CL, float>::Init(
ElementwiseMulParam<GPU_CL> *param) {
DLOG << "-----init add-----";
framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
const_cast<framework::CLImage *>(param->InputY()));
if (bias->dims() == param->InputX()->dims()) {
DLOG << "init element wise mul";
this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
} else if (bias->dims().size() == 4) {
} else if (bias->dims().size() == 1) {
DLOG << "init channel_mul";
this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
} else if (bias->dims().size() == 2) {
// etc. input 1 72 28 28
// filter 1 72
DLOG << "init channel_mul_d2";
this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
} else {
DLOG << "error:bias dims is error";
PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet");
}
return true;
}
template <>
void ElementwiseMulKernel<GPU_CL, float>::Compute(
const ElementwiseMulParam<GPU_CL> &param) {
......@@ -64,8 +71,8 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else if (bias->dims().size() == 4) {
DLOG << "zp7 444";
} else if (bias->dims().size() == 1) {
DLOG << "channel mul";
cl_mem input_image = input->GetCLImage();
cl_mem bias_image = bias->GetCLImage();
cl_mem output_image = output->GetCLImage();
......@@ -84,14 +91,48 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
CL_CHECK_ERRORS(status);
auto width = input->ImageWidth();
auto height = input->ImageHeight();
DLOG << "dede:" << width << "," << height;
size_t global_work_size[2] = {width, height};
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else if (bias->dims().size() == 2) {
DLOG << "channel mul d2";
// etc. input 1 72 28 28
// filter 1 72 --> 1 1 1 72
DLOG << "input->ImageDims(): " << input->ImageDims();
DLOG << "bias->ImageDims(): " << bias->ImageDims();
DLOG << "out->ImageDims(): " << output->ImageDims();
DLOG << "channel mul d2";
cl_mem input_image = input->GetCLImage();
cl_mem bias_image = bias->GetCLImage();
cl_mem output_image = output->GetCLImage();
int tensor_w = input->dims()[input->dims().size() - 1];
status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
reinterpret_cast<void *>(&input_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
reinterpret_cast<void *>(&bias_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
reinterpret_cast<void *>(&output_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int),
reinterpret_cast<void *>(&tensor_w));
CL_CHECK_ERRORS(status);
auto width = input->ImageWidth();
auto height = input->ImageHeight();
size_t global_work_size[2] = {width, height};
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
// bias->PrintTensor(*bias);
} else {
DLOG << "error:bias dims is error";
PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet")
}
}
......
......@@ -489,6 +489,7 @@ class ConvParam : public OpParam {
EXEC_SLIDINGWINDOW5x5_FLOAT,
EXEC_SLIDINGWINDOW7x7_FLOAT,
EXEC_GEMM1x1s1_FLOAT,
EXEC_DEPTHWISEBASIC_FLOAT,
};
ExecMode &ExecMode() const { return exec_mode_; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册