diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h index 509be5f92f3ee64953aeba54e178ba2a6cfe9678..15af651136b79a8034609596b6454410cb4e5d33 100644 --- a/src/framework/cl/cl_engine.h +++ b/src/framework/cl/cl_engine.h @@ -40,8 +40,8 @@ class CLEngine { return std::move(context_ptr); } - std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> - CreateClCommandQueue(cl_context context) { + std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue( + cl_context context) { cl_int status; cl_command_queue queue = clCreateCommandQueue(context, devices_[0], 0, &status); diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index cc887741ce6449ca7a479ba79491aeda280a8ffd..9bad4bb6124a2102646c273036ee0d8b86a06191 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -182,28 +182,25 @@ class CLImage { DLOG << " image width: " << width; DLOG << " image height: " << height; - cl_image_format cf = { - .image_channel_order = CL_RGBA, - .image_channel_data_type = CL_HALF_FLOAT - }; + cl_image_format cf = {.image_channel_order = CL_RGBA, + .image_channel_data_type = CL_HALF_FLOAT}; cl_image_desc cid = { - .image_type = CL_MEM_OBJECT_IMAGE2D, - .image_width = width, - .image_height = height, - .image_depth = 1, - .image_array_size = 1, - .image_row_pitch = 0, - .image_slice_pitch = 0, - .num_mip_levels = 0, - .num_samples = 0, - // .buffer = nullptr + .image_type = CL_MEM_OBJECT_IMAGE2D, + .image_width = width, + .image_height = height, + .image_depth = 1, + .image_array_size = 1, + .image_row_pitch = 0, + .image_slice_pitch = 0, + .num_mip_levels = 0, + .num_samples = 0, + // .buffer = nullptr }; cid.buffer = nullptr; cl_image_ = clCreateImage( - context, - CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0), - &cf, // const cl_image_format *image_format - &cid, // const cl_image_desc *image_desc + context, CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0), + &cf, // const cl_image_format *image_format + &cid, // const cl_image_desc *image_desc reinterpret_cast(imageData.get()), // void *host_ptr &err); diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index f542cfeae315bd0dc1d89d5eb7c44cfaf52b4704..609e069d48bac3347eb74d3f9573bdb2d45ab10e 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -37,7 +37,7 @@ limitations under the License. */ #include "framework/cl/cl_image.h" #endif -int debug_to = 2; +int debug_to = 4; namespace paddle_mobile { namespace framework { diff --git a/src/operators/kernel/cl/batchnorm_kernel.cpp b/src/operators/kernel/cl/batchnorm_kernel.cpp index 8770ce70191197790c4e0b1dfbd4523ef83e5d4c..c3fb564474ea1ff80f5e31c533b7b6bfa805cde0 100644 --- a/src/operators/kernel/cl/batchnorm_kernel.cpp +++ b/src/operators/kernel/cl/batchnorm_kernel.cpp @@ -47,15 +47,20 @@ bool BatchNormKernel::Init(BatchNormParam *param) { new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; } - delete[](new_scale_ptr); - delete[](new_bias_ptr); - framework::CLImage *new_scale = new framework::CLImage(); + new_scale->SetTensorData(new_scale_ptr, variance->dims()); + new_scale->InitCLImage(this->cl_helper_.CLContext()); + framework::CLImage *new_bias = new framework::CLImage(); + new_bias->SetTensorData(new_bias_ptr, variance->dims()); + new_bias->InitCLImage(this->cl_helper_.CLContext()); param->SetNewScale(new_scale); param->SetNewBias(new_bias); + delete[](new_scale_ptr); + delete[](new_bias_ptr); + return true; } diff --git a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl index d2cc2151422255f48f81550f7424ec2dccb3be41..bb89ceb1397c4644f57cd649ccb7a532b643af04 100644 --- a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl @@ -3,8 +3,8 @@ __kernel void batchnorm(__private const int out_height, __private const int out_width, __read_only image2d_t input, - __read_only image2d_t new_scale, - __read_only image2d_t new_bias, + __read_only image2d_t new_scale_image, + __read_only image2d_t new_bias_image, __write_only image2d_t output) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -13,12 +13,12 @@ __kernel void batchnorm(__private const int out_height, const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - half4 new_scale = read_imageh(bn_scale, sampler, (int2)(out_c, 0)); - half4 new_bias = read_imageh(bn_bias, sampler, (int2)(out_c, 0)); + half4 new_scale = read_imageh(new_scale_image, sampler, (int2)(out_c, 0)); + half4 new_bias = read_imageh(new_bias_image, sampler, (int2)(out_c, 0)); int pos_x = mad24(out_c, out_width, out_w); half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh)); half4 out = mad(in, new_scale, new_bias); - write_imageh(output, (int2)(pos_x, nh), out); + write_imageh(output, (int2)(pos_x, out_nh), out); } diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl index 113f1be9a1a8cf52c1bd57c73208295efffd1b84..5af833a29b82e2a1d6ed8c03906caec599e29839 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl @@ -12,10 +12,139 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void conv_3x3() { +__kernel void conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + + #ifdef BIASE + half4 output = read_imageh(bias, sampler, int2(out_c, 0)); +#else + half4 output = 0.0; +#endif + + half4 input[9]; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + input[0] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[1] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[2] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[3] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height)); + + input[4] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + + input[5] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height)); + + input[6] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + input[7] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + input[8] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0), + (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + for (int j = 0; j < 9; ++j) { + int2 fuck; + fuck.x = i * 3 + j % 3; + fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, fuck); + output.x += dot(input[j], weight_x); + + fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, fuck); + output.y += dot(input[j], weight_y); + + fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, fuck); + output.z += dot(input[j], weight_z); + + fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, fuck); + output.w += dot(input[j], weight_w); + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0)) +#endif + +#ifdef RELU + output = activation(output); +#endif + + write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); } + + diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp index bb26843795eda54669f473e36a9e66721790630b..bc50440273c48df6ea765716b2b7711d0765b007 100644 --- a/src/operators/kernel/cl/conv_kernel.cpp +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -78,7 +78,7 @@ void ConvKernel::Compute(const ConvParam ¶m) { DLOG << " get Filter "; - auto output = param.Output(); + auto output = param.Output()->GetCLImage(); DLOG << " get Output "; @@ -89,45 +89,54 @@ void ConvKernel::Compute(const ConvParam ¶m) { int input_width = param.Input()->WidthOfOneBlock(); int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); + cl_int status; DLOG << " begin set kernel arg "; -// status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 1, sizeof(int), &w); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 2, sizeof(int), &nh); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 6, sizeof(int), &stride); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 7, sizeof(int), &offset); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); -// CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); DLOG << " end set kernel arg "; @@ -138,7 +147,6 @@ void ConvKernel::Compute(const ConvParam ¶m) { default_work_size.data(), NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); DLOG << " end enqueue "; - } template class ConvKernel; diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp index 467886a34b676824fcd455e3870fad3f384abe42..3282556bb1afeff1085f35cfccd49301445a58a8 100644 --- a/src/operators/kernel/cl/feed_kernel.cpp +++ b/src/operators/kernel/cl/feed_kernel.cpp @@ -61,7 +61,11 @@ void FeedKernel::Compute(const FeedParam ¶m) { size_t region[3] = {height, width, 1}; clEnqueueReadImage(commandQueue, cl_image, CL_TRUE, origin, region, 0, 0, out, 0, NULL, NULL); +<<<<<<< HEAD for (int i = 0; i < numel; i++) DLOG << Half2Float(out[i])<<","<>>>>>> 289b739de8517c21872107c16790b9cb2e7042d7 } template class FeedKernel; diff --git a/tools/android-debug-script/push2android.sh b/tools/android-debug-script/push2android.sh index fae1a856123bd16cf3f7a115f61b3e4473ff58a3..14664b4051f69af5f803734bb04fb5a8654b341b 100644 --- a/tools/android-debug-script/push2android.sh +++ b/tools/android-debug-script/push2android.sh @@ -1,6 +1,11 @@ #!/usr/bin/env sh push_fn () { + + +cp ../../src/operators/kernel/cl/cl_kernel/* ../../build/release/arm-v7a/build/cl_kernel/ + + MODELS_PATH="../../test/models/*" MODELS_SRC="../../test/models" IMAGE_PATH="../../test/images/*"