diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h index 509be5f92f3ee64953aeba54e178ba2a6cfe9678..15af651136b79a8034609596b6454410cb4e5d33 100644 --- a/src/framework/cl/cl_engine.h +++ b/src/framework/cl/cl_engine.h @@ -40,8 +40,8 @@ class CLEngine { return std::move(context_ptr); } - std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> - CreateClCommandQueue(cl_context context) { + std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue( + cl_context context) { cl_int status; cl_command_queue queue = clCreateCommandQueue(context, devices_[0], 0, &status); diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index e9bef701986f7c0db533d80422faf7e286c3306b..b07f438c45de19df6b5bdf41c7ca0a67558d5a0d 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -193,28 +193,25 @@ class CLImage { DLOG << " image width: " << width; DLOG << " image height: " << height; - cl_image_format cf = { - .image_channel_order = CL_RGBA, - .image_channel_data_type = CL_HALF_FLOAT - }; + cl_image_format cf = {.image_channel_order = CL_RGBA, + .image_channel_data_type = CL_HALF_FLOAT}; cl_image_desc cid = { - .image_type = CL_MEM_OBJECT_IMAGE2D, - .image_width = width, - .image_height = height, - .image_depth = 1, - .image_array_size = 1, - .image_row_pitch = 0, - .image_slice_pitch = 0, - .num_mip_levels = 0, - .num_samples = 0, - // .buffer = nullptr + .image_type = CL_MEM_OBJECT_IMAGE2D, + .image_width = width, + .image_height = height, + .image_depth = 1, + .image_array_size = 1, + .image_row_pitch = 0, + .image_slice_pitch = 0, + .num_mip_levels = 0, + .num_samples = 0, + // .buffer = nullptr }; cid.buffer = nullptr; cl_image_ = clCreateImage( - context, - CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0), - &cf, // const cl_image_format *image_format - &cid, // const cl_image_desc *image_desc + context, CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0), + &cf, // const cl_image_format *image_format + &cid, // const cl_image_desc *image_desc reinterpret_cast(imageData.get()), // void *host_ptr &err); diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl index 113f1be9a1a8cf52c1bd57c73208295efffd1b84..5af833a29b82e2a1d6ed8c03906caec599e29839 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl @@ -12,10 +12,139 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void conv_3x3() { +__kernel void conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + + #ifdef BIASE + half4 output = read_imageh(bias, sampler, int2(out_c, 0)); +#else + half4 output = 0.0; +#endif + + half4 input[9]; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + input[0] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[1] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[2] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[3] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height)); + + input[4] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + + input[5] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height)); + + input[6] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + input[7] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + input[8] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0), + (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + for (int j = 0; j < 9; ++j) { + int2 fuck; + fuck.x = i * 3 + j % 3; + fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, fuck); + output.x += dot(input[j], weight_x); + + fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, fuck); + output.y += dot(input[j], weight_y); + + fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, fuck); + output.z += dot(input[j], weight_z); + + fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, fuck); + output.w += dot(input[j], weight_w); + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0)) +#endif + +#ifdef RELU + output = activation(output); +#endif + + write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); } + + diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp index bb26843795eda54669f473e36a9e66721790630b..bc50440273c48df6ea765716b2b7711d0765b007 100644 --- a/src/operators/kernel/cl/conv_kernel.cpp +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -78,7 +78,7 @@ void ConvKernel::Compute(const ConvParam ¶m) { DLOG << " get Filter "; - auto output = param.Output(); + auto output = param.Output()->GetCLImage(); DLOG << " get Output "; @@ -89,45 +89,54 @@ void ConvKernel::Compute(const ConvParam ¶m) { int input_width = param.Input()->WidthOfOneBlock(); int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); + cl_int status; DLOG << " begin set kernel arg "; -// status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 1, sizeof(int), &w); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 2, sizeof(int), &nh); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 6, sizeof(int), &stride); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 7, sizeof(int), &offset); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); -// CL_CHECK_ERRORS(status); -// -// status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); -// CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); DLOG << " end set kernel arg "; @@ -138,7 +147,6 @@ void ConvKernel::Compute(const ConvParam ¶m) { default_work_size.data(), NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); DLOG << " end enqueue "; - } template class ConvKernel; diff --git a/tools/android-debug-script/push2android.sh b/tools/android-debug-script/push2android.sh index fae1a856123bd16cf3f7a115f61b3e4473ff58a3..14664b4051f69af5f803734bb04fb5a8654b341b 100644 --- a/tools/android-debug-script/push2android.sh +++ b/tools/android-debug-script/push2android.sh @@ -1,6 +1,11 @@ #!/usr/bin/env sh push_fn () { + + +cp ../../src/operators/kernel/cl/cl_kernel/* ../../build/release/arm-v7a/build/cl_kernel/ + + MODELS_PATH="../../test/models/*" MODELS_SRC="../../test/models" IMAGE_PATH="../../test/images/*"