From 0d5bf6ce03f78b721b5b5685cd177d921a538437 Mon Sep 17 00:00:00 2001 From: liuqi Date: Fri, 3 Nov 2017 15:50:38 +0800 Subject: [PATCH] Change the data type of conv kernel params from uint32_t to int32_t. --- mace/kernels/opencl/cl/batch_norm.cl | 4 +- mace/kernels/opencl/cl/conv_2d_3x3.cl | 52 +++++++++---------- mace/kernels/opencl/cl/depthwise_conv_3x3.cl | 52 +++++++++---------- mace/kernels/opencl/conv_2d_opencl_3x3.cc | 16 +++--- .../opencl/depthwise_conv_opencl_3x3.cc | 12 ++--- mace/ops/conv_2d_benchmark.cc | 1 - mace/ops/conv_2d_test.cc | 2 +- 7 files changed, 69 insertions(+), 70 deletions(-) diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 3fc449ce..6cc2a2e0 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -4,7 +4,7 @@ void kernel batch_norm(global const float *input, global const float *mean, global const float *var, global const float *epsilon, - private const uint pixels, + private const int pixels, global float *output, __local float4 *new_scale, __local float4 *new_offset) { @@ -12,7 +12,7 @@ void kernel batch_norm(global const float *input, const int channel = get_global_id(1); const int channels = get_global_size(1); const int pixel_offset = get_global_id(2); - const unsigned int local_channel = get_local_id(1); + const int local_channel = get_local_id(1); const int local_pixel_idx = get_local_id(2); if(local_pixel_idx == 0) { diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index cd4e5eed..0dbafb2d 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -3,44 +3,44 @@ void kernel conv_2d_3x3(global const float *input, global const float *filter, global const float *bias, global float *output, - private const uint in_chan_num, - private const uint out_chan_num, - private const uint in_height, - private const uint in_width, - private const uint out_height, - private const uint out_width, - private const uint stride_h, - private const uint stride_w) { - const int batch = get_global_id(0); - const int out_chan_blk = get_global_id(1); - const int out_pixel_blk = get_global_id(2); + private const int in_chan_num, + private const int out_chan_num, + private const int in_height, + private const int in_width, + private const int out_height, + private const int out_width, + private const int stride_h, + private const int stride_w) { + int batch = get_global_id(0); + int out_chan_blk = get_global_id(1); + int out_pixel_blk = get_global_id(2); - const uint in_pixel = in_height * in_width; - const uint out_pixel = out_height * out_width; + const int in_pixel = in_height * in_width; + const int out_pixel = out_height * out_width; - const uint round_out_width = (out_width + 3) / 4; - const uint out_pixel_height = out_pixel_blk / round_out_width; - const uint out_pixel_width = out_pixel_blk % round_out_width; + const int round_out_width = (out_width + 3) / 4; + const int out_pixel_height = out_pixel_blk / round_out_width; + const int out_pixel_width = out_pixel_blk % round_out_width; - const uint out_chan_begin = out_chan_blk * 4; - const uint out_chan_end = min(out_chan_begin + 4, out_chan_num); - const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; - const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); - const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4; + const int out_chan_begin = out_chan_blk * 4; + const int out_chan_end = min(out_chan_begin + 4, out_chan_num); + const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; + const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); + const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4; - const uint in_offset = batch * in_chan_num * in_pixel; - const uint out_offset = batch * out_chan_num * out_pixel; + const int in_offset = batch * in_chan_num * in_pixel; + const int out_offset = batch * out_chan_num * out_pixel; const float *input_base = input + in_offset + in_pixel_begin; float *output_base = output + out_offset + out_pixel_begin; uint pixels = out_pixel_end - out_pixel_begin; - for (uint i = out_chan_begin; i < out_chan_end; ++i) { + for (int i = out_chan_begin; i < out_chan_end; ++i) { float *output_ptr = output_base + i * out_pixel; const float *filter_base = filter + i * in_chan_num * 9; if (pixels == 4) { float4 res = (float4)bias[i]; - for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) { + for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) { const float* input_ptr = input_base + in_chan_idx * in_pixel; const float* filter_ptr = filter_base + in_chan_idx * 9; if (stride_w == 1) { @@ -55,7 +55,7 @@ void kernel conv_2d_3x3(global const float *input, } vstore4(res, 0, output_ptr); } else { - for (uint p = 0; p < pixels; ++p) { + for (int p = 0; p < pixels; ++p) { float res = bias[i]; for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) { const float* input_ptr = input_base + in_chan_idx * in_pixel + p * stride_w; diff --git a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl index b88e339a..52905289 100644 --- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl +++ b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl @@ -4,40 +4,40 @@ void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */ global const float *filter, /* m, i, kh, kw */ global const float *bias, /* o */ global float *output, /* n, c, h, w */ - private const uint in_chan_num, - private const uint out_chan_num, - private const uint in_height, - private const uint in_width, - private const uint out_height, - private const uint out_width, - private const uint stride_h, - private const uint stride_w) { - const int batch = get_global_id(0); - const int out_chan_blk = get_global_id(1); - const int out_pixel_blk = get_global_id(2); + private const int in_chan_num, + private const int out_chan_num, + private const int in_height, + private const int in_width, + private const int out_height, + private const int out_width, + private const int stride_h, + private const int stride_w) { + int batch = get_global_id(0); + int out_chan_blk = get_global_id(1); + int out_pixel_blk = get_global_id(2); - const uint in_pixel = in_height * in_width; - const uint out_pixel = out_height * out_width; - const uint multiplier = out_chan_num / in_chan_num; + const int in_pixel = in_height * in_width; + const int out_pixel = out_height * out_width; + const int multiplier = out_chan_num / in_chan_num; - const uint round_out_width = (out_width + 3) / 4; - const uint out_pixel_height = out_pixel_blk / round_out_width; - const uint out_pixel_width = out_pixel_blk % round_out_width; + const int round_out_width = (out_width + 3) / 4; + const int out_pixel_height = out_pixel_blk / round_out_width; + const int out_pixel_width = out_pixel_blk % round_out_width; - const uint out_chan_begin = out_chan_blk * 4; - const uint out_chan_end = min(out_chan_begin + 4, out_chan_num); - const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; - const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); - const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4; + const int out_chan_begin = out_chan_blk * 4; + const int out_chan_end = min(out_chan_begin + 4, out_chan_num); + const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; + const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); + const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4; - const uint in_offset = batch * in_chan_num * in_pixel; - const uint out_offset = batch * out_chan_num * out_pixel; + const int in_offset = batch * in_chan_num * in_pixel; + const int out_offset = batch * out_chan_num * out_pixel; const float *input_base = input + in_offset + in_pixel_begin; float *output_base = output + out_offset + out_pixel_begin; uint pixels = out_pixel_end - out_pixel_begin; - for (uint i = out_chan_begin; i < out_chan_end; ++i) { + for (int i = out_chan_begin; i < out_chan_end; ++i) { float bias_value = bias[i]; const float *input_ptr = input_base + (i / multiplier) * in_pixel; const float *filter_ptr = filter + i * 9; @@ -55,7 +55,7 @@ void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */ } vstore4(res, 0, output_ptr); } else { - for (uint p = 0; p < pixels; ++p) { + for (int p = 0; p < pixels; ++p) { float res = bias[i]; res += conv3x3(input_ptr, filter_ptr, in_width); output_ptr[p] = res; diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 90011b8f..41dccf4c 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -29,20 +29,20 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter, conv_kernel.setArg(idx++, *(static_cast(filter->buffer()))); conv_kernel.setArg(idx++, *(static_cast(bias->buffer()))); conv_kernel.setArg(idx++, *(static_cast(output->buffer()))); - conv_kernel.setArg(idx++, static_cast(input->dim(1))); - conv_kernel.setArg(idx++, static_cast(channels)); - conv_kernel.setArg(idx++, static_cast(input->dim(2))); - conv_kernel.setArg(idx++, static_cast(input->dim(3))); - conv_kernel.setArg(idx++, static_cast(height)); - conv_kernel.setArg(idx++, static_cast(width)); + conv_kernel.setArg(idx++, static_cast(input->dim(1))); + conv_kernel.setArg(idx++, static_cast(channels)); + conv_kernel.setArg(idx++, static_cast(input->dim(2))); + conv_kernel.setArg(idx++, static_cast(input->dim(3))); + conv_kernel.setArg(idx++, static_cast(height)); + conv_kernel.setArg(idx++, static_cast(width)); conv_kernel.setArg(idx++, stride); conv_kernel.setArg(idx++, stride); const uint32_t gws[3] = {static_cast(output->dim(0)), static_cast(channel_blocks), static_cast(pixel_blocks)}; const uint32_t lws[3] = {static_cast(1), - static_cast(1), - static_cast(256)}; + static_cast(8), + static_cast(128)}; cl_int error = runtime->command_queue().enqueueNDRangeKernel( conv_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), diff --git a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc index 0c393e76..c37fe77f 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc @@ -38,12 +38,12 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input, conv_kernel.setArg(idx++, *(static_cast(filter->buffer()))); conv_kernel.setArg(idx++, *(static_cast(bias->buffer()))); conv_kernel.setArg(idx++, *(static_cast(output->buffer()))); - conv_kernel.setArg(idx++, static_cast(input->dim(1))); - conv_kernel.setArg(idx++, static_cast(channels)); - conv_kernel.setArg(idx++, static_cast(input->dim(2))); - conv_kernel.setArg(idx++, static_cast(input->dim(3))); - conv_kernel.setArg(idx++, static_cast(height)); - conv_kernel.setArg(idx++, static_cast(width)); + conv_kernel.setArg(idx++, static_cast(input->dim(1))); + conv_kernel.setArg(idx++, static_cast(channels)); + conv_kernel.setArg(idx++, static_cast(input->dim(2))); + conv_kernel.setArg(idx++, static_cast(input->dim(3))); + conv_kernel.setArg(idx++, static_cast(height)); + conv_kernel.setArg(idx++, static_cast(width)); conv_kernel.setArg(idx++, stride); conv_kernel.setArg(idx++, stride); diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index dd4f4f7d..fb859da8 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -3,7 +3,6 @@ // #include -#include #include "mace/core/operator.h" #include "mace/core/testing/test_benchmark.h" diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 31d3130a..aff87a1a 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -296,7 +296,7 @@ static void TestUnalignedConvNxNS12() { ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); }; - for (int kernel_size : {1, 3, 5}) { + for (int kernel_size : {3}) { for (int stride : {1, 2}) { func(kernel_size, kernel_size, stride, stride, VALID); func(kernel_size, kernel_size, stride, stride, SAME); -- GitLab