From 9efc54d648c58732ba32a1b6e260c84f830cf36a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E5=A5=87=E5=8F=AF?= Date: Wed, 29 Aug 2018 10:13:44 +0800 Subject: [PATCH] fix unefficient code, delete unused code --- mace/kernels/opencl/cl/resize_bicubic.cl | 114 +++++++++-------------- mace/kernels/opencl/resize_bicubic.cc | 1 + mace/kernels/resize_bicubic.h | 1 - 3 files changed, 47 insertions(+), 69 deletions(-) diff --git a/mace/kernels/opencl/cl/resize_bicubic.cl b/mace/kernels/opencl/cl/resize_bicubic.cl index d3bd1ecd..767ee391 100644 --- a/mace/kernels/opencl/cl/resize_bicubic.cl +++ b/mace/kernels/opencl/cl/resize_bicubic.cl @@ -1,22 +1,14 @@ #include -const int kTableSize = (1 << 10); - -inline float ComputeCoeffs(int i) { - const float A = -0.75; - float x = (i / 2) * 1.0 / kTableSize; - if (i % 2 == 0){ - float coeff = ((A + 2) * x - (A + 3)) * x * x + 1; - return coeff; - } - else { - x += 1.0; - float coeff = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; - return coeff; - } +inline float coeff_even(float i) { + float x = i / TABLE_SIZE; + return (1.25f * x - 2.25f) * x * x + 1.0f; } -#define BOUND(val, limit) min(limit - 1, max(0, val)) +inline float coeff_odd(float i) { + float x = i / TABLE_SIZE + 1.0f; + return ((-0.75f * x + 3.75f) * x - 6.0f) * x + 3.0f; +} __kernel void resize_bicubic_nocache(KERNEL_ERROR_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 @@ -27,7 +19,6 @@ __kernel void resize_bicubic_nocache(KERNEL_ERROR_PARAMS __private const int in_height, __private const int in_width, __private const int out_height) { - const int ch_blk = get_global_id(0); const int w = get_global_id(1); const int hb = get_global_id(2); @@ -53,72 +44,59 @@ __kernel void resize_bicubic_nocache(KERNEL_ERROR_PARAMS const int in_w_offset = mul24(ch_blk, in_width); const int in_h_offset = mul24(b, in_height); - const int h_in_loc = height_scale * h; - const float h_delta = height_scale * h - h_in_loc; - const int h_offset = h_delta * kTableSize + 0.5; - - const int w_in_loc = width_scale * w; - const float w_delta = width_scale * w - w_in_loc; - const int w_offset = w_delta * kTableSize + 0.5; - - float4 y_weights = {ComputeCoeffs(h_offset * 2 + 1), - ComputeCoeffs(h_offset * 2), - ComputeCoeffs((kTableSize - h_offset) * 2), - ComputeCoeffs((kTableSize - h_offset) * 2 + 1)}; - int4 y_indices = {BOUND(h_in_loc - 1, in_height), - BOUND(h_in_loc, in_height), - BOUND(h_in_loc + 1, in_height), - BOUND(h_in_loc + 2, in_height)}; - float4 x_weights = {ComputeCoeffs(w_offset * 2 + 1), - ComputeCoeffs(w_offset * 2), - ComputeCoeffs((kTableSize - w_offset) * 2), - ComputeCoeffs((kTableSize - w_offset) * 2 + 1)}; - int4 x_indices = {BOUND(w_in_loc - 1, in_width), - BOUND(w_in_loc, in_width), - BOUND(w_in_loc + 1, in_width), - BOUND(w_in_loc + 2, in_width)}; - - float4 coeffs0 = {0, 0, 0, 0}; - float4 coeffs1 = {0, 0, 0, 0}; - float4 coeffs2 = {0, 0, 0, 0}; - float4 coeffs3 = {0, 0, 0, 0}; + const int h_in_loc = (int)h_in; + const float h_delta = h_in - h_in_loc; + const int h_offset = h_delta * TABLE_SIZE + 0.5f; + + const int w_in_loc = (int)w_in; + const float w_delta = w_in - w_in_loc; + const int w_offset = w_delta * TABLE_SIZE + 0.5f; + + const float h_offset_l = h_offset; + const float h_offset_r = TABLE_SIZE - h_offset_l; + float4 y_weights = {coeff_odd(h_offset_l), coeff_even(h_offset_l), + coeff_even(h_offset_r), coeff_odd(h_offset_r)}; + int4 y_indices = {h_in_loc - 1, h_in_loc, h_in_loc + 1, h_in_loc + 2}; + y_indices = min(max(y_indices, 0), in_height - 1); + + const float w_offset_l = w_offset; + const float w_offset_r = TABLE_SIZE - w_offset_l; + float4 x_weights = {coeff_odd(w_offset_l), coeff_even(w_offset_l), + coeff_even(w_offset_r), coeff_odd(w_offset_r)}; + int4 x_indices = {w_in_loc - 1, w_in_loc, w_in_loc + 1, w_in_loc + 2}; + x_indices = min(max(x_indices, 0), in_width - 1); + + float4 coeffs0 = 0, coeffs1 = 0, coeffs2 = 0, coeffs3 = 0; for (int i = 0; i < 4; ++i) { int y_index = y_indices.s0; if ( i == 1 ) { y_index = y_indices.s1; } if ( i == 2 ) { y_index = y_indices.s2; } if ( i == 3 ) { y_index = y_indices.s3; } + const int in_h_index = in_h_offset + y_index; DATA_TYPE4 data0 = READ_IMAGET(input, SAMPLER, - (int2)(in_w_offset + x_indices.s0, in_h_offset + y_index)); + (int2)(in_w_offset + x_indices.s0, in_h_index)); DATA_TYPE4 data1 = READ_IMAGET(input, SAMPLER, - (int2)(in_w_offset + x_indices.s1, in_h_offset + y_index)); + (int2)(in_w_offset + x_indices.s1, in_h_index)); DATA_TYPE4 data2 = READ_IMAGET(input, SAMPLER, - (int2)(in_w_offset + x_indices.s2, in_h_offset + y_index)); + (int2)(in_w_offset + x_indices.s2, in_h_index)); DATA_TYPE4 data3 = READ_IMAGET(input, SAMPLER, - (int2)(in_w_offset + x_indices.s3, in_h_offset + y_index)); - - float4 xw0 = { x_weights.s0, x_weights.s0, x_weights.s0, x_weights.s0 }; - float4 xw1 = { x_weights.s1, x_weights.s1, x_weights.s1, x_weights.s1 }; - float4 xw2 = { x_weights.s2, x_weights.s2, x_weights.s2, x_weights.s2 }; - float4 xw3 = { x_weights.s3, x_weights.s3, x_weights.s3, x_weights.s3 }; - float4 res = { 0, 0, 0, 0 }; - res = mad(xw0, data0, res); - res = mad(xw1, data1, res); - res = mad(xw2, data2, res); - res = mad(xw3, data3, res); + (int2)(in_w_offset + x_indices.s3, in_h_index)); + + float4 res = 0; + res = mad(data0, x_weights.s0, res); + res = mad(data1, x_weights.s1, res); + res = mad(data2, x_weights.s2, res); + res = mad(data3, x_weights.s3, res); if ( i == 0 ) { coeffs0 = res; } if ( i == 1 ) { coeffs1 = res; } if ( i == 2 ) { coeffs2 = res; } if ( i == 3 ) { coeffs3 = res; } } - float4 yw0 = { y_weights.s0, y_weights.s0, y_weights.s0, y_weights.s0 }; - float4 yw1 = { y_weights.s1, y_weights.s1, y_weights.s1, y_weights.s1 }; - float4 yw2 = { y_weights.s2, y_weights.s2, y_weights.s2, y_weights.s2 }; - float4 yw3 = { y_weights.s3, y_weights.s3, y_weights.s3, y_weights.s3 }; - DATA_TYPE4 outdata = { 0, 0, 0, 0 }; - outdata = mad(yw0, coeffs0, outdata); - outdata = mad(yw1, coeffs1, outdata); - outdata = mad(yw2, coeffs2, outdata); - outdata = mad(yw3, coeffs3, outdata); + DATA_TYPE4 outdata = 0; + outdata = mad(coeffs0, y_weights.s0, outdata); + outdata = mad(coeffs1, y_weights.s1, outdata); + outdata = mad(coeffs2, y_weights.s2, outdata); + outdata = mad(coeffs3, y_weights.s3, outdata); const int out_w_offset = mul24(ch_blk, out_width); const int out_h_offset = mul24(b, out_height); diff --git a/mace/kernels/opencl/resize_bicubic.cc b/mace/kernels/opencl/resize_bicubic.cc index 2b043794..f8a33383 100644 --- a/mace/kernels/opencl/resize_bicubic.cc +++ b/mace/kernels/opencl/resize_bicubic.cc @@ -76,6 +76,7 @@ MaceStatus ResizeBicubicFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize)); MACE_RETURN_IF_ERROR( runtime->BuildKernel("resize_bicubic", kernel_name, diff --git a/mace/kernels/resize_bicubic.h b/mace/kernels/resize_bicubic.h index 4bef1c90..046c6bb1 100644 --- a/mace/kernels/resize_bicubic.h +++ b/mace/kernels/resize_bicubic.h @@ -103,7 +103,6 @@ inline void ResizeImage(const float *images, std::array y_indices; GetWeightsAndIndices(height_scale, y, in_height, &y_weights, &y_indices); - std::stringstream ss; for (index_t x = 0; x < out_width; ++x) { std::array x_weights; std::array x_indices; -- GitLab