diff --git a/mace/kernels/opencl/cl/resize_bicubic.cl b/mace/kernels/opencl/cl/resize_bicubic.cl index ce0b3af0e268071691bebf05ae5ccad7ad8e637c..42213094f97054baaf5586bbdcacac10cd87ef25 100644 --- a/mace/kernels/opencl/cl/resize_bicubic.cl +++ b/mace/kernels/opencl/cl/resize_bicubic.cl @@ -4,7 +4,6 @@ const int kTableSize = (1 << 10); inline float ComputeCoeffs(int i) { -// const int kTableSize = (1 << 10); const float A = -0.75; float x = (i / 2) * 1.0 / kTableSize; if (i % 2 == 0){ @@ -18,39 +17,11 @@ inline float ComputeCoeffs(int i) { } } -//#define GET_COEFFS(coeffs_tab, i) coeffs_tab[i] -//float getCoeffs(const float *coeffs_tab, int i) { -// return coeffs_tab[i]; -//} - #define BOUND(val, limit) min(limit - 1, max(0, val)) -//int Bound(int val, int limit) { -// return min(limit - 1, max(0, val)); -//} - -//float4 GetWeights(const float* coeffs_tab, float scale, int out_loc, int limit) { -// const int in_loc = scale * out_loc; -// const float delta = scale * out_loc - in_loc; -// const int offset = delta * kTableSize + 0.5; //lrintf not found in opencl; -// float4 weights = {getCoeffs(coeffs_tab, offset * 2 + 1), -// getCoeffs(coeffs_tab, offset * 2), -// getCoeffs(coeffs_tab, (kTableSize - offset) * 2), -// getCoeffs(coeffs_tab, (kTableSize - offset) * 2 + 1)}; -// return weights; -//} -// -//int4 GetIndices(float scale, int out_loc, int limit) { -// const int in_loc = scale * out_loc; -// const float delta = scale * out_loc - in_loc; -// const int offset = delta * kTableSize + 0.5; //lrintf not found in opencl -// int4 indices = {Bound(in_loc - 1, limit), Bound(in_loc, limit), -// Bound(in_loc + 1, limit), Bound(in_loc + 2, limit)}; -// return indices; -//} __kernel void resize_bicubic_nocache(KERNEL_ERROR_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 - __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ + __read_only image2d_t input, __write_only image2d_t output, __private const float height_scale, __private const float width_scale, @@ -83,27 +54,30 @@ __kernel void resize_bicubic_nocache(KERNEL_ERROR_PARAMS const int in_w_offset = mul24(ch_blk, in_width); const int in_h_offset = mul24(b, in_height); - //begin resize bicubic const int h_in_loc = height_scale * h; const float h_delta = height_scale * h - h_in_loc; - const int h_offset = h_delta * kTableSize + 0.5; //lrintf not found in opencl; + const int h_offset = h_delta * kTableSize + 0.5; const int w_in_loc = width_scale * w; const float w_delta = width_scale * w - w_in_loc; - const int w_offset = w_delta * kTableSize + 0.5; //lrintf not found in opencl; + const int w_offset = w_delta * kTableSize + 0.5; float4 y_weights = {ComputeCoeffs(h_offset * 2 + 1), ComputeCoeffs(h_offset * 2), ComputeCoeffs((kTableSize - h_offset) * 2), ComputeCoeffs((kTableSize - h_offset) * 2 + 1)}; - int4 y_indices = {BOUND(h_in_loc - 1, in_height), BOUND(h_in_loc, in_height), - BOUND(h_in_loc + 1, in_height), BOUND(h_in_loc + 2, in_height)}; + int4 y_indices = {BOUND(h_in_loc - 1, in_height), + BOUND(h_in_loc, in_height), + BOUND(h_in_loc + 1, in_height), + BOUND(h_in_loc + 2, in_height)}; float4 x_weights = {ComputeCoeffs(w_offset * 2 + 1), ComputeCoeffs(w_offset * 2), ComputeCoeffs((kTableSize - w_offset) * 2), ComputeCoeffs((kTableSize - w_offset) * 2 + 1)}; - int4 x_indices = {BOUND(w_in_loc - 1, in_width), BOUND(w_in_loc, in_width), - BOUND(w_in_loc + 1, in_width), BOUND(w_in_loc + 2, in_width)}; + int4 x_indices = {BOUND(w_in_loc - 1, in_width), + BOUND(w_in_loc, in_width), + BOUND(w_in_loc + 1, in_width), + BOUND(w_in_loc + 2, in_width)}; float4 coeffs0 = {0, 0, 0, 0}; float4 coeffs1 = {0, 0, 0, 0}; @@ -115,13 +89,13 @@ __kernel void resize_bicubic_nocache(KERNEL_ERROR_PARAMS if ( i == 2 ) { y_index = y_indices.s2; } if ( i == 3 ) { y_index = y_indices.s3; } DATA_TYPE4 data0 = READ_IMAGET(input, SAMPLER, - (int2)(in_w_offset + x_indices.s0, in_h_offset + y_index)); + (int2)(in_w_offset + x_indices.s0, in_h_offset + y_index)); DATA_TYPE4 data1 = READ_IMAGET(input, SAMPLER, - (int2)(in_w_offset + x_indices.s1, in_h_offset + y_index)); + (int2)(in_w_offset + x_indices.s1, in_h_offset + y_index)); DATA_TYPE4 data2 = READ_IMAGET(input, SAMPLER, - (int2)(in_w_offset + x_indices.s2, in_h_offset + y_index)); + (int2)(in_w_offset + x_indices.s2, in_h_offset + y_index)); DATA_TYPE4 data3 = READ_IMAGET(input, SAMPLER, - (int2)(in_w_offset + x_indices.s3, in_h_offset + y_index)); + (int2)(in_w_offset + x_indices.s3, in_h_offset + y_index)); float4 xw0 = { x_weights.s0, x_weights.s0, x_weights.s0, x_weights.s0 }; float4 xw1 = { x_weights.s1, x_weights.s1, x_weights.s1, x_weights.s1 }; @@ -150,7 +124,6 @@ __kernel void resize_bicubic_nocache(KERNEL_ERROR_PARAMS const int out_h_offset = mul24(b, out_height); WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), outdata); - //end bicubic } diff --git a/mace/kernels/opencl/resize_bicubic.cc b/mace/kernels/opencl/resize_bicubic.cc index d3ecfadc566902dd212053600273a929b446e62d..2b043794f26ade30c7e22b07d67b56570514d389 100644 --- a/mace/kernels/opencl/resize_bicubic.cc +++ b/mace/kernels/opencl/resize_bicubic.cc @@ -85,13 +85,7 @@ MaceStatus ResizeBicubicFunctor::operator()( kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } -// //create buffer -// cl::Buffer coeffs_buf(runtime->context(), CL_MEM_READ_WRITE, (kTableSize + 1 ) * 2 * sizeof(float)); -// if (runtime->command_queue().enqueueWriteBuffer(coeffs_buf,CL_TRUE,0,(kTableSize + 1 ) * 2 * sizeof(float),GetCoeffsTable()) != CL_SUCCESS) { -// std::cout << "Failed to write memory;n"; -// exit(1); -// } -// //end create buffer + if (!IsVecEqual(input_shape_, input->shape())) { MACE_CHECK(out_height > 0 && out_width > 0); std::vector output_shape{batch, out_height, out_width, channels}; @@ -116,7 +110,6 @@ MaceStatus ResizeBicubicFunctor::operator()( kernel_.setArg(idx++, static_cast(in_height)); kernel_.setArg(idx++, static_cast(in_width)); kernel_.setArg(idx++, static_cast(out_height)); -// kernel_.setArg(idx++, coeffs_buf); input_shape_ = input->shape(); } diff --git a/mace/kernels/resize_bicubic.h b/mace/kernels/resize_bicubic.h index 92f82cb26ccbf0ce38fb4cbc8579f48863344703..1da7c2b1f0873148d5e4cc563cb94fa2828177e8 100644 --- a/mace/kernels/resize_bicubic.h +++ b/mace/kernels/resize_bicubic.h @@ -95,12 +95,6 @@ inline void ResizeImage(const float *images, const float height_scale, const float width_scale, float *output) { -// std::stringstream tabss; -// for(int tabi = 0; tabi < 10; tabi++){ -// tabss << GetCoeffsTable()[tabi] << " "; -// } -// LOG(WARNING) << tabss.str().c_str(); - std::array coeff = {{0.0, 0.0, 0.0, 0.0}}; #pragma omp parallel for collapse(2) for (index_t b = 0; b < batch_size; ++b) { @@ -119,22 +113,24 @@ inline void ResizeImage(const float *images, for (index_t c = 0; c < channels; ++c) { // Use a 4x4 patch to compute the interpolated output value at // (b, y, x, c). - - const float *channel_input_ptr = images + (b * channels + c) * in_height * in_width; - float *channel_output_ptr = output + (b * channels + c) * out_height * out_width; + const float *channel_input_ptr = + images + (b * channels + c) * in_height * in_width; + float *channel_output_ptr = + output + (b * channels + c) * out_height * out_width; for (index_t i = 0; i < 4; ++i) { const std::array values = { - {static_cast( - channel_input_ptr[y_indices[i] * in_width + x_indices[0]]), - static_cast( - channel_input_ptr[y_indices[i] * in_width + x_indices[1]]), - static_cast( - channel_input_ptr[y_indices[i] * in_width + x_indices[2]]), - static_cast( - channel_input_ptr[y_indices[i] * in_width + x_indices[3]])}}; + {static_cast(channel_input_ptr + [y_indices[i] * in_width + x_indices[0]]), + static_cast(channel_input_ptr + [y_indices[i] * in_width + x_indices[1]]), + static_cast(channel_input_ptr + [y_indices[i] * in_width + x_indices[2]]), + static_cast(channel_input_ptr + [y_indices[i] * in_width + x_indices[3]])}}; coeff[i] = Interpolate1D(x_weights, values); } - channel_output_ptr[y * out_width + x] = Interpolate1D(y_weights, coeff); + channel_output_ptr[y * out_width + x] = + Interpolate1D(y_weights, coeff); } } } diff --git a/mace/ops/resize_bicubic_benchmark.cc b/mace/ops/resize_bicubic_benchmark.cc index 6685392df0fd2384a9554ce138aadfe29a5e0410..ba22f4fecdf49267f9f845a0879fe1f38e7faa0f 100644 --- a/mace/ops/resize_bicubic_benchmark.cc +++ b/mace/ops/resize_bicubic_benchmark.cc @@ -83,17 +83,17 @@ void ResizeBicubicBenchmark(int iters, } // namespace #define MACE_BM_RESIZE_BICUBIC_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE) \ - static void \ + static void \ MACE_BM_RESIZE_BICUBIC_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ - ##DEVICE( \ - int iters) { \ - const int64_t macc = static_cast(iters) * N * C * H1 * W1 * 3; \ - const int64_t tot = static_cast(iters) * N * C * H0 * W0; \ - mace::testing::MaccProcessed(macc); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + ##DEVICE( \ + int iters) { \ + const int64_t macc = static_cast(iters) * N * C * H1 * W1 * 3; \ + const int64_t tot = static_cast(iters) * N * C * H0 * W0; \ + mace::testing::MaccProcessed(macc); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ResizeBicubicBenchmark(iters, N, C, H0, W0, H1, W1); \ - } \ - MACE_BENCHMARK( \ + } \ + MACE_BENCHMARK( \ MACE_BM_RESIZE_BICUBIC_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ ##DEVICE) diff --git a/mace/ops/resize_bicubic_test.cc b/mace/ops/resize_bicubic_test.cc index 68965321b6f30a4a80d9ce629601d15d847ecb53..a834d98f387fa983fb056c3e0c61f550ab0595b4 100644 --- a/mace/ops/resize_bicubic_test.cc +++ b/mace/ops/resize_bicubic_test.cc @@ -48,7 +48,6 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) { NHWC); // Check - //TODO change expected data auto expected = CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -63,7 +62,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners1) { std::vector input(48); std::iota(begin(input), end(input), 0); net.AddInputFromArray("Input", {1, 4, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat("Input", NHWC, "InputNCHW", + NCHW); OpDefBuilder("ResizeBicubic", "ResizeBicubicTest") .Input("InputNCHW") @@ -73,11 +73,14 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners1) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat("OutputNCHW", NCHW, "Output", + NHWC); // Check - //TODO change expected data - auto expected = CreateTensor({1, 2, 3, 3}, {0., 1., 2.,4.110297, 5.110297, 6.110297, 8.223037, 9.223036, 10.223037, 24., 25., 26., 28.110298, 29.1103, 30.110298, 32.223038, 33.223038, 34.223038}); + auto expected = CreateTensor({1, 2, 3, 3}, + {0., 1., 2., 4.110297, 5.110297, 6.110297, + 8.223037, 9.223036, 10.223037, 24., 25., 26., + 28.110298, 29.1103, 30.110298, 32.223038, 33.223038, 34.223038}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -167,7 +170,6 @@ void TestRandomResizeBicubic() { // Check ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), 1e-5, 1e-4); - } } } // namespace