// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include #include #include "lite/backends/opencl/target_wrapper.h" #include "lite/core/op_registry.h" #include "lite/core/profile/timer.h" #include "lite/core/tensor.h" #include "lite/kernels/opencl/test_helper.h" #define FP16_MAX_DIFF (5e-3) DEFINE_int32(warmup, 0, "warmup times"); DEFINE_int32(repeats, 1, "repeats times"); using paddle::lite::profile::Timer; namespace paddle { namespace lite { void instance_norm_ref(Tensor* x, Tensor* y, Tensor* scale, Tensor* bias, Tensor* saved_mean, Tensor* saved_variance, float epsilon) { auto x_data = x->data(); auto scale_data = scale->data(); auto bias_data = bias->data(); auto y_data = y->mutable_data(); auto saved_mean_data = saved_mean->mutable_data(); auto saved_variance_data = saved_variance->mutable_data(); int n = x->dims()[0]; int c = x->dims()[1]; int spatial_size = x->dims()[2] * x->dims()[3]; // compute mean for (int i = 0; i < n * c; ++i) { const float* x_ptr = x_data + i * spatial_size; float sum = 0.f; for (int j = 0; j < spatial_size; ++j) { sum += x_ptr[j]; } saved_mean_data[i] = sum / spatial_size; } // compute variance for (int i = 0; i < n * c; ++i) { const float* x_ptr = x_data + i * spatial_size; float sum = 0.f; for (int j = 0; j < spatial_size; ++j) { sum += (x_ptr[j] - saved_mean_data[i]) * (x_ptr[j] - saved_mean_data[i]); } saved_variance_data[i] = 1.f / sqrtf(sum / spatial_size + epsilon); } // compute out for (int i = 0; i < n * c; ++i) { const float* x_ptr = x_data + i * spatial_size; float* y_ptr = y_data + i * spatial_size; float scale_val = scale_data[i % c]; float bias_val = bias_data[i % c]; for (int j = 0; j < spatial_size; ++j) { y_ptr[j] = scale_val * (x_ptr[j] - saved_mean_data[i]) * saved_variance_data[i] + bias_val; } } } // #define INSTANCE_NORM_FP16_LOOP_TEST // #define INSTANCE_NORM_FP16_PRINT_RESULT TEST(instance_norm_image2d, compute) { #ifdef INSTANCE_NORM_FP16_LOOP_TEST for (auto n : {1, 3}) { for (auto c : {1, 3, 8, 32, 65}) { for (auto h : {4, 20, 64, 112, 224}) { for (auto w : {2, 20, 64, 112, 224}) { #else const int n = 1; const int c = 32; const int h = 224; const int w = 224; #endif // INSTANCE_NORM_FP16_LOOP_TEST LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " " << h << " " << w << " ========"; auto kernels = KernelRegistry::Global().Create("instance_norm", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)); ASSERT_FALSE(kernels.empty()); auto kernel = std::move(kernels.front()); LOG(INFO) << "get kernel:" << kernel->doc(); lite::Tensor x, out, out_ref, scale, bias, saved_mean, saved_variance; operators::InstanceNormParam param; param.x = &x; param.out = &out; param.scale = &scale; param.bias = &bias; param.saved_mean = &saved_mean; param.saved_variance = &saved_variance; param.epsilon = 1e-5; std::unique_ptr context(new KernelContext); context->As().InitOnce(); kernel->SetParam(param); std::unique_ptr instance_context(new KernelContext); context->As().CopySharedTo( &(instance_context->As())); kernel->SetContext(std::move(instance_context)); const DDim in_dim = DDim(std::vector{n, c, h, w}); x.Resize(in_dim); out.Resize(in_dim); out_ref.Resize(in_dim); scale.Resize({c}); bias.Resize({c}); saved_mean.Resize({n * c}); saved_variance.Resize({n * c}); auto* x_data = x.mutable_data(); auto* scale_data = scale.mutable_data(); auto* bias_data = bias.mutable_data(); auto* saved_mean_data = saved_mean.mutable_data(); auto* saved_variance_data = saved_variance.mutable_data(); std::default_random_engine engine; std::uniform_real_distribution dist(-1, 1); int sum = n * c * h * w; for (int i = 0; i < sum; ++i) { x_data[i] = dist(engine); } for (int i = 0; i < c; ++i) { scale_data[i] = dist(engine); bias_data[i] = dist(engine); } //! run reference instance norm instance_norm_ref( &x, &out_ref, &scale, &bias, &saved_mean, &saved_variance, 1e-5); LOG(INFO) << "prepare input"; CLImageConverterDefault* default_converter = new CLImageConverterDefault(); DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim); LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " " << x_image_shape[1]; std::vector x_image_data(x_image_shape.production() * 4); // 4 : RGBA default_converter->NCHWToImage(x_data, x_image_data.data(), in_dim); auto* x_image = x.mutable_data( x_image_shape[0], x_image_shape[1], x_image_data.data()); auto* out_image = out.mutable_data( x_image_shape[0], x_image_shape[1]); //! warm up for (int i = 0; i < FLAGS_warmup; ++i) { kernel->Launch(); } context->As().cl_context()->GetCommandQueue().finish(); //! compute Timer t0; t0.Start(); for (int i = 0; i < FLAGS_repeats; ++i) { kernel->Launch(); } context->As().cl_context()->GetCommandQueue().finish(); t0.Stop(); double gops = 6 * sum; LOG(INFO) << "avg time: " << t0.LapTimes().Avg() / FLAGS_repeats << " ms, " << "avg GOPs: " << 1e-6 * gops * FLAGS_repeats / t0.LapTimes().Avg() << " GOPs"; const size_t cl_image2d_row_pitch{0}; const size_t cl_image2d_slice_pitch{0}; half_t* out_image_data = new half_t[x_image_shape.production() * 4]; TargetWrapperCL::ImgcpySync(out_image_data, out_image, x_image_shape[0], x_image_shape[1], cl_image2d_row_pitch, cl_image2d_slice_pitch, IoDirection::DtoH); float* out_data = new float[x_image_shape.production() * 4]; default_converter->ImageToNCHW( out_image_data, out_data, x_image_shape, in_dim); // result #ifdef INSTANCE_NORM_FP16_PRINT_RESULT LOG(INFO) << "---- print kernel result (input -> output) ----"; for (int eidx = 0; eidx < in_dim.production(); ++eidx) { std::cout << x_data[eidx] << " -> " << out_data[eidx] << std::endl; } #endif // INSTANCE_NORM_FP16_PRINT_RESULT auto* out_ref_data = out_ref.data(); for (int i = 0; i < in_dim.production(); i++) { auto abs_diff = abs(out_data[i] - out_ref_data[i]); auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_ref_data[i]); EXPECT_EQ( (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF), true); if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) { LOG(ERROR) << "error idx:" << i << ", in_data[" << i << "]: " << x_data[i] << ", out_data[" << i << "]: " << out_data[i] << ", out_ref[" << i << "]: " << out_ref_data[i] << ", abs_diff: " << abs_diff << ", relative_diff: " << relative_diff << ", FP16_MAX_DIFF: " << FP16_MAX_DIFF; } } delete[] out_data; delete[] out_image_data; #ifdef INSTANCE_NORM_FP16_LOOP_TEST } // w } // h } // c } // n #else // nothing to do. #endif } } // namespace lite } // namespace paddle USE_LITE_KERNEL(instance_norm, kOpenCL, kFP16, kImageDefault, ImageDefault);