From f33ae2060320fe68a1aa0465de503bc882febc8c Mon Sep 17 00:00:00 2001 From: Leo Guo <58431564+ZibinGuo@users.noreply.github.com> Date: Tue, 22 Feb 2022 17:09:42 +0800 Subject: [PATCH] Adapt to batch_norm_grad op and add align function in roi_align op for kunlun (#39685) * Adapt to batch_norm_grad op and add align function in roi_align op for kunlun, *test=kunlun * Adapt to batch_norm, batch_norm_grad op api for kunlun, and add unit-tests of batch_norm, roi_align. *test=kunlun --- paddle/fluid/operators/batch_norm_op_xpu.cc | 140 ++++++++++-------- paddle/fluid/operators/roi_align_op_xpu.cc | 6 +- .../unittests/xpu/test_batch_norm_op_xpu.py | 11 +- .../unittests/xpu/test_roi_align_op_xpu.py | 29 ++-- 4 files changed, 109 insertions(+), 77 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc index 505acbbdbde..6699df0c8dc 100644 --- a/paddle/fluid/operators/batch_norm_op_xpu.cc +++ b/paddle/fluid/operators/batch_norm_op_xpu.cc @@ -38,23 +38,25 @@ class BatchNormXPUKernel : public framework::OpKernel { bool global_stats = test_mode || use_global_stats; const auto &data_layout_str = ctx.Attr("data_layout"); const auto data_layout = framework::StringToDataLayout(data_layout_str); - PADDLE_ENFORCE_EQ(data_layout, DataLayout::kNCHW, + PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC", + true, platform::errors::InvalidArgument( - "The 'data_layout' attribute must be NCHW. But " - "recevived 'data_layout' is [%s].", + "The 'data_layout' attribute must be NCHW or NHWC. " + "But recevived 'data_layout' is [%s].", data_layout_str)); const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::InvalidArgument( - "The input tensor X's dimension must equal to 4. But " - "received X's shape = [%s], X's dimension = [%d].", - x_dims, x_dims.size())); - const int N = x_dims[0]; - const int C = x_dims[1]; - const int H = x_dims[2]; - const int W = x_dims[3]; + PADDLE_ENFORCE_EQ( + x_dims.size() >= 2 && x_dims.size() <= 5, true, + platform::errors::InvalidArgument( + "The size of input's dimensions should be between 2 and 5" + "But received: the size of input's dimensions is [%d]", + x_dims.size())); + + int N, C, H, W, D; + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + const auto *scale = ctx.Input("Scale"); const auto *bias = ctx.Input("Bias"); const auto *x_data = x->data(); @@ -75,6 +77,7 @@ class BatchNormXPUKernel : public framework::OpKernel { saved_variance->mutable_data(ctx.GetPlace()); auto &dev_ctx = ctx.template device_context(); + bool is_nchw = data_layout_str == "NCHW"; if (!global_stats) { auto *mean_out_data = mean_out->data(); @@ -95,7 +98,7 @@ class BatchNormXPUKernel : public framework::OpKernel { int r = xpu::batch_norm(dev_ctx.x_context(), x_data, y_data, N, C, H, W, epsilon, momentum, scale_data, bias_data, saved_mean_data, saved_variance_data, - mean_out_data, variance_out_data, true); + mean_out_data, variance_out_data, is_nchw); PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, platform::errors::External( "The batch_norm XPU API return wrong value[%d %s]", @@ -107,7 +110,7 @@ class BatchNormXPUKernel : public framework::OpKernel { const auto *variance_data = variance->data(); int r = xpu::batch_norm_infer(dev_ctx.x_context(), x_data, y_data, N, C, H, W, epsilon, scale_data, bias_data, - mean_data, variance_data, true); + mean_data, variance_data, is_nchw); PADDLE_ENFORCE_EQ( r, xpu::Error_t::SUCCESS, platform::errors::External( @@ -168,11 +171,11 @@ class BatchNormGradXPUKernel : public framework::OpKernel { const float epsilon = ctx.Attr("epsilon"); const auto data_layout = framework::StringToDataLayout(data_layout_str); - // TODO(guozbin): Transform input tensor from NHWC to NCHW - PADDLE_ENFORCE_EQ(data_layout, DataLayout::kNCHW, + PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC", + true, platform::errors::InvalidArgument( - "The 'data_layout' attribute must be NCHW. But " - "recevived 'data_layout' is [%s].", + "The 'data_layout' attribute must be NCHW or NHWC. " + "But recevived 'data_layout' is [%s].", data_layout_str)); auto *d_x = ctx.Output(framework::GradVarName("X")); @@ -207,15 +210,15 @@ class BatchNormGradXPUKernel : public framework::OpKernel { } const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::InvalidArgument( - "The input tensor X's dimension must equal to 4. But " - "received X's shape = [%s], X's dimension = [%d].", - x_dims, x_dims.size())); - const int N = x_dims[0]; - const int C = x_dims[1]; - const int H = x_dims[2]; - const int W = x_dims[3]; + PADDLE_ENFORCE_EQ( + x_dims.size() >= 2 && x_dims.size() <= 5, true, + platform::errors::InvalidArgument( + "The size of input's dimensions should be between 2 and 5" + "But received: the size of input's dimensions is [%d]", + x_dims.size())); + + int N, C, H, W, D; + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); const auto *x_data = x->data(); const auto *d_y_data = d_y->data(); @@ -250,38 +253,35 @@ class BatchNormGradXPUKernel : public framework::OpKernel { auto &dev_ctx = ctx.template device_context(); xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - const T *mean_data = nullptr; - const T *inv_var_data = nullptr; + const auto *batch_mean = ctx.Input("SavedMean"); + const auto *batch_inv_std = ctx.Input("SavedVariance"); + const auto *global_mean = ctx.Input("Mean"); + const auto *global_var = ctx.Input("Variance"); // TODO(guozibin): hadle the situation case of N * H * W = 1 - if (!use_global_stats) { - const auto *saved_mean = ctx.Input("SavedMean"); - // SavedVariance have been reverted in forward operator - const auto *saved_inv_variance = ctx.Input("SavedVariance"); - mean_data = saved_mean->data(); - inv_var_data = saved_inv_variance->data(); - } else { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_variance = ctx.Input("Variance"); - mean_data = running_mean->data(); - inv_var_data = running_variance->data(); - float *running_inv_var_data = - RAII_GUARD.alloc_l3_or_gm(running_variance->numel()); - float *epsilon_data = RAII_GUARD.alloc_l3_or_gm(1); - int r1 = calculate_inv_var(dev_ctx.x_context(), inv_var_data, epsilon, C, - epsilon_data, running_inv_var_data); - PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External( - "XPU API(batch_norm_grad " - "calculate_inv_var function) " - "return wrong value[%d %s]", - r1, XPUAPIErrorMsg[r1])); - inv_var_data = running_inv_var_data; - } if (is_inplace) { + float *global_inv_std_data; + if (use_global_stats) { + global_inv_std_data = + RAII_GUARD.alloc_l3_or_gm(global_var->numel()); + float *epsilon_data = RAII_GUARD.alloc_l3_or_gm(1); + int r1 = + calculate_inv_var(dev_ctx.x_context(), global_var->data(), + epsilon, C, epsilon_data, global_inv_std_data); + PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External( + "XPU API(batch_norm_grad " + "calculate_inv_var function) " + "return wrong value[%d %s]", + r1, XPUAPIErrorMsg[r1])); + } auto px = *x; + auto *inv_std_data = + use_global_stats ? global_inv_std_data : batch_inv_std->data(); + auto mean_data = use_global_stats ? global_mean->data() + : batch_mean->data(); int r2 = calculate_inv_BN_Y( dev_ctx.x_context(), px.mutable_data(ctx.GetPlace()), - scale->data(), bias->data(), mean_data, inv_var_data, N, + scale->data(), bias->data(), mean_data, inv_std_data, N, C, H * W, x->data()); PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, platform::errors::External( "XPU API(batch_norm_grad " @@ -289,19 +289,29 @@ class BatchNormGradXPUKernel : public framework::OpKernel { "return wrong value[%d %s]", r2, XPUAPIErrorMsg[r2])); } - if (!d_x) { - d_x_data = RAII_GUARD.alloc_l3_or_gm(x->numel()); - } - if (!d_scale) { - d_scale_data = RAII_GUARD.alloc_l3_or_gm(C); - } - if (!d_bias_data) { - d_bias_data = RAII_GUARD.alloc_l3_or_gm(C); - } - int r3 = xpu::batch_norm_grad( - dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W, scale_data, - mean_data, inv_var_data, d_scale_data, d_bias_data, true); + int r3; + bool is_nchw = data_layout_str == "NCHW"; + if (use_global_stats) { + r3 = xpu::batch_norm_grad( + dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W, + scale_data, nullptr, nullptr, d_scale_data, d_bias_data, is_nchw, + global_mean->data(), global_var->data(), epsilon); + } else { + if (!d_x) { + d_x_data = RAII_GUARD.alloc_l3_or_gm(x->numel()); + } + if (!d_scale) { + d_scale_data = RAII_GUARD.alloc_l3_or_gm(C); + } + if (!d_bias_data) { + d_bias_data = RAII_GUARD.alloc_l3_or_gm(C); + } + r3 = xpu::batch_norm_grad( + dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W, + scale_data, batch_mean->data(), batch_inv_std->data(), + d_scale_data, d_bias_data, is_nchw); + } PADDLE_ENFORCE_EQ(r3, XPU_SUCCESS, platform::errors::External( "XPU API(batch_norm_grad) return " "wrong value[%d %s]", diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc index 7764e52c2f6..09d2d906653 100644 --- a/paddle/fluid/operators/roi_align_op_xpu.cc +++ b/paddle/fluid/operators/roi_align_op_xpu.cc @@ -32,6 +32,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel { auto pooled_width = ctx.Attr("pooled_width"); auto spatial_scale = ctx.Attr("spatial_scale"); auto sampling_ratio = ctx.Attr("sampling_ratio"); + auto aligned = ctx.Attr("aligned"); auto in_dims = in->dims(); int batch_size = in_dims[0]; @@ -117,7 +118,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel { dev_ctx.x_context(), in->data(), out->mutable_data(ctx.GetPlace()), rois->data(), roi_id_data, batch_size, channels, height, width, out->dims()[0], pooled_height, - pooled_width, spatial_scale, sampling_ratio, true); + pooled_width, spatial_scale, sampling_ratio, true, aligned); PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, platform::errors::External( "The roi_align XPU OP return wrong value[%d %s]", r, @@ -143,6 +144,7 @@ class XPUROIAlignGradOpKernel : public framework::OpKernel { auto pooled_width = ctx.Attr("pooled_width"); auto spatial_scale = ctx.Attr("spatial_scale"); auto sampling_ratio = ctx.Attr("sampling_ratio"); + auto aligned = ctx.Attr("aligned"); int rois_num = rois->dims()[0]; int channels = in->dims()[1]; @@ -197,7 +199,7 @@ class XPUROIAlignGradOpKernel : public framework::OpKernel { dev_ctx.x_context(), out_grad->data(), in_grad->data(), rois->data(), roi_id_data, in->dims()[0], channels, height, width, out_grad->dims()[0], pooled_height, pooled_width, spatial_scale, - sampling_ratio, true); + sampling_ratio, true, aligned); PADDLE_ENFORCE_EQ( r, xpu::Error_t::SUCCESS, platform::errors::External( diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py index 9cd34c82650..f401a9a5374 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py @@ -296,7 +296,9 @@ class TestXPUBatchNormOpUseGlobalStats(unittest.TestCase): net2.training = False y1 = net1(x) y2 = net2(x) - self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True) + self.assertEqual( + np.allclose( + y1.numpy(), y2.numpy(), atol=1e-4), True) class TestXPUBatchNormUseGlobalStatsCase1(TestXPUBatchNormOpUseGlobalStats): @@ -320,5 +322,12 @@ class TestXPUBatchNormUseGlobalStatsCase3(TestXPUBatchNormOpUseGlobalStats): self.trainable_statistics = True +class TestXPUBatchNormUseGlobalStatsCase4(TestXPUBatchNormOpUseGlobalStats): + ### train mode + def init_test(self): + self.use_global_stats = True + self.trainable_statistics = False + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py index 2122223dbec..e80b1e4c50e 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py @@ -40,7 +40,8 @@ class TestROIAlignOp(XPUOpTest): 'spatial_scale': self.spatial_scale, 'pooled_height': self.pooled_height, 'pooled_width': self.pooled_width, - 'sampling_ratio': self.sampling_ratio + 'sampling_ratio': self.sampling_ratio, + 'aligned': self.continuous_coordinate } self.outputs = {'Out': self.out_data} @@ -51,6 +52,8 @@ class TestROIAlignOp(XPUOpTest): self.height = 8 self.width = 6 + self.xpu_version = core.get_xpu_device_version(0) + # n, c, h, w self.x_dim = (self.batch_size, self.channels, self.height, self.width) @@ -58,7 +61,10 @@ class TestROIAlignOp(XPUOpTest): self.pooled_height = 2 self.pooled_width = 2 self.sampling_ratio = -1 - + if self.xpu_version == core.XPUVersion.XPU1: + self.continuous_coordinate = False + else: + self.continuous_coordinate = bool(np.random.randint(2)) self.x = np.random.random(self.x_dim).astype('float32') def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w, @@ -124,12 +130,16 @@ class TestROIAlignOp(XPUOpTest): roi = self.rois[i] roi_batch_id = int(roi[0]) x_i = self.x[roi_batch_id] - roi_xmin = roi[1] * self.spatial_scale - roi_ymin = roi[2] * self.spatial_scale - roi_xmax = roi[3] * self.spatial_scale - roi_ymax = roi[4] * self.spatial_scale - roi_width = max(roi_xmax - roi_xmin, 1) - roi_height = max(roi_ymax - roi_ymin, 1) + roi_offset = 0.5 if self.continuous_coordinate else 0 + roi_xmin = roi[1] * self.spatial_scale - roi_offset + roi_ymin = roi[2] * self.spatial_scale - roi_offset + roi_xmax = roi[3] * self.spatial_scale - roi_offset + roi_ymax = roi[4] * self.spatial_scale - roi_offset + roi_width = roi_xmax - roi_xmin + roi_height = roi_ymax - roi_ymin + if not self.continuous_coordinate: + roi_width = max(roi_width, 1) + roi_height = max(roi_height, 1) bin_size_h = float(roi_height) / float(self.pooled_height) bin_size_w = float(roi_width) / float(self.pooled_width) roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \ @@ -203,7 +213,8 @@ class TestROIAlignInLodOp(TestROIAlignOp): 'spatial_scale': self.spatial_scale, 'pooled_height': self.pooled_height, 'pooled_width': self.pooled_width, - 'sampling_ratio': self.sampling_ratio + 'sampling_ratio': self.sampling_ratio, + 'aligned': self.continuous_coordinate } self.outputs = {'Out': self.out_data} -- GitLab