Added nearest interp v2 BF16 FWD kernel (#39490)

* added nearest interp v2 bf16 * disabled bilinear interp nhwc test * added skipping UT for gpu * added NHWC support * removed unnecessary statements * minor change * CI fix * added appropriate changes to interpolate_v1 * fix after review * minor change * minor change * revert unwanted deletions * CI fix

Added nearest interp v2 BF16 FWD kernel (#39490)
* added nearest interp v2 bf16 * disabled bilinear interp nhwc test * added skipping UT for gpu * added NHWC support * removed unnecessary statements * minor change * CI fix * added appropriate changes to interpolate_v1 * fix after review * minor change * minor change * revert unwanted deletions * CI fix
2ec943a7 · jakpiase · GitHub · 1abfc8dd · 2ec943a7 · 2ec943a7
4 changed file
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -328,7 +328,7 @@ class InterpolateOp : public framework::OperatorWithKernel {
    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");

 #ifdef PADDLE_WITH_MKLDNN
-    auto interp_method = ctx.Attr<std::string>("interp_method");
+    const auto& interp_method = ctx.Attr<std::string>("interp_method");
    // TODO(danqing): support other interp_method
    if (this->CanMKLDNNBeUsed(ctx, data_type) &&
        (interp_method == "nearest" || interp_method == "bilinear")) {

--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -414,7 +414,7 @@ class InterpolateV2Op : public framework::OperatorWithKernel {
    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");

 #ifdef PADDLE_WITH_MKLDNN
-    auto interp_method = ctx.Attr<std::string>("interp_method");
+    const auto& interp_method = ctx.Attr<std::string>("interp_method");
    // TODO(danqing): support other interp_method
    if (this->CanMKLDNNBeUsed(ctx, data_type) &&
        (interp_method == "nearest" || interp_method == "bilinear")) {

--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -53,17 +53,13 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
  std::vector<int> ComputeOutputShape(
      const framework::ExecutionContext& ctx) const {
    const auto* x = ctx.Input<Tensor>("X");
-    auto in_dims = x->dims();
-    const bool is_channel_last = false;  // In mkldnn kernel, always use NCHW
-
-    framework::DDim in_dhw_dims;
-    if (is_channel_last) {  // NDHWC, NHWC, NWC
-      in_dhw_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {  // NCDHW, NCHW, NCW
-      in_dhw_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
+    const auto& in_dims = x->dims();
+
+    const framework::DDim in_dhw_dims =
+        phi::slice_ddim(in_dims, 2, in_dims.size());

    std::vector<int> out_dims;
+    out_dims.reserve(5);
    if (in_dhw_dims.size() == 1) {
      out_dims.push_back(ctx.Attr<int>("out_w"));
    } else if (in_dhw_dims.size() == 2) {
@@ -125,12 +121,8 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
                             "out_d, out_h, out_w of Op(interpolate) "
                             "should be greater than 0."));

-    out_dims.insert(out_dims.begin(), in_dims[0]);
-    if (is_channel_last) {
-      out_dims.push_back(in_dims[in_dims.size() - 1]);
-    } else {
-      out_dims.insert(out_dims.begin() + 1, in_dims[1]);
-    }
+    const std::vector<int64_t> nc_dims = {in_dims[0], in_dims[1]};
+    out_dims.insert(out_dims.begin(), nc_dims.begin(), nc_dims.end());
    return out_dims;
  }

@@ -143,12 +135,12 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
    const auto* x = ctx.Input<Tensor>("X");
    auto* z = ctx.Output<Tensor>("Out");

-    auto interp_method = ctx.Attr<std::string>("interp_method");
-    dnnl::algorithm algo = (interp_method == "nearest")
-                               ? dnnl::algorithm::resampling_nearest
-                               : dnnl::algorithm::resampling_linear;
+    const auto interp_method = ctx.Attr<std::string>("interp_method");
+    const dnnl::algorithm algo = (interp_method == "nearest")
+                                     ? dnnl::algorithm::resampling_nearest
+                                     : dnnl::algorithm::resampling_linear;

-    auto out_dims_vec = ComputeOutputShape(ctx);
+    const auto out_dims_vec = ComputeOutputShape(ctx);
    framework::DDim dim_out = phi::make_ddim(out_dims_vec);
    z->Resize(dim_out);

@@ -162,6 +154,7 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
    const std::unordered_map<int, dnnl::memory> args = {
        {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
    resampling_prim->execute(astream, args);
    astream.wait();

@@ -184,6 +177,7 @@ REGISTER_OP_KERNEL(bilinear_interp, MKLDNN, ::paddle::platform::CPUPlace,

 REGISTER_OP_KERNEL(nearest_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::InterpolateMKLDNNKernel<float>,
+                   ops::InterpolateMKLDNNKernel<paddle::platform::bfloat16>,
                   ops::InterpolateMKLDNNKernel<int8_t>,
                   ops::InterpolateMKLDNNKernel<uint8_t>);
 REGISTER_OP_KERNEL(bilinear_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function

 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
 from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci


@@ -59,6 +59,7 @@ def nearest_neighbor_interp_mkldnn_np(X,


 @skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
+@OpTestTool.skip_if_not_cpu_bf16()
 class TestNearestInterpV2MKLDNNOp(OpTest):
    def init_test_case(self):
        pass
@@ -84,7 +85,7 @@ class TestNearestInterpV2MKLDNNOp(OpTest):
        self.init_test_case()
        self.init_data_type()

-        if self.dtype == np.float32:
+        if self.dtype == np.float32 or self.dtype == np.uint16:
            input_np = np.random.random(self.input_shape).astype(self.dtype)
        else:
            init_low, init_high = (-5, 5) if self.dtype == np.int8 else (0, 10)
@@ -126,6 +127,9 @@ class TestNearestInterpV2MKLDNNOp(OpTest):
        if isinstance(self.scale, float):
            self.scale = [self.scale]

+        if self.dtype == np.uint16:
+            input_np = convert_float_to_uint16(input_np)
+
        self.inputs = {'X': input_np}
        if self.out_size is not None:
            self.inputs['OutSize'] = self.out_size
@@ -191,6 +195,10 @@ def create_test_class(parent):
        def init_data_type(self):
            self.dtype = np.float32

+    class TestBf16Case(parent):
+        def init_data_type(self):
+            self.dtype = np.uint16
+
    class TestInt8Case(parent):
        def init_data_type(self):
            self.dtype = np.int8
@@ -199,12 +207,14 @@ def create_test_class(parent):
        def init_data_type(self):
            self.dtype = np.uint8

-    TestFp32Case.__name__ = parent.__name__
-    TestInt8Case.__name__ = parent.__name__
-    TestUint8Case.__name__ = parent.__name__
-    globals()[parent.__name__] = TestFp32Case
-    globals()[parent.__name__] = TestInt8Case
-    globals()[parent.__name__] = TestUint8Case
+    TestFp32Case.__name__ = "{0}_{1}".format(parent.__name__, "FP32")
+    TestBf16Case.__name__ = "{0}_{1}".format(parent.__name__, "BF16")
+    TestInt8Case.__name__ = "{0}_{1}".format(parent.__name__, "INT8")
+    TestUint8Case.__name__ = "{0}_{1}".format(parent.__name__, "UINT8")
+    globals()[TestFp32Case.__name__] = TestFp32Case
+    globals()[TestBf16Case.__name__] = TestBf16Case
+    globals()[TestInt8Case.__name__] = TestInt8Case
+    globals()[TestUint8Case.__name__] = TestUint8Case


 create_test_class(TestNearestInterpV2MKLDNNOp)