diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index adc0842fb38825b5c86db28932da13316eea6820..95753bb336354e7e4cd41307bbe4f9d703d6edd7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -103,11 +103,12 @@ class ElementwiseOp : public framework::OperatorWithKernel {
       std::vector<int> out_dims_array(max_dim);
 #ifdef PADDLE_WITH_MKLDNN
       // (jczaja): Broadcasting of dims has to be done on Paddle shapes (NHWC)
-      // if model is using NHWC.
+      // if model is using NHWC and any of shapes in at least 3D
       bool should_rotate =
           ctx->IsRunMKLDNNKernel() &&
           (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() ==
-           framework::DataLayout::kNHWC);
+           framework::DataLayout::kNHWC) &&
+          (x_dims.size() >= 3 || y_dims.size() >= 3);
       if (should_rotate) {
         // Pick bigger shape and rotate this one
         bool x_over_y = (x_dims.size() > y_dims.size());
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 0811407466ddcea428c5cee4fab22ec83a128814..2540af5d472c4916514cbfa8257af487b6de6b5f 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -585,6 +585,19 @@ class MatMulOp : public framework::OperatorWithKernel {
 
     auto dim_x = GetDimForInput(*context, "X");
     auto dim_y = GetDimForInput(*context, "Y");
+
+#ifdef PADDLE_WITH_MKLDNN
+    // (jczaja): For NHWC execution output shape needs
+    // to be computed like instead x*y we are to do y*x
+    bool channelwise_onednn =
+        context->IsRunMKLDNNKernel() &&
+        (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() ==
+         framework::DataLayout::kNHWC);
+    if (channelwise_onednn) {
+      std::swap(dim_x, dim_y);
+    }
+#endif
+
     auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(
         RowMatrixFromVector(dim_x), 0,
         context->Attrs().Get<bool>("transpose_X"));
@@ -770,6 +783,21 @@ class MatMulOp : public framework::OperatorWithKernel {
           framework::TransToProtoVarType(tensor.dtype()), tensor.place(),
           tensor.layout());
     } else {
+#ifdef PADDLE_WITH_MKLDNN
+      // When matmul is first oneDNN op in a chain (there was some non oneDNN op
+      // previously)
+      // then we also need to rotate shape NHWC -> NCWH
+      if ((expected_kernel_type.data_layout_ ==
+           framework::DataLayout::kMKLDNN) &&
+          (tensor.layout() != framework::DataLayout::kMKLDNN) &&
+          paddle::platform::MKLDNNDeviceContext::tls()
+                  .get_cur_paddle_data_layout() ==
+              framework::DataLayout::kNHWC) {
+        return framework::OpKernelType(expected_kernel_type.data_type_,
+                                       tensor.place(),
+                                       framework::DataLayout::kNHWC);
+      }
+#endif
       return framework::OpKernelType(expected_kernel_type.data_type_,
                                      tensor.place(), tensor.layout());
     }
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 01fa01e3c6ed04c151f709dd5fbebe387c32bde3..55294331a9c85e980dfb4bb5d5fbfe650ef9b06c 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -274,6 +274,22 @@ class MatMulV2Op : public framework::OperatorWithKernel {
           framework::TransToProtoVarType(tensor.dtype()), tensor.place(),
           tensor.layout());
     } else {
+#ifdef PADDLE_WITH_MKLDNN
+      // When matmul_v2 is first oneDNN op in a chain (there was some non oneDNN
+      // op
+      // previously)
+      // then we also need to rotate shape NHWC -> NCWH
+      if ((expected_kernel_type.data_layout_ ==
+           framework::DataLayout::kMKLDNN) &&
+          (tensor.layout() != framework::DataLayout::kMKLDNN) &&
+          paddle::platform::MKLDNNDeviceContext::tls()
+                  .get_cur_paddle_data_layout() ==
+              framework::DataLayout::kNHWC) {
+        return framework::OpKernelType(expected_kernel_type.data_type_,
+                                       tensor.place(),
+                                       framework::DataLayout::kNHWC);
+      }
+#endif
       return framework::OpKernelType(expected_kernel_type.data_type_,
                                      tensor.place(), tensor.layout());
     }
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 17736a87409af94712ad79c62f012c7c88c06174..94c0124440ea9e671b0a9b2e17bdccd6923b20f5 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -78,13 +78,6 @@ tf_pd<Type> MKLDNNBwdPrimitiveDesc(const Engine& e, const Primitive& p,
 inline void MatchShapeToLayout(framework::Tensor* tensor_in,
                                framework::DataLayout from,
                                framework::DataLayout to) {
-  // In these data layouts, channel dimension is either on 2nd position: nChw or
-  // at last nhwC, so for dim==2 these layouts are the same and nothing should
-  // be done. Similarly for dim==1 when you have just one possible combination.
-  if (tensor_in->dims().size() < 3) {
-    return;
-  }
-
   auto print_dims = [](const std::vector<int>& dims) {
     std::ostringstream oss;
 
@@ -101,6 +94,15 @@ inline void MatchShapeToLayout(framework::Tensor* tensor_in,
     return oss.str();
   };
 
+  // In these data layouts, channel dimension is either on 2nd position: nChw or
+  // at last nhwC, so for dim==2 these layouts are the same and nothing should
+  // be done. Similarly for dim==1 when you have just one possible combination.
+  if (tensor_in->dims().size() < 3) {
+    VLOG(3) << "Keeping kMKLDNN/kNHWC/kNDHWC output_shape"
+            << print_dims(phi::vectorize<int>(tensor_in->dims()));
+    return;
+  }
+
   switch (from) {
     case framework::DataLayout::kMKLDNN:
       if ((to == framework::DataLayout::kNHWC) ||
@@ -571,6 +573,12 @@ inline void RegisterModelLayout(
     std::vector<std::unique_ptr<framework::OperatorBase>>& ops,
     const platform::Place& place) {
   if (platform::is_cpu_place(place)) {
+    // If there is already registered NHWC then quit this call
+    // not to overwrite setting with analysis of internal "while" op block
+    if (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() ==
+        framework::DataLayout::kNHWC)
+      return;
+
     VLOG(4) << "RegisterModelLayout for mkldnn";
     auto check_attrib = [](std::unique_ptr<framework::OperatorBase>& op,
                            const std::string& attrib_name) -> bool {
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
index 25701b797ec4a1cbdc540dad703c83101f2dd1ec..4e59e41b60851d5ed32adf72d8095dedb203b0d8 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
@@ -67,6 +67,8 @@ class TestMatMulV2VectorXVectorOneDNNOp(OpTest):
         self.y_shape = (100, )
         self.trans_x = False
         self.trans_y = False
+        self._cpu_only = True
+        self.use_mkldnn = True
 
     def set_inputs(self, x, y):
         self.inputs = {'X': x, 'Y': y}