diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 18287f0c7a4eec2c93b049251ddc0b0863604828..d8eab0e9a729744d8c3608891cb2f636414821a6 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1908,7 +1908,8 @@ Scope* OperatorWithKernel::PrepareData(
             (var->IsType<LoDTensor>() == true) &&
             (expected_kernel_key.data_layout_ != DataLayout::kMKLDNN) &&
             (paddle::platform::MKLDNNDeviceContext::tls()
-                 .get_cur_paddle_data_layout() == DataLayout::kNHWC)) {
+                 .get_cur_paddle_data_layout() == DataLayout::kNHWC) &&
+            (tensor_in->dims().size() >= 3)) {
           // Mixed execution : MKL-DNN and GPU is not supported!
           if (!new_scope) {
             new_scope = &scope.NewScope();
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 6bf419c47a5669b87c0b47d48259362a66a23239..fd06e33a6bb6e03b8e90c47ae3edbb9ce18e0e85 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -17,6 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/assign_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -65,6 +69,12 @@ class ConditionalBlockOp : public ConditionalOp {
       scopes->resize(1);
       scopes->front() = &scope.NewScope();
       auto &cur_scope = *scopes->front();
+#ifdef PADDLE_WITH_MKLDNN
+      // (jczaja) Executor on being destroyed clears oneDNN cache and
+      // reset registered model data layout. This is unwanted for nested
+      // Executors (executors declared inside control ops)
+      platform::DontClearMKLDNNCache(dev_place);
+#endif
       framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
       VLOG(3) << "Conditional block.idx = " << block->ID()
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index eb44655c88f1848e26219fd87e03e9a4e8e587ae..d8daa25f31be8d0eac90667daed40367cc037d35 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -17,6 +17,9 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -66,6 +69,12 @@ class WhileOp : public framework::OperatorBase {
             "the Condition's shape is ",
             cond.dims().to_str(), ".\n"));
 
+#ifdef PADDLE_WITH_MKLDNN
+    // (jczaja) Executor on being destroyed clears oneDNN cache and
+    // resets registered model data layout. This is unwanted for nested
+    // Executors (executors declared inside control ops)
+    platform::DontClearMKLDNNCache(dev_place);
+#endif
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index f2beb4cec212e245a7e8a825b7b660c8633b2acb..9de5bc6ea36368e51f52916f50443a6a732c5b68 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -97,7 +97,7 @@ Crop Operator.
 Crop input into output, as specified by offsets and shape.
 
 There are two ways to set the offsets:
-1. In runtime: Using the input 'Offsets', which is a Vairbale and can be 
+1. In runtime: Using the input 'Offsets', which is a Variable and can be 
                output of other operators. This way is suitable for 
                dynamic offsets.
 2. In network configuration: Using the attribute 'offsets', which will be 
diff --git a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
index 3ebfbdc50caab4cbe4f026f1ecdba2da6e5ce44d..8bad3e86b2934e51d763246ce1d4e1330da6e54a 100644
--- a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
+++ b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
@@ -1 +1 @@
-cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op shape_op activation_op pooling transpose_op scope device_context enforce executor)
+cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op shape_op crop_op activation_op pooling transpose_op scope device_context enforce executor)
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 4ff93ee3cd6241a5fe4b2190575472974845a4ac..b9866ba8c364794f67935c2855ae2b51c4b443b5 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -34,6 +34,8 @@ USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
 USE_OP_ITSELF(shape);
 USE_OP_DEVICE_KERNEL(shape, MKLDNN);
+USE_OP_ITSELF(crop);
+USE_OP_DEVICE_KERNEL(crop, CPU);
 
 PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
@@ -211,5 +213,68 @@ TEST(test_pool2d_shape_nhwc, cpu_place) {
                         "Computed shape does not match expected shape"));
 }
 
+TEST(test_pool2d_crop_nhwc, cpu_place) {
+  framework::DDim dims({1, 4, 8, 512});           // NHWC shape
+  framework::DDim expected_dims({1, 3, 7, 512});  // NCHW expected shape
+  platform::CPUPlace p;
+  framework::Scope scope;
+
+  InputVars input_name = {"x",
+                          scope.Var("x")->GetMutable<framework::LoDTensor>()};
+  InputVars second_crop_input_name = {
+      "v", scope.Var("v")->GetMutable<framework::LoDTensor>()};
+  // Initialize input data
+  std::uniform_real_distribution<float> dist(10.0f, 20.0f);
+  std::mt19937 engine;
+  size_t numel = static_cast<size_t>(phi::product(dims));
+  input_name.tensor->Resize(dims);
+  auto data_ptr = input_name.tensor->mutable_data<float>(p);
+  for (size_t i = 0; i < numel; ++i) {
+    data_ptr[i] = dist(engine);
+  }
+  // Second input (Y) to crop is having no buffer
+  // but as it is MKLDNN then its shape order should be NCHW
+  auto expected_dims_nchw = phi::vectorize<int64_t>(expected_dims);
+  std::rotate(expected_dims_nchw.begin() + 1, expected_dims_nchw.end() - 1,
+              expected_dims_nchw.end());
+  second_crop_input_name.tensor->Resize(phi::make_ddim(expected_dims_nchw));
+  const auto second_crop_input_md =
+      dnnl::memory::desc(expected_dims_nchw, dnnl::memory::data_type::f32,
+                         dnnl::memory::format_tag::nhwc);
+  second_crop_input_name.tensor->set_mem_desc(second_crop_input_md);
+
+  scope.Var("y")->GetMutable<framework::LoDTensor>();
+  auto *z = scope.Var("z")->GetMutable<framework::LoDTensor>();
+
+  auto &pool = platform::DeviceContextPool::Instance();
+
+  // Make pool2d followed by crop. crop may have Y input as
+  // non buffered so the path to be executed is handling oneDNN kernel
+  // that is followed by CPU kernel with non-buffered Input
+
+  auto ksize = std::vector<int>(2, 2);
+  auto op_pool = framework::OpRegistry::CreateOp(
+      "pool2d", {{"X", {"x"}}}, {{"Out", {"y"}}},
+      {{"pooling_type", {std::string("max")}},
+       {"ksize", {ksize}},
+       {"data_format", {std::string("NHWC")}},
+       {"use_mkldnn", {true}}});
+
+  std::vector<int> offsets{0, 0, 0, 0};
+  auto op_crop = framework::OpRegistry::CreateOp(
+      "crop", {{"X", {"y"}}, {"Y", {"v"}}}, {{"Out", {"z"}}},
+      {{"offsets", {offsets}}});
+
+  op_pool->Run(scope, p);
+  op_crop->Run(scope, p);
+
+  pool.Get(p)->Wait();
+
+  // Verify shape of output
+  PADDLE_ENFORCE_EQ(z->dims(), expected_dims,
+                    platform::errors::InvalidArgument(
+                        "Output shape does not match expected output shape"));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 0bf5ca7f8f52572943c711eb6b960aedb7b13acd..09a29c3429cba5ce71da7138097bded7572913f5 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -750,7 +750,7 @@ dnnl::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) {
 void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
   VLOG(4) << tls().get_curr_exec() << " " << ptr;
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-  if (!block_next_cache_clearing_) {
+  if (block_next_cache_clearing_ == 0) {
     VLOG(3) << "Clearing DNNL cache.";
     // If no specific executor pointer then clear
     // everything. For executor pointer then clear only
@@ -768,9 +768,20 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
         s.second->erase(ptr);
       }
     }
+    // Reset paddle layout to NCHW
+    VLOG(3) << "Resetting Paddle data layout to NCHW.";
+    platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
+        paddle::framework::DataLayout::kNCHW);
   } else {
-    VLOG(3) << "Prevented Clearing DNNL cache.";
-    block_next_cache_clearing_ = false;
+    --block_next_cache_clearing_;
+    VLOG(3) << "Prevented Clearing DNNL cache. Updated "
+               "block_next_cache_clearing_ : "
+            << block_next_cache_clearing_;
+    PADDLE_ENFORCE_GE(block_next_cache_clearing_, 0,
+                      platform::errors::InvalidArgument(
+                          "Cache clearing mark should be non-negative "
+                          ". But received %d.",
+                          block_next_cache_clearing_));
   }
 }
 
@@ -796,8 +807,10 @@ void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
 
 void MKLDNNDeviceContext::BlockNextCacheClearing() {
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-  VLOG(3) << "Next DNNL cache clearing has been blocked.";
-  block_next_cache_clearing_ = true;
+  ++block_next_cache_clearing_;
+  VLOG(3) << "Next DNNL cache clearing has been blocked. Updated "
+             "block_next_cache_clearing_ : "
+          << block_next_cache_clearing_;
 }
 
 size_t MKLDNNDeviceContext::GetShapeBlobSize() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 2b53ecf86a641f3579f6ff6f6c78f6c2c55d83da..a63d41405f1b2dac733d419e954d74ba35f63252 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -850,7 +850,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   // to erase
   std::shared_ptr<ExecShape> p_exec_items_;
   std::shared_ptr<std::mutex> p_mutex_;
-  bool block_next_cache_clearing_ = false;
+  // 0 - clearing is allowed. x > 0 do not clear.
+  unsigned int block_next_cache_clearing_ = 0;
 };
 #endif
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 94c0124440ea9e671b0a9b2e17bdccd6923b20f5..5e77046962931302f06080a5dd1275d7fd2b0d7a 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -148,8 +148,6 @@ inline void ClearMKLDNNCache(const platform::Place& place,
     platform::MKLDNNDeviceContext* dev_ctx =
         (platform::MKLDNNDeviceContext*)pool.Get(place);
     dev_ctx->ResetBlobMap(ptr);
-    platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
-        paddle::framework::DataLayout::kNCHW);
   }
 }