diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
index 7d493226821b2eff608d6cfaa0418cd319fe104e..610a692ef12c6ae6f992fff8e4e65f48f3aeb01f 100644
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -44,7 +44,7 @@ ExternalProject_Add(
     DEPENDS             ""
     PREFIX              ${XBYAK_PREFIX_DIR}
     SOURCE_DIR          ${XBYAK_SOURCE_DIR}
-    UPDATE_COMMAND      ""
+    # UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
 )
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 167d083f3d47f75b003107d3f873c5f111a3b337..7eb1bb1a24e2441fe72fcbffd231e267688899eb 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -261,6 +261,10 @@ void AnalysisConfig::EnableMkldnnBfloat16() {
 #ifdef PADDLE_WITH_MKLDNN
   if (platform::MayIUse(platform::cpu_isa_t::avx512_core)) {
     use_mkldnn_bfloat16_ = true;
+    LOG(INFO) << "Hardware support for BFLOAT16"
+              << (platform::MayIUse(platform::cpu_isa_t::avx512_bf16)
+                      ? " is enabled"
+                      : " is disabled. Simulation will be used");
   } else {
     LOG(INFO) << "CPU does not support BFLOAT16 calculations";
     use_mkldnn_bfloat16_ = false;
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index b6d42f1c790649bc9105ff7004dbcc2a109ec8d0..923c97350e89ea9a3de01120bb7df57766247a38 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -130,6 +130,8 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
     case avx512_mic_4ops:
       return true && MayIUse(avx512_mic) && cpu.has(Cpu::tAVX512_4FMAPS) &&
              cpu.has(Cpu::tAVX512_4VNNIW);
+    case avx512_bf16:
+      return true && cpu.has(Cpu::tAVX512_BF16);
     case isa_any:
       return true;
   }
@@ -173,6 +175,13 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
         return ((reg[1] & avx512f_mask) && (reg[1] & avx512dq_mask) &&
                 (reg[1] & avx512bw_mask) && (reg[1] & avx512vl_mask));
       }
+      // EAX = 7, ECX = 1
+      cpuid(reg, 0x00010007);
+      if (cpu_isa == avx512_bf16) {
+        // AVX512BF16: EAX Bit 5
+        int avx512bf16_mask = (1 << 5);
+        return (reg[0] & avx512bf16_mask) != 0;
+      }
     }
 #endif
     return false;
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 3c74e6fb2acb0a15cb2e034f1bcb2871578eae01..94527149d4e0b459dee03375d56fb0a9526aa055 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -83,6 +83,7 @@ typedef enum {
   avx512_core_vnni,
   avx512_mic,
   avx512_mic_4ops,
+  avx512_bf16,
 } cpu_isa_t;  // Instruction set architecture
 
 // May I use some instruction
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 03a21b29921de17799da580e31130c5ca9134729..745bda49ecfa09d9130abbd9f92732b0b45bf01f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -161,6 +161,17 @@ bool SupportsBfloat16() {
 #endif
 }
 
+bool SupportsBfloat16FastPerformance() {
+#ifndef PADDLE_WITH_MKLDNN
+  return false;
+#else
+  if (platform::MayIUse(platform::cpu_isa_t::avx512_bf16))
+    return true;
+  else
+    return false;
+#endif
+}
+
 bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
   return false;
@@ -1730,6 +1741,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("supports_bfloat16", SupportsBfloat16);
+  m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
   m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) {