diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d79bba7fd2f81e484169972a4bb43cc7dbe393de..fdff8310e710ba58982ce25550dc862d32662679 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -226,6 +226,23 @@ bool SupportsBfloat16FastPerformance() {
 #endif
 }
 
+bool SupportsInt8() {
+#ifndef PADDLE_WITH_MKLDNN
+  return false;
+#else
+  return (platform::MayIUse(platform::cpu_isa_t::avx2) ||
+          platform::MayIUse(platform::cpu_isa_t::avx512f));
+#endif
+}
+
+bool SupportsVNNI() {
+#ifndef PADDLE_WITH_MKLDNN
+  return false;
+#else
+  return platform::MayIUse(platform::cpu_isa_t::avx512_core_vnni);
+#endif
+}
+
 // According to the input `place` and `dtype`, this function returns a tuple
 // consists of three sets:
 // 1) All operators registered in the Paddle framework.
@@ -2121,6 +2138,8 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS);
   m.def("supports_bfloat16", SupportsBfloat16);
   m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
+  m.def("supports_int8", SupportsInt8);
+  m.def("supports_vnni", SupportsVNNI);
   m.def("op_supported_infos", OpSupportedInfos);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
index 2cfb6146f3f55d1b939d3a5d3e6b141a517524e1..7508ecbb2946d2efb393bd1c984df5269a96a34e 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -23,13 +23,12 @@ from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, Te
 
 
 def conv2d_forward_refer(input, filter, group, conv_param):
-    out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group,
-                                                          conv_param)
+    out, _, _, _, _ = conv2d_forward_naive(input, filter, group, conv_param)
     return out
 
 
-@unittest.skipIf(not core.supports_bfloat16(),
-                 "place does not support BF16 evaluation")
+@unittest.skipIf(not core.supports_int8(),
+                 "place does not support int8 computation")
 class TestConv2DInt8Op(TestConv2DOp):
     def setUp(self):
         self.op_type = "conv2d"
@@ -53,73 +52,61 @@ class TestConv2DInt8Op(TestConv2DOp):
             'pad': self.pad,
             'dilation': self.dilations
         }
-
+        # This implementation of convolution quantization is based on OneDNN documentation
+        # https://oneapi-src.github.io/oneDNN/dev_guide_int8_computations.html#doxid-dev-guide-int8-computations-1dg-i8-comp-s11
+        scale_output_shift = (self.scale_out /
+                              (self.scale_in * self.scale_weights[0]))
         filter = np.random.random(self.filter_size).astype(self.weighttype)
-        if self.srctype == np.uint8:
-            input = np.random.randint(0, 10,
+
+        # When the Intel AVX2 or Intel AVX512 Instruction Set is used
+        # the reorder additionally scales the weights by 0.5
+        # to overcome the potential overflow issue. If the processor supports VNNI instructions,
+        # modification of the weights is not necessary.
+        avx_scale = 0.5 if not core.supports_vnni(
+        ) and self.srctype == np.int8 else 1.
+        filter_int = np.round(filter * self.scale_weights[0] *
+                              avx_scale).astype(np.int32)
+        scale_output_shift = scale_output_shift / avx_scale
+
+        def conv2d_forward_refer_helper(input_):
+            return conv2d_forward_refer(
+                input_.astype(np.int32), filter_int, self.groups,
+                conv2d_param).astype(np.float32) * scale_output_shift
+
+        def residual_helper(init_low, init_high, output_):
+            input_residual_ = np.random.randint(
+                init_low, init_high,
+                self.input_residual_size).astype(self.srctype)
+            return (output_ + input_residual_ *
+                    (self.scale_out / self.scale_in_eltwise)), input_residual_
+
+        if self.srctype == np.int8:
+            init_low, init_high = (-5, 5)
+            input = np.random.randint(init_low, init_high,
                                       self.input_size).astype(self.srctype)
+            input_shift = (np.ones(self.input_size) * 128).astype(np.uint8)
+
+            output1 = conv2d_forward_refer_helper(
+                np.round(input + input_shift).astype(np.int32))
+            output2 = conv2d_forward_refer_helper(
+                np.round(input_shift).astype(np.int32))
+            output = output1 - output2
         else:
-            input = np.random.randint(-5, 5,
+            init_low, init_high = (0, 10)
+            input = np.random.randint(init_low, init_high,
                                       self.input_size).astype(self.srctype)
-            input_shift = (np.ones(self.input_size) * 128).astype(np.uint8)
+            output = conv2d_forward_refer_helper(input)
 
-        if self.srctype == np.int8:
-            filter_int = np.round(filter * self.scale_weights[0] *
-                                  0.5).astype(np.int32)
-            scale_output_shift = self.scale_out / (self.scale_in *
-                                                   self.scale_weights[0] * 0.5)
-            output1 = conv2d_forward_refer(
-                np.round((input.astype(np.int32) + input_shift) *
-                         self.scale_in).astype(np.int32), filter_int,
-                self.groups,
-                conv2d_param).astype(np.float32) * scale_output_shift
-            output2 = conv2d_forward_refer(
-                np.round((input_shift) * self.scale_in).astype(np.int32),
-                filter_int, self.groups,
-                conv2d_param).astype(np.float32) * scale_output_shift
-            if self.fuse_residual:
-                input_residual = np.random.randint(
-                    -5, 5, self.input_residual_size).astype(self.srctype)
-                output_tmp = np.round(output1 - output2 + input_residual.astype(
-                    self.srctype) * (self.scale_out / self.scale_in_eltwise))
-                if self.fuse_activation == "relu":
-                    output = np.maximum(output_tmp, 0).astype(self.dsttype)
-                else:
-                    output = output_tmp.astype(self.dsttype)
-            else:
-                if self.fuse_activation == "relu":
-                    output = np.maximum(np.round(output1 - output2),
-                                        0).astype(self.dsttype)
-                else:
-                    output = np.round(output1 - output2).astype(self.dsttype)
+        if self.fuse_residual:
+            output, input_residual = residual_helper(init_low, init_high,
+                                                     output)
 
-        else:
-            filter_int = np.round(filter *
-                                  self.scale_weights[0]).astype(np.int32)
-            scale_output_shift = self.scale_out / (self.scale_in *
-                                                   self.scale_weights[0])
-            output1 = conv2d_forward_refer(
-                input.astype(np.int32), filter_int, self.groups,
-                conv2d_param).astype(np.float32)
-            output1_tmp = np.round(output1 * (
-                self.scale_out / (self.scale_in * self.scale_weights[0])))
-
-            if self.fuse_residual:
-                input_residual = np.random.randint(
-                    0, 10, self.input_residual_size).astype(self.srctype)
-                output_tmp_res = np.round(output1 * (self.scale_out / (
-                    self.scale_in * self.scale_weights[
-                        0])) + input_residual.astype(np.int32) * (
-                            self.scale_out / self.scale_in_eltwise))
-                if self.fuse_activation == "relu":
-                    output = np.maximum(output_tmp_res, 0).astype(self.dsttype)
-                else:
-                    output = output_tmp_res.astype(self.dsttype)
-            else:
-                if self.fuse_activation == "relu":
-                    output = np.maximum(output1_tmp, 0).astype(self.dsttype)
-                else:
-                    output = output1_tmp.astype(self.dsttype)
+        output = np.round(output)
+
+        if self.fuse_activation == "relu":
+            output = np.maximum(output, 0)
+
+        output = output.astype(self.dsttype)
 
         self.inputs = {
             'Input':
@@ -169,7 +156,7 @@ class TestConv2DInt8Op(TestConv2DOp):
         f_c = self.input_size[1] // self.groups
         self.input_residual_size = [1, 2, 3, 3]
         self.filter_size = [2, f_c, 3, 3]
-        self.scale_in = 1.0
+        self.scale_in = 0.95
         self.scale_out = 0.5
         self.scale_weights = [10.0]
         self.scale_in_eltwise = 0.6
@@ -185,7 +172,7 @@ class TestConv2DInt8Op(TestConv2DOp):
         self.fuse_residual = True
 
 
-#--------------------test conv2d u8 in and u8 out with residual fuse--------------------
+# --------------------test conv2d u8 in and u8 out with residual fuse--------------------
 
 
 class TestConv2D(TestConv2DInt8Op):
@@ -197,7 +184,7 @@ class TestConv2D(TestConv2DInt8Op):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
-        self.scale_in = 1.0
+        self.scale_in = 0.95
         self.scale_out = 0.5
         self.scale_weights = [10.0]
         self.scale_in_eltwise = 0.6
@@ -224,7 +211,7 @@ class TestWithStride(TestConv2DInt8Op):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
-        self.scale_in = 1.0
+        self.scale_in = 0.95
         self.scale_out = 0.8
         self.scale_weights = [10.0]
         self.scale_in_eltwise = 0.5
@@ -240,7 +227,7 @@ class TestWithDilations(TestConv2DInt8Op):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
-        self.scale_in = 1.0
+        self.scale_in = 0.95
         self.scale_out = 0.8
         self.scale_weights = [10.0]
         self.scale_in_eltwise = 0.5
@@ -255,7 +242,7 @@ class TestWith1x1(TestConv2DInt8Op):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 1, 1]
-        self.scale_in = 1.0
+        self.scale_in = 0.95
         self.scale_out = 0.5
         self.scale_weights = [12.0]
         self.scale_in_eltwise = 0.5
@@ -270,7 +257,7 @@ class TestWithInput1x1Filter1x1(TestConv2DInt8Op):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 1, 1]
-        self.scale_in = 1.0
+        self.scale_in = 0.95
         self.scale_out = 0.5
         self.scale_weights = [10.0]
         self.scale_in_eltwise = 0.8
@@ -290,32 +277,32 @@ def init_data_type_with_fusion(self, input_dt, fuse_activation, fuse_residual):
 
 def create_test_int8_class(parent):
 
-    #--------------------test conv2d s8 in and u8 out--------------------
+    # --------------------test conv2d s8 in and u8 out--------------------
     class TestS8U8Case(parent):
         def init_data_type(self):
             init_data_type_with_fusion(self, np.int8, "relu", False)
 
-    #--------------------test conv2d s8 in and s8 out--------------------
+    # --------------------test conv2d s8 in and s8 out--------------------
     class TestS8S8Case(parent):
         def init_data_type(self):
             init_data_type_with_fusion(self, np.int8, "", False)
 
-    #--------------------test conv2d u8 in and s8 out--------------------
+    # --------------------test conv2d u8 in and s8 out--------------------
     class TestU8S8Case(parent):
         def init_data_type(self):
             init_data_type_with_fusion(self, np.uint8, "", False)
 
-    #--------------------test conv2d u8 in and u8 out without residual fuse--------------------
+    # --------------------test conv2d u8 in and u8 out without residual fuse--------------------
     class TestU8U8Case(parent):
         def init_data_type(self):
             init_data_type_with_fusion(self, np.uint8, "relu", False)
 
-    #--------------------test conv2d s8 in and s8 out with residual fuse--------------------
+    # --------------------test conv2d s8 in and s8 out with residual fuse--------------------
     class TestS8S8ResCase(parent):
         def init_data_type(self):
             init_data_type_with_fusion(self, np.int8, "", True)
 
-    #--------------------test conv2d u8 in and s8 out with residual fuse--------------------
+    # --------------------test conv2d u8 in and s8 out with residual fuse--------------------
     class TestU8S8ResCase(parent):
         def init_data_type(self):
             init_data_type_with_fusion(self, np.uint8, "", True)
@@ -333,9 +320,9 @@ def create_test_int8_class(parent):
     TestS8S8Case.__name__ = cls_name_s8s8
     TestU8S8Case.__name__ = cls_name_u8s8
     TestU8U8Case.__name__ = cls_name_u8u8
-
     TestS8S8ResCase.__name__ = cls_name_s8s8_re_1
     TestU8S8ResCase.__name__ = cls_name_u8s8_re_1
+
     globals()[cls_name_s8u8] = TestS8U8Case
     globals()[cls_name_s8s8] = TestS8S8Case
     globals()[cls_name_u8s8] = TestU8S8Case
@@ -344,7 +331,7 @@ def create_test_int8_class(parent):
     globals()[cls_name_u8s8_re_1] = TestU8S8ResCase
 
     if os.name != 'nt':
-        #--------------------test conv2d s8 in and u8 out with residual fuse--------------------
+        # --------------------test conv2d s8 in and u8 out with residual fuse--------------------
         class TestS8U8ResCase(parent):
             def init_data_type(self):
                 init_data_type_with_fusion(self, np.int8, "relu", True)