diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d79bba7fd2f81e484169972a4bb43cc7dbe393de..fdff8310e710ba58982ce25550dc862d32662679 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -226,6 +226,23 @@ bool SupportsBfloat16FastPerformance() { #endif } +bool SupportsInt8() { +#ifndef PADDLE_WITH_MKLDNN + return false; +#else + return (platform::MayIUse(platform::cpu_isa_t::avx2) || + platform::MayIUse(platform::cpu_isa_t::avx512f)); +#endif +} + +bool SupportsVNNI() { +#ifndef PADDLE_WITH_MKLDNN + return false; +#else + return platform::MayIUse(platform::cpu_isa_t::avx512_core_vnni); +#endif +} + // According to the input `place` and `dtype`, this function returns a tuple // consists of three sets: // 1) All operators registered in the Paddle framework. @@ -2121,6 +2138,8 @@ All parameter, weight, gradient are variables in Paddle. m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS); m.def("supports_bfloat16", SupportsBfloat16); m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance); + m.def("supports_int8", SupportsInt8); + m.def("supports_vnni", SupportsVNNI); m.def("op_supported_infos", OpSupportedInfos); m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_dist", IsCompiledWithDIST); diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py index 2cfb6146f3f55d1b939d3a5d3e6b141a517524e1..7508ecbb2946d2efb393bd1c984df5269a96a34e 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py @@ -23,13 +23,12 @@ from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, Te def conv2d_forward_refer(input, filter, group, conv_param): - out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group, - conv_param) + out, _, _, _, _ = conv2d_forward_naive(input, filter, group, conv_param) return out -@unittest.skipIf(not core.supports_bfloat16(), - "place does not support BF16 evaluation") +@unittest.skipIf(not core.supports_int8(), + "place does not support int8 computation") class TestConv2DInt8Op(TestConv2DOp): def setUp(self): self.op_type = "conv2d" @@ -53,73 +52,61 @@ class TestConv2DInt8Op(TestConv2DOp): 'pad': self.pad, 'dilation': self.dilations } - + # This implementation of convolution quantization is based on OneDNN documentation + # https://oneapi-src.github.io/oneDNN/dev_guide_int8_computations.html#doxid-dev-guide-int8-computations-1dg-i8-comp-s11 + scale_output_shift = (self.scale_out / + (self.scale_in * self.scale_weights[0])) filter = np.random.random(self.filter_size).astype(self.weighttype) - if self.srctype == np.uint8: - input = np.random.randint(0, 10, + + # When the Intel AVX2 or Intel AVX512 Instruction Set is used + # the reorder additionally scales the weights by 0.5 + # to overcome the potential overflow issue. If the processor supports VNNI instructions, + # modification of the weights is not necessary. + avx_scale = 0.5 if not core.supports_vnni( + ) and self.srctype == np.int8 else 1. + filter_int = np.round(filter * self.scale_weights[0] * + avx_scale).astype(np.int32) + scale_output_shift = scale_output_shift / avx_scale + + def conv2d_forward_refer_helper(input_): + return conv2d_forward_refer( + input_.astype(np.int32), filter_int, self.groups, + conv2d_param).astype(np.float32) * scale_output_shift + + def residual_helper(init_low, init_high, output_): + input_residual_ = np.random.randint( + init_low, init_high, + self.input_residual_size).astype(self.srctype) + return (output_ + input_residual_ * + (self.scale_out / self.scale_in_eltwise)), input_residual_ + + if self.srctype == np.int8: + init_low, init_high = (-5, 5) + input = np.random.randint(init_low, init_high, self.input_size).astype(self.srctype) + input_shift = (np.ones(self.input_size) * 128).astype(np.uint8) + + output1 = conv2d_forward_refer_helper( + np.round(input + input_shift).astype(np.int32)) + output2 = conv2d_forward_refer_helper( + np.round(input_shift).astype(np.int32)) + output = output1 - output2 else: - input = np.random.randint(-5, 5, + init_low, init_high = (0, 10) + input = np.random.randint(init_low, init_high, self.input_size).astype(self.srctype) - input_shift = (np.ones(self.input_size) * 128).astype(np.uint8) + output = conv2d_forward_refer_helper(input) - if self.srctype == np.int8: - filter_int = np.round(filter * self.scale_weights[0] * - 0.5).astype(np.int32) - scale_output_shift = self.scale_out / (self.scale_in * - self.scale_weights[0] * 0.5) - output1 = conv2d_forward_refer( - np.round((input.astype(np.int32) + input_shift) * - self.scale_in).astype(np.int32), filter_int, - self.groups, - conv2d_param).astype(np.float32) * scale_output_shift - output2 = conv2d_forward_refer( - np.round((input_shift) * self.scale_in).astype(np.int32), - filter_int, self.groups, - conv2d_param).astype(np.float32) * scale_output_shift - if self.fuse_residual: - input_residual = np.random.randint( - -5, 5, self.input_residual_size).astype(self.srctype) - output_tmp = np.round(output1 - output2 + input_residual.astype( - self.srctype) * (self.scale_out / self.scale_in_eltwise)) - if self.fuse_activation == "relu": - output = np.maximum(output_tmp, 0).astype(self.dsttype) - else: - output = output_tmp.astype(self.dsttype) - else: - if self.fuse_activation == "relu": - output = np.maximum(np.round(output1 - output2), - 0).astype(self.dsttype) - else: - output = np.round(output1 - output2).astype(self.dsttype) + if self.fuse_residual: + output, input_residual = residual_helper(init_low, init_high, + output) - else: - filter_int = np.round(filter * - self.scale_weights[0]).astype(np.int32) - scale_output_shift = self.scale_out / (self.scale_in * - self.scale_weights[0]) - output1 = conv2d_forward_refer( - input.astype(np.int32), filter_int, self.groups, - conv2d_param).astype(np.float32) - output1_tmp = np.round(output1 * ( - self.scale_out / (self.scale_in * self.scale_weights[0]))) - - if self.fuse_residual: - input_residual = np.random.randint( - 0, 10, self.input_residual_size).astype(self.srctype) - output_tmp_res = np.round(output1 * (self.scale_out / ( - self.scale_in * self.scale_weights[ - 0])) + input_residual.astype(np.int32) * ( - self.scale_out / self.scale_in_eltwise)) - if self.fuse_activation == "relu": - output = np.maximum(output_tmp_res, 0).astype(self.dsttype) - else: - output = output_tmp_res.astype(self.dsttype) - else: - if self.fuse_activation == "relu": - output = np.maximum(output1_tmp, 0).astype(self.dsttype) - else: - output = output1_tmp.astype(self.dsttype) + output = np.round(output) + + if self.fuse_activation == "relu": + output = np.maximum(output, 0) + + output = output.astype(self.dsttype) self.inputs = { 'Input': @@ -169,7 +156,7 @@ class TestConv2DInt8Op(TestConv2DOp): f_c = self.input_size[1] // self.groups self.input_residual_size = [1, 2, 3, 3] self.filter_size = [2, f_c, 3, 3] - self.scale_in = 1.0 + self.scale_in = 0.95 self.scale_out = 0.5 self.scale_weights = [10.0] self.scale_in_eltwise = 0.6 @@ -185,7 +172,7 @@ class TestConv2DInt8Op(TestConv2DOp): self.fuse_residual = True -#--------------------test conv2d u8 in and u8 out with residual fuse-------------------- +# --------------------test conv2d u8 in and u8 out with residual fuse-------------------- class TestConv2D(TestConv2DInt8Op): @@ -197,7 +184,7 @@ class TestConv2D(TestConv2DInt8Op): assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups self.filter_size = [6, f_c, 3, 3] - self.scale_in = 1.0 + self.scale_in = 0.95 self.scale_out = 0.5 self.scale_weights = [10.0] self.scale_in_eltwise = 0.6 @@ -224,7 +211,7 @@ class TestWithStride(TestConv2DInt8Op): assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups self.filter_size = [6, f_c, 3, 3] - self.scale_in = 1.0 + self.scale_in = 0.95 self.scale_out = 0.8 self.scale_weights = [10.0] self.scale_in_eltwise = 0.5 @@ -240,7 +227,7 @@ class TestWithDilations(TestConv2DInt8Op): assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups self.filter_size = [6, f_c, 3, 3] - self.scale_in = 1.0 + self.scale_in = 0.95 self.scale_out = 0.8 self.scale_weights = [10.0] self.scale_in_eltwise = 0.5 @@ -255,7 +242,7 @@ class TestWith1x1(TestConv2DInt8Op): assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups self.filter_size = [6, f_c, 1, 1] - self.scale_in = 1.0 + self.scale_in = 0.95 self.scale_out = 0.5 self.scale_weights = [12.0] self.scale_in_eltwise = 0.5 @@ -270,7 +257,7 @@ class TestWithInput1x1Filter1x1(TestConv2DInt8Op): assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups self.filter_size = [6, f_c, 1, 1] - self.scale_in = 1.0 + self.scale_in = 0.95 self.scale_out = 0.5 self.scale_weights = [10.0] self.scale_in_eltwise = 0.8 @@ -290,32 +277,32 @@ def init_data_type_with_fusion(self, input_dt, fuse_activation, fuse_residual): def create_test_int8_class(parent): - #--------------------test conv2d s8 in and u8 out-------------------- + # --------------------test conv2d s8 in and u8 out-------------------- class TestS8U8Case(parent): def init_data_type(self): init_data_type_with_fusion(self, np.int8, "relu", False) - #--------------------test conv2d s8 in and s8 out-------------------- + # --------------------test conv2d s8 in and s8 out-------------------- class TestS8S8Case(parent): def init_data_type(self): init_data_type_with_fusion(self, np.int8, "", False) - #--------------------test conv2d u8 in and s8 out-------------------- + # --------------------test conv2d u8 in and s8 out-------------------- class TestU8S8Case(parent): def init_data_type(self): init_data_type_with_fusion(self, np.uint8, "", False) - #--------------------test conv2d u8 in and u8 out without residual fuse-------------------- + # --------------------test conv2d u8 in and u8 out without residual fuse-------------------- class TestU8U8Case(parent): def init_data_type(self): init_data_type_with_fusion(self, np.uint8, "relu", False) - #--------------------test conv2d s8 in and s8 out with residual fuse-------------------- + # --------------------test conv2d s8 in and s8 out with residual fuse-------------------- class TestS8S8ResCase(parent): def init_data_type(self): init_data_type_with_fusion(self, np.int8, "", True) - #--------------------test conv2d u8 in and s8 out with residual fuse-------------------- + # --------------------test conv2d u8 in and s8 out with residual fuse-------------------- class TestU8S8ResCase(parent): def init_data_type(self): init_data_type_with_fusion(self, np.uint8, "", True) @@ -333,9 +320,9 @@ def create_test_int8_class(parent): TestS8S8Case.__name__ = cls_name_s8s8 TestU8S8Case.__name__ = cls_name_u8s8 TestU8U8Case.__name__ = cls_name_u8u8 - TestS8S8ResCase.__name__ = cls_name_s8s8_re_1 TestU8S8ResCase.__name__ = cls_name_u8s8_re_1 + globals()[cls_name_s8u8] = TestS8U8Case globals()[cls_name_s8s8] = TestS8S8Case globals()[cls_name_u8s8] = TestU8S8Case @@ -344,7 +331,7 @@ def create_test_int8_class(parent): globals()[cls_name_u8s8_re_1] = TestU8S8ResCase if os.name != 'nt': - #--------------------test conv2d s8 in and u8 out with residual fuse-------------------- + # --------------------test conv2d s8 in and u8 out with residual fuse-------------------- class TestS8U8ResCase(parent): def init_data_type(self): init_data_type_with_fusion(self, np.int8, "relu", True)