diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index 5d6488c67e0db440c8d4609736523643dd666dcc..73ffaae6a57806d115df65e67d7012f9598e08d1 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -76,6 +76,70 @@ $$Out = \frac{scale*X}{ max_range }$$ } }; +class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("X"), + "Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("WeightScales"), + "Input(WeightScales) of FakeChannelWiseDequantizeMaxAbsOp " + "should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FakeChannelWiseDequantizeMaxAbsOp should not be null."); + + ctx->ShareDim("X", /*->*/ "Out"); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class FakeChannelWiseDequantizeMaxAbsOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor) The input with float-32/64 type is the " + "low precision tensor."); + AddInput("ActivationScale", + "(float) The activation scale in quantization stage.") + .AsDispensable(); + AddInput("WeightScales", + "(float array) The weight scales in quantization stage."); + AddOutput("Out", + "(Tensor) The output is the dequantized high " + "precision tensor."); + AddAttr("activation_bits", "Quantization bit number for activation.") + .SetDefault(8) + .AddCustomChecker([](const int& bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'activation_bits' should be between 1 and 16."); + }); + AddAttr("weight_bits", "Quantization bit number for weights.") + .SetDefault(8) + .AddCustomChecker([](const int& bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'weight_bits' should be between 1 and 16."); + }); + + AddComment(R"DOC( +FakeChannelWiseDequantizeMaxAbsOp operator. + +This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp: + +$$Out_c = \frac{ActivationScale*WeightScale_c*X_c}{(2^{weight\_bits-1}-1)*(2^{activation\_bits-1}-1)}$$ + +In the above formula, the range value of c is as follow: +$$0 \leq c \lt \ the\ channel\ number\ of\ X$$ + +Notes: Tha per-channel quantization is only applied to weights(channel size scale). +And the activations use per-layer quantization(only one scale). +)DOC"); + } +}; + } // namespace operators } // namespace paddle @@ -88,3 +152,11 @@ REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp, REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsKernel, ops::FakeDequantizeMaxAbsKernel); + +REGISTER_OPERATOR(fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsOp, + ops::FakeChannelWiseDequantizeMaxAbsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsKernel, + ops::FakeChannelWiseDequantizeMaxAbsKernel); diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu index 225bcc45bc65bc9268d1e866a4358731eaf0c3ef..35dcc69279d0119e75c4c5072e7817c839b9e819 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu +++ b/paddle/fluid/operators/fake_dequantize_op.cu @@ -55,3 +55,7 @@ using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsKernel, ops::FakeDequantizeMaxAbsKernel); +REGISTER_OP_CUDA_KERNEL( + fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsKernel, + ops::FakeChannelWiseDequantizeMaxAbsKernel); diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index d9923a10daa01ca06ebabb27cf9285b0628634bc..c26dfa8332f7f945ccf42387ecea88d860168e15 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -45,5 +45,56 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel { } }; +template +class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& ctx) const { + auto* in = ctx.Input("X"); + auto* weight_scales = ctx.Input("WeightScales"); + auto* out = ctx.Output("Out"); + + PADDLE_ENFORCE_EQ(weight_scales->numel(), in->dims()[0], + "The weight uses the per-channel quantization type, so " + "the number of weight scale values must be the same with " + "first dimension value of Input(X)."); + + int ativation_bits = ctx.Attr("activation_bits"); + int weight_bits = ctx.Attr("weight_bits"); + int range = std::pow(2, weight_bits - 1) - 1; + + auto& dev_ctx = ctx.template device_context(); + out->mutable_data(dev_ctx.GetPlace()); + + auto dequant = DequantizeFunctor(); + if (ctx.HasInput("ActivationScale")) { + auto* activation_scale = ctx.Input("ActivationScale"); + PADDLE_ENFORCE_EQ(activation_scale->numel(), 1, + "The activation uses per-layer quantization type, so " + "it must have only one value."); + framework::Tensor cpu_weigth_scales; + framework::TensorCopy(*weight_scales, platform::CPUPlace(), + &cpu_weigth_scales); + dev_ctx.Wait(); + const T* weight_scales_data = cpu_weigth_scales.data(); + range *= (std::pow(2, ativation_bits - 1) - 1); + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + auto max_range = range / weight_scales_data[i]; + dequant(dev_ctx, &one_channel_in, activation_scale, + static_cast(max_range), &one_channel_out); + } + } else { + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + framework::Tensor one_channel_scale = weight_scales->Slice(i, i + 1); + dequant(dev_ctx, &one_channel_in, &one_channel_scale, + static_cast(range), &one_channel_out); + } + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index c873ee6718018604ae9bba74aa616b2165ba9bc4..70186e5efa29b1324ff7f3954720276156fddaf1 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -180,11 +180,10 @@ The scale of FakeChannelWiseQuantize operator is a vector. In detail, each channel of the input X has a scale value. $$scale_c = max(abs(X_c))$$ -$$range = 2^{bit_length - 1} - 1$$ -$$Out_c = round(X_c / scale_c * range)$$ - +$$range = 2^{bit\_length - 1} - 1$$ +$$Out_c = round(\frac{X_c * range} {scale_c})$$ In above three formulas, the range value of c is as follow: -$$0 \leq c \leq \ the\ channel\ number\ of\ X$$ +$$0 \leq c \lt \ the\ channel\ number\ of\ X$$ )DOC"); } }; diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py index 1bb4662e8d83ac0c34b209e4e7a605869fdb59d5..bd8dad4d5927a678d0500abed7213f203074a3ca 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py @@ -31,6 +31,77 @@ def dequantize_max_abs(x, scale, max_range): return y +def channel_wise_quantize_max_abs(x, max_range): + scales = [] + for i in range(x.shape[0]): + scales.append(np.max(np.abs(x[i])).astype("float32")) + + y = x.copy() + for i, scale in enumerate(scales): + y[i] = np.round(y[i] / scale * max_range) + return y, scales + + +def channel_wise_dequantize_max_abs(x, scales, max_range): + y = x.copy() + for i in range(x.shape[0]): + y[i] = (scales[i] / max_range) * y[i] + return y + + +class TestFakeChannelWiseDequantizeMaxAbsOp(OpTest): + def set_args(self): + self.weight_bits = 8 + self.activation_bits = 2 + self.data_type = "float32" + + def setUp(self): + self.set_args() + self.op_type = "fake_channel_wise_dequantize_max_abs" + x = np.random.randn(4, 3, 64, 64).astype(self.data_type) + max_range = math.pow(2, self.weight_bits - 1) - 1 + yq, scales = channel_wise_quantize_max_abs(x, max_range) + ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) + + self.inputs = { + 'X': yq, + 'ActivationScale': np.array(1.0).astype(self.data_type), + 'WeightScales': np.array(scales).astype(self.data_type) + } + self.attrs = { + 'weight_bits': self.weight_bits, + 'activation_bits': self.activation_bits + } + self.outputs = {'Out': ydq} + + def test_check_output(self): + self.check_output() + + +class TestFakeChannelWiseDequantizeMaxAbsOpNoActivationScale(OpTest): + def set_args(self): + self.weight_bits = 8 + self.data_type = "float32" + + def setUp(self): + self.set_args() + self.op_type = "fake_channel_wise_dequantize_max_abs" + x = np.random.randn(4, 3, 64, 64).astype(self.data_type) + max_range = math.pow(2, self.weight_bits - 1) - 1 + yq, scales = channel_wise_quantize_max_abs(x, max_range) + ydq = channel_wise_dequantize_max_abs(yq, scales, max_range) + + self.inputs = { + 'X': yq, + 'WeightScales': np.array(scales).astype(self.data_type) + } + self.attrs = {'weight_bits': self.weight_bits} + self.outputs = {'Out': ydq} + + def test_check_output(self): + self.check_output() + + class TestFakeDequantizeMaxAbsOp(OpTest): def set_args(self): self.num_bits = 8