提交 81b4fad8 编写于 作者: 视言's avatar 视言 提交者: qingqing01

add moving average absmax op and fix bug (#15155)

* Add moving average absmax op in quantilize-aware training.
上级 92b9ce34
...@@ -367,7 +367,7 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init', ...@@ -367,7 +367,7 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init',
paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7')) paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7'))
paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47')) paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47'))
paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa')) paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa'))
paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)), ('document', '14b39f1fcd5667ff556b1aad94357d1d')) paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size', 'moving_rate'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000, 0.9)), ('document', '14b39f1fcd5667ff556b1aad94357d1d'))
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd')) paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884')) paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884'))
......
...@@ -81,6 +81,30 @@ struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> { ...@@ -81,6 +81,30 @@ struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>; template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
template <typename T>
struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx,
const framework::Tensor& in_accum,
const framework::Tensor& in_state, const T* cur_scale,
const float rate, framework::Tensor* out_state,
framework::Tensor* out_accum, framework::Tensor* out_scale) {
T accum = in_accum.data<T>()[0];
T state = in_state.data<T>()[0];
T scale = cur_scale[0];
state = rate * state + 1;
accum = rate * accum + scale;
scale = accum / state;
out_state->mutable_data<T>(ctx.GetPlace())[0] = state;
out_accum->mutable_data<T>(ctx.GetPlace())[0] = accum;
out_scale->mutable_data<T>(ctx.GetPlace())[0] = scale;
}
};
template struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext,
float>;
class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel { class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
public: public:
FakeQuantizeAbsMaxOp(const std::string& type, FakeQuantizeAbsMaxOp(const std::string& type,
...@@ -255,6 +279,78 @@ $$Out = round(X/scale * range)$$ ...@@ -255,6 +279,78 @@ $$Out = round(X/scale * range)$$
} }
}; };
class FakeQuantizeMovingAverageAbsMaxOp : public framework::OperatorWithKernel {
public:
FakeQuantizeMovingAverageAbsMaxOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("X"),
"Input(X) of FakeQuantizeMovingAverageAbsMaxOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FakeQuantizeMovingAverageAbsMaxOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
"Output(OutScale) of FakeQuantizeMovingAverageAbsMaxOp "
"should not be null");
if (ctx->HasOutput("OutState")) {
ctx->SetOutputDim("OutState", {1});
}
if (ctx->HasOutput("OutAccum")) {
ctx->SetOutputDim("OutAccum", {1});
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->SetOutputDim("OutScale", {1});
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
ctx.device_context());
}
};
class FakeQuantizeMovingAverageAbsMaxOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) Input is float data type.");
AddInput("InScale", "Last scale.");
AddInput("InAccum", "Last accum.").AsDispensable();
AddInput("InState", "Last state.").AsDispensable();
AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
AddOutput("OutScale", " Current scale");
AddOutput("OutState", "(Tensor) state buffer.").AsDispensable();
AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable();
AddAttr<float>("moving_rate", "(float, default 0.9) moving rate.")
.SetDefault(0.9);
AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
.SetDefault(8)
.AddCustomChecker([](const int& bit_length) {
PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
"'bit_length' should be between 1 and 16.");
});
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddComment(R"DOC(
FakeQuantize operator is used in static quantization.
$$scale = (0.9*max(abs(x))+accum)/(0.9*state+1)$$
$$range = 2^{bit_length - 1} - 1$$
$$Out = round(X/scale * range)$$
)DOC");
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -273,6 +369,12 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp, ...@@ -273,6 +369,12 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max, REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>); ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
REGISTER_OPERATOR(fake_quantize_moving_average_abs_max,
ops::FakeQuantizeMovingAverageAbsMaxOp,
ops::FakeQuantizeMovingAverageAbsMaxOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);
REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max, REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxOp, ops::FakeChannelWiseQuantizeAbsMaxOp,
ops::FakeChannelWiseQuantizeAbsMaxOpMaker, ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
......
...@@ -147,6 +147,41 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> { ...@@ -147,6 +147,41 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>; template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
template <typename T>
struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in_accum,
const framework::Tensor& in_state, const T* cur_scale,
const float rate, framework::Tensor* out_state,
framework::Tensor* out_accum, framework::Tensor* out_scale) {
const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
T accum;
memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
sizeof(T), 0);
T state;
memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
sizeof(T), 0);
T scale;
memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
0);
state = rate * state + 1;
accum = rate * accum + scale;
scale = accum / state;
memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
platform::CPUPlace(), &accum, sizeof(T), 0);
memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
platform::CPUPlace(), &state, sizeof(T), 0);
memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
platform::CPUPlace(), &scale, sizeof(T), 0);
}
};
template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
float>;
template <typename T> template <typename T>
struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> { struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx, void operator()(const platform::CUDADeviceContext& ctx,
...@@ -178,3 +213,6 @@ REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max, ...@@ -178,3 +213,6 @@ REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>); ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max, REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>); ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(
fake_quantize_moving_average_abs_max,
ops::FakeQuantizeMovingAverageAbsMaxKernel<CUDA, float>);
...@@ -42,12 +42,20 @@ struct FindRangeAbsMaxFunctor { ...@@ -42,12 +42,20 @@ struct FindRangeAbsMaxFunctor {
framework::Tensor* scales_arr, framework::Tensor* out_scale); framework::Tensor* scales_arr, framework::Tensor* out_scale);
}; };
template <typename DeviceContext, typename T>
struct FindMovingAverageAbsMaxFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
const framework::Tensor& in_state,
const framework::Tensor& cur_scale,
framework::Tensor* out_state, framework::Tensor* out_accum,
framework::Tensor* out_scale);
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> { class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X"); auto* in = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out"); auto* out = context.Output<framework::Tensor>("Out");
auto* out_scale = context.Output<framework::Tensor>("OutScale"); auto* out_scale = context.Output<framework::Tensor>("OutScale");
T* out_s = out_scale->mutable_data<T>(context.GetPlace()); T* out_s = out_scale->mutable_data<T>(context.GetPlace());
...@@ -138,5 +146,54 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> { ...@@ -138,5 +146,54 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T>
class FakeQuantizeMovingAverageAbsMaxKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X");
auto* in_scale = context.Input<framework::Tensor>("InScale");
auto* out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
bool is_test = context.Attr<bool>("is_test");
int bit_length = context.Attr<int>("bit_length");
int bin_cnt = std::pow(2, bit_length - 1) - 1;
auto& dev_ctx = context.template device_context<DeviceContext>();
// testing
if (is_test) {
ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
bin_cnt, out);
return;
}
// training
auto* in_accum = context.Input<framework::Tensor>("InAccum");
auto* in_state = context.Input<framework::Tensor>("InState");
auto& allocator =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
auto cur_scale = allocator.Allocate(1 * sizeof(T));
T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
cur_scale_data);
auto* out_state = context.Output<framework::Tensor>("OutState");
auto* out_accum = context.Output<framework::Tensor>("OutAccum");
auto* out_scale = context.Output<framework::Tensor>("OutScale");
out_state->mutable_data<T>(context.GetPlace());
out_accum->mutable_data<T>(context.GetPlace());
out_scale->mutable_data<T>(context.GetPlace());
float moving_rate = context.Attr<float>("moving_rate");
FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(
dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state,
out_accum, out_scale);
ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
bin_cnt, out);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -84,7 +84,8 @@ class QuantizeTranspiler(object): ...@@ -84,7 +84,8 @@ class QuantizeTranspiler(object):
activation_bits=8, activation_bits=8,
activation_quantize_type='abs_max', activation_quantize_type='abs_max',
weight_quantize_type='abs_max', weight_quantize_type='abs_max',
window_size=10000): window_size=10000,
moving_rate=0.9):
""" """
Convert and rewrite the fluid Program according to weight and Convert and rewrite the fluid Program according to weight and
activation quantization type. activation quantization type.
...@@ -117,23 +118,27 @@ class QuantizeTranspiler(object): ...@@ -117,23 +118,27 @@ class QuantizeTranspiler(object):
""" """
self.weight_bits = weight_bits self.weight_bits = weight_bits
self.activation_bits = activation_bits self.activation_bits = activation_bits
quant_type = ['abs_max', 'range_abs_max'] quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max']
if weight_quantize_type not in quant_type: if weight_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown weight_quantize_type: '%s'. It can only be ", "Unknown weight_quantize_type: '%s'. It can only be ",
"'abs_max' or 'range_abs_max'.", str(weight_quantize_type)) "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
str(weight_quantize_type))
if activation_quantize_type not in quant_type: if activation_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown activation_quantize_type : '%s'. It can only be ", "Unknown activation_quantize_type : '%s'. It can only be ",
"'abs_max' or 'range_abs_max'.", str(activation_quantize_type)) "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
str(activation_quantize_type))
self.weight_quantize_type = weight_quantize_type self.weight_quantize_type = weight_quantize_type
self.activation_quantize_type = activation_quantize_type self.activation_quantize_type = activation_quantize_type
self.window_size = window_size self.window_size = window_size
self.moving_rate = moving_rate
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
self.fake_quant_op_types = [ self.fake_quant_op_types = [
'fake_quantize_abs_max', 'fake_quantize_range_abs_max' 'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
'fake_quantize_moving_average_abs_max'
] ]
self.fake_dequant_op_types = ['fake_dequantize_max_abs'] self.fake_dequant_op_types = ['fake_dequantize_max_abs']
self.is_test = None self.is_test = None
...@@ -168,6 +173,7 @@ class QuantizeTranspiler(object): ...@@ -168,6 +173,7 @@ class QuantizeTranspiler(object):
block_id = block.idx block_id = block.idx
# insert quant op and dequant op # insert quant op and dequant op
for name in op.input_arg_names: for name in op.input_arg_names:
#if share input between ops
if name in dequanted_vars[block_id]: if name in dequanted_vars[block_id]:
dequant_var = dequanted_vars[block_id][name] dequant_var = dequanted_vars[block_id][name]
else: else:
...@@ -261,6 +267,7 @@ class QuantizeTranspiler(object): ...@@ -261,6 +267,7 @@ class QuantizeTranspiler(object):
max_range = None max_range = None
scale_var = None scale_var = None
for name in op.input_arg_names: for name in op.input_arg_names:
#rename input name of the op to the input name of last op which has be removed
if name in op_in_rename_map[block_id]: if name in op_in_rename_map[block_id]:
op._rename_input(name, op_in_rename_map[block_id][name]) op._rename_input(name, op_in_rename_map[block_id][name])
...@@ -272,8 +279,7 @@ class QuantizeTranspiler(object): ...@@ -272,8 +279,7 @@ class QuantizeTranspiler(object):
max_range = param_range * act_range / scale_v max_range = param_range * act_range / scale_v
else: else:
assert isinstance(scale_v, Variable) assert isinstance(scale_v, Variable)
scale_var = var_scale_map[block_id][_original_var_name( scale_var = scale_v
name)]
if len(op.output_arg_names) != 1: if len(op.output_arg_names) != 1:
raise ValueError("Only support one output, but op %s has" raise ValueError("Only support one output, but op %s has"
...@@ -309,7 +315,7 @@ class QuantizeTranspiler(object): ...@@ -309,7 +315,7 @@ class QuantizeTranspiler(object):
op_type = op.type op_type = op.type
# insert dequant_op after fc/conv, need to rename # insert dequant_op after fc/conv, need to rename
# input of the followed ops # input of the followed ops(of fc/conv) to the dquant_op
for name in op.input_arg_names: for name in op.input_arg_names:
if name in op_out_rename_map[block_id]: if name in op_out_rename_map[block_id]:
op._rename_input(name, op._rename_input(name,
...@@ -389,8 +395,8 @@ class QuantizeTranspiler(object): ...@@ -389,8 +395,8 @@ class QuantizeTranspiler(object):
for op in block.ops: for op in block.ops:
args += op.input_arg_names args += op.input_arg_names
args += op.output_arg_names args += op.output_arg_names
args = list(set(args)) args = list(set(args)) #vals of all left ops
var_names = block.vars.keys() var_names = block.vars.keys() # all vals
sub_block_remove_vars = [] sub_block_remove_vars = []
for var in var_names: for var in var_names:
if var not in args: if var not in args:
...@@ -471,6 +477,61 @@ class QuantizeTranspiler(object): ...@@ -471,6 +477,61 @@ class QuantizeTranspiler(object):
return quant_var, scale return quant_var, scale
def _insert_quant_moving_average_abs_max_op(self, block, idx, var,
quant_bits):
"""Insert fake_quantize_moving_average_abs_max
"""
quant_var = block.create_var(
name=_quantized_var_name(var.name),
type=var.type,
shape=var.shape,
dtype=var.dtype)
state = self.helper.create_global_variable(
name=unique_name.generate('state'),
persistable=True,
dtype=var.dtype,
shape=[1])
self.helper.set_variable_initializer(
state, initializer=Constant(value=1))
accum = self.helper.create_global_variable(
name=unique_name.generate('accum'),
persistable=True,
dtype=var.dtype,
shape=[1])
self.helper.set_variable_initializer(
accum, initializer=Constant(value=1))
scale = self.helper.create_parameter(
attr=ParamAttr(
name=_quantized_scale_name(var.name),
initializer=Constant(0.001),
trainable=False),
shape=[1],
dtype=var.dtype)
scale.stop_gradient = True
ins = {'X': var, 'InScale': scale}
outs = {'Out': quant_var, 'OutScale': scale}
if not self.is_test:
ins['InState'] = state
ins['InAccum'] = accum
outs['OutState'] = state
outs['OutAccum'] = accum
attrs = {
'bit_length': quant_bits,
'moving_rate': self.moving_rate,
'is_test': self.is_test
}
quant_op = block._insert_op(
idx,
type='fake_quantize_moving_average_abs_max',
attrs=attrs,
inputs=ins,
outputs=outs)
return quant_var, scale
def _insert_quant_op(self, block, idx, var, quant_bits, quant_type): def _insert_quant_op(self, block, idx, var, quant_bits, quant_type):
""" """
Insert fake_quantize_op Insert fake_quantize_op
...@@ -480,6 +541,9 @@ class QuantizeTranspiler(object): ...@@ -480,6 +541,9 @@ class QuantizeTranspiler(object):
elif quant_type == 'range_abs_max': elif quant_type == 'range_abs_max':
return self._insert_quant_range_abs_max_op(block, idx, var, return self._insert_quant_range_abs_max_op(block, idx, var,
quant_bits) quant_bits)
elif quant_type == 'moving_average_abs_max':
return self._insert_quant_moving_average_abs_max_op(block, idx, var,
quant_bits)
def _insert_dequant_op(self, block, idx, var, scale, quant_bits): def _insert_dequant_op(self, block, idx, var, scale, quant_bits):
""" """
......
...@@ -38,7 +38,8 @@ class QuantizationTransformPass(object): ...@@ -38,7 +38,8 @@ class QuantizationTransformPass(object):
activation_bits=8, activation_bits=8,
activation_quantize_type='abs_max', activation_quantize_type='abs_max',
weight_quantize_type='abs_max', weight_quantize_type='abs_max',
window_size=10000): window_size=10000,
moving_rate=0.9):
""" """
Convert and rewrite the IrGraph according to weight and Convert and rewrite the IrGraph according to weight and
activation quantization type. activation quantization type.
...@@ -83,19 +84,22 @@ class QuantizationTransformPass(object): ...@@ -83,19 +84,22 @@ class QuantizationTransformPass(object):
self._weight_bits = weight_bits self._weight_bits = weight_bits
self._activation_bits = activation_bits self._activation_bits = activation_bits
quant_type = ['abs_max', 'range_abs_max'] quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max']
if activation_quantize_type not in quant_type: if activation_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown activation_quantize_type : '%s'. It can only be ", "Unknown activation_quantize_type : '%s'. It can only be ",
"'abs_max' or 'range_abs_max'.", str(activation_quantize_type)) "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
str(activation_quantize_type))
if weight_quantize_type not in quant_type: if weight_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown weight_quantize_type: '%s'. It can only be ", "Unknown weight_quantize_type: '%s'. It can only be ",
"'abs_max' or 'range_abs_max'.", str(weight_quantize_type)) "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
str(weight_quantize_type))
self._activation_quantize_type = activation_quantize_type self._activation_quantize_type = activation_quantize_type
self._weight_quantize_type = weight_quantize_type self._weight_quantize_type = weight_quantize_type
self._window_size = window_size self._window_size = window_size
self._moving_rate = moving_rate
self._need_initialized = collections.OrderedDict() self._need_initialized = collections.OrderedDict()
self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
...@@ -222,6 +226,9 @@ class QuantizationTransformPass(object): ...@@ -222,6 +226,9 @@ class QuantizationTransformPass(object):
elif quant_type == 'range_abs_max': elif quant_type == 'range_abs_max':
return self._insert_quant_range_abs_max_op(graph, var_node, return self._insert_quant_range_abs_max_op(graph, var_node,
quant_bits) quant_bits)
elif quant_type == 'moving_average_abs_max':
return self._insert_quant_moving_average_abs_max_op(graph, var_node,
quant_bits)
def _insert_quant_abs_max_op(self, graph, var_node, quant_bits): def _insert_quant_abs_max_op(self, graph, var_node, quant_bits):
""" """
...@@ -309,6 +316,74 @@ class QuantizationTransformPass(object): ...@@ -309,6 +316,74 @@ class QuantizationTransformPass(object):
return quant_var_node, scale_out_node return quant_var_node, scale_out_node
def _insert_quant_moving_average_abs_max_op(self, graph, var_node,
quant_bits):
"""Insert fake_quantize_moving_average_abs_max
"""
quant_var_node = graph.create_var_node(
name=self._quantized_var_name(var_node.name()),
var_type=var_node.type(),
shape=var_node.shape(),
var_dtype=var_node.dtype())
scale_in_node = graph.create_persistable_node(
name=self._quantized_scale_name(var_node.name()),
var_type=core.VarDesc.VarType.LOD_TENSOR,
shape=[1],
var_dtype=var_node.dtype())
self._need_initialized[scale_in_node.var()] = Constant(value=0.001)
scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
ins = {'X': var_node, 'InScale': scale_in_node}
outs = {'Out': quant_var_node, 'OutScale': scale_out_node}
if not self._is_test:
state_in_node = graph.create_persistable_node(
name=unique_name.generate('state'),
var_type=core.VarDesc.VarType.LOD_TENSOR,
var_dtype=var_node.dtype(),
shape=[1])
self._need_initialized[state_in_node.var()] = Constant(value=1)
accum_in_node = graph.create_persistable_node(
name=unique_name.generate('accum'),
var_type=core.VarDesc.VarType.LOD_TENSOR,
var_dtype=var_node.dtype(),
shape=[1])
self._need_initialized[accum_in_node.var()] = Constant(value=1)
state_out_node = graph.create_var_node_from_desc(state_in_node.var(
))
accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
))
ins['InState'] = state_in_node
ins['InAccum'] = accum_in_node
outs['OutState'] = state_out_node
outs['OutAccum'] = accum_out_node
attrs = {
'bit_length': quant_bits,
'moving_rate': self._moving_rate,
'is_test': self._is_test,
'op_role': core.op_proto_and_checker_maker.OpRole.Forward
}
quant_op_node = graph.create_op_node(
op_type='fake_quantize_moving_average_abs_max',
attrs=attrs,
inputs=ins,
outputs=outs)
graph.link_to(var_node, quant_op_node)
graph.link_to(scale_in_node, quant_op_node)
graph.link_to(quant_op_node, quant_var_node)
graph.link_to(quant_op_node, scale_out_node)
if not self._is_test:
graph.link_to(state_in_node, quant_op_node)
graph.link_to(accum_in_node, quant_op_node)
graph.link_to(quant_op_node, state_out_node)
graph.link_to(quant_op_node, accum_out_node)
return quant_var_node, scale_out_node
def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits): def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits):
""" """
Insert fake_dequantize_op in the graph. Insert fake_dequantize_op in the graph.
...@@ -389,7 +464,8 @@ class QuantizationFreezePass(object): ...@@ -389,7 +464,8 @@ class QuantizationFreezePass(object):
self._weight_quantize_type = weight_quantize_type self._weight_quantize_type = weight_quantize_type
self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
self._fake_quant_op_names = [ self._fake_quant_op_names = [
'fake_quantize_abs_max', 'fake_quantize_range_abs_max' 'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
'fake_quantize_moving_average_abs_max'
] ]
self._fake_dequant_op_names = ['fake_dequantize_max_abs'] self._fake_dequant_op_names = ['fake_dequantize_max_abs']
self._op_input_rename_map = collections.OrderedDict() self._op_input_rename_map = collections.OrderedDict()
......
...@@ -164,6 +164,9 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -164,6 +164,9 @@ class TestQuantizationTransformPass(unittest.TestCase):
def test_linear_fc_quant_range_abs_max(self): def test_linear_fc_quant_range_abs_max(self):
self.linear_fc_quant('range_abs_max', for_ci=True) self.linear_fc_quant('range_abs_max', for_ci=True)
def test_linear_fc_quant_moving_average_abs_max(self):
self.linear_fc_quant('moving_average_abs_max', for_ci=True)
def residual_block_quant(self, quant_type, for_ci=False): def residual_block_quant(self, quant_type, for_ci=False):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
...@@ -201,6 +204,9 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -201,6 +204,9 @@ class TestQuantizationTransformPass(unittest.TestCase):
def test_residual_block_range_abs_max(self): def test_residual_block_range_abs_max(self):
self.residual_block_quant('range_abs_max', for_ci=True) self.residual_block_quant('range_abs_max', for_ci=True)
def test_residual_block_moving_average_abs_max(self):
self.residual_block_quant('moving_average_abs_max', for_ci=True)
class TestQuantizationFreezePass(unittest.TestCase): class TestQuantizationFreezePass(unittest.TestCase):
def freeze_graph(self, use_cuda, seed, quant_type, for_ci=False): def freeze_graph(self, use_cuda, seed, quant_type, for_ci=False):
...@@ -380,11 +386,18 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -380,11 +386,18 @@ class TestQuantizationFreezePass(unittest.TestCase):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph( self.freeze_graph(
True, seed=1, quant_type='range_abs_max', for_ci=True) True, seed=1, quant_type='range_abs_max', for_ci=True)
self.freeze_graph(
True,
seed=1,
quant_type='moving_average_abs_max',
for_ci=True)
def test_freeze_graph_cpu_static(self): def test_freeze_graph_cpu_static(self):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph( self.freeze_graph(
False, seed=2, quant_type='range_abs_max', for_ci=True) False, seed=2, quant_type='range_abs_max', for_ci=True)
self.freeze_graph(
False, seed=2, quant_type='moving_average_abs_max', for_ci=True)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -17,6 +17,7 @@ from __future__ import print_function ...@@ -17,6 +17,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
import paddle.fluid.core as core
class TestFakeQuantizeOp(OpTest): class TestFakeQuantizeOp(OpTest):
...@@ -75,6 +76,7 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest): ...@@ -75,6 +76,7 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest):
'InScale': np.zeros(1).astype("float32") 'InScale': np.zeros(1).astype("float32")
} }
scale = np.max(np.abs(self.inputs['X'])).astype("float32") scale = np.max(np.abs(self.inputs['X'])).astype("float32")
out_scales = np.zeros(self.attrs['window_size']).astype("float32") out_scales = np.zeros(self.attrs['window_size']).astype("float32")
out_scales[0] = scale out_scales[0] = scale
self.outputs = { self.outputs = {
...@@ -88,6 +90,46 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest): ...@@ -88,6 +90,46 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest):
self.check_output() self.check_output()
class TestFakeQuantizeMovingOp(OpTest):
def setUp(self):
self.op_type = "fake_quantize_moving_average_abs_max"
self.attrs = {
'bit_length': int(5),
'moving_rate': float(0.9),
'is_test': False
}
accum = np.zeros(1).astype("float32")
accum[0] = 1
state = np.zeros(1).astype("float32")
state[0] = 1
scale = np.zeros(1).astype("float32")
scale[0] = 0.001
self.inputs = {
'X': np.random.random((8, 16, 7, 7)).astype("float32"),
'InScale': scale,
'InAccum': accum,
'InState': state,
}
out_accum = np.zeros(1).astype("float32")
out_state = np.zeros(1).astype("float32")
out_scale = np.zeros(1).astype("float32")
out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max(
np.abs(self.inputs['X'])).astype("float32")
out_state[0] = self.attrs['moving_rate'] * state[0] + 1
out_scale = out_accum / out_state
self.outputs = {
'Out': np.round(self.inputs['X'] / out_scale * (
(1 << (self.attrs['bit_length'] - 1)) - 1)),
'OutAccum': out_accum,
'OutState': out_state,
'OutScale': out_scale,
}
def test_check_output(self):
self.check_output()
class TestFakeQuantizeRangeAbsMaxOp2(OpTest): class TestFakeQuantizeRangeAbsMaxOp2(OpTest):
def setUp(self): def setUp(self):
self.op_type = "fake_quantize_range_abs_max" self.op_type = "fake_quantize_range_abs_max"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册