diff --git a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py b/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
index cde3d991a7f2fda5b2ab5ed57ac5bc0f5d06b143..753d68f79703271b3719cfe6f920d3008c003c64 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
@@ -17,6 +17,7 @@ import logging
 import numpy as np
 from .... import core
 from ....framework import Program, Operator, Variable, program_guard
+from ....executor import global_scope
 from .... import unique_name
 from ....layer_helper import LayerHelper
 from ....param_attr import ParamAttr
@@ -27,26 +28,49 @@ _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
+def find_next_ops(block, var_name):
+    """
+    Find all followed ops for the input variable.
+    """
+    res_ops = []
+    for op in block.ops:
+        if var_name in op.input_arg_names:
+            res_ops.append(op)
+    return res_ops
+
+
+def load_variable_data(scope, var_name):
+    '''
+    Load variable value from scope
+    '''
+    var_node = scope.find_var(var_name)
+    assert var_node is not None, \
+        "Cannot find " + var_name + " in scope."
+    return np.array(var_node.get_tensor())
+
+
 class QuantizeTranspilerV2(object):
     def __init__(self,
                  weight_bits=8,
                  activation_bits=8,
                  weight_quantize_type='abs_max',
-                 activation_quantize_type='abs_max',
-                 quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'],
+                 activation_quantize_type='moving_average_abs_max',
+                 quantizable_op_type=[
+                     'conv2d',
+                     'depthwise_conv2d',
+                     'mul',
+                 ],
                  skip_pattern=['skip_quant']):
         """
-        Add quant_dequant op before the quantized op to quantize the fluid Program.
-        It is a patch for distributed quantization, we will support others module for
-        distributed quantization.
+        Apply fake quant for the quantized ops. 
 
         Args:
             weight_bits(int): the bit of quantized weight.
             activation_bits(int): the bit of quantized activation.
             weight_quantize_type(str): the quantization type for weight.
-                Only support to be 'abs_max' for now.
+                Only support to be 'abs_max' and 'channel_wise_abs_max'.
             activation_quantize_type(str): the quantization type for activation.
-                Only support to be 'abs_max' for now.
+                Only support to be 'abs_max' and 'moving_average_abs_max'.
             quantizable_op_type(str): set the op type for quantization.
             skip_pattern(str|list): The user-defined quantization skip pattern, which
                 will be presented in the name scope of an op. When the skip pattern is
@@ -55,28 +79,37 @@ class QuantizeTranspilerV2(object):
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
 
-        assert activation_quantize_type == "abs_max", \
-            "activation_quantize_type should be abs_max for now."
-        assert weight_quantize_type == "abs_max", \
-            "weight_quantize_type should be abs_max for now."
+        assert activation_quantize_type in \
+            ["abs_max", "moving_average_abs_max"], \
+            "activation_quantize_type should be abs_max " \
+            "or moving_average_abs_max for now."
+        assert weight_quantize_type in ["abs_max", "channel_wise_abs_max"], \
+            "weight_quantize_type should be abs_max or channel_wise_abs_max."
         self._activation_quantize_type = activation_quantize_type
         self._weight_quantize_type = weight_quantize_type
 
+        for op_type in quantizable_op_type:
+            assert op_type in ['conv2d', 'depthwise_conv2d', 'mul'], \
+                "Quantize op should be ['conv2d', 'depthwise_conv2d', 'mul']"
         self._quantizable_ops = quantizable_op_type
         self._quantizable_grad_ops = [
             '%s_grad' % (op) for op in self._quantizable_ops
         ]
 
         self._skip_pattern = skip_pattern
-        self.helper = LayerHelper(self.__class__.__name__)
+        self._helper = LayerHelper(self.__class__.__name__)
 
-    def apply(self, program, startup_program):
+        self._moving_rate = 0.9
+        self._out_ch_axis1_ops = ['conv2d_transpose', 'mul', 'matmul']
+
+    def apply(self, program, startup_program, is_test=False):
         """
         Apply quantization to fluid Program.
 
         Args:
             program(Program): the train or test program to be quantized.
             startup_program(Program): the corresponding startup_program.
+            is_test(bool): Whethe the program is used for test.
         Returns:
             None
         """
@@ -85,7 +118,7 @@ class QuantizeTranspilerV2(object):
         assert isinstance(startup_program, Program), \
             "startup_program must be the instance of Program"
 
-        quant_dequant_vars = [
+        var_rename_map = [
             collections.OrderedDict() for _ in range(len(program.blocks))
         ]
         with program_guard(program, startup_program):
@@ -94,13 +127,104 @@ class QuantizeTranspilerV2(object):
                 for op in ops:
                     if op.type in self._quantizable_ops and \
                         (not self._is_skip_quant(op)):
-                        self._transform_forward(block, op, quant_dequant_vars)
+                        self._transform_forward(block, op, var_rename_map,
+                                                is_test)
+
             for block in program.blocks:
                 ops = list(block.ops)
                 for op in ops:
                     if op.type in self._quantizable_grad_ops and \
                         (not self._is_skip_quant(op)):
-                        self._transform_backward(block, op, quant_dequant_vars)
+                        self._transform_backward(block, op, var_rename_map)
+
+    def convert(self, test_program, scope=None):
+        """
+        Convert the test program. 
+        Get the out scale from the moving_average_abs_max_scale op and save the
+        out scale into the quantized op. 
+        Args:
+            test_program(Program): the test program to be converted.
+            scope(fluid.Scope, optional): The scope of the program, use it to load 
+                and save variables. If scope=None, get scope by global_scope(). 
+        """
+        scope = global_scope() if scope == None else scope
+
+        for block in test_program.blocks:
+            for op in block.ops:
+                if op.has_attr("quantization_type") \
+                    and op.attr("quantization_type") == "qat_with_weight":
+                    # quant op -> var1 -> fake op -> var2
+                    assert len(op.output_arg_names) == 1
+                    var1_name = op.output_arg_names[0]
+
+                    fake_ops = find_next_ops(block, var1_name)
+                    assert len(fake_ops) == 1
+                    fake_op = fake_ops[0]
+                    assert fake_op.type == "moving_average_abs_max_scale"
+
+                    out_scale_name = fake_op.output("OutScale")
+                    out_threshold = load_variable_data(scope, out_scale_name[0])
+                    op._set_attr("out_threshold", float(out_threshold))
+
+                    var2_name = fake_op.output("Out")[0]
+                    op._rename_output(var1_name, var2_name)
+                    fake_op._rename_output(var2_name, var1_name)
+
+    def _transform_forward(self, block, op, var_rename_map, is_test):
+        """
+        Insert fake quant op before the target ops.
+        """
+        op._set_attr("quantization_type", "qat_with_weight")
+
+        # insert fake quant op before the quantized op
+        for in_name in op.input_arg_names:
+            block_id = block.idx
+            idx = block.ops.index(op)
+
+            if in_name in var_rename_map[block_id]:
+                new_in_name = var_rename_map[block_id][in_name]
+            else:
+                in_var = block.var(in_name)
+                if in_var.dtype != core.VarDesc.VarType.FP32:
+                    continue
+
+                quant_bits = self._weight_bits if in_var.persistable \
+                        else self._activation_bits
+                quant_type = self._weight_quantize_type if in_var.persistable \
+                        else self._activation_quantize_type
+
+                if quant_type == "abs_max":
+                    new_var = self._insert_abs_max_fq_op(block, idx, in_var,
+                                                         quant_bits)
+                elif quant_type == "moving_average_abs_max":
+                    new_var = self._insert_ma_abs_max_fq_op(block, idx, in_var,
+                                                            quant_bits, is_test)
+                elif quant_type == "channel_wise_abs_max":
+                    ch_axis = 1 if op.type in self._out_ch_axis1_ops else 0
+                    new_var = self._insert_pc_abs_max_fq_op(block, idx, in_var,
+                                                            quant_bits, ch_axis)
+                else:
+                    _logger.error("Don't support the quant_type: %s" %
+                                  quant_type)
+                    continue
+
+                new_in_name = new_var.name
+                var_rename_map[block_id][in_name] = new_in_name
+
+            op._rename_input(in_name, new_in_name)
+
+        # insert out scale op followed the quantized op
+        for out_name in op.output_arg_names:
+            next_ops = find_next_ops(block, out_name)
+
+            idx = block.ops.index(op)
+            out_var = block.var(out_name)
+            new_out_var = self._insert_ma_abs_max_scale_op(
+                block, idx + 1, out_var, is_test, True)
+
+            for next_op in next_ops:
+                if "_grad" not in next_op.type:
+                    next_op._rename_input(out_name, new_out_var.name)
 
     def _is_skip_quant(self, op):
         """
@@ -117,49 +241,35 @@ class QuantizeTranspilerV2(object):
                                 self._skip_pattern) != -1
         return user_skipped
 
-    def _transform_forward(self, block, op, quant_dequant_vars):
-        op._set_attr("quantization_type", "qat_with_weight")
-        idx = block.ops.index(op)
-        block_id = block.idx
-        for in_name in op.input_arg_names:
-            if in_name in quant_dequant_vars[block_id]:
-                quant_dequant_var = quant_dequant_vars[block_id][in_name]
-            else:
-                in_var = block.var(in_name)
-                quant_bits = self._weight_bits if in_var.persistable \
-                        else self._activation_bits
-                quant_type = self._weight_quantize_type if in_var.persistable \
-                        else self._activation_quantize_type
-                if quant_type == "abs_max":
-                    quant_dequant_var = self._insert_quant_dequant_abs_max_op(
-                        block, idx, in_var, quant_bits)
-                else:
-                    _logger.error("Quant_type only supported to be abs_max")
-                quant_dequant_vars[block_id][in_name] = quant_dequant_var
-                op._rename_input(in_name, quant_dequant_var.name)
-
-    def _transform_backward(self, block, op, quant_dequant_vars):
+    def _transform_backward(self, block, op, var_rename_map):
+        """
+        Update the backword of the target ops.
+        Note: for the grad ops, only rename the input, skip rename the output.
+        """
         block_id = block.idx
         no_dequanted_input_vars = True
         for name in op.input_arg_names:
-            if name in quant_dequant_vars[block_id]:
-                dequant_var = quant_dequant_vars[block_id][name]
-                op._rename_input(name, dequant_var.name)
+            if name in var_rename_map[block_id]:
+                new_var_name = var_rename_map[block_id][name]
+                op._rename_input(name, new_var_name)
                 no_dequanted_input_vars = False
         if no_dequanted_input_vars:
             raise ValueError("There is no dequanted inputs for op %s." %
                              (op.type))
 
-    def _insert_quant_dequant_abs_max_op(self, block, idx, in_var, quant_bits):
+    def _insert_abs_max_fq_op(self, block, idx, in_var, quant_bits):
+        """
+        Inset abs max fake quant op.
+        """
         quant_dequant_var = block.create_var(
             type=in_var.type,
             name="{}.quant_dequant".format(in_var.name),
             shape=in_var.shape,
             dtype=in_var.dtype)
-        scale_var = self.helper.create_parameter(
+        scale_var = self._helper.create_parameter(
             attr=ParamAttr(
                 name="{}.quant_dequant.scale".format(in_var.name),
-                initializer=Constant(0.001),
+                initializer=Constant(0.),
                 trainable=False),
             shape=[1],
             dtype=in_var.dtype)
@@ -175,3 +285,157 @@ class QuantizeTranspilerV2(object):
             inputs=inputs,
             outputs=outputs)
         return quant_dequant_var
+
+    def _insert_ma_abs_max_fq_op(self, block, idx, in_var, quant_bits, is_test):
+        """
+        Insert moving average abs max fake quant op.
+        """
+        quant_dequant_var = block.create_var(
+            type=in_var.type,
+            name="{}.quant_dequant".format(in_var.name),
+            shape=in_var.shape,
+            dtype=in_var.dtype)
+
+        scale_var = self._helper.create_parameter(
+            attr=ParamAttr(
+                name="{}.quant_dequant.scale".format(in_var.name),
+                initializer=Constant(0.),
+                trainable=False),
+            shape=[1],
+            dtype=in_var.dtype)
+        scale_var.stop_gradient = True
+
+        if not is_test:
+            state_var = self._helper.create_parameter(
+                attr=ParamAttr(
+                    name="{}.quant_dequant.state".format(in_var.name),
+                    initializer=Constant(0),
+                    trainable=False),
+                shape=[1],
+                dtype=in_var.dtype)
+            state_var.stop_gradient = True
+
+            accum_var = self._helper.create_parameter(
+                attr=ParamAttr(
+                    name="{}.quant_dequant.accum".format(in_var.name),
+                    initializer=Constant(0),
+                    trainable=False),
+                shape=[1],
+                dtype=in_var.dtype)
+            accum_var.stop_gradient = True
+
+        attrs = {
+            'moving_rate': self._moving_rate,
+            'bit_length': quant_bits,
+            'is_test': is_test
+        }
+        inputs = {'X': in_var, 'InScale': scale_var}
+        outputs = {'Out': quant_dequant_var, 'OutScale': scale_var}
+        if not is_test:
+            inputs['InState'] = state_var
+            inputs['InAccum'] = accum_var
+            outputs['OutState'] = state_var
+            outputs['OutAccum'] = accum_var
+
+        block._insert_op(
+            idx,
+            type='fake_quantize_dequantize_moving_average_abs_max',
+            attrs=attrs,
+            inputs=inputs,
+            outputs=outputs)
+        return quant_dequant_var
+
+    def _insert_pc_abs_max_fq_op(self, block, idx, in_var, quant_bits, ch_axis):
+        """
+        Insert per channel abs max fake quant op.
+        """
+        quant_dequant_var = block.create_var(
+            type=in_var.type,
+            name="{}.quant_dequant".format(in_var.name),
+            shape=in_var.shape,
+            dtype=in_var.dtype)
+
+        scale_var = self._helper.create_parameter(
+            attr=ParamAttr(
+                name="{}.quant_dequant.scale".format(in_var.name),
+                initializer=Constant(0.),
+                trainable=False),
+            shape=[in_var.shape[ch_axis]],
+            dtype=in_var.dtype)
+        scale_var.stop_gradient = True
+
+        inputs = {'X': in_var}
+        outputs = {'Out': quant_dequant_var, 'OutScale': scale_var}
+        attrs = {'bit_length': quant_bits, 'quant_axis': ch_axis}
+        block._insert_op(
+            idx,
+            type='fake_channel_wise_quantize_dequantize_abs_max',
+            attrs=attrs,
+            inputs=inputs,
+            outputs=outputs)
+        return quant_dequant_var
+
+    def _insert_ma_abs_max_scale_op(self,
+                                    block,
+                                    idx,
+                                    in_var,
+                                    is_test,
+                                    has_out_var=False):
+        """
+        Insert moving average abs max scale op.
+        """
+        scale_var = self._helper.create_parameter(
+            attr=ParamAttr(
+                name="{}.outscale.scale".format(in_var.name),
+                initializer=Constant(0.),
+                trainable=False),
+            shape=[1],
+            dtype=in_var.dtype)
+        scale_var.stop_gradient = True
+
+        attrs = {'moving_rate': self._moving_rate, 'is_test': is_test}
+        inputs = {'X': in_var}
+        outputs = {'OutScale': scale_var}
+
+        if not is_test:
+            state_var = self._helper.create_parameter(
+                attr=ParamAttr(
+                    name="{}.outscale.state".format(in_var.name),
+                    initializer=Constant(0),
+                    trainable=False),
+                shape=[1],
+                dtype=in_var.dtype)
+            state_var.stop_gradient = True
+
+            accum_var = self._helper.create_parameter(
+                attr=ParamAttr(
+                    name="{}.outscale.accum".format(in_var.name),
+                    initializer=Constant(0),
+                    trainable=False),
+                shape=[1],
+                dtype=in_var.dtype)
+            accum_var.stop_gradient = True
+
+            inputs['InState'] = state_var
+            inputs['InAccum'] = accum_var
+            outputs['OutState'] = state_var
+            outputs['OutAccum'] = accum_var
+
+        if has_out_var:
+            out_var = block.create_var(
+                type=in_var.type,
+                name="{}.tmp".format(in_var.name),
+                shape=in_var.shape,
+                dtype=in_var.dtype)
+
+            outputs['Out'] = out_var
+
+        block._insert_op(
+            idx,
+            type='moving_average_abs_max_scale',
+            attrs=attrs,
+            inputs=inputs,
+            outputs=outputs)
+
+        if has_out_var:
+            return out_var
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py b/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py
index 00f2b597d934ba9467c7f37fcbbac843a4223ac8..aa9f6a1801cbf642d9f420a36e6ac5df3f84d2b6 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py
@@ -79,6 +79,7 @@ class TestQuantizeProgramPass(unittest.TestCase):
         random.seed(0)
         np.random.seed(0)
 
+        # 1 Define program
         train_program = fluid.Program()
         startup_program = fluid.Program()
         test_program = fluid.Program()
@@ -93,15 +94,14 @@ class TestQuantizeProgramPass(unittest.TestCase):
             test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
             test_graph.draw('.', 'test_program_1')
 
+        # 2 Apply quantization
         qt = QuantizeTranspilerV2(
             activation_quantize_type=activation_quant_type,
-            weight_quantize_type=weight_quant_type,
-            quantizable_op_type=[
-                'conv2d', 'depthwise_conv2d', 'mul', 'pool2d'
-            ])
-        qt.apply(train_program, startup_program)
-        qt.apply(test_program, startup_program)
+            weight_quantize_type=weight_quant_type)
+        qt.apply(train_program, startup_program, is_test=False)
+        qt.apply(test_program, startup_program, is_test=True)
 
+        # 3 Train
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
         exe = fluid.Executor(place)
         scope = fluid.Scope()
@@ -120,28 +120,32 @@ class TestQuantizeProgramPass(unittest.TestCase):
         build_strategy.fuse_all_reduce_ops = False
         binary = fluid.CompiledProgram(train_program).with_data_parallel(
             loss_name=loss.name, build_strategy=build_strategy)
-        iters = 2
+        iters = 5
         batch_size = 8
 
         train_reader = paddle.batch(
             paddle.dataset.mnist.train(), batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
         with fluid.scope_guard(scope):
-            for _ in range(iters):
+            for idx in range(iters):
                 data = next(train_reader())
                 loss_v = exe.run(binary,
                                  feed=feeder.feed(data),
                                  fetch_list=[loss])
-                if not for_ci:
-                    print('{}: {}'.format('loss', loss_v))
+                if not for_ci and idx % 20 == 0:
+                    print('{}: {}'.format('loss', np.mean(loss_v)))
 
+        print('{}: {}'.format('loss', np.mean(loss_v)))
+
+        # 4 Convert
+        qt.convert(test_program, scope)
         if not for_ci:
             with fluid.scope_guard(scope):
                 fluid.io.save_inference_model('./infer_model',
                                               ['image', 'label'], [loss], exe,
                                               test_program)
 
-    def test_quantize_program_gpu(self):
+    def test_gpu_1(self):
         if fluid.core.is_compiled_with_cuda():
             self.quantize_program(
                 use_cuda=True,
@@ -150,7 +154,16 @@ class TestQuantizeProgramPass(unittest.TestCase):
                 weight_quant_type='abs_max',
                 for_ci=True)
 
-    def test_quantize_program_cpu(self):
+    def test_gpu_2(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.quantize_program(
+                use_cuda=True,
+                seed=1,
+                activation_quant_type='moving_average_abs_max',
+                weight_quant_type='channel_wise_abs_max',
+                for_ci=True)
+
+    def test_cpu_1(self):
         self.quantize_program(
             use_cuda=False,
             seed=2,
@@ -158,6 +171,14 @@ class TestQuantizeProgramPass(unittest.TestCase):
             weight_quant_type='abs_max',
             for_ci=True)
 
+    def test_cpu_2(self):
+        self.quantize_program(
+            use_cuda=False,
+            seed=2,
+            activation_quant_type='moving_average_abs_max',
+            weight_quant_type='channel_wise_abs_max',
+            for_ci=True)
+
 
 if __name__ == '__main__':
     unittest.main()