diff --git a/imperative/python/megengine/functional/quantized.py b/imperative/python/megengine/functional/quantized.py
index 3872c159d74138fafcdeeee363af4170dc55ba68..0ae082b2d14f901adc6880f13972ae791c962e87 100644
--- a/imperative/python/megengine/functional/quantized.py
+++ b/imperative/python/megengine/functional/quantized.py
@@ -45,8 +45,6 @@ def conv_bias_activation(
     :param conv_mode: supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
         'CROSS_CORRELATION'
     :param dtype: support for ``np.dtype``, Default: np.int8
-    :param scale: scale if use quantization, Default: 0.0
-    :param zero_point: scale if use quantization quint8, Default: 0.0
     :type compute_mode: string or
         :class:`P.Convolution.ComputeMode`.
     :param compute_mode: when set to "DEFAULT", no special requirements will be
@@ -75,3 +73,63 @@ def conv_bias_activation(
     )
     (outputs,) = apply(op, inp, weight, bias)
     return outputs
+
+
+def batch_conv_bias_activation(
+    inp: Tensor,
+    weight: Tensor,
+    bias: Tensor,
+    dtype=None,
+    stride: Union[int, Tuple[int, int]] = 1,
+    padding: Union[int, Tuple[int, int]] = 0,
+    dilation: Union[int, Tuple[int, int]] = 1,
+    groups: int = 1,
+    nonlinear_mode="IDENTITY",
+    conv_mode="CROSS_CORRELATION",
+    compute_mode="DEFAULT",
+) -> Tensor:
+    """
+    Batch convolution bias with activation operation, only for inference.
+
+    :param inp: feature map of the convolution operation.
+    :param weight: convolution kernel in batched way.
+    :param bias: bias added to the result of convolution
+    :param stride: stride of the 2D convolution operation. Default: 1
+    :param padding: size of the paddings added to the input on both sides of its spatial dimensions. Only zero-padding is supported. Default: 0
+    :param dilation: dilation of the 2D convolution operation. Default: 1
+    :param groups: number of groups into which the input and output channels are divided, so as to perform a "grouped convolution". When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+        and the shape of weight should be `(groups, out_channel // groups,
+        in_channels // groups, height, width)`.
+    :type conv_mode: string or :class:`P.Convolution.Mode`.
+    :param conv_mode: supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
+        'CROSS_CORRELATION'
+    :param dtype: support for ``np.dtype``, Default: np.int8
+    :type compute_mode: string or
+        :class:`P.Convolution.ComputeMode`.
+    :param compute_mode: when set to "DEFAULT", no special requirements will be
+        placed on the precision of intermediate results. When set to "FLOAT32",
+        "Float32" would be used for accumulator and intermediate result, but only effective when input and output are of Float16 dtype.
+
+    """
+    ph, pw = _pair(padding)
+    sh, sw = _pair_nonzero(stride)
+    dh, dw = _pair_nonzero(dilation)
+    sparse_type = "DENSE" if groups == 1 else "GROUP"
+    op = builtin.BatchConvBiasForward(
+        stride_h=sh,
+        stride_w=sw,
+        pad_h=ph,
+        pad_w=pw,
+        dilate_h=dh,
+        dilate_w=dw,
+        dtype=dtype,
+        format="NCHW",
+        strategy=get_conv_execution_strategy(),
+        nonlineMode=nonlinear_mode,
+        mode=conv_mode,
+        compute_mode=compute_mode,
+        sparse=sparse_type,
+    )
+    (outputs,) = apply(op, inp, weight, bias)
+    return outputs
diff --git a/imperative/python/megengine/module/__init__.py b/imperative/python/megengine/module/__init__.py
index 35fd875cdc6a6a61ff4e0178859648345d36d9bb..46493ee7bf738831b92c23dd36fa5b6d815c8d66 100644
--- a/imperative/python/megengine/module/__init__.py
+++ b/imperative/python/megengine/module/__init__.py
@@ -9,6 +9,7 @@
 
 from .activation import LeakyReLU, PReLU, ReLU, Sigmoid, Softmax
 from .adaptive_pooling import AdaptiveAvgPool2d, AdaptiveMaxPool2d
+from .batch_matmul_activation import BatchMatMulActivation
 from .batchnorm import BatchNorm1d, BatchNorm2d, SyncBatchNorm
 from .concat import Concat
 from .conv import Conv1d, Conv2d, ConvRelu2d, ConvTranspose2d, LocalConv2d
diff --git a/imperative/python/megengine/module/batch_matmul_activation.py b/imperative/python/megengine/module/batch_matmul_activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfb303bd92badba6a9e027fe1b4d21951a59d01b
--- /dev/null
+++ b/imperative/python/megengine/module/batch_matmul_activation.py
@@ -0,0 +1,67 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from ..functional import matmul, relu
+from ..tensor import Parameter
+from . import init
+from .module import Module
+
+
+class BatchMatMulActivation(Module):
+    r"""
+    Batched MatMul with activation(only relu supported), no transpose anywhere.
+    """
+
+    def __init__(
+        self,
+        batch: int,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        nonlinear_mode="IDENTITY",
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.batch = batch
+        self.out_features = out_features
+        self.in_features = in_features
+        w_shape = (batch, out_features, in_features)
+        self.weight = Parameter(np.zeros(w_shape, dtype=np.float32))
+        self.bias = None
+        if bias:
+            b_shape = (out_features,)
+            self.bias = Parameter(np.zeros(b_shape, dtype=np.float32))
+        self.nonlinear_mode = nonlinear_mode
+        self.reset_parameters()
+
+    def _get_fanin(self):
+        return self.in_features
+
+    def reset_parameters(self) -> None:
+        fanin = self._get_fanin()
+        std = np.sqrt(1 / fanin)
+        init.normal_(self.weight, 0.0, std)
+        if self.bias is not None:
+            init.zeros_(self.bias)
+
+    def _calc_linear(self, x, weight, bias):
+        res = matmul(weight, x)
+        if self.bias is not None:
+            res += bias
+        if self.nonlinear_mode == "RELU":
+            res = relu(res)
+        return res
+
+    def forward(self, x):
+        return self._calc_linear(x, self.weight, self.bias)
+
+    def _module_info_string(self) -> str:
+        return "batch={}, in_features={}, out_features={}, bias={}".format(
+            self.batch, self.in_features, self.out_features, self.bias is not None
+        )
diff --git a/imperative/python/megengine/module/qat/__init__.py b/imperative/python/megengine/module/qat/__init__.py
index b6adab4dc687a322fba6dd5652bdf8975933ad3a..044cf61d63970517df17dbdc4315a1aaccbc17c9 100644
--- a/imperative/python/megengine/module/qat/__init__.py
+++ b/imperative/python/megengine/module/qat/__init__.py
@@ -5,6 +5,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .batch_matmul_activation import BatchMatMulActivation
 from .concat import Concat
 from .conv import Conv2d, ConvRelu2d
 from .conv_bn import ConvBn2d, ConvBnRelu2d
diff --git a/imperative/python/megengine/module/qat/batch_matmul_activation.py b/imperative/python/megengine/module/qat/batch_matmul_activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..774ea85a3095c00bfda0d93f7e11617cf452658b
--- /dev/null
+++ b/imperative/python/megengine/module/qat/batch_matmul_activation.py
@@ -0,0 +1,30 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...quantization.utils import fake_quant_bias
+from .. import batch_matmul_activation as Float
+from .module import QATModule
+
+
+class BatchMatMulActivation(Float.BatchMatMulActivation, QATModule):
+    def forward(self, inp):
+        w_qat = self.apply_quant_weight(self.weight)
+        b_qat = fake_quant_bias(self.bias, inp, w_qat)
+        return self.apply_quant_activation(self._calc_linear(inp, w_qat, b_qat))
+
+    @classmethod
+    def from_float_module(cls, float_module: Float.BatchMatMulActivation):
+        qat_module = cls(
+            float_module.batch,
+            float_module.in_features,
+            float_module.out_features,
+            float_module.bias is not None,
+        )
+        qat_module.weight = float_module.weight
+        qat_module.bias = float_module.bias
+        return qat_module
diff --git a/imperative/python/megengine/module/quantized/__init__.py b/imperative/python/megengine/module/quantized/__init__.py
index e641476d6a363a609660fb2495bf946e91b7b6c8..d7691eede74ca15b6ebe38b08f91d3bea91c085e 100644
--- a/imperative/python/megengine/module/quantized/__init__.py
+++ b/imperative/python/megengine/module/quantized/__init__.py
@@ -5,6 +5,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .batch_matmul_activation import BatchMatMulActivation
 from .concat import Concat
 from .conv import Conv2d, ConvRelu2d
 from .conv_bn import ConvBn2d, ConvBnRelu2d
diff --git a/imperative/python/megengine/module/quantized/batch_matmul_activation.py b/imperative/python/megengine/module/quantized/batch_matmul_activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cdb7463ead5272eedb68145e73d405aa2ce8325
--- /dev/null
+++ b/imperative/python/megengine/module/quantized/batch_matmul_activation.py
@@ -0,0 +1,76 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Tuple, Union
+
+import numpy as np
+
+from ... import module as Float
+from ...core.tensor import dtype
+from ...functional import expand_dims, squeeze
+from ...functional.quantized import batch_conv_bias_activation
+from ...tensor import Parameter
+from ..qat import batch_matmul_activation as QAT
+from .module import QuantizedModule
+
+
+class BatchMatMulActivation(Float.BatchMatMulActivation, QuantizedModule):
+    def __init__(
+        self,
+        batch: int,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        nonlinear_mode="IDENTITY",
+        dtype=None,
+        **kwargs
+    ):
+        super().__init__(batch, in_features, out_features, bias, **kwargs)
+        self.output_dtype = dtype
+
+    def calc_bmm_quantized(self, inp):
+        inp_scale = dtype.get_scale(inp.dtype)
+        w_scale = dtype.get_scale(self.weight.dtype)
+        bias_scale = inp_scale * w_scale
+        inp = expand_dims(inp, [-1])
+        res = batch_conv_bias_activation(
+            inp,
+            self.weight,
+            self.bias.astype(dtype.qint32(bias_scale)),
+            dtype=self.output_dtype,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            nonlinear_mode=self.nonlinear_mode,
+        )
+        return squeeze(res, -1)
+
+    @classmethod
+    def from_qat_module(cls, qat_module: QAT.BatchMatMulActivation):
+        output_dtype = qat_module.get_activation_dtype()
+        qbmm = cls(
+            qat_module.batch,
+            qat_module.in_features,
+            qat_module.out_features,
+            qat_module.bias is not None,
+            dtype=output_dtype,
+        )
+        weight = qat_module.weight.astype(qat_module.get_weight_dtype())
+        weight = expand_dims(weight, [-1, -2])
+        qbmm.weight = Parameter(weight.numpy())
+        if qat_module.bias is not None:
+            bias = qat_module.bias.reshape((1, qbmm.out_features, 1, 1))
+            qbmm.bias = Parameter(bias.numpy())
+        else:
+            qbmm.bias = Parameter(
+                np.zeros((1, qbmm.out_features, 1, 1), dtype=np.float32)
+            )
+        return qbmm
+
+    def forward(self, inp):
+        return self.calc_bmm_quantized(inp)
diff --git a/imperative/python/test/unit/functional/test_functional.py b/imperative/python/test/unit/functional/test_functional.py
index 7d5488f91806f53af3d98307c8fbbf8049e66716..da894936a8ab14a88592cf7f372a14f01b42300f 100644
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -20,6 +20,7 @@ from megengine import Parameter, Tensor, is_cuda_available, tensor
 from megengine.core._trace_option import use_symbolic_shape
 from megengine.core.autodiff.grad import Grad
 from megengine.core.tensor.utils import make_shape_tuple
+from megengine.distributed.helper import get_device_count_by_fork
 
 
 def test_where():
@@ -420,7 +421,9 @@ def test_nms():
     np.testing.assert_equal(result.numpy(), np.array([2, 1, 3], dtype=np.int32))
 
 
-@pytest.mark.skip(reason="cuda does not support nchw int8")
+@pytest.mark.skipif(
+    get_device_count_by_fork("gpu") > 0, reason="cuda does not support nchw int8"
+)
 def test_conv_bias():
     inp_scale = 1.5
     w_scale = 2.5
@@ -446,7 +449,7 @@ def test_conv_bias():
         nonlinear_mode="IDENTITY",
     ):
         inp_v = np.random.normal(size=(N, IC, IH, IW))
-        w_v = np.random.normal(size=(OC, IC, KW, KW))
+        w_v = np.random.normal(size=(OC, IC, KH, KW))
         b_v = np.random.normal(size=(1, OC, 1, 1))
         inp_scale = dtype.get_scale(inp_dtype)
         w_scale = dtype.get_scale(w_dtype)
@@ -486,13 +489,12 @@ def test_conv_bias():
                 inp = convert_to_nchw4(inp)
                 w = convert_to_nchw4(w)
                 b = convert_to_nchw4(b)
-            return F.nn.conv_bias_activation(
+            return F.quantized.conv_bias_activation(
                 inp,
                 w,
                 b,
                 stride=(SH, SW),
                 padding=(PH, PW),
-                format=format,
                 dtype=out_dtype,
                 nonlinear_mode=nonlinear_mode,
             )
@@ -522,6 +524,59 @@ def test_conv_bias():
     run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, True, "RELU")
 
 
+@pytest.mark.skipif(
+    get_device_count_by_fork("gpu") > 0, reason="no int8 algorithm on cuda"
+)
+def test_batch_conv_bias():
+    inp_scale = 1.5
+    w_scale = 2.5
+    outp_scale = 1.5
+    inp_dtype = dtype.qint8(inp_scale)
+    w_dtype = dtype.qint8(w_scale)
+    b_dtype = dtype.qint32(inp_scale * w_scale)
+    out_dtype = dtype.qint8(outp_scale)
+
+    def run(
+        N, IC, OC, IH, IW, KH, KW, PH, PW, SH, SW, has_bias=True,
+    ):
+        inp_v = np.random.normal(size=(N, IC, IH, IW))
+        w_v = np.random.normal(size=(N, OC, IC, KH, KW))
+        b_v = np.random.normal(size=(1, OC, 1, 1))
+        inp_scale = dtype.get_scale(inp_dtype)
+        w_scale = dtype.get_scale(w_dtype)
+        b_scale = dtype.get_scale(b_dtype)
+
+        inpv = dtype.convert_to_qint8(inp_v * inp_scale, inp_dtype)
+        wv = dtype.convert_to_qint8(w_v * w_scale, w_dtype)
+        bv = dtype.convert_to_qint32(b_v * b_scale, b_dtype)
+
+        inp_int8 = tensor(inpv, dtype=inp_dtype)
+        w_int8 = Parameter(wv, dtype=w_dtype)
+        b_int32 = Parameter(bv, dtype=b_dtype)
+
+        inp_fp32 = inp_int8.astype("float32")
+        w_fp32 = w_int8.astype("float32")
+        b_fp32 = b_int32.astype("float32")
+
+        def run_batch_conv_bias(inp, w, b):
+            b = b if has_bias else Parameter(np.zeros_like(b.numpy()))
+            result = F.quantized.batch_conv_bias_activation(
+                inp, w, b, stride=(SH, SW), padding=(PH, PW), dtype=out_dtype,
+            )
+            return result.astype("float32")
+
+        expected = F.conv2d(inp_fp32, w_fp32[0], b_fp32 if has_bias else None)[0]
+        expected = expected.astype(out_dtype).astype("float32")
+        expected = F.flatten(expected)
+
+        result = run_batch_conv_bias(inp_int8, w_int8, b_int32)
+        result = F.flatten(result)
+
+        np.testing.assert_allclose(result.numpy(), expected.numpy(), atol=outp_scale)
+
+    run(1, 4, 4, 5, 5, 3, 3, 0, 0, 1, 1, True)
+
+
 def test_zero_stride_numpy_array():
     inp = np.random.randn(3, 224, 224).astype(np.float32)
     inp = inp[np.newaxis, :]
diff --git a/imperative/python/test/unit/module/test_qat.py b/imperative/python/test/unit/module/test_qat.py
index 34176a3a8e5b9b70e4c6fc37de8e72f9b548db9d..37fdc3c58165b4466105a047bfcd3bfaa41ba6e5 100644
--- a/imperative/python/test/unit/module/test_qat.py
+++ b/imperative/python/test/unit/module/test_qat.py
@@ -1,9 +1,15 @@
+import io
 from itertools import product
 
 import numpy as np
+import pytest
 
-from megengine import tensor
+import megengine.utils.comp_graph_tools as cgtools
+from megengine import jit, tensor
+from megengine.distributed.helper import get_device_count_by_fork
+from megengine.functional import expand_dims
 from megengine.module import (
+    BatchMatMulActivation,
     Conv2d,
     ConvBn2d,
     ConvRelu2d,
@@ -11,7 +17,12 @@ from megengine.module import (
     Module,
     QuantStub,
 )
-from megengine.quantization.quantize import disable_fake_quant, quantize_qat
+from megengine.quantization.quantize import (
+    disable_fake_quant,
+    enable_fake_quant,
+    quantize,
+    quantize_qat,
+)
 
 
 def test_qat_convbn2d():
@@ -88,3 +99,107 @@ def test_qat_conv():
         qat_net.eval()
         qat_outputs = qat_net(inputs)
         np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy())
+
+
+@pytest.mark.skipif(
+    get_device_count_by_fork("gpu") > 0, reason="no int8 algorithm on cuda"
+)
+def test_qat_batchmatmul_activation():
+    batch = 4
+    in_features = 8
+    out_features = 4
+
+    class TestNet(Module):
+        def __init__(self, bias):
+            super().__init__()
+            self.quant = QuantStub()
+            self.dequant = DequantStub()
+            self.batch_mm = BatchMatMulActivation(
+                batch, in_features, out_features, bias=bias
+            )
+
+        def forward(self, inp):
+            out = self.quant(inp)
+            out = self.batch_mm(out)
+            out = self.dequant(out)
+            return out
+
+    inputs = tensor(
+        np.random.randn(batch, in_features, out_features).astype(np.float32)
+    )
+    for bias in (True, False):
+        net = TestNet(bias)
+        net.train()
+        qat_net = quantize_qat(net, inplace=False)
+        disable_fake_quant(qat_net)
+        normal_outputs = net(inputs)
+        qat_outputs = qat_net(inputs)
+        np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy())
+
+        net.eval()
+        normal_outputs = net(inputs)
+        qat_net.eval()
+        qat_outputs = qat_net(inputs)
+        np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy())
+
+
+@pytest.mark.skip(reason="FIXME: abnormal exit")
+def test_quantize_batchmatmul_activation():
+    batch = 4
+    in_features = 8
+    out_features = 4
+
+    class TestNet(Module):
+        def __init__(self, bias):
+            super().__init__()
+            self.quant = QuantStub()
+            self.dequant = DequantStub()
+            self.batch_mm = BatchMatMulActivation(
+                batch, in_features, out_features, bias=bias
+            )
+
+        def forward(self, inp):
+            out = self.quant(inp)
+            out = self.batch_mm(out)
+            out = expand_dims(out, -1)
+            out = self.dequant(out)
+            return out
+
+    inputs = tensor(
+        np.random.randn(batch, in_features, out_features).astype(np.float32)
+    )
+    for bias in (True, False):
+        net = TestNet(bias)
+        net.train()
+        qat_net = quantize_qat(net, inplace=False)
+        disable_fake_quant(qat_net)
+        normal_outputs = net(inputs)
+        qat_outputs = qat_net(inputs)
+        np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy())
+
+        net.eval()
+        normal_outputs = net(inputs)
+        qat_net.eval()
+        qat_outputs = qat_net(inputs)
+        np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy())
+
+        enable_fake_quant(qat_net)
+        qat_outputs = qat_net(inputs)
+        qnet = quantize(qat_net, inplace=False)
+        qnet.eval()
+        quantize_outputs = qnet(inputs)
+        np.testing.assert_allclose(
+            qat_outputs.numpy(), quantize_outputs.numpy(), atol=1e-6
+        )
+
+        @jit.trace(capture_as_const=True)
+        def f(x):
+            qnet.eval()
+            return qnet(x)
+
+        f(inputs)
+        file = io.BytesIO()
+        f.dump(file, enable_nchw4=True)
+        file.seek(0)
+        dumped_outputs = cgtools.load_and_inference(file, [inputs])[0]
+        np.testing.assert_allclose(quantize_outputs.numpy(), dumped_outputs, atol=1e-6)