From 3eb0505f9b9db06160fc1f4d2d8724856da9d46b Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Fri, 23 Jul 2021 13:45:25 +0800
Subject: [PATCH] feat(imperative): add support for quantized conv transpose2d

GitOrigin-RevId: ffd6431299b2ae008fbdd1eed6458437e6b6a45f
---
 CMakeLists.txt                                |  8 --
 cmake/Halide.cmake                            |  1 +
 dnn/CMakeLists.txt                            |  2 -
 dnn/include/megdnn/config/config.h            |  1 -
 dnn/include/megdnn/handle.h                   |  1 -
 dnn/src/CMakeLists.txt                        |  4 -
 dnn/src/common/flag_warn.cpp                  |  1 -
 dnn/src/common/handle.cpp                     |  4 -
 .../megcore/common/computing_context.cpp      |  2 -
 dnn/src/common/warp_common.h                  |  1 -
 dnn/src/cuda/conv_bias/algo.cpp               |  1 -
 dnn/src/cuda/conv_bias/algo.h                 |  1 -
 dnn/src/fallback/convolution/img2col_helper.h |  1 -
 dnn/test/CMakeLists.txt                       |  4 -
 dnn/test/common/conv_bias.cpp                 |  4 -
 dnn/test/common/conv_bias.h                   |  1 -
 dnn/test/common/small_vector.cpp              |  1 -
 dnn/test/common/test_basic_types.cpp          |  2 -
 dnn/test/cuda/conv_bias_int8.cpp              |  2 -
 dnn/test/cuda/elemwise_multi_type.cpp         |  1 -
 dnn/test/cuda/sleep.cpp                       |  2 -
 dnn/test/fallback/conv_bias.cpp               |  2 -
 .../python/megengine/functional/quantized.py  | 49 ++++++++++
 imperative/python/megengine/module/conv.py    |  9 +-
 .../python/megengine/module/qat/__init__.py   |  2 +-
 .../python/megengine/module/qat/conv.py       | 39 ++++++++
 .../megengine/module/quantized/__init__.py    |  2 +-
 .../python/megengine/module/quantized/conv.py | 96 +++++++++++++++++++
 .../quantization/internal_fake_quant.py       |  2 -
 .../megengine/utils/persistent_cache.py       |  3 -
 imperative/python/setup.py                    |  4 -
 .../test/unit/functional/test_functional.py   |  2 -
 .../python/test/unit/module/test_qat.py       | 38 ++++++++
 .../test/unit/quantization/test_fake_quant.py |  2 -
 .../python/test/unit/quantization/test_op.py  | 92 ++++++++++++++++++
 .../test/unit/utils/test_network_node.py      |  4 -
 imperative/python/version_template.py         |  1 -
 imperative/src/impl/ops/convolution.cpp       |  5 +
 scripts/whl/macos/macos_build_whl.sh          |  4 -
 scripts/whl/manylinux2014/do_build_common.sh  |  4 -
 scripts/whl/windows/windows_build_whl.sh      |  7 --
 src/core/impl/comp_node_env.cpp               |  8 --
 src/core/impl/exception.cpp                   |  3 -
 src/core/impl/graph/cg_impl.cpp               |  2 -
 src/core/impl/graph/var_node_mem_mgr.h        |  1 -
 src/core/include/megbrain/comp_node.h         |  1 -
 src/core/include/megbrain/comp_node_env.h     | 12 ---
 src/core/include/megbrain/exception.h         |  4 -
 src/core/include/megbrain/ir/ops.td           |  6 +-
 src/core/test/comp_node.cpp                   |  7 --
 src/core/test/mem_alloc.cpp                   |  2 -
 src/core/test/tensor.cpp                      |  1 -
 src/gopt/test/inference.cpp                   |  2 -
 src/megbrain_build_config.h.in                |  3 -
 src/opr/impl/basic_arith.sereg.h              |  1 -
 src/opr/impl/blas.sereg.h                     |  1 -
 src/opr/impl/dnn/dnn.sereg.h                  |  1 -
 src/opr/impl/imgproc.sereg.h                  |  1 -
 src/opr/impl/misc.sereg.h                     |  1 -
 src/opr/impl/rand.sereg.h                     |  1 -
 src/opr/impl/tensor_manip.sereg.h             |  1 -
 src/opr/test/dnn/convolution.cpp              |  1 -
 src/serialization/impl/serializer.cpp         |  1 -
 src/serialization/test/extern_c_opr.cpp       |  1 -
 test/CMakeLists.txt                           |  1 -
 tools/param_defs/mgb_opr_param_defs.py        |  1 -
 66 files changed, 333 insertions(+), 143 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3af836e38..3429e3ee5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,7 +67,6 @@ option(MGE_WITH_ROCM "Enable ROCM support" OFF)
 option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF)
 option(MGE_BUILD_WITH_ASAN "Enable build with ASAN, need compiler support" OFF)
 
-
 if(MSVC OR WIN32)
     message(STATUS "windows force cudnn static link")
     set(MGE_WITH_CUDNN_SHARED OFF)
@@ -332,7 +331,6 @@ set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_
 set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")
 
-
 if(NOT MGE_WITH_JIT)
     if(MGE_WITH_HALIDE)
         message(WARNING "MGE_WITH_HALIDE is set to OFF with MGE_WITH_JIT disabled")
@@ -728,7 +726,6 @@ if (MGE_WITH_ROCM)
     include(cmake/rocm.cmake)
 endif ()
 
-
 if(MGE_WITH_ATLAS)
     add_subdirectory(dnn/atlas-stub)
     list(APPEND MGE_ATLAS_LIBS atlas-stub)
@@ -736,7 +733,6 @@ if(MGE_WITH_ATLAS)
     set(MGB_ATLAS ${MGE_WITH_ATLAS})
 endif()
 
-
 find_program(CCACHE_BIN ccache)
 if(CCACHE_BIN)
     set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_BIN})
@@ -834,12 +830,10 @@ endif()
 set(MGB_CUDA ${MGE_WITH_CUDA})
 set(MEGDNN_WITH_CUDA ${MGE_WITH_CUDA})
 
-
 #ROCM
 set(MGB_ROCM ${MGE_WITH_ROCM})
 set(MEGDNN_WITH_ROCM ${MGE_WITH_ROCM})
 
-
 # CAMBRICON
 set(MGB_CAMBRICON ${MGE_WITH_CAMBRICON})
 set(MEGDNN_WITH_CAMBRICON ${MGE_WITH_CAMBRICON})
@@ -1029,7 +1023,6 @@ if(MGE_BUILD_SDK)
     add_subdirectory(sdk/load-and-run)
 endif()
 
-
 if(MGE_BUILD_IMPERATIVE_RT)
     add_subdirectory(imperative)
     message(STATUS "Enable imperative python wrapper runtime")
@@ -1117,4 +1110,3 @@ if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND("${CUDNN_VERSION}" VERSION_GREATER
     message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ")
     message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ")
 endif()
-
diff --git a/cmake/Halide.cmake b/cmake/Halide.cmake
index 4b145daf9..2dc8ecab9 100644
--- a/cmake/Halide.cmake
+++ b/cmake/Halide.cmake
@@ -1,3 +1,4 @@
+
 include(ExternalProject)
 find_package(LLVM 6.0 REQUIRED CONFIG)
 
diff --git a/dnn/CMakeLists.txt b/dnn/CMakeLists.txt
index 939006d26..6270da986 100644
--- a/dnn/CMakeLists.txt
+++ b/dnn/CMakeLists.txt
@@ -38,7 +38,6 @@ list(APPEND OPR_PARAM_DEFS_OUTS
 )
 list(APPEND OPR_PARAM_DEFS_INC ${OPR_PARAM_DEFS_OUT_DIR})
 
-
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/megdnn DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN "*.h")
 
 add_custom_target(_opr_param_defs DEPENDS ${OPR_PARAM_DEFS_OUTS})
@@ -56,7 +55,6 @@ endforeach()
 add_dependencies(opr_param_defs _opr_param_defs)
 install(TARGETS opr_param_defs EXPORT ${MGE_EXPORT_TARGETS})
 
-
 if(MGE_WITH_CUDA)
     add_library(cutlass INTERFACE)
     target_include_directories(cutlass 
diff --git a/dnn/include/megdnn/config/config.h b/dnn/include/megdnn/config/config.h
index 619f786a6..4d7ec4074 100644
--- a/dnn/include/megdnn/config/config.h
+++ b/dnn/include/megdnn/config/config.h
@@ -13,7 +13,6 @@
 
 #if !defined(__CUDACC__) && !defined(__HIPCC__)
 
-
 #endif  // !defined(__CUDACC__)
 
 // vim: syntax=cpp.doxygen
diff --git a/dnn/include/megdnn/handle.h b/dnn/include/megdnn/handle.h
index c72bc120d..f938d1299 100644
--- a/dnn/include/megdnn/handle.h
+++ b/dnn/include/megdnn/handle.h
@@ -90,7 +90,6 @@ class Handle {
         std::unique_ptr<opr> create_rocm_operator();
 #endif
 
-
         virtual ~Handle();
 
         /*!
diff --git a/dnn/src/CMakeLists.txt b/dnn/src/CMakeLists.txt
index fb1d59c90..ddd7f3549 100644
--- a/dnn/src/CMakeLists.txt
+++ b/dnn/src/CMakeLists.txt
@@ -137,11 +137,9 @@ if(MGE_WITH_CUDA)
     gen_cutlass_kimpl(conv2d tensorop8832)
     file(GLOB_RECURSE CUTLASS_SOURCES ${CUTLASS_GEN_DIR}/*.cu)
     list(APPEND SOURCES ${CUTLASS_SOURCES})
-
     list(APPEND SOURCES ${CUSOURCES})
 endif()
 
-
 if(MGE_WITH_CAMBRICON)
     file(GLOB_RECURSE SOURCES_ cambricon/*.cpp)
     list(APPEND SOURCES ${SOURCES_})
@@ -161,7 +159,6 @@ if(MGE_WITH_ATLAS)
     list(APPEND LIBMEGDNN_DEF -DMEGDNN_WITH_ATLAS=1)
 endif()
 
-
 add_definitions(${LIBMEGDNN_DEF})
 add_library(megdnn EXCLUDE_FROM_ALL OBJECT ${SOURCES})
 
@@ -186,7 +183,6 @@ if(MGE_WITH_ROCM)
         ${AMDOCL_LIBRARY_DIR})
 endif()
 
-
 if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386" OR ${MGE_ARCH} STREQUAL "armv7" OR ${MGE_ARCH} STREQUAL "aarch64")
     if(MGE_ENABLE_CPUINFO)
         target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:cpuinfo>)
diff --git a/dnn/src/common/flag_warn.cpp b/dnn/src/common/flag_warn.cpp
index 01a8745ea..a19d9ff73 100644
--- a/dnn/src/common/flag_warn.cpp
+++ b/dnn/src/common/flag_warn.cpp
@@ -15,5 +15,4 @@
  #pragma message "Mangling is disabled."
 #endif // MEGDNN_ENABLE_MANGLING
 
-
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/common/handle.cpp b/dnn/src/common/handle.cpp
index ea9208b43..75b282d8d 100644
--- a/dnn/src/common/handle.cpp
+++ b/dnn/src/common/handle.cpp
@@ -31,13 +31,10 @@
 #include "src/aarch64/handle.h"
 #endif
 
-
-
 #if MEGDNN_WITH_CUDA
 #include "src/cuda/handle.h"
 #endif
 
-
 #if MEGDNN_WITH_CAMBRICON
 #include "src/cambricon/handle.h"
 #endif
@@ -128,7 +125,6 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle,
     return nullptr;
 }
 
-
 void Handle::set_destructor(const thin_function<void()>& d) {
     megdnn_assert(!m_destructor, "destructor can be set only once");
     m_destructor = d;
diff --git a/dnn/src/common/megcore/common/computing_context.cpp b/dnn/src/common/megcore/common/computing_context.cpp
index df191d75b..35d118fd1 100644
--- a/dnn/src/common/megcore/common/computing_context.cpp
+++ b/dnn/src/common/megcore/common/computing_context.cpp
@@ -17,8 +17,6 @@
 #include "src/cuda/megcore/cuda_computing_context.hpp"
 #endif
 
-
-
 #if MEGDNN_WITH_ROCM
 #include "src/rocm/megcore/computing_context.hpp"
 #endif
diff --git a/dnn/src/common/warp_common.h b/dnn/src/common/warp_common.h
index 6db4642b4..02e0298ce 100644
--- a/dnn/src/common/warp_common.h
+++ b/dnn/src/common/warp_common.h
@@ -880,7 +880,6 @@ void remap(const Mat<T>& src, Mat<T>& dst, Mat<short>& map1, Mat<ushort>& map2,
                     for (; x1 <= bcols - 8; x1 += 8)
                         vst1q_u16(A + x1,
                                   vandq_u16(vld1q_u16(sA + x1), v_scale));
-
 #endif
                     for (; x1 < bcols; ++x1)
                         A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2 - 1));
diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp
index fd290e08a..80f2846ec 100644
--- a/dnn/src/cuda/conv_bias/algo.cpp
+++ b/dnn/src/cuda/conv_bias/algo.cpp
@@ -287,7 +287,6 @@ void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() {
     int8_nchw4_dotprod.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2});
 }
 
-
 ConvBiasForwardImpl::AlgoBase*
 ConvBiasForwardImpl::AlgoPack::cudnn_conv_from_enum(
         cudnnConvolutionFwdAlgo_t algo) {
diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h
index 7349ae1e4..e892ff8cd 100644
--- a/dnn/src/cuda/conv_bias/algo.h
+++ b/dnn/src/cuda/conv_bias/algo.h
@@ -1037,7 +1037,6 @@ private:
     WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
 };
 
-
 class ConvBiasForwardImpl::AlgoPack : NonCopyableObj {
 private:
     AlgoBase::Mapper m_all_algos_map;
diff --git a/dnn/src/fallback/convolution/img2col_helper.h b/dnn/src/fallback/convolution/img2col_helper.h
index 9317c5eb6..00379fa4e 100644
--- a/dnn/src/fallback/convolution/img2col_helper.h
+++ b/dnn/src/fallback/convolution/img2col_helper.h
@@ -10,7 +10,6 @@
  */
 #include "src/common/utils.h"
 
-
 namespace {
 
 template <bool is_xcorr, typename dtype>
diff --git a/dnn/test/CMakeLists.txt b/dnn/test/CMakeLists.txt
index 0cc8fc7c4..298acef5d 100644
--- a/dnn/test/CMakeLists.txt
+++ b/dnn/test/CMakeLists.txt
@@ -34,7 +34,6 @@ if(MGE_WITH_CAMBRICON)
     list(APPEND SOURCES ${SOURCES_})
 endif()
 
-
 if(MGE_WITH_ATLAS)
     file(GLOB_RECURSE SOURCES_ atlas/*.cpp)
     list(APPEND SOURCES ${SOURCES_})
@@ -45,8 +44,6 @@ if (MGE_WITH_ROCM)
     list (APPEND SOURCES ${SOURCES_})
 endif()
 
-
-
 add_executable(megdnn_test ${SOURCES})
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
 target_link_libraries(megdnn_test gtest)
@@ -60,7 +57,6 @@ if(MGE_WITH_ATLAS)
     target_link_libraries(megdnn_test atlas-stub)
 endif()
 
-
 target_include_directories(megdnn_test
     PRIVATE
         ${PROJECT_SOURCE_DIR}/third_party/midout/src
diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp
index 9ffe92b3a..fe2022358 100644
--- a/dnn/test/common/conv_bias.cpp
+++ b/dnn/test/common/conv_bias.cpp
@@ -494,7 +494,6 @@ std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size,
     return args;
 }
 
-
 std::vector<TestArg> get_int8_nchw4_args_check_bounds(size_t kernel_size) {
     std::vector<TestArg> args;
     param::ConvBias cur_param;
@@ -530,7 +529,6 @@ std::vector<TestArg> get_int8_nchw4_args_check_bounds(size_t kernel_size) {
     return args;
 }
 
-
 std::vector<TestArg> get_int8_nchw4_args_small_batch(size_t kernel_size) {
     std::vector<TestArg> args;
     param::ConvBias cur_param;
@@ -974,7 +972,6 @@ void benchmark_winograd(const char* algo_name, Handle* handle, size_t kernel,
 }
 #endif  // MEGDNN_WITH_BENCHMARK
 
-
 std::vector<conv_bias::TestArg> get_conv_bias_args(
         std::vector<size_t> kernel, size_t stride, bool no_pad, bool no_bias,
         bool no_nonlinemode, bool quantized_nlmod, bool only_broadcast_bias) {
@@ -1188,7 +1185,6 @@ void check_conv_bias_preprocess(std::vector<conv_bias::TestArg> args,
     }
 }
 
-
 void checker_conv_bias_common(std::vector<conv_bias::TestArg> args, Handle* handle,
                        RNG* rng, float epsilon, DType type0, DType type1,
                        DType type2, DType type3, const char* algo_name) {
diff --git a/dnn/test/common/conv_bias.h b/dnn/test/common/conv_bias.h
index 573efe64c..fbc9edf41 100644
--- a/dnn/test/common/conv_bias.h
+++ b/dnn/test/common/conv_bias.h
@@ -93,7 +93,6 @@ void check_conv_bias(std::vector<megdnn::test::conv_bias::TestArg> args,
 void checker_conv_bias_int8x8x16(
         std::vector<megdnn::test::conv_bias::TestArg> args,
         megdnn::Handle* handle, const char* algo_name);
-
 void checker_conv_bias_common(std::vector<conv_bias::TestArg> args,
                               Handle* handle, RNG* rng, float epsilon,
                               DType type0, DType type1, DType type2,
diff --git a/dnn/test/common/small_vector.cpp b/dnn/test/common/small_vector.cpp
index 667d260ea..3a88aa186 100644
--- a/dnn/test/common/small_vector.cpp
+++ b/dnn/test/common/small_vector.cpp
@@ -1145,7 +1145,6 @@ TEST(SmallVectorTest, SwapMoveOnly) {
         }
     }
 }
-
 }  // anonymous namespace
 
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/test/common/test_basic_types.cpp b/dnn/test/common/test_basic_types.cpp
index c377fde50..e88768b0c 100644
--- a/dnn/test/common/test_basic_types.cpp
+++ b/dnn/test/common/test_basic_types.cpp
@@ -40,7 +40,6 @@ TensorLayout make_layout(std::initializer_list<size_t> shape,
 }
 
 } // anonymous namespace
-
 #if MEGDNN_64_BIT
 TEST(BASIC_TYPES, TOTAL_NR_ELEMS) {
     TensorShape shp{1u<<31, 1u<<31};
@@ -340,5 +339,4 @@ TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) {
                               LowbitsAlignedToBytesTensorFormat::make(4_z)),
                  MegDNNError);
 }
-
 // vim: syntax=cpp.doxygen
diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp
index b4f3ebaed..3c593dcc6 100644
--- a/dnn/test/cuda/conv_bias_int8.cpp
+++ b/dnn/test/cuda/conv_bias_int8.cpp
@@ -697,7 +697,6 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
             conv_bias::get_int8_chwn4_args_small_batch(1));
 }
 
-
 TEST_F(CUDA, FALLBACK_CONV_QS8) {
     require_compute_capability_eq(7, 5);
     Checker<ConvBiasForward> checker(handle_cuda());
@@ -1100,7 +1099,6 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW) {
     run({{16, 16, 46, 80, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
 }
 
-
 #if CUDA_VERSION >= 10020
 TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
     require_compute_capability(7, 5);
diff --git a/dnn/test/cuda/elemwise_multi_type.cpp b/dnn/test/cuda/elemwise_multi_type.cpp
index b57037b56..ca771a5ac 100644
--- a/dnn/test/cuda/elemwise_multi_type.cpp
+++ b/dnn/test/cuda/elemwise_multi_type.cpp
@@ -32,7 +32,6 @@ TYPED_TEST(CUDA_ELEMWISE_MULTI_TYPE, run) {
     elemwise_multi_type::run_test<TypeParam>(this->handle_cuda());
 }
 
-
 using Mode = ElemwiseMultiType::Param::Mode;
 static void run_test(int arity, Checker<ElemwiseMultiType>& checker, Mode mode) {
     for (auto type : std::vector<std::pair<DType, DType>>{
diff --git a/dnn/test/cuda/sleep.cpp b/dnn/test/cuda/sleep.cpp
index c53073730..c32cb5964 100644
--- a/dnn/test/cuda/sleep.cpp
+++ b/dnn/test/cuda/sleep.cpp
@@ -22,7 +22,6 @@
 using namespace megdnn;
 using namespace test;
 
-
 TEST_F(CUDA, SLEEP) {
     auto opr = this->handle_cuda()->create_operator<megdnn::SleepForward>();
 
@@ -53,6 +52,5 @@ TEST_F(CUDA, SLEEP) {
 }
 
 
-
 // vim: syntax=cpp.doxygen
 
diff --git a/dnn/test/fallback/conv_bias.cpp b/dnn/test/fallback/conv_bias.cpp
index ee0226528..78a19cb8a 100644
--- a/dnn/test/fallback/conv_bias.cpp
+++ b/dnn/test/fallback/conv_bias.cpp
@@ -75,7 +75,6 @@ TEST_F(FALLBACK, CONV_BIAS_FORWARD) {
                 .execs({src_shape, filter_shape, bias_shape, {}, {}})
                 .execs({src_shape, filter_shape, bias_shape_channel, {}, {}});
     }
-
 }
 
 std::vector<conv_bias::TestArg> get_conv_bias_args(
@@ -236,7 +235,6 @@ TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_QUANTIZED) {
                       "FALLBACK_NAIVE");
 }
 
-
 #if MEGDNN_WITH_BENCHMARK
 TEST_F(FALLBACK, BENCHMARK_CONVBIAS) {
     constexpr size_t RUNS = 10;
diff --git a/imperative/python/megengine/functional/quantized.py b/imperative/python/megengine/functional/quantized.py
index 17a45d04d..16975c7a6 100644
--- a/imperative/python/megengine/functional/quantized.py
+++ b/imperative/python/megengine/functional/quantized.py
@@ -139,3 +139,52 @@ def batch_conv_bias_activation(
     )
     (outputs,) = apply(op, inp, weight, bias)
     return outputs
+
+
+def conv_transpose2d(
+    inp: Tensor,
+    weight: Tensor,
+    bias: Tensor = None,
+    dtype=None,
+    stride: Union[int, Tuple[int, int]] = 1,
+    padding: Union[int, Tuple[int, int]] = 0,
+    dilation: Union[int, Tuple[int, int]] = 1,
+    groups: int = 1,
+    conv_mode="cross_correlation",
+    compute_mode="default",
+) -> Tensor:
+
+    assert (
+        conv_mode.lower() == "cross_correlation"
+        or conv_mode.name == "CROSS_CORRELATION"
+    )
+    assert compute_mode.lower() == "default" or compute_mode.name == "DEFAULT"
+
+    if groups != 1:
+        raise NotImplementedError(
+            "group quantized transposed conv2d is not supported yet."
+        )
+    if bias is not None:
+        raise NotImplementedError(
+            "bias of quantized transposed conv2d is not supported yet."
+        )
+
+    pad_h, pad_w = _pair(padding)
+    stride_h, stride_w = _pair_nonzero(stride)
+    dilate_h, dilate_w = _pair_nonzero(dilation)
+
+    # should be replaced by Op with bias such as ConvolutionBackwardDataBias
+    op = builtin.ConvolutionBackwardData(
+        stride_h=stride_h,
+        stride_w=stride_w,
+        pad_h=pad_h,
+        pad_w=pad_w,
+        dilate_h=dilate_h,
+        dilate_w=dilate_w,
+        strategy=get_execution_strategy(),
+        dtype=dtype,
+        compute_mode=compute_mode,
+        mode=conv_mode,
+    )
+    (output,) = apply(op, weight, inp)
+    return output
diff --git a/imperative/python/megengine/module/conv.py b/imperative/python/megengine/module/conv.py
index 861c92698..183356db3 100644
--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -651,11 +651,11 @@ class ConvTranspose2d(_ConvNd):
         # Assume format is NCHW
         return (1, self.out_channels, 1, 1)
 
-    def forward(self, inp):
+    def calc_conv_transpose2d(self, inp, weight, bias):
         return conv_transpose2d(
             inp,
-            self.weight,
-            self.bias,
+            weight,
+            bias,
             self.stride,
             self.padding,
             self.dilation,
@@ -664,6 +664,9 @@ class ConvTranspose2d(_ConvNd):
             self.compute_mode,
         )
 
+    def forward(self, inp):
+        return self.calc_conv_transpose2d(inp, self.weight, self.bias)
+
 
 class LocalConv2d(Conv2d):
     r"""
diff --git a/imperative/python/megengine/module/qat/__init__.py b/imperative/python/megengine/module/qat/__init__.py
index 9a9295f9f..86ff72352 100644
--- a/imperative/python/megengine/module/qat/__init__.py
+++ b/imperative/python/megengine/module/qat/__init__.py
@@ -7,7 +7,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from .batch_matmul_activation import BatchMatMulActivation
 from .concat import Concat
-from .conv import Conv2d, ConvRelu2d
+from .conv import Conv2d, ConvRelu2d, ConvTranspose2d
 from .conv_bn import ConvBn2d, ConvBnRelu2d
 from .elemwise import Elemwise
 from .linear import Linear
diff --git a/imperative/python/megengine/module/qat/conv.py b/imperative/python/megengine/module/qat/conv.py
index c3608d591..c8465f8c7 100644
--- a/imperative/python/megengine/module/qat/conv.py
+++ b/imperative/python/megengine/module/qat/conv.py
@@ -57,3 +57,42 @@ class ConvRelu2d(Conv2d):
 
     def forward(self, inp):
         return self.apply_quant_activation(F.relu(self.calc_conv_qat(inp)))
+
+
+class ConvTranspose2d(Float.ConvTranspose2d, QATModule):
+    r"""
+    A :class:`~.QATModule` :class:`~.module.ConvTranspose2d` with QAT support.
+    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
+    """
+
+    def calc_conv_transpose2d_qat(self, inp):
+        w_qat = self.apply_quant_weight(self.weight)
+        b_qat = self.apply_quant_bias(self.bias, inp, w_qat)
+        conv = self.calc_conv_transpose2d(inp, w_qat, b_qat)
+        return conv
+
+    @classmethod
+    def from_float_module(cls, float_module: Float.ConvTranspose2d):
+        r"""
+        Return a :class:`~.QATModule` instance converted from
+        a float :class:`~.Module` instance.
+        """
+        qat_module = cls(
+            float_module.in_channels,
+            float_module.out_channels,
+            float_module.kernel_size,
+            float_module.stride,
+            float_module.padding,
+            float_module.dilation,
+            float_module.groups,
+            float_module.bias is not None,
+            float_module.conv_mode,
+            float_module.compute_mode,
+            name=float_module.name,
+        )
+        qat_module.weight = float_module.weight
+        qat_module.bias = float_module.bias
+        return qat_module
+
+    def forward(self, inp):
+        return self.apply_quant_activation(self.calc_conv_transpose2d_qat(inp))
diff --git a/imperative/python/megengine/module/quantized/__init__.py b/imperative/python/megengine/module/quantized/__init__.py
index 5c21359e0..ed9866f1e 100644
--- a/imperative/python/megengine/module/quantized/__init__.py
+++ b/imperative/python/megengine/module/quantized/__init__.py
@@ -7,7 +7,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from .batch_matmul_activation import BatchMatMulActivation
 from .concat import Concat
-from .conv import Conv2d, ConvRelu2d
+from .conv import Conv2d, ConvRelu2d, ConvTranspose2d
 from .conv_bn import ConvBn2d, ConvBnRelu2d
 from .elemwise import Elemwise
 from .linear import Linear
diff --git a/imperative/python/megengine/module/quantized/conv.py b/imperative/python/megengine/module/quantized/conv.py
index 0230dda5d..4d7a5b5f6 100644
--- a/imperative/python/megengine/module/quantized/conv.py
+++ b/imperative/python/megengine/module/quantized/conv.py
@@ -12,6 +12,7 @@ import numpy as np
 from ... import module as Float
 from ...core.tensor import dtype
 from ...functional.nn import conv_bias_activation
+from ...functional.quantized import conv_transpose2d
 from ...tensor import Parameter
 from ..qat import conv as QAT
 from .module import QuantizedModule
@@ -108,3 +109,98 @@ class ConvRelu2d(Conv2d):
 
     def forward(self, inp):
         return self.calc_conv_quantized(inp, nonlinear_mode="relu")
+
+
+class ConvTranspose2d(Float.ConvTranspose2d, QuantizedModule):
+    r"""Quantized version of :class:`~.qat.ConvTranspose2d`.
+
+    Applies a 2D transposed convolution over a quantized input tensor, used
+    for inference only.
+
+    The parameter is same with :class:`~.module.ConvTranspose2d` but dtype.
+
+    :param dtype: data type of the output, should be qint8.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]] = 1,
+        padding: Union[int, Tuple[int, int]] = 0,
+        dilation: Union[int, Tuple[int, int]] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        conv_mode: str = "cross_correlation",
+        compute_mode: str = "default",
+        dtype=None,
+        **kwargs
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            conv_mode=conv_mode,
+            compute_mode=compute_mode,
+        )
+        self.output_dtype = dtype
+
+    @classmethod
+    def from_qat_module(cls, qat_module: QAT.ConvTranspose2d):
+        r"""
+        return a :class:`~.QuantizedModule` instance converted from a
+        :class:`~.QATModule` instance.
+        """
+        output_dtype = qat_module.get_activation_dtype()
+        qconv = cls(
+            qat_module.in_channels,
+            qat_module.out_channels,
+            qat_module.kernel_size,
+            qat_module.stride,
+            qat_module.padding,
+            qat_module.dilation,
+            qat_module.groups,
+            qat_module.bias is not None,
+            qat_module.conv_mode,
+            qat_module.compute_mode,
+            dtype=output_dtype,
+            name=qat_module.name,
+        )
+        weight = qat_module.weight.astype(qat_module.get_weight_dtype())
+        qconv.weight = Parameter(weight.numpy(), name=qat_module.weight.name)
+        qconv.bias = (
+            Parameter(qat_module.bias.numpy(), name=qat_module.bias.name)
+            if qat_module.bias is not None
+            else None
+        )
+        return qconv
+
+    def calc_conv_transpose2d_quantized(self, inp):
+        if self.bias is not None:
+            inp_scale = dtype.get_scale(inp.dtype)
+            w_scale = dtype.get_scale(self.weight.dtype)
+            bias_scale = inp_scale * w_scale
+
+        return conv_transpose2d(
+            inp=inp,
+            weight=self.weight,
+            bias=self.bias.astype(dtype.qint32(bias_scale))
+            if self.bias is not None
+            else None,
+            dtype=self.output_dtype,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+            conv_mode=self.conv_mode,
+            compute_mode=self.compute_mode,
+        )
+
+    def forward(self, inp):
+        return self.calc_conv_transpose2d_quantized(inp)
diff --git a/imperative/python/megengine/quantization/internal_fake_quant.py b/imperative/python/megengine/quantization/internal_fake_quant.py
index 7972c6225..afec8fe92 100644
--- a/imperative/python/megengine/quantization/internal_fake_quant.py
+++ b/imperative/python/megengine/quantization/internal_fake_quant.py
@@ -13,5 +13,3 @@ from .fake_quant import _FakeQuantize
 from .observer import MinMaxObserver
 from .qconfig import QConfig
 from .utils import QParams
-
-
diff --git a/imperative/python/megengine/utils/persistent_cache.py b/imperative/python/megengine/utils/persistent_cache.py
index eda132c2a..b0bd351d4 100644
--- a/imperative/python/megengine/utils/persistent_cache.py
+++ b/imperative/python/megengine/utils/persistent_cache.py
@@ -69,7 +69,6 @@ class PersistentCacheOnServer(_PersistentCache):
     def make_user_prefix(cls):
         return "mgbcache:{}".format(getpass.getuser())
 
-
     def _make_key(self, category, key):
         prefix_with_version = "{}:MGB{}".format(self._prefix, __version__)
         return b"@".join(
@@ -86,5 +85,3 @@ class PersistentCacheOnServer(_PersistentCache):
         key = self._make_key(category, key)
         self._prev_get_refkeep = conn.get(key)
         return self._prev_get_refkeep
-
-
diff --git a/imperative/python/setup.py b/imperative/python/setup.py
index 21c5365de..7441c5abe 100644
--- a/imperative/python/setup.py
+++ b/imperative/python/setup.py
@@ -38,7 +38,6 @@ class build_ext(_build_ext):
             modpath = str(pathlib.Path(*modpath).resolve())
 
             copy_file(modpath, fullpath, verbose=self.verbose, dry_run=self.dry_run)
-
 package_name = 'MegEngine'
 
 v = {}
@@ -79,7 +78,6 @@ megengine_data += [
     for f in pathlib.Path('megengine', 'core', 'lib').glob('**/*')
 ]
 
-
 with open('requires.txt') as f:
     requires = f.read().splitlines()
 with open('requires-style.txt') as f:
@@ -108,8 +106,6 @@ setup_kwargs = dict(
     cmdclass={'build_ext': build_ext},
     scripts = ['./megengine/tools/mge'],
 )
-
-
 setup_kwargs.update(dict(
     classifiers=[
     'Development Status :: 3 - Alpha',
diff --git a/imperative/python/test/unit/functional/test_functional.py b/imperative/python/test/unit/functional/test_functional.py
index 612f6612f..36f186977 100644
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -876,8 +876,6 @@ def test_nms_is_same():
     assert op3 != op4
 
 
-
-
 def test_argmxx_on_inf():
     def run_argmax():
         x = F.zeros((100, 100))
diff --git a/imperative/python/test/unit/module/test_qat.py b/imperative/python/test/unit/module/test_qat.py
index 51b5206e5..2cb3f5a87 100644
--- a/imperative/python/test/unit/module/test_qat.py
+++ b/imperative/python/test/unit/module/test_qat.py
@@ -13,6 +13,7 @@ from megengine.module import (
     Conv2d,
     ConvBn2d,
     ConvRelu2d,
+    ConvTranspose2d,
     DequantStub,
     Module,
     QuantStub,
@@ -202,3 +203,40 @@ def test_quantize_batchmatmul_activation():
         infer_cg = cgtools.GraphInference(file)[0]
         dumped_outputs = list(infer_cg.run(inputs.numpy()).values())[0]
         np.testing.assert_allclose(quantize_outputs.numpy(), dumped_outputs, atol=1e-6)
+
+
+def test_qat_conv_transpose2d():
+    in_channels = 32
+    out_channels = 64
+    kernel_size = 3
+
+    class TestNet(Module):
+        def __init__(self, bias):
+            super().__init__()
+            self.quant = QuantStub()
+            self.dequant = DequantStub()
+            self.conv = ConvTranspose2d(
+                in_channels, out_channels, kernel_size, bias=bias
+            )
+
+        def forward(self, inp):
+            out = self.quant(inp)
+            out = self.conv(out)
+            out = self.dequant(out)
+            return out
+
+    inputs = tensor(np.random.randn(4, in_channels, 32, 32).astype(np.float32))
+    for bias in [True, False]:
+        net = TestNet(bias)
+        net.train()
+        qat_net = quantize_qat(net, inplace=False)
+        disable_fake_quant(qat_net)
+        normal_outputs = net(inputs)
+        qat_outputs = qat_net(inputs)
+        np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy())
+
+        net.eval()
+        normal_outputs = net(inputs)
+        qat_net.eval()
+        qat_outputs = qat_net(inputs)
+        np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy())
diff --git a/imperative/python/test/unit/quantization/test_fake_quant.py b/imperative/python/test/unit/quantization/test_fake_quant.py
index 612e03ab7..8ee961ed7 100644
--- a/imperative/python/test/unit/quantization/test_fake_quant.py
+++ b/imperative/python/test/unit/quantization/test_fake_quant.py
@@ -92,8 +92,6 @@ def test_tqt():
     np.testing.assert_allclose(g_s.numpy(), g_s_np, rtol=5e-5, atol=5e-5)
 
 
-
-
 def _save_to(self, name="grad"):
     def callback(grad):
         setattr(self, name, grad)
diff --git a/imperative/python/test/unit/quantization/test_op.py b/imperative/python/test/unit/quantization/test_op.py
index beb151d9e..9b07ac02a 100644
--- a/imperative/python/test/unit/quantization/test_op.py
+++ b/imperative/python/test/unit/quantization/test_op.py
@@ -14,6 +14,7 @@ import megengine.functional as F
 from megengine.core.tensor import dtype
 from megengine.device import get_device_count
 from megengine.functional.elemwise import _elemwise_multi_type, _elwise
+from megengine.module.quantized.conv import ConvTranspose2d
 from megengine.quantization import QuantMode, create_qparams
 
 
@@ -168,3 +169,94 @@ def test_conv_bias():
 
     run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, False, "relu")
     run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, True, "relu")
+
+
+def test_conv_transpose2d():
+    rng = np.random.RandomState(seed=2021)
+
+    def test_func(
+        N,
+        IC,
+        IH,
+        IW,
+        OC,
+        KH,
+        KW,
+        SH,
+        SW,
+        PH,
+        PW,
+        DH,
+        DW,
+        groups=1,
+        has_bias=True,
+        conv_mode: str = "cross_correlation",
+        compute_mode: str = "default",
+    ):
+        inp_scale = np.float32(rng.uniform(low=0.04, high=0.06))
+        weight_scale = np.float32(rng.uniform(low=0.04, high=0.06))
+        bias_scale = inp_scale * weight_scale
+        out_scale = np.float32(rng.uniform(low=0.04, high=0.06))
+
+        inp_dtype = dtype.qint8(inp_scale)
+        weight_dtype = dtype.qint8(weight_scale)
+        bias_dtype = dtype.qint32(bias_scale)
+        out_dtype = dtype.qint8(out_scale)
+
+        inp_fp32 = rng.uniform(low=-1, high=1, size=(N, IC, IH, IW)).astype(np.float32)
+        weight_fp32 = rng.uniform(low=-1, high=1, size=(IC, OC, KH, KW)).astype(
+            np.float32
+        )
+        bias_fp32 = rng.uniform(low=-1, high=1, size=(1, OC, 1, 1)).astype(np.float32)
+
+        inp_int8 = dtype.convert_to_qint8(inp_fp32, inp_dtype)
+        weight_int8 = dtype.convert_to_qint8(weight_fp32, weight_dtype)
+        bias_int32 = dtype.convert_to_qint32(bias_fp32, bias_dtype)
+
+        inp_int8 = mge.tensor(inp_int8, dtype=inp_dtype)
+        weight_int8 = mge.Parameter(weight_int8, dtype=weight_dtype)
+        bias_int32 = mge.Parameter(bias_int32, dtype=bias_dtype)
+
+        inp_fp32 = inp_int8.astype("float32")
+        weight_fp32 = weight_int8.astype("float32")
+        bias_fp32 = bias_int32.astype("float32")
+
+        expected = F.conv_transpose2d(
+            inp_fp32,
+            weight_fp32,
+            bias_fp32 if has_bias else None,
+            stride=(SH, SW),
+            padding=(PH, PW),
+            dilation=(DH, DW),
+            groups=groups,
+            conv_mode=conv_mode,
+            compute_mode=compute_mode,
+        )
+        expected = dtype.convert_to_qint8(expected.numpy(), out_dtype)
+        expected = dtype.convert_from_qint8(expected)
+
+        conv_transpose2d = ConvTranspose2d(
+            in_channels=IC,
+            out_channels=OC,
+            kernel_size=(KH, KW),
+            stride=(SH, SW),
+            padding=(PH, PW),
+            dilation=(DH, DW),
+            groups=groups,
+            bias=has_bias,
+            conv_mode=conv_mode,
+            compute_mode=compute_mode,
+            dtype=out_dtype,
+        )
+
+        conv_transpose2d.weight = mge.Parameter(weight_int8)
+        if has_bias:
+            conv_transpose2d.bias = mge.Parameter(bias_int32)
+        result = conv_transpose2d.forward(inp_int8).numpy()
+        result = dtype.convert_from_qint8(result)
+        np.testing.assert_allclose(result, expected, atol=out_scale)
+
+    test_func(1, 4, 1, 1, 4, 1, 1, 1, 1, 0, 0, 1, 1, 1, False)
+    test_func(2, 4, 3, 1, 8, 1, 1, 1, 1, 0, 0, 1, 1, 1, False)
+    test_func(4, 4, 16, 16, 8, 3, 3, 1, 1, 1, 1, 1, 1, 1, False)
+    test_func(32, 64, 36, 28, 16, 3, 2, 1, 3, 1, 0, 1, 1, 1, False)
diff --git a/imperative/python/test/unit/utils/test_network_node.py b/imperative/python/test/unit/utils/test_network_node.py
index 15cb7c1d1..dc2fa6d0d 100644
--- a/imperative/python/test/unit/utils/test_network_node.py
+++ b/imperative/python/test/unit/utils/test_network_node.py
@@ -486,8 +486,6 @@ def test_topk():
     check_pygraph_dump(fwd, [x], [top, indices])
 
 
-
-
 def test_random():
     @trace(symbolic=True, capture_as_const=True)
     def fwd():
@@ -723,8 +721,6 @@ def test_elemwise_multitype():
     check_pygraph_dump(fwd, [x, y], [result])
 
 
-
-
 def test_cvtcolor():
     inp = np.random.randn(3, 3, 3, 3).astype(np.float32)
     x = Tensor(inp)
diff --git a/imperative/python/version_template.py b/imperative/python/version_template.py
index e5d40075a..f45d43992 100644
--- a/imperative/python/version_template.py
+++ b/imperative/python/version_template.py
@@ -7,4 +7,3 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 __version__ = "1.6.0.dev"
-
diff --git a/imperative/src/impl/ops/convolution.cpp b/imperative/src/impl/ops/convolution.cpp
index 8a517712b..90c5a5f60 100644
--- a/imperative/src/impl/ops/convolution.cpp
+++ b/imperative/src/impl/ops/convolution.cpp
@@ -43,6 +43,11 @@ auto apply_on_var_node(
         const VarNodeArray& inputs) {
     auto&& conv = static_cast<const ConvolutionBackwardData&>(def);
     OperatorNodeConfig config{conv.make_name()};
+    DType output_dtype = conv.dtype;
+    if (output_dtype.valid()) {
+        config.output_dtype(output_dtype);
+    }
+
     if (inputs.size() == 2) {
         return opr::ConvolutionBackwardData::make(inputs[0], inputs[1], conv.param(), conv.policy(), config);
     } else {
diff --git a/scripts/whl/macos/macos_build_whl.sh b/scripts/whl/macos/macos_build_whl.sh
index 9533b7c36..63ff3b0b8 100755
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
@@ -192,7 +192,6 @@ function do_build() {
         #handle dlopen path
         install_name_tool -change @rpath/libmegengine_export.dylib @loader_path/lib/libmegengine_export.dylib _imperative_rt.so
 
-
         #copy megbrain_export lib
         DEPEND_LIB=${BUILD_DIR}/staging/megengine/core/lib/
         rm -rf ${DEPEND_LIB}
@@ -209,7 +208,6 @@ function do_build() {
         echo "comapt whl name: ${compat_whl_name}"
         cp ${BUILD_DIR}/staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name}
 
-
         cd ${SRC_DIR}
         echo ""
         echo "##############################################################################################"
@@ -220,12 +218,10 @@ function do_build() {
     done
 }
 
-
 function third_party_prepare() {
     echo "init third_party..."
     ${SRC_DIR}/third_party/prepare.sh
 
-
     if [[ -z ${ALREADY_INSTALL_MKL} ]]
     then
         echo "init third_party..."
diff --git a/scripts/whl/manylinux2014/do_build_common.sh b/scripts/whl/manylinux2014/do_build_common.sh
index 5fc2081dc..0f1fc771e 100755
--- a/scripts/whl/manylinux2014/do_build_common.sh
+++ b/scripts/whl/manylinux2014/do_build_common.sh
@@ -55,13 +55,11 @@ function patch_elf_depend_lib_mgb_mge() {
     patchelf --force-rpath --set-rpath '$ORIGIN/.' ${LIBS_DIR}/libmegengine_export.so
     handle_strip ${LIBS_DIR}/libmegengine_export.so
 
-
     # as some version of cudnn/trt libs have dlopen libs, so we can not use auditwheel
     # TODO: PR for auditwheel to support args for dlopen libs
     handle_copy_cuda_libs ${LIBS_DIR}
 }
 
-
 SRC_DIR=$(readlink -f "`dirname $0`/../../../")
 source ${SRC_DIR}/scripts/whl/utils/utils.sh
 
@@ -142,7 +140,6 @@ do
     mkdir -p staging
     cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
 
-
     cd ${BUILD_DIR}/staging/megengine/core
     mkdir -p lib/ucx
     patch_elf_depend_lib_mgb_mge
@@ -158,7 +155,6 @@ do
     echo "comapt whl name: ${compat_whl_name}"
     mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}/${compat_whl_name}
 
-
     cd /home/output
     chown -R ${UID}.${UID} .
     # compat for root-less docker env to remove output at host side
diff --git a/scripts/whl/windows/windows_build_whl.sh b/scripts/whl/windows/windows_build_whl.sh
index 3d0045f8b..c917c4895 100755
--- a/scripts/whl/windows/windows_build_whl.sh
+++ b/scripts/whl/windows/windows_build_whl.sh
@@ -70,7 +70,6 @@ then
     BUILD_WHL_CPU_ONLY="OFF"
 fi
 
-
 # config NVIDIA libs
 TRT_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/TensorRT-6.0.1.5/lib/nvinfer.dll"
 CUDNN_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/cudnn-10.1-windows10-x64-v7.6.5.32/cuda/bin/cudnn64_7.dll"
@@ -102,14 +101,11 @@ function copy_more_dll() {
     # empty.file to triger setup.py to create a null empty
     echo "empty" > ${CP_WHL_DST_IMP}/empty.file
 
-
     if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
         echo "copy nvidia lib to whl use...."
         depend_real_copy ${CP_WHL_DST_IMP}
-
     fi
 }
-
 BUILD_DIR=${SRC_DIR}/build_dir/host/build/
 
 # here we just treat cu file should not in the increment build file list
@@ -194,14 +190,12 @@ function do_build() {
         llvm-strip -s ${rt_file}
         mv ${rt_file} _imperative_rt.pyd
 
-
         copy_more_dll
         cd ${BUILD_DIR}/staging
         echo "call setup.py now"
         ${PYTHON_DIR}/python3 setup.py bdist_wheel
         cp ${BUILD_DIR}/staging/dist/Meg*.whl ${WINDOWS_WHL_HOME}/
 
-
         echo ""
         echo "##############################################################################################"
         echo "windows whl package location: ${WINDOWS_WHL_HOME}"
@@ -215,7 +209,6 @@ function third_party_prepare() {
     echo "init third_party..."
     ${SRC_DIR}/third_party/prepare.sh
 
-
     if [[ -z ${ALREADY_INSTALL_MKL} ]]
     then
         echo "init third_party..."
diff --git a/src/core/impl/comp_node_env.cpp b/src/core/impl/comp_node_env.cpp
index 1c6546415..de6a0de69 100644
--- a/src/core/impl/comp_node_env.cpp
+++ b/src/core/impl/comp_node_env.cpp
@@ -35,8 +35,6 @@
 #include "megcore_atlas.h"
 #endif
 
-
-
 using namespace mgb;
 
 /* =================== MegDNNHandle =================== */
@@ -102,7 +100,6 @@ MegDNNHandle::MegDNNHandle(const CompNodeEnv& env) {
     }
 #endif
 
-
     if (env.property().type == CompNode::DeviceType::CPU) {
         megcoreCreateDeviceHandle(&m_dev_hdl, megcorePlatformCPU);
         megcoreCreateComputingHandleWithCPUDispatcher(&m_comp_hdl, m_dev_hdl,
@@ -234,7 +231,6 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node,
 }
 #endif
 
-
 #if MGB_ATLAS
 
 void mgb::_on_atlas_error(const char* expr, int err, const char* file,
@@ -258,8 +254,6 @@ void CompNodeEnv::init_atlas(CompNode comp_node, const AtlasEnv& env) {
 }
 #endif
 
-
-
 #if MGB_ROCM
 
 void mgb::_on_hip_error(const char* expr, hipError_t err, const char* file,
@@ -381,7 +375,6 @@ void CompNodeEnv::init_cpu(const CpuEnv& env, CompNode comp_node) {
             MegDNNHandle::get(*this).handle()->alignment_requirement();
 }
 
-
 #if MGB_CAMBRICON
 void CompNodeEnv::init_cnrt(int dev, CompNode comp_node,
                             const ContinuationCtx<cnrtQueue_t>& cont) {
@@ -446,7 +439,6 @@ void CompNodeEnv::fini() {
         MGB_ATLAS_CHECK(aclrtDestroyStream(m_atlas_env.stream));
     }
 #endif
-
 }
 
 #if MGB_ENABLE_COMP_NODE_ASYNC_INIT
diff --git a/src/core/impl/exception.cpp b/src/core/impl/exception.cpp
index 7d990a180..7cb61de04 100644
--- a/src/core/impl/exception.cpp
+++ b/src/core/impl/exception.cpp
@@ -73,14 +73,11 @@ std::string CudaError::get_cuda_extra_info() {
 #endif
 }
 
-
 AtlasError::AtlasError(const std::string &msg):
     SystemError(msg)
 {
 }
 
-
-
 ROCmError::ROCmError(const std::string &msg):
     SystemError(msg)
 {
diff --git a/src/core/impl/graph/cg_impl.cpp b/src/core/impl/graph/cg_impl.cpp
index 43b9a5a4e..515637312 100644
--- a/src/core/impl/graph/cg_impl.cpp
+++ b/src/core/impl/graph/cg_impl.cpp
@@ -23,7 +23,6 @@
 #include "megbrain/graph/helper.h"
 #include "megbrain/opr/utility.h"
 
-
 #if MGB_ENABLE_TENSOR_RT
 #include "megbrain/tensorrt/opr_replace.h"
 #endif
@@ -554,7 +553,6 @@ ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare(
     }
 #endif
 
-
 #if MGB_JIT
    if (std::abs(options().graph_opt_level) == 0 &&
         (options().graph_opt.jit || options().graph_opt.jit_config.enabled())) {
diff --git a/src/core/impl/graph/var_node_mem_mgr.h b/src/core/impl/graph/var_node_mem_mgr.h
index d657a0b9c..953f23c1b 100644
--- a/src/core/impl/graph/var_node_mem_mgr.h
+++ b/src/core/impl/graph/var_node_mem_mgr.h
@@ -445,7 +445,6 @@ class VarNodeMemManager {
 
         SyncableCounter m_cpu_async_release_barrier;
 
-
 #if MGB_CUDA || MGB_ATLAS || MGB_CAMBRICON  || MGB_ROCM
         //! release dynamic var on after compnode event finishes
         class AsyncVarReleaser;
diff --git a/src/core/include/megbrain/comp_node.h b/src/core/include/megbrain/comp_node.h
index db8076b26..87c362ed0 100644
--- a/src/core/include/megbrain/comp_node.h
+++ b/src/core/include/megbrain/comp_node.h
@@ -508,7 +508,6 @@ class CompNode {
          */
         static bool enable_affinity_for_cpu(bool flag);
 
-
     protected:
         //! ImplBase with env(); defined in CompNodeEnv
         class Impl;
diff --git a/src/core/include/megbrain/comp_node_env.h b/src/core/include/megbrain/comp_node_env.h
index 3b9f1af8b..af8575dcd 100644
--- a/src/core/include/megbrain/comp_node_env.h
+++ b/src/core/include/megbrain/comp_node_env.h
@@ -19,8 +19,6 @@
 
 #include "megdnn/handle.h"
 
-
-
 #if MGB_CUDA
 #include <cuda_runtime.h>
 #include <cuda.h>
@@ -90,8 +88,6 @@
 
 #endif // MGB_ATLAS
 
-
-
 #if MGB_ROCM
 #include "hcc_detail/hcc_defs_prologue.h"
 #include "megcore_rocm.h"
@@ -196,7 +192,6 @@ namespace mgb {
                                   const char* file, const char* func, int line);
 #endif
 
-
 #if MGB_CUDA
 [[noreturn]] void _on_cuda_error(const char* expr, cudaError_t err,
                                  const char* file, const char* func, int line);
@@ -205,7 +200,6 @@ namespace mgb {
                                     int line);
 #endif
 
-
 #if MGB_ROCM
 [[noreturn]] void _on_hip_error(const char* expr, hipError_t err,
                                 const char* file, const char* func, int line);
@@ -232,7 +226,6 @@ public:
         mgb_assert(0, "The CompNode set_affinity is not implement");
     }
 };
-
 using AtlasDispatcher = CPUDispatcher;
 
 /*!
@@ -328,7 +321,6 @@ public:
         }
 #endif
 
-
     }
 
     /*!
@@ -370,7 +362,6 @@ public:
                          const ContinuationCtx<cudaStream_t>& cont);
 #endif
 
-
 #if MGB_ATLAS
     struct AtlasEnv {
         int device = -1;
@@ -431,8 +422,6 @@ public:
     void init_atlas(CompNode comp_node, const AtlasEnv& env);
 #endif
 
-
-
 #if MGB_ROCM
     struct ROCmEnv {
         int device = -1;
@@ -547,7 +536,6 @@ private:
     CompNode m_comp_node;
     Property m_property;
     MemEventHandler m_mem_event_handler;
-
 #if MGB_CUDA
     CudaEnv m_cuda_env;
 #endif
diff --git a/src/core/include/megbrain/exception.h b/src/core/include/megbrain/exception.h
index a58bd8433..701f056d9 100644
--- a/src/core/include/megbrain/exception.h
+++ b/src/core/include/megbrain/exception.h
@@ -71,7 +71,6 @@
     })                                                                        \
     do {                                                                      \
     } while (0)
-
 namespace mgb {
 
 //! the most general MegBrain exception type; also base class for all megbrain
@@ -149,7 +148,6 @@ public:
     AtlasError(const std::string& msg);
 };
 
-
 class ROCmError final : public SystemError {
 public:
     /*!
@@ -224,7 +222,6 @@ public:
     using MegBrainError::MegBrainError;
 };
 
-
 }  // namespace mgb
 
 namespace mgb {
@@ -233,5 +230,4 @@ bool has_uncaught_exception();
 
 }  // namespace mgb
 
-
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/core/include/megbrain/ir/ops.td b/src/core/include/megbrain/ir/ops.td
index 14aa42f83..07223e7f5 100644
--- a/src/core/include/megbrain/ir/ops.td
+++ b/src/core/include/megbrain/ir/ops.td
@@ -49,7 +49,11 @@ def SVD: MgbHashableOp<"SVD", [SVDParam]>;
 
 def Convolution : MgbHashableOp<"Convolution", [ConvolutionParam, ExecutionPolicyParamBase<"policy">]>;
 
-def ConvolutionBackwardData: MgbHashableOp<"ConvolutionBackwardData", [ConvolutionParam, ExecutionPolicyParamBase<"policy">]>;
+def ConvolutionBackwardData: MgbHashableOp<"ConvolutionBackwardData", [ConvolutionParam, ExecutionPolicyParamBase<"policy">]> {
+  let extraArguments = (ins
+    MgbDTypeAttr:$dtype
+  );
+}
 
 def Convolution3D: MgbHashableOp<"Convolution3D", [Convolution3DParam, ExecutionPolicyParamBase<"policy">]>;
 
diff --git a/src/core/test/comp_node.cpp b/src/core/test/comp_node.cpp
index 05bd4c658..1e45711e2 100644
--- a/src/core/test/comp_node.cpp
+++ b/src/core/test/comp_node.cpp
@@ -40,7 +40,6 @@ TEST(TestCompNode, Parse) {
     ASSERT_EQ(L::parse("cpu2:23"), make_lc(D::CPU, 2, 23));
     ASSERT_EQ(L::parse("cpu21:23"), make_lc(D::CPU, 21, 23));
 
-
     ASSERT_EQ(L::parse("rocmx"), make_lc(D::ROCM, -1, 0));
     ASSERT_EQ(L::parse("rocm2"), make_lc(D::ROCM, 2, 0));
     ASSERT_EQ(L::parse("rocm2:3"), make_lc(D::ROCM, 2, 3));
@@ -62,7 +61,6 @@ TEST(TestCompNode, Parse) {
     ASSERT_EQ(L::parse("multithread:default:2"),
               make_lc(D::MULTITHREAD, L::DEVICE_MULTITHREAD_DEFAULT, 2));
 
-
     ASSERT_THROW(L::parse("apu"), MegBrainError);
     ASSERT_THROW(L::parse("fpgbx"), MegBrainError);
     ASSERT_THROW(L::parse("cab0"), MegBrainError);
@@ -165,8 +163,6 @@ TEST(TestCompNode, Load) {
     auto atlas1 = CompNode::load("atlas1");
     ASSERT_NE(atlas0, atlas1);
 #endif
-
-
 }
 
 TEST(TestCompNode, FreeAfterFinalize) {
@@ -355,7 +351,6 @@ TEST(TestCompNodeAtlas, MemNode) {
 }
 #endif
 
-
 TEST(TestCompNodeCPU, PhysicalDispatch) {
     constexpr int ID = 0x2a6453e0;
     using L = CompNode::Locator;
@@ -754,7 +749,6 @@ TEST(TestCompNodeCambricon, P2PCopy) {
 #endif
 #endif // MGB_CAMBRICON
 
-
 #if MGB_ATLAS
 
 TEST(TestCompNodeAtlas, D2DCopy) {
@@ -780,7 +774,6 @@ TEST(TestCompNodeAtlas, D2DCopy) {
 }
 #endif
 
-
 namespace {
 class CompNodeDepedentObjectInst final : public CompNodeDepedentObject {
     int *m_dst, *m_timer;
diff --git a/src/core/test/mem_alloc.cpp b/src/core/test/mem_alloc.cpp
index eea865861..b6f494474 100644
--- a/src/core/test/mem_alloc.cpp
+++ b/src/core/test/mem_alloc.cpp
@@ -634,7 +634,6 @@ void test_gather_other(CompNode cn0, CompNode cn1) {
     opr::Sleep::sleep(cn1, 0.7);
     func->execute();
 }
-
 }  // namespace
 
 #if MGB_CUDA
@@ -668,5 +667,4 @@ TEST(TestCudaMemAlloc, FreeMem) {
 }
 #endif  // MGB_CUDA
 
-
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/core/test/tensor.cpp b/src/core/test/tensor.cpp
index 23736404a..e2fbae64d 100644
--- a/src/core/test/tensor.cpp
+++ b/src/core/test/tensor.cpp
@@ -340,7 +340,6 @@ TEST(TestTensor, ValueDump) {
     auto val = debug::dump_tensor(*gen({23, 45}), "test");
     debug::write_to_file(output_file("TestTensor.ValueDump.bin").c_str(), val);
 }
-
 template <class Src, class Dst>
 void run_negative_index_test() {
     constexpr size_t S0 = 200, S1 = 200;
diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp
index ad1d17206..817f6384e 100644
--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -1912,7 +1912,6 @@ TEST_PASS(FuseConvBiasNonlinPass, Basic) {
     }
 }
 
-
 #if MGB_CUDA
 
 TEST(TestEnableTensorCore, SmallInputShape) {
@@ -4735,7 +4734,6 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) {
     MGB_ASSERT_TENSOR_EQ(t1, t2);
 }
 
-
 #endif
 
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/megbrain_build_config.h.in b/src/megbrain_build_config.h.in
index 4407817b1..49de6b1d2 100644
--- a/src/megbrain_build_config.h.in
+++ b/src/megbrain_build_config.h.in
@@ -67,7 +67,6 @@
 #define MGB_CUDA    1
 #endif
 
-
 // whether to include file/line location for assert message
 #ifndef MGB_ASSERT_LOC
 #define MGB_ASSERT_LOC          1
@@ -162,7 +161,6 @@
 #define MGB_JIT_HALIDE 0
 #endif
 
-
 #ifndef MEGDNN_WITH_CAMBRICON
 #define MEGDNN_WITH_CAMBRICON 0
 #endif
@@ -182,7 +180,6 @@
 #define MGB_ENABLE_FASTRUN 1
 #endif
 
-
 /* ================= following are more finegrind controls ================= */
 
 // whether to enable json dumper
diff --git a/src/opr/impl/basic_arith.sereg.h b/src/opr/impl/basic_arith.sereg.h
index d8180c3ae..97e87e0f0 100644
--- a/src/opr/impl/basic_arith.sereg.h
+++ b/src/opr/impl/basic_arith.sereg.h
@@ -162,7 +162,6 @@ namespace opr {
     using ReduceV2 = opr::Reduce;
     MGB_SEREG_OPR(ReduceV2, 0);
 } // namespace opr
-
 using TypeCvtV2 = opr::TypeCvt;
 MGB_SEREG_OPR(TypeCvtV2, 1);
 
diff --git a/src/opr/impl/blas.sereg.h b/src/opr/impl/blas.sereg.h
index e959d8889..59dd928dc 100644
--- a/src/opr/impl/blas.sereg.h
+++ b/src/opr/impl/blas.sereg.h
@@ -97,7 +97,6 @@ MGB_SEREG_OPR(SVD, 1);
 
 }  // namespace opr
 
-
 }  // namespace mgb
 
 // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/impl/dnn/dnn.sereg.h b/src/opr/impl/dnn/dnn.sereg.h
index 396446cba..bdaa9b067 100644
--- a/src/opr/impl/dnn/dnn.sereg.h
+++ b/src/opr/impl/dnn/dnn.sereg.h
@@ -613,7 +613,6 @@ MGB_SEREG_OPR(LSQ, 4);
 MGB_SEREG_OPR(LSQBackward, 5);
 }  // namespace opr
 
-
 }  // namespace mgb
 
 // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/impl/imgproc.sereg.h b/src/opr/impl/imgproc.sereg.h
index effacd3da..7dac473b7 100644
--- a/src/opr/impl/imgproc.sereg.h
+++ b/src/opr/impl/imgproc.sereg.h
@@ -196,7 +196,6 @@ using DctChannelSelectV1 = opr::DctChannelSelect;
 MGB_SEREG_OPR(DctChannelSelectV1, 0);
 }  // namespace opr
 
-
 }  // namespace mgb
 
 // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/impl/misc.sereg.h b/src/opr/impl/misc.sereg.h
index 15fc8aaea..9da06d1bf 100644
--- a/src/opr/impl/misc.sereg.h
+++ b/src/opr/impl/misc.sereg.h
@@ -57,7 +57,6 @@ namespace serialization {
 
 } // namespace serialization
 
-
 namespace opr {
 
     MGB_SEREG_OPR(Argmax, 1);
diff --git a/src/opr/impl/rand.sereg.h b/src/opr/impl/rand.sereg.h
index 8c5c6c22a..68b16ea1f 100644
--- a/src/opr/impl/rand.sereg.h
+++ b/src/opr/impl/rand.sereg.h
@@ -14,7 +14,6 @@
 
 namespace mgb {
 
-
 namespace opr {
 
 using UniformRNGV1 = opr::UniformRNG;
diff --git a/src/opr/impl/tensor_manip.sereg.h b/src/opr/impl/tensor_manip.sereg.h
index f64c1b571..db3c8d4fe 100644
--- a/src/opr/impl/tensor_manip.sereg.h
+++ b/src/opr/impl/tensor_manip.sereg.h
@@ -120,7 +120,6 @@ namespace serialization {
 #endif
 } // namespace serialization
 
-
 namespace opr {
     MGB_SEREG_OPR(Broadcast, 2);
     MGB_SEREG_OPR(Dimshuffle, 1);
diff --git a/src/opr/test/dnn/convolution.cpp b/src/opr/test/dnn/convolution.cpp
index 3fd50d80f..88645a48e 100644
--- a/src/opr/test/dnn/convolution.cpp
+++ b/src/opr/test/dnn/convolution.cpp
@@ -2401,7 +2401,6 @@ TEST(TestOprDNN, ConvolutionMultiCompNode) {
     worker0.join();
     worker1.join();
 }
-
 #endif
 
 }  // anonymous namespace
diff --git a/src/serialization/impl/serializer.cpp b/src/serialization/impl/serializer.cpp
index 504abdfe5..e24d6132e 100644
--- a/src/serialization/impl/serializer.cpp
+++ b/src/serialization/impl/serializer.cpp
@@ -37,7 +37,6 @@ GraphLoader::shared_tensor_name_map() {
     }
     return ret;
 }
-
 std::unique_ptr<GraphLoader> make_fbs_loader(std::unique_ptr<InputFile> file);
 std::unique_ptr<GraphDumper> make_fbs_dumper(std::unique_ptr<OutputFile> file);
 bool is_fbs_file(InputFile& file);
diff --git a/src/serialization/test/extern_c_opr.cpp b/src/serialization/test/extern_c_opr.cpp
index dc153ff85..351410d23 100644
--- a/src/serialization/test/extern_c_opr.cpp
+++ b/src/serialization/test/extern_c_opr.cpp
@@ -502,5 +502,4 @@ TEST(TestExternCOpr, Dedup) {
     ASSERT_EQ(0, MGBOprDescImpl<>::nr_inst);
 }
 
-
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index fd4dde506..2b13c9b45 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -15,7 +15,6 @@ if (MGE_WITH_CUDA AND MGE_WITH_TRT)
     list(APPEND SOURCES ${SOURCES_})
 endif()
 
-
 add_executable(megbrain_test ${SOURCES})
 target_link_libraries(megbrain_test gtest gmock)
 target_link_libraries(megbrain_test megbrain megdnn ${MGE_CUDA_LIBS})
diff --git a/tools/param_defs/mgb_opr_param_defs.py b/tools/param_defs/mgb_opr_param_defs.py
index b32d2087d..16adfcb00 100644
--- a/tools/param_defs/mgb_opr_param_defs.py
+++ b/tools/param_defs/mgb_opr_param_defs.py
@@ -63,7 +63,6 @@ pdef('PersistentOutputStorage').add_fields(
             'false')
  )
 
-
 (pdef('CollectiveComm', 'collective communication between multiple computing '
       'nodes on localhost')
  .add_enum(Doc('Mode', 'mode of collective communication'),
-- 
GitLab