diff --git a/dnn/src/cuda/elemwise_helper.cpp b/dnn/src/cuda/elemwise_helper.cpp
index e64cc593fcc55a38214738328013a9babb9f6168..a285782f760aa07bc3222cdc8ea2302ab486ee2b 100644
--- a/dnn/src/cuda/elemwise_helper.cpp
+++ b/dnn/src/cuda/elemwise_helper.cpp
@@ -6,7 +6,8 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 
 #include "src/cuda/elemwise_helper.cuh"
@@ -21,7 +22,7 @@
 
 #define _cb_check_ndim(n) megdnn::TensorShape::MAX_NDIM == n ||
 static_assert(MEGDNN_FOREACH_TENSOR_NDIM(_cb_check_ndim) false,
-        "bad foreach ndim");
+              "bad foreach ndim");
 #undef _cb_check_ndim
 
 namespace megdnn {
@@ -32,28 +33,30 @@ namespace elemwise_intl {
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Warray-bounds"
-template<int ndim, typename ctype>
-void ParamElemVisitor<ndim, ctype, BCAST_OTHER>::host_init(
-        const TensorND &rv, int /*grid_size*/, int /*block_size*/) {
+template <int ndim, typename ctype>
+void ParamElemVisitor<ndim, ctype, BCAST_OTHER>::host_init(const TensorND& rv,
+                                                           int /*grid_size*/,
+                                                           int /*block_size*/) {
     megdnn_assert(rv.layout.ndim && rv.layout.ndim <= ndim);
     m_ptr = rv.ptr<ctype>();
-    for (size_t i = 0; i < rv.layout.ndim; ++ i) {
+    for (size_t i = 0; i < rv.layout.ndim; ++i) {
         m_stride[i] = rv.layout.stride[i];
         if (i + 1 < rv.layout.ndim)
             m_shape_highdim[i] = rv.layout.shape[i + 1];
     }
-    for (int i = rv.layout.ndim - 1; i < ndim - 1; ++ i) {
+    for (int i = rv.layout.ndim - 1; i < ndim - 1; ++i) {
         m_shape_highdim[i] = 1;
     }
-    for (int i = rv.layout.ndim; i < ndim; ++ i) {
+    for (int i = rv.layout.ndim; i < ndim; ++i) {
         m_stride[i] = 0;
     }
 }
 #pragma GCC diagnostic pop
 
 template <typename ctype>
-void ParamElemVisitor<3, ctype, BCAST_101>::host_init(
-        const TensorND& rv, int grid_size, int block_size) {
+void ParamElemVisitor<3, ctype, BCAST_101>::host_init(const TensorND& rv,
+                                                      int grid_size,
+                                                      int block_size) {
     uint32_t shape2, shape1;
     int stride1;
     if (rv.layout.ndim == 3) {
@@ -74,8 +77,8 @@ void ParamElemVisitor<3, ctype, BCAST_101>::host_init(
 
 template <typename ctype>
 void ParamElemVisitor<2, ctype, BCAST_10>::host_init(const TensorND& rv,
-                                                             int grid_size,
-                                                             int block_size) {
+                                                     int grid_size,
+                                                     int block_size) {
     megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0]);
     m_ptr = rv.ptr<ctype>();
     m_stride1 = rv.layout.stride[1];
@@ -85,8 +88,8 @@ void ParamElemVisitor<2, ctype, BCAST_10>::host_init(const TensorND& rv,
 
 template <typename ctype>
 void ParamElemVisitor<2, ctype, BCAST_01>::host_init(const TensorND& rv,
-                                                             int grid_size,
-                                                             int block_size) {
+                                                     int grid_size,
+                                                     int block_size) {
     megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[1]);
     m_ptr = rv.ptr<ctype>();
     m_stride0 = rv.layout.stride[0];
@@ -94,9 +97,10 @@ void ParamElemVisitor<2, ctype, BCAST_01>::host_init(const TensorND& rv,
                        rv.layout.shape[1]);
 }
 
-template<typename ctype>
-void ParamElemVisitor<1, ctype, BCAST_FULL>::host_init(
-        const TensorND &rv, int /*grid_size*/, int /*block_size*/) {
+template <typename ctype>
+void ParamElemVisitor<1, ctype, BCAST_FULL>::host_init(const TensorND& rv,
+                                                       int /*grid_size*/,
+                                                       int /*block_size*/) {
     megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0]);
     m_ptr = rv.ptr<ctype>();
 }
@@ -119,14 +123,13 @@ void ParamVectVisitor<4, ctype, BCAST_1010>::host_init(const TensorND& rv,
 }
 
 #define INST(ndim, ctype, brd) template class ParamElemVisitor<ndim, ctype, brd>
-#define INST_FOR_CTYPE \
+#define INST_FOR_CTYPE                  \
     MEGDNN_FOREACH_TENSOR_NDIM(ndim_cb) \
-    INST(3, ct, BCAST_101); \
-    INST(2, ct, BCAST_10); \
-    INST(2, ct, BCAST_01); \
+    INST(3, ct, BCAST_101);             \
+    INST(2, ct, BCAST_10);              \
+    INST(2, ct, BCAST_01);              \
     INST(1, ct, BCAST_FULL);
 
-
 #define ndim_cb(_ndim) INST(_ndim, ct, BCAST_OTHER);
 
 #define ct dt_byte
@@ -175,11 +178,10 @@ INST(dt_qint8);
 INST(dt_quint8);
 #undef dt_ibyte
 
-} // namespace elemwise_intl
-
+}  // namespace elemwise_intl
 
-void elemwise_intl::get_launch_spec(
-        const void *kern, size_t size, int *grid_size, int *block_size) {
+void elemwise_intl::get_launch_spec(const void* kern, size_t size,
+                                    int* grid_size, int* block_size) {
     safe_size_in_kern(size);
     auto config = query_launch_config_for_kernel(kern);
     *block_size = config.block_size;
@@ -202,11 +204,8 @@ void elemwise_intl::get_launch_spec(
 void elemwise_intl::on_bad_ndim(int ndim) {
     megdnn_throw(ssprintf("invalid ndim: %d", ndim));
     MEGDNN_MARK_USED_VAR(ndim);
-
 }
-} // namespace cuda
-} // namespace megdnn
-
+}  // namespace cuda
+}  // namespace megdnn
 
 // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
-
diff --git a/dnn/src/cuda/elemwise_helper.cuh b/dnn/src/cuda/elemwise_helper.cuh
index d9c5e5d169ba90542800ed52c6ec346adba360b2..47ed82d01111fcdcc0f6b60c69c5a7378eca9041 100644
--- a/dnn/src/cuda/elemwise_helper.cuh
+++ b/dnn/src/cuda/elemwise_helper.cuh
@@ -6,7 +6,8 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 
 #pragma once
@@ -28,56 +29,56 @@ namespace cuda {
 namespace elemwise_intl {
 #define devfunc __device__ __forceinline__
 
-    /*!
-     * \brief get cuda launch specs for element-wise kernel
-     * \param kern kernel function address
-     * \param size total size of elements
-     */
-    void get_launch_spec(
-            const void *kern, size_t size, int *grid_size, int *block_size);
-
-    MEGDNN_NORETURN void on_bad_ndim(int ndim);
-
-    /*!
-     * \brief broadcast type
-     * BCAST_x[0]x[1]...: x[i] == !stride[i]
-     */
-    enum BcastType {
-        BCAST_OTHER,
-        BCAST_1010,
-        BCAST_101,
-        BCAST_10,
-        BCAST_01,
-        BCAST_FULL
-    };
+/*!
+ * \brief get cuda launch specs for element-wise kernel
+ * \param kern kernel function address
+ * \param size total size of elements
+ */
+void get_launch_spec(const void* kern, size_t size, int* grid_size,
+                     int* block_size);
 
-    /*!
-     * \brief read and write type trait for byte width integer type
-     */
-    template <typename ctype>
-    class VectTypeTrait;
+MEGDNN_NORETURN void on_bad_ndim(int ndim);
 
-    struct __attribute__((aligned(8))) half4 {
-        dt_float16 x, y, z, w;
-    };
+/*!
+ * \brief broadcast type
+ * BCAST_x[0]x[1]...: x[i] == !stride[i]
+ */
+enum BcastType {
+    BCAST_OTHER,
+    BCAST_1010,
+    BCAST_101,
+    BCAST_10,
+    BCAST_01,
+    BCAST_FULL
+};
 
-    __device__ __forceinline__ half4 make_half4(dt_float16 x, dt_float16 y,
-                                                dt_float16 z, dt_float16 w) {
-        half4 t;
-        t.x = x, t.y = y, t.z = z, t.w = w;
-        return t;
-    }
+/*!
+ * \brief read and write type trait for byte width integer type
+ */
+template <typename ctype>
+class VectTypeTrait;
+
+struct __attribute__((aligned(8))) half4 {
+    dt_float16 x, y, z, w;
+};
+
+__device__ __forceinline__ half4 make_half4(dt_float16 x, dt_float16 y,
+                                            dt_float16 z, dt_float16 w) {
+    half4 t;
+    t.x = x, t.y = y, t.z = z, t.w = w;
+    return t;
+}
 
-    struct __attribute__((aligned(8))) bhalf4 {
-        dt_bfloat16 x, y, z, w;
-    };
+struct __attribute__((aligned(8))) bhalf4 {
+    dt_bfloat16 x, y, z, w;
+};
 
-    __device__ __forceinline__ bhalf4 make_bhalf4(dt_bfloat16 x, dt_bfloat16 y,
-                                                dt_bfloat16 z, dt_bfloat16 w) {
-        bhalf4 t;
-        t.x = x, t.y = y, t.z = z, t.w = w;
-        return t;
-    }
+__device__ __forceinline__ bhalf4 make_bhalf4(dt_bfloat16 x, dt_bfloat16 y,
+                                              dt_bfloat16 z, dt_bfloat16 w) {
+    bhalf4 t;
+    t.x = x, t.y = y, t.z = z, t.w = w;
+    return t;
+}
 
 #define INST(_ctype, _vect_type)                                               \
     template <>                                                                \
@@ -94,245 +95,245 @@ namespace elemwise_intl {
         }                                                                      \
     }
 #define as_raw(x) x
-    INST(dt_int8, char4);
-    INST(dt_uint8, uchar4);
-    INST(dt_float32, float4);
-    INST(dt_float16, half4);
-    INST(dt_bfloat16, bhalf4);
-    INST(dt_int32, int4);
-    INST(dt_int16, short4);
+INST(dt_int8, char4);
+INST(dt_uint8, uchar4);
+INST(dt_float32, float4);
+INST(dt_float16, half4);
+INST(dt_bfloat16, bhalf4);
+INST(dt_int32, int4);
+INST(dt_int16, short4);
 #undef as_raw
 #define as_raw(x) x.as_int8()
-    INST(dt_qint8, char4);
+INST(dt_qint8, char4);
 #undef as_raw
 #define as_raw(x) x.as_uint8()
-    INST(dt_quint8, uchar4);
+INST(dt_quint8, uchar4);
 #undef as_raw
 #define as_raw(x) x.as_int32()
-    INST(dt_qint32, int4);
+INST(dt_qint32, int4);
 #undef as_raw
 #undef INST
 
-    /*!
-     * \brief visitor to access an elemeent in a tensor at given logic index
-     * \tparam ctype plain element ctype (i.e. ctype in DTypeTrait)
-     * \tparam brdcast_mask bit mask for broadcast of params; (i.e. stride[i] is
-     *      0 iff (brdcast_mask & (1<<(ndim-1-i))) is 1.
-     *
-     * host interface:
-     *      void host_init(
-     *              const TensorND &tensor, int grid_size, int block_size)
-     *
-     * device interface:
-     *      void thread_init(uint32_t idx)
-     *          called on thread entrance, with logical indexing; the index may
-     *          go beyond buffer range
-     *
-     *      ctype* ptr()
-     *          return buffer pointer; can be used by specialized OpCaller
-     *
-     *      void next()
-     *          called before moving to next chunk on each thread
-     *
-     *      int offset(uint32_t idx)
-     *          get physical offset from logical index
-     *
-     *      ctype& at(uint32_t idx)
-     *          ptr()[offset(idx)]
-     *
-     */
-    template <int ndim, typename ctype, BcastType brd_type>
-    class ParamElemVisitor;
-
-    /*!
-     * \brief visitor to access vector element in a tensor at given logic index
-     * \tparam ctype same as ParamElemVisitor, vect_type packed vector type of
-     * element ctype (i.e. vect_type in VectTypeTrait) \tparam brdcast_mask same
-     * as ParamElemVisitor
-     *
-     *
-     * device interface:
-     *      vect_type& at(uint32_t idx)
-     *          ptr()[offset(idx)]
-     *
-     */
-    template <int ndim, typename ctype, BcastType brd_type>
-    class ParamVectVisitor;
-
-    /* f{{{ ParamElemVisitor specializations */
+/*!
+ * \brief visitor to access an elemeent in a tensor at given logic index
+ * \tparam ctype plain element ctype (i.e. ctype in DTypeTrait)
+ * \tparam brdcast_mask bit mask for broadcast of params; (i.e. stride[i] is
+ *      0 iff (brdcast_mask & (1<<(ndim-1-i))) is 1.
+ *
+ * host interface:
+ *      void host_init(
+ *              const TensorND &tensor, int grid_size, int block_size)
+ *
+ * device interface:
+ *      void thread_init(uint32_t idx)
+ *          called on thread entrance, with logical indexing; the index may
+ *          go beyond buffer range
+ *
+ *      ctype* ptr()
+ *          return buffer pointer; can be used by specialized OpCaller
+ *
+ *      void next()
+ *          called before moving to next chunk on each thread
+ *
+ *      int offset(uint32_t idx)
+ *          get physical offset from logical index
+ *
+ *      ctype& at(uint32_t idx)
+ *          ptr()[offset(idx)]
+ *
+ */
+template <int ndim, typename ctype, BcastType brd_type>
+class ParamElemVisitor;
+
+/*!
+ * \brief visitor to access vector element in a tensor at given logic index
+ * \tparam ctype same as ParamElemVisitor, vect_type packed vector type of
+ * element ctype (i.e. vect_type in VectTypeTrait) \tparam brdcast_mask same
+ * as ParamElemVisitor
+ *
+ *
+ * device interface:
+ *      vect_type& at(uint32_t idx)
+ *          ptr()[offset(idx)]
+ *
+ */
+template <int ndim, typename ctype, BcastType brd_type>
+class ParamVectVisitor;
+
+/* f{{{ ParamElemVisitor specializations */
 
 #define PARAM_ELEM_VISITOR_COMMON_DEV      \
     devfunc ctype* ptr() { return m_ptr; } \
     devfunc ctype& at(uint32_t idx) { return m_ptr[offset(idx)]; }
 #define PARAM_ELEM_VISITOR_COMMON_HOST static const int packed_size = 1;
 
-    //! specialization for BCAST_OTHER
-    template <int ndim, typename ctype>
-    class ParamElemVisitor<ndim, ctype, BCAST_OTHER> {
-    protected:
-        ctype* __restrict m_ptr;
+//! specialization for BCAST_OTHER
+template <int ndim, typename ctype>
+class ParamElemVisitor<ndim, ctype, BCAST_OTHER> {
+protected:
+    ctype* __restrict m_ptr;
 
-    private:
-        int m_stride[ndim];
+private:
+    int m_stride[ndim];
 
-        //! m_shape_highdim[i] = original_shape[i + 1]
+    //! m_shape_highdim[i] = original_shape[i + 1]
 #ifdef _MSC_VER
-        Uint32Fastdiv m_shape_highdim[ndim > 1 ? ndim - 1 : 1];
+    Uint32Fastdiv m_shape_highdim[ndim > 1 ? ndim - 1 : 1];
 #else
-        Uint32Fastdiv m_shape_highdim[ndim];
+    Uint32Fastdiv m_shape_highdim[ndim];
 #endif
 
-    public:
-        static const int NDIM = ndim;
-        PARAM_ELEM_VISITOR_COMMON_HOST
+public:
+    static const int NDIM = ndim;
+    PARAM_ELEM_VISITOR_COMMON_HOST
 
-        void host_init(const TensorND& rv, int grid_size, int block_size);
+    void host_init(const TensorND& rv, int grid_size, int block_size);
 
 #if MEGDNN_CC_CUDA
-        devfunc void thread_init(uint32_t) {}
+    devfunc void thread_init(uint32_t) {}
 
-        devfunc void next() {}
+    devfunc void next() {}
 
-        devfunc int offset(uint32_t idx) {
-            int offset = 0;
+    devfunc int offset(uint32_t idx) {
+        int offset = 0;
 #pragma unroll
-            for (int i = ndim - 1; i >= 1; --i) {
-                Uint32Fastdiv& shp = m_shape_highdim[i - 1];
-                uint32_t idx_div = idx / shp;
-                offset += (idx - idx_div * shp.divisor()) * m_stride[i];
-                idx = idx_div;
-            }
-            offset += idx * m_stride[0];
-            return offset;
+        for (int i = ndim - 1; i >= 1; --i) {
+            Uint32Fastdiv& shp = m_shape_highdim[i - 1];
+            uint32_t idx_div = idx / shp;
+            offset += (idx - idx_div * shp.divisor()) * m_stride[i];
+            idx = idx_div;
         }
+        offset += idx * m_stride[0];
+        return offset;
+    }
 
-        PARAM_ELEM_VISITOR_COMMON_DEV
+    PARAM_ELEM_VISITOR_COMMON_DEV
 #endif
-    };
-    
-    /*!
-     * \brief specialization for ndim == 3 and BCAST_101
-     * (for dimshuffle 'x', 0, 'x')
-     *
-     * visit: idx / m_shape2 % m_shape1
-     */
-    template <typename ctype>
-    class ParamElemVisitor<3, ctype, BCAST_101> {
-        StridedDivSeq2 m_shape12;
-        int m_stride1;
-
-    protected:
-        ctype* __restrict m_ptr;
-
-    public:
-        static const int NDIM = 3;
-        PARAM_ELEM_VISITOR_COMMON_HOST
-
-        void host_init(const TensorND& rv, int grid_size, int block_size);
+};
+
+/*!
+ * \brief specialization for ndim == 3 and BCAST_101
+ * (for dimshuffle 'x', 0, 'x')
+ *
+ * visit: idx / m_shape2 % m_shape1
+ */
+template <typename ctype>
+class ParamElemVisitor<3, ctype, BCAST_101> {
+    StridedDivSeq2 m_shape12;
+    int m_stride1;
+
+protected:
+    ctype* __restrict m_ptr;
+
+public:
+    static const int NDIM = 3;
+    PARAM_ELEM_VISITOR_COMMON_HOST
+
+    void host_init(const TensorND& rv, int grid_size, int block_size);
 
 #if MEGDNN_CC_CUDA
-        devfunc void thread_init(uint32_t idx) { m_shape12.device_init(idx); }
+    devfunc void thread_init(uint32_t idx) { m_shape12.device_init(idx); }
 
-        devfunc void next() { m_shape12.next(); }
+    devfunc void next() { m_shape12.next(); }
 
-        devfunc int offset(uint32_t idx) { return m_shape12.get() * m_stride1; }
+    devfunc int offset(uint32_t idx) { return m_shape12.get() * m_stride1; }
 
-        PARAM_ELEM_VISITOR_COMMON_DEV
+    PARAM_ELEM_VISITOR_COMMON_DEV
 #endif
-    };
+};
 
-    /*!
-     * \brief specialization for ndim == 2 and BCAST_10
-     *
-     * visit: idx % m_shape1
-     */
-    template <typename ctype>
-    class ParamElemVisitor<2, ctype, BCAST_10> {
-        StridedDivSeq<false> m_shape1;
-        int m_stride1;
+/*!
+ * \brief specialization for ndim == 2 and BCAST_10
+ *
+ * visit: idx % m_shape1
+ */
+template <typename ctype>
+class ParamElemVisitor<2, ctype, BCAST_10> {
+    StridedDivSeq<false> m_shape1;
+    int m_stride1;
 
-    protected:
-        ctype* __restrict m_ptr;
+protected:
+    ctype* __restrict m_ptr;
 
-    public:
-        static const int NDIM = 2;
-        PARAM_ELEM_VISITOR_COMMON_HOST
+public:
+    static const int NDIM = 2;
+    PARAM_ELEM_VISITOR_COMMON_HOST
 
-        void host_init(const TensorND& rv, int grid_size, int block_size);
+    void host_init(const TensorND& rv, int grid_size, int block_size);
 
 #if MEGDNN_CC_CUDA
-        devfunc void thread_init(uint32_t idx) { m_shape1.device_init(idx); }
+    devfunc void thread_init(uint32_t idx) { m_shape1.device_init(idx); }
 
-        devfunc void next() { m_shape1.next(); }
+    devfunc void next() { m_shape1.next(); }
 
-        devfunc int offset(uint32_t idx) { return m_shape1.r() * m_stride1; }
+    devfunc int offset(uint32_t idx) { return m_shape1.r() * m_stride1; }
 
-        PARAM_ELEM_VISITOR_COMMON_DEV
+    PARAM_ELEM_VISITOR_COMMON_DEV
 #endif
-    };
+};
 
-    /*!
-     * \brief specialization for ndim == 2 and BCAST_01
-     *
-     * visit: idx / shape1
-     */
-    template <typename ctype>
-    class ParamElemVisitor<2, ctype, BCAST_01> {
-        StridedDivSeq<true> m_shape1;
-        int m_stride0;
+/*!
+ * \brief specialization for ndim == 2 and BCAST_01
+ *
+ * visit: idx / shape1
+ */
+template <typename ctype>
+class ParamElemVisitor<2, ctype, BCAST_01> {
+    StridedDivSeq<true> m_shape1;
+    int m_stride0;
 
-    protected:
-        ctype* __restrict m_ptr;
+protected:
+    ctype* __restrict m_ptr;
 
-    public:
-        static const int NDIM = 2;
-        PARAM_ELEM_VISITOR_COMMON_HOST
+public:
+    static const int NDIM = 2;
+    PARAM_ELEM_VISITOR_COMMON_HOST
 
-        void host_init(const TensorND& rv, int grid_size, int block_size);
+    void host_init(const TensorND& rv, int grid_size, int block_size);
 
 #if MEGDNN_CC_CUDA
-        devfunc void thread_init(uint32_t idx) { m_shape1.device_init(idx); }
+    devfunc void thread_init(uint32_t idx) { m_shape1.device_init(idx); }
 
-        devfunc void next() { m_shape1.next(); }
+    devfunc void next() { m_shape1.next(); }
 
-        devfunc int offset(uint32_t idx) { return m_shape1.q() * m_stride0; }
+    devfunc int offset(uint32_t idx) { return m_shape1.q() * m_stride0; }
 
-        PARAM_ELEM_VISITOR_COMMON_DEV
+    PARAM_ELEM_VISITOR_COMMON_DEV
 #endif
-    };
+};
 
-    //! specialization for ndim == 1 and BCAST_FULL
-    template <typename ctype>
-    class ParamElemVisitor<1, ctype, BCAST_FULL> {
-    protected:
-        ctype* __restrict m_ptr;
+//! specialization for ndim == 1 and BCAST_FULL
+template <typename ctype>
+class ParamElemVisitor<1, ctype, BCAST_FULL> {
+protected:
+    ctype* __restrict m_ptr;
 
-    public:
-        static const int NDIM = 1;
-        PARAM_ELEM_VISITOR_COMMON_HOST
+public:
+    static const int NDIM = 1;
+    PARAM_ELEM_VISITOR_COMMON_HOST
 
-        void host_init(const TensorND& rv, int grid_size, int block_size);
+    void host_init(const TensorND& rv, int grid_size, int block_size);
 
 #if MEGDNN_CC_CUDA
-        devfunc void thread_init(uint32_t) {}
+    devfunc void thread_init(uint32_t) {}
 
-        devfunc void next() {}
+    devfunc void next() {}
 
-        devfunc int offset(uint32_t idx) {
-            MEGDNN_MARK_USED_VAR(idx);
-            return 0;
-        }
+    devfunc int offset(uint32_t idx) {
+        MEGDNN_MARK_USED_VAR(idx);
+        return 0;
+    }
 
-        PARAM_ELEM_VISITOR_COMMON_DEV
+    PARAM_ELEM_VISITOR_COMMON_DEV
 #endif
-    };
+};
 
 #undef PARAM_ELEM_VISITOR_COMMON_DEV
 #undef PARAM_ELEM_VISITOR_COMMON_HOST
 
-    /* f}}} */
+/* f}}} */
 
-    /* f{{{ ParamVectVisitor specializations */
+/* f{{{ ParamVectVisitor specializations */
 
 #if MEGDNN_CC_CUDA
 #define DEVICE_WRAPPER(x) x
@@ -352,16 +353,16 @@ namespace elemwise_intl {
         })                                                             \
     };
 #define _brdcast_mask BCAST_OTHER
-    INST_PARAM_VECT_VISITOR;
+INST_PARAM_VECT_VISITOR;
 #undef _brdcast_mask
 #define _brdcast_mask BCAST_01
-    INST_PARAM_VECT_VISITOR;
+INST_PARAM_VECT_VISITOR;
 #undef _brdcast_mask
 #define _brdcast_mask BCAST_10
-    INST_PARAM_VECT_VISITOR;
+INST_PARAM_VECT_VISITOR;
 #undef _brdcast_mask
 #define _brdcast_mask BCAST_101
-    INST_PARAM_VECT_VISITOR;
+INST_PARAM_VECT_VISITOR;
 #undef _brdcast_mask
 #define INST_DT_IBYTE(ctype)                                                \
     template <int ndim>                                                     \
@@ -379,463 +380,453 @@ namespace elemwise_intl {
                            return vect_scalar;                              \
                        })                                                   \
     }
-    INST_DT_IBYTE(dt_int8);
-    INST_DT_IBYTE(dt_uint8);
-    INST_DT_IBYTE(dt_qint8);
-    INST_DT_IBYTE(dt_quint8);
+INST_DT_IBYTE(dt_int8);
+INST_DT_IBYTE(dt_uint8);
+INST_DT_IBYTE(dt_qint8);
+INST_DT_IBYTE(dt_quint8);
 #undef INST_DT_IBYTE
 #undef DEVICE_WRAPPER
 #undef INST_PARAM_VECT_VISITOR
 
-    /*!
-     * \brief specialization for ndim == 4 and BCAST_1010
-     *
-     * visit: (idx % m_shape3) * m_stride3 + (idx / m_shape23 % m_shape1) *
-     * m_stride1
-     */
-    template <typename ctype>
-    class ParamVectVisitor<4, ctype, BCAST_1010> {
-        StridedDivSeq2 m_shape123;
-        StridedDivSeq<false> m_shape3;
-        int m_stride3, m_stride1;
-        ctype* __restrict m_ptr;
-
-    public:
-        static const int NDIM = 4;
-        using rwtype = typename VectTypeTrait<ctype>::vect_type;
-        static const int packed_size = sizeof(rwtype) / sizeof(ctype);
-
-        void host_init(const TensorND& rv, int grid_size, int block_size);
+/*!
+ * \brief specialization for ndim == 4 and BCAST_1010
+ *
+ * visit: (idx % m_shape3) * m_stride3 + (idx / m_shape23 % m_shape1) *
+ * m_stride1
+ */
+template <typename ctype>
+class ParamVectVisitor<4, ctype, BCAST_1010> {
+    StridedDivSeq2 m_shape123;
+    StridedDivSeq<false> m_shape3;
+    int m_stride3, m_stride1;
+    ctype* __restrict m_ptr;
+
+public:
+    static const int NDIM = 4;
+    using rwtype = typename VectTypeTrait<ctype>::vect_type;
+    static const int packed_size = sizeof(rwtype) / sizeof(ctype);
+
+    void host_init(const TensorND& rv, int grid_size, int block_size);
 
 #if MEGDNN_CC_CUDA
-        devfunc void thread_init(uint32_t idx) {
-            m_shape123.device_init(idx);
-            m_shape3.device_init(idx);
-        }
+    devfunc void thread_init(uint32_t idx) {
+        m_shape123.device_init(idx);
+        m_shape3.device_init(idx);
+    }
 
-        devfunc void next() {
-            m_shape123.next();
-            m_shape3.next();
-        }
+    devfunc void next() {
+        m_shape123.next();
+        m_shape3.next();
+    }
 
-        devfunc int offset(uint32_t idx) {
-            return m_shape3.r() * m_stride3 + m_shape123.get() * m_stride1;
-        }
+    devfunc int offset(uint32_t idx) {
+        return m_shape3.r() * m_stride3 + m_shape123.get() * m_stride1;
+    }
 
-        devfunc ctype* ptr() { return m_ptr; }
-        devfunc rwtype& at(uint32_t idx) {
-            return *(rwtype*)(&m_ptr[offset(idx)]);
-        }
+    devfunc ctype* ptr() { return m_ptr; }
+    devfunc rwtype& at(uint32_t idx) { return *(rwtype*)(&m_ptr[offset(idx)]); }
 #endif
-    };
-
-    /* f}}} */
+};
 
+/* f}}} */
 
 #if MEGDNN_CC_CUDA
 
-    /* f{{{ user operator callers */
-
-    /*
-     * OpCaller is used to invoke user operator with loaded element arguments.
-     *
-     * device interface:
-     *      void thread_init(uint32_t idx);
-     *
-     *      void on(uint32_t idx);
-     *
-     *      void next();
-     */
-
-    /*!
-     * \brief call user op directly without visiting any params (i.e. arity ==
-     *      0)
-     */
-    template<class Op>
-    struct OpCallerNull {
-        Op op;
-
-        devfunc void thread_init(uint32_t) {
-        }
+/* f{{{ user operator callers */
 
-        devfunc void on(uint32_t idx) {
-            op(idx);
-        }
+/*
+ * OpCaller is used to invoke user operator with loaded element arguments.
+ *
+ * device interface:
+ *      void thread_init(uint32_t idx);
+ *
+ *      void on(uint32_t idx);
+ *
+ *      void next();
+ */
 
-        devfunc void next() {
-        }
-    };
+/*!
+ * \brief call user op directly without visiting any params (i.e. arity ==
+ *      0)
+ */
+template <class Op>
+struct OpCallerNull {
+    Op op;
 
-    /*!
-     * \brief call an operator whose each param are promted to the same ndim and
-     *      brdcast_mask
-     * \tparam PVis ParamElemVisitor class
-     */
-    template<class Op, int arity, class PVis>
-    struct OpCallerUniform;
-
-    //! specialization for arity == 1
-    template<class Op, class PVis>
-    struct OpCallerUniform<Op, 1, PVis> {
-        Op op;
-        PVis par[1];
-        static const uint32_t packed_size = PVis::packed_size;
-
-        devfunc void thread_init(uint32_t idx) {
-            idx = idx * packed_size;
-            par[0].thread_init(idx);
-        }
+    devfunc void thread_init(uint32_t) {}
 
-        devfunc void on(uint32_t idx) {
-            idx = idx * packed_size;
-            op(idx, par[0].at(idx));
-        }
+    devfunc void on(uint32_t idx) { op(idx); }
 
-        devfunc void on(uint32_t idx, uint32_t remain) {
-            idx = idx * packed_size;
-            if (remain >= packed_size) {
-                op(idx, par[0].at(idx));
-            } else {
-                auto ptr0 = par[0].ptr();
-                for (int i = 0; i < remain; i++) {
-                    op(idx + i, ptr0[par[0].offset(idx + i)]);
-                }
-            }
-        }
+    devfunc void next() {}
+};
 
-        devfunc void next() {
-            par[0].next();
-        }
-    };
-    //! specialization for arity == 2
-    template<class Op, class PVis>
-    struct OpCallerUniform<Op, 2, PVis> {
-        Op op;
-        PVis par[2];
-        static const uint32_t packed_size = PVis::packed_size;
-
-        devfunc void thread_init(uint32_t idx) {
-            idx = idx * packed_size;
-            par[0].thread_init(idx);
-            par[1].thread_init(idx);
-        }
+/*!
+ * \brief call an operator whose each param are promted to the same ndim and
+ *      brdcast_mask
+ * \tparam PVis ParamElemVisitor class
+ */
+template <class Op, int arity, class PVis>
+struct OpCallerUniform;
+
+//! specialization for arity == 1
+template <class Op, class PVis>
+struct OpCallerUniform<Op, 1, PVis> {
+    Op op;
+    PVis par[1];
+    static const uint32_t packed_size = PVis::packed_size;
+
+    devfunc void thread_init(uint32_t idx) {
+        idx = idx * packed_size;
+        par[0].thread_init(idx);
+    }
 
-        devfunc void on(uint32_t idx) {
-            idx = idx * packed_size;
-            op(idx, par[0].at(idx), par[1].at(idx));
-        }
+    devfunc void on(uint32_t idx) {
+        idx = idx * packed_size;
+        op(idx, par[0].at(idx));
+    }
 
-        devfunc void on(uint32_t idx, uint32_t remain) {
-            idx = idx * packed_size;
-            if (remain >= packed_size) {
-                op(idx, par[0].at(idx), par[1].at(idx));
-            } else {
-                auto ptr0 = par[0].ptr();
-                auto ptr1 = par[1].ptr();
-                for (int i = 0; i < remain; i++) {
-                    op(idx + i, ptr0[par[0].offset(idx + i)],
-                       ptr1[par[1].offset(idx + i)]);
-                }
+    devfunc void on(uint32_t idx, uint32_t remain) {
+        idx = idx * packed_size;
+        if (remain >= packed_size) {
+            op(idx, par[0].at(idx));
+        } else {
+            auto ptr0 = par[0].ptr();
+            for (int i = 0; i < remain; i++) {
+                op(idx + i, ptr0[par[0].offset(idx + i)]);
             }
         }
+    }
 
-        devfunc void next() {
-            par[0].next();
-            par[1].next();
-        }
-    };
-    //! specialization for arity == 3
-    template<class Op, class PVis>
-    struct OpCallerUniform<Op, 3, PVis> {
-        Op op;
-        PVis par[3];
-        static const uint32_t packed_size = PVis::packed_size;
-
-        devfunc void thread_init(uint32_t idx) {
-            idx = idx * packed_size;
-            par[0].thread_init(idx);
-            par[1].thread_init(idx);
-            par[2].thread_init(idx);
-        }
+    devfunc void next() { par[0].next(); }
+};
+//! specialization for arity == 2
+template <class Op, class PVis>
+struct OpCallerUniform<Op, 2, PVis> {
+    Op op;
+    PVis par[2];
+    static const uint32_t packed_size = PVis::packed_size;
+
+    devfunc void thread_init(uint32_t idx) {
+        idx = idx * packed_size;
+        par[0].thread_init(idx);
+        par[1].thread_init(idx);
+    }
 
-        devfunc void on(uint32_t idx) {
-            idx = idx * packed_size;
-            op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx));
-        }
+    devfunc void on(uint32_t idx) {
+        idx = idx * packed_size;
+        op(idx, par[0].at(idx), par[1].at(idx));
+    }
 
-        devfunc void on(uint32_t idx, uint32_t remain) {
-            idx = idx * packed_size;
-            if (remain >= packed_size) {
-                op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx));
-            } else {
-                auto ptr0 = par[0].ptr();
-                auto ptr1 = par[1].ptr();
-                auto ptr2 = par[2].ptr();
-                for (int i = 0; i < remain; i++) {
-                    op(idx + i, ptr0[par[0].offset(idx + i)],
-                       ptr1[par[1].offset(idx + i)],
-                       ptr2[par[2].offset(idx + i)]);
-                }
+    devfunc void on(uint32_t idx, uint32_t remain) {
+        idx = idx * packed_size;
+        if (remain >= packed_size) {
+            op(idx, par[0].at(idx), par[1].at(idx));
+        } else {
+            auto ptr0 = par[0].ptr();
+            auto ptr1 = par[1].ptr();
+            for (int i = 0; i < remain; i++) {
+                op(idx + i, ptr0[par[0].offset(idx + i)],
+                   ptr1[par[1].offset(idx + i)]);
             }
         }
+    }
 
-        devfunc void next() {
-            par[0].next();
-            par[1].next();
-            par[2].next();
-        }
-    };
+    devfunc void next() {
+        par[0].next();
+        par[1].next();
+    }
+};
+//! specialization for arity == 3
+template <class Op, class PVis>
+struct OpCallerUniform<Op, 3, PVis> {
+    Op op;
+    PVis par[3];
+    static const uint32_t packed_size = PVis::packed_size;
+
+    devfunc void thread_init(uint32_t idx) {
+        idx = idx * packed_size;
+        par[0].thread_init(idx);
+        par[1].thread_init(idx);
+        par[2].thread_init(idx);
+    }
 
-    /*!
-     * \brief call binary (i.e. arity == 2) operator with different param
-     *      visitors
-     */
-    template <class Op, class PVis0, class PVis1>
-    struct OpCallerBinary {
-        Op op;
-        PVis0 par0;
-        PVis1 par1;
-        MEGDNN_STATIC_ASSERT(PVis0::packed_size == PVis1::packed_size,
-                             "vector size mismatch")
-
-        static const uint32_t packed_size = PVis0::packed_size;
-
-        devfunc void thread_init(uint32_t idx) {
-            idx = idx * packed_size;
-            par0.thread_init(idx);
-            par1.thread_init(idx);
-        }
+    devfunc void on(uint32_t idx) {
+        idx = idx * packed_size;
+        op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx));
+    }
 
-        devfunc void on(uint32_t idx) {
-            idx = idx * packed_size;
-            op(idx, par0.at(idx), par1.at(idx));
+    devfunc void on(uint32_t idx, uint32_t remain) {
+        idx = idx * packed_size;
+        if (remain >= packed_size) {
+            op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx));
+        } else {
+            auto ptr0 = par[0].ptr();
+            auto ptr1 = par[1].ptr();
+            auto ptr2 = par[2].ptr();
+            for (int i = 0; i < remain; i++) {
+                op(idx + i, ptr0[par[0].offset(idx + i)],
+                   ptr1[par[1].offset(idx + i)], ptr2[par[2].offset(idx + i)]);
+            }
         }
+    }
 
-        devfunc void next() {
-            par0.next();
-            par1.next();
-        }
-    };
+    devfunc void next() {
+        par[0].next();
+        par[1].next();
+        par[2].next();
+    }
+};
+
+/*!
+ * \brief call binary (i.e. arity == 2) operator with different param
+ *      visitors
+ */
+template <class Op, class PVis0, class PVis1>
+struct OpCallerBinary {
+    Op op;
+    PVis0 par0;
+    PVis1 par1;
+    MEGDNN_STATIC_ASSERT(PVis0::packed_size == PVis1::packed_size,
+                         "vector size mismatch")
+
+    static const uint32_t packed_size = PVis0::packed_size;
+
+    devfunc void thread_init(uint32_t idx) {
+        idx = idx * packed_size;
+        par0.thread_init(idx);
+        par1.thread_init(idx);
+    }
 
-    /* f}}} */
+    devfunc void on(uint32_t idx) {
+        idx = idx * packed_size;
+        op(idx, par0.at(idx), par1.at(idx));
+    }
 
-    template <class OpCaller>
-    __global__ void cuda_kern(OpCaller op_caller, uint32_t size) {
-        uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x,
-                 delta = blockDim.x * gridDim.x;
-        // each thread works on at most 3 elements; see get_launch_spec
-        op_caller.thread_init(idx);
+    devfunc void next() {
+        par0.next();
+        par1.next();
+    }
+};
+
+/* f}}} */
+
+template <class OpCaller>
+__global__ void cuda_kern(OpCaller op_caller, uint32_t size) {
+    uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x,
+             delta = blockDim.x * gridDim.x;
+    // each thread works on at most 3 elements; see get_launch_spec
+    op_caller.thread_init(idx);
+    if (idx < size) {
+        op_caller.on(idx);
+        idx += delta;
         if (idx < size) {
+            op_caller.next();
             op_caller.on(idx);
             idx += delta;
             if (idx < size) {
                 op_caller.next();
                 op_caller.on(idx);
-                idx += delta;
-                if (idx < size) {
-                    op_caller.next();
-                    op_caller.on(idx);
-                }
             }
         }
     }
+}
 
-    template <class Op, int arity, class PVis>
-    __global__ void cuda_kern(OpCallerUniform<Op, arity, PVis> op_caller,
-                              uint32_t size) {
-        constexpr uint32_t packed_size = PVis::packed_size;
-        const uint32_t size_packed = DIVUP(size, packed_size);
-        uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x,
-                 delta = blockDim.x * gridDim.x;
+template <class Op, int arity, class PVis>
+__global__ void cuda_kern(OpCallerUniform<Op, arity, PVis> op_caller,
+                          uint32_t size) {
+    constexpr uint32_t packed_size = PVis::packed_size;
+    const uint32_t size_packed = DIVUP(size, packed_size);
+    uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x,
+             delta = blockDim.x * gridDim.x;
+    if (idx < size_packed) {
+        op_caller.on(idx, size - packed_size * idx);
+        idx += delta;
         if (idx < size_packed) {
             op_caller.on(idx, size - packed_size * idx);
             idx += delta;
             if (idx < size_packed) {
                 op_caller.on(idx, size - packed_size * idx);
-                idx += delta;
-                if (idx < size_packed) {
-                    op_caller.on(idx, size - packed_size * idx);
-                }
             }
         }
     }
+}
 
-    //! invoke a user Op passed to run_elemwise
-    template<class Op, typename ctype, int arity>
-    class UserOpInvoker;
+//! invoke a user Op passed to run_elemwise
+template <class Op, typename ctype, int arity>
+class UserOpInvoker;
 
-    /* f{{{ UserOpInvoker specializations */
+/* f{{{ UserOpInvoker specializations */
 
-    //! run op by promoting all params to same ndim
-    template<class Op, typename ctype, int arity>
-    class UserOpInvokerToSameNdim {
-        const ElemwiseOpParamN<arity> &m_param;
-        cudaStream_t m_stream;
-        const Op &m_op;
+//! run op by promoting all params to same ndim
+template <class Op, typename ctype, int arity>
+class UserOpInvokerToSameNdim {
+    const ElemwiseOpParamN<arity>& m_param;
+    cudaStream_t m_stream;
+    const Op& m_op;
 
-        void dispatch0() {
-            switch(m_param.max_ndim) {
+    void dispatch0() {
+        switch (m_param.max_ndim) {
 #define cb(ndim) \
-                case ndim: return dispatch1<ndim>();
-                MEGDNN_FOREACH_TENSOR_NDIM(cb)
+    case ndim:   \
+        return dispatch1<ndim>();
+            MEGDNN_FOREACH_TENSOR_NDIM(cb)
 #undef cb
-            }
-            on_bad_ndim(m_param.max_ndim);
         }
+        on_bad_ndim(m_param.max_ndim);
+    }
 
-        template <int ndim>
-        void dispatch1() {
-            typedef OpCallerUniform<
-                    Op, arity,
-                    ParamElemVisitor<ndim, ctype, BCAST_OTHER>>
-                    Caller;
-            size_t size = m_param.size;
-            int grid_size, block_size;
-            void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
-            get_launch_spec(reinterpret_cast<const void*>(fptr), size,
-                            &grid_size, &block_size);
-
-            Caller caller;
-            caller.op = m_op;
-            for (int i = 0; i < arity; ++i)
-                caller.par[i].host_init(m_param[i], grid_size, block_size);
-            (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller, size);
-            after_kernel_launch();
-        }
+    template <int ndim>
+    void dispatch1() {
+        typedef OpCallerUniform<Op, arity,
+                                ParamElemVisitor<ndim, ctype, BCAST_OTHER>>
+                Caller;
+        size_t size = m_param.size;
+        int grid_size, block_size;
+        void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
+        get_launch_spec(reinterpret_cast<const void*>(fptr), size, &grid_size,
+                        &block_size);
+
+        Caller caller;
+        caller.op = m_op;
+        for (int i = 0; i < arity; ++i)
+            caller.par[i].host_init(m_param[i], grid_size, block_size);
+        (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller, size);
+        after_kernel_launch();
+    }
 
-    public:
-        UserOpInvokerToSameNdim(const ElemwiseOpParamN<arity>& param,
-                                cudaStream_t stream, const Op& op)
-                : m_param(param), m_stream(stream), m_op(op) {
-            dispatch0();
-            }
-    };
+public:
+    UserOpInvokerToSameNdim(const ElemwiseOpParamN<arity>& param,
+                            cudaStream_t stream, const Op& op)
+            : m_param(param), m_stream(stream), m_op(op) {
+        dispatch0();
+    }
+};
 
-    template <class Op, typename ctype, int arity>
-    class UserOpInvokerToSameNdimIByteHelper {
-    public:
-        UserOpInvokerToSameNdimIByteHelper(const ElemwiseOpParamN<arity>& param,
-                                           cudaStream_t stream, const Op& op)
-                : m_rw_size(param.size),
-                  m_param(param),
-                  m_stream(stream),
-                  m_op(op) {
-            if (!try_vect_load_store_contiguous() && !try_vect_load_store()) {
-                dispatch0();
-            }
+template <class Op, typename ctype, int arity>
+class UserOpInvokerToSameNdimIByteHelper {
+public:
+    UserOpInvokerToSameNdimIByteHelper(const ElemwiseOpParamN<arity>& param,
+                                       cudaStream_t stream, const Op& op)
+            : m_rw_size(param.size),
+              m_param(param),
+              m_stream(stream),
+              m_op(op) {
+        if (!try_vect_load_store_contiguous() && !try_vect_load_store()) {
+            dispatch0();
         }
+    }
 
-    private:
-        const ElemwiseOpParamN<arity>& m_param;
-        size_t m_rw_size;
-        cudaStream_t m_stream;
-        const Op& m_op;
-        using vect_type = typename VectTypeTrait<ctype>::vect_type;
-        static const size_t packed_size = VectTypeTrait<ctype>::packed_size;
+private:
+    const ElemwiseOpParamN<arity>& m_param;
+    size_t m_rw_size;
+    cudaStream_t m_stream;
+    const Op& m_op;
+    using vect_type = typename VectTypeTrait<ctype>::vect_type;
+    static const size_t packed_size = VectTypeTrait<ctype>::packed_size;
 
-        void dispatch0() {
-            switch (m_param.max_ndim) {
+    void dispatch0() {
+        switch (m_param.max_ndim) {
 #define cb(ndim) \
     case ndim:   \
         return dispatch1<ndim>();
-                MEGDNN_FOREACH_TENSOR_NDIM(cb)
+            MEGDNN_FOREACH_TENSOR_NDIM(cb)
 #undef cb
-            }
-            on_bad_ndim(m_param.max_ndim);
         }
+        on_bad_ndim(m_param.max_ndim);
+    }
 
-        void dispatch0_vect() {
-            switch (m_param.max_ndim) {
+    void dispatch0_vect() {
+        switch (m_param.max_ndim) {
 #define cb(ndim) \
     case ndim:   \
         return dispatch1_vect<ndim>();
-                MEGDNN_FOREACH_TENSOR_NDIM(cb)
+            MEGDNN_FOREACH_TENSOR_NDIM(cb)
 #undef cb
-            }
-            on_bad_ndim(m_param.max_ndim);
         }
+        on_bad_ndim(m_param.max_ndim);
+    }
 
-        void dispatch_contiguous() {
-            typedef ParamVectVisitor<1, ctype, BCAST_OTHER> PVis;
-            typedef OpCallerUniform<Op, arity, PVis> Caller;
-            size_t size = m_rw_size;
-            int grid_size, block_size;
-            void (*fptr)(Caller, uint32_t) = cuda_kern<Op, arity, PVis>;
-            get_launch_spec(reinterpret_cast<const void*>(fptr), size,
-                            &grid_size, &block_size);
-
-            Caller caller;
-            caller.op = m_op;
-            for (int i = 0; i < arity; ++i)
-                caller.par[i].host_init(m_param[i], grid_size, block_size);
-            (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller,
-                                                            m_param.size);
-            after_kernel_launch();
-        }
+    void dispatch_contiguous() {
+        typedef ParamVectVisitor<1, ctype, BCAST_OTHER> PVis;
+        typedef OpCallerUniform<Op, arity, PVis> Caller;
+        size_t size = m_rw_size;
+        int grid_size, block_size;
+        void (*fptr)(Caller, uint32_t) = cuda_kern<Op, arity, PVis>;
+        get_launch_spec(reinterpret_cast<const void*>(fptr), size, &grid_size,
+                        &block_size);
+
+        Caller caller;
+        caller.op = m_op;
+        for (int i = 0; i < arity; ++i)
+            caller.par[i].host_init(m_param[i], grid_size, block_size);
+        (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller, m_param.size);
+        after_kernel_launch();
+    }
 
-        template <int ndim>
-        void dispatch1() {
-            typedef ParamElemVisitor<ndim, ctype, BCAST_OTHER> PVis;
-            typedef OpCallerUniform<Op, arity, PVis> Caller;
-            size_t size = m_rw_size;
-            int grid_size, block_size;
-            void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
-            get_launch_spec(reinterpret_cast<const void*>(fptr), size,
-                            &grid_size, &block_size);
-            Caller caller;
-            caller.op = m_op;
-            for (int i = 0; i < arity; ++i)
-                caller.par[i].host_init(m_param[i], grid_size, block_size);
-            (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller, size);
-            after_kernel_launch();
-        }
+    template <int ndim>
+    void dispatch1() {
+        typedef ParamElemVisitor<ndim, ctype, BCAST_OTHER> PVis;
+        typedef OpCallerUniform<Op, arity, PVis> Caller;
+        size_t size = m_rw_size;
+        int grid_size, block_size;
+        void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
+        get_launch_spec(reinterpret_cast<const void*>(fptr), size, &grid_size,
+                        &block_size);
+        Caller caller;
+        caller.op = m_op;
+        for (int i = 0; i < arity; ++i)
+            caller.par[i].host_init(m_param[i], grid_size, block_size);
+        (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller, size);
+        after_kernel_launch();
+    }
 
-        template <int ndim>
-        void dispatch1_vect() {
-            typedef ParamVectVisitor<ndim, ctype, BCAST_OTHER> PVis;
-            typedef OpCallerUniform<Op, arity, PVis> Caller;
-            size_t size = m_rw_size;
-            int grid_size, block_size;
-            void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
-            get_launch_spec(reinterpret_cast<const void*>(fptr), size,
-                            &grid_size, &block_size);
-            Caller caller;
-            caller.op = m_op;
-            for (int i = 0; i < arity; ++i)
-                caller.par[i].host_init(m_param[i], grid_size, block_size);
-            (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller, size);
-            after_kernel_launch();
-        }
+    template <int ndim>
+    void dispatch1_vect() {
+        typedef ParamVectVisitor<ndim, ctype, BCAST_OTHER> PVis;
+        typedef OpCallerUniform<Op, arity, PVis> Caller;
+        size_t size = m_rw_size;
+        int grid_size, block_size;
+        void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
+        get_launch_spec(reinterpret_cast<const void*>(fptr), size, &grid_size,
+                        &block_size);
+        Caller caller;
+        caller.op = m_op;
+        for (int i = 0; i < arity; ++i)
+            caller.par[i].host_init(m_param[i], grid_size, block_size);
+        (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller, size);
+        after_kernel_launch();
+    }
 
-        bool try_vect_load_store() {
-            auto try_last_contig = [](const TensorLayout& layout) {
-                return layout.stride[layout.ndim - 1] == 1 &&
-                       layout[layout.ndim - 1] % packed_size == 0;
-            };
-            /*
-             * \NOTE: remove try_scalar() to adapt multi-type tenary op
-             */
-            for (int i = 0; i < arity; ++i) {
-                if (!try_last_contig(m_param[i].layout)) return false;
-            }
-            m_rw_size /= packed_size;
-            dispatch0_vect();
-            return true;
+    bool try_vect_load_store() {
+        auto try_last_contig = [](const TensorLayout& layout) {
+            return layout.stride[layout.ndim - 1] == 1 &&
+                   layout[layout.ndim - 1] % packed_size == 0;
+        };
+        /*
+         * \NOTE: remove try_scalar() to adapt multi-type tenary op
+         */
+        for (int i = 0; i < arity; ++i) {
+            if (!try_last_contig(m_param[i].layout))
+                return false;
         }
+        m_rw_size /= packed_size;
+        dispatch0_vect();
+        return true;
+    }
 
-        bool try_vect_load_store_contiguous() {
-            auto try_contig = [](const TensorLayout& layout) {
-                return (layout.is_contiguous());
-            };
-            for (int i = 0; i < arity; ++i) {
-                if (!try_contig(m_param[i].layout))
-                    return false;
-            }
-            m_rw_size = DIVUP(m_rw_size, packed_size);
-            dispatch_contiguous();
-            return true;
+    bool try_vect_load_store_contiguous() {
+        auto try_contig = [](const TensorLayout& layout) {
+            return (layout.is_contiguous());
+        };
+        for (int i = 0; i < arity; ++i) {
+            if (!try_contig(m_param[i].layout))
+                return false;
         }
-    };
+        m_rw_size = DIVUP(m_rw_size, packed_size);
+        dispatch_contiguous();
+        return true;
+    }
+};
 
 #define INST_DT_IBYTE(ctype)                                                \
     template <class Op, int arity>                                          \
@@ -848,154 +839,148 @@ namespace elemwise_intl {
                                 cudaStream_t stream, const Op& op)          \
                 : Super{param, stream, op} {}                               \
     }
-    INST_DT_IBYTE(dt_int8);
-    INST_DT_IBYTE(dt_uint8);
-    INST_DT_IBYTE(dt_qint8);
-    INST_DT_IBYTE(dt_quint8);
+INST_DT_IBYTE(dt_int8);
+INST_DT_IBYTE(dt_uint8);
+INST_DT_IBYTE(dt_qint8);
+INST_DT_IBYTE(dt_quint8);
 #undef INST_DT_IBYTE
 
-    //! implement general case by UserOpInvokerToSameNdim
-    template<class Op, typename ctype, int arity>
-    class UserOpInvoker: public UserOpInvokerToSameNdim<Op, ctype, arity> {
-        public:
-            UserOpInvoker(
-                    const ElemwiseOpParamN<arity> &param,
-                    cudaStream_t stream,
-                    const Op &op):
-                UserOpInvokerToSameNdim<Op, ctype, arity>(param, stream, op)
-            {
-            }
-    };
-
-    //! specialization for arity == 0
-    template<class Op, typename ctype>
-    class UserOpInvoker<Op, ctype, 0> {
-        public:
-            UserOpInvoker(
-                    const ElemwiseOpParamN<0> &param,
-                    cudaStream_t stream,
-                    const Op &op) {
-                size_t size = param.size;
-                typedef OpCallerNull<Op> Caller;
-                Caller caller;
-                caller.op = op;
-                int grid_size, block_size;
-                void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
-                get_launch_spec(reinterpret_cast<const void*>(fptr), size,
-                                &grid_size, &block_size);
-                (*fptr) <<< grid_size, block_size, 0, stream >>> (caller, size);
-                after_kernel_launch();
-            }
-    };
+//! implement general case by UserOpInvokerToSameNdim
+template <class Op, typename ctype, int arity>
+class UserOpInvoker : public UserOpInvokerToSameNdim<Op, ctype, arity> {
+public:
+    UserOpInvoker(const ElemwiseOpParamN<arity>& param, cudaStream_t stream,
+                  const Op& op)
+            : UserOpInvokerToSameNdim<Op, ctype, arity>(param, stream, op) {}
+};
+
+//! specialization for arity == 0
+template <class Op, typename ctype>
+class UserOpInvoker<Op, ctype, 0> {
+public:
+    UserOpInvoker(const ElemwiseOpParamN<0>& param, cudaStream_t stream,
+                  const Op& op) {
+        size_t size = param.size;
+        typedef OpCallerNull<Op> Caller;
+        Caller caller;
+        caller.op = op;
+        int grid_size, block_size;
+        void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
+        get_launch_spec(reinterpret_cast<const void*>(fptr), size, &grid_size,
+                        &block_size);
+        (*fptr)<<<grid_size, block_size, 0, stream>>>(caller, size);
+        after_kernel_launch();
+    }
+};
 
 #define DEFINE_BRDCAST_DISPATCH_RECEIVERS(_cb_header, _cb_dispatch, _stride) \
-    _cb_header(1) { \
-        const ptrdiff_t *stride = _stride; \
-        if (!stride[0]) { \
-            return _cb_dispatch(1, BCAST_FULL); \
-        } \
-        _cb_dispatch(1, BCAST_OTHER); \
-    } \
-    _cb_header(2) { \
-        const ptrdiff_t *stride = _stride; \
-        if (!stride[0] && stride[1]) { \
-            return _cb_dispatch(2, BCAST_10); \
-        } \
-        if (stride[0] && !stride[1]) { \
-            return _cb_dispatch(2, BCAST_01); \
-        } \
-        _cb_dispatch(2, BCAST_OTHER); \
-    } \
-    _cb_header(3) { \
-        const ptrdiff_t *stride = _stride; \
-        if (!stride[0] && stride[1] && !stride[2]) { \
-            return _cb_dispatch(3, BCAST_101); \
-        } \
-        _cb_dispatch(3, BCAST_OTHER); \
+    _cb_header(1) {                                                          \
+        const ptrdiff_t* stride = _stride;                                   \
+        if (!stride[0]) {                                                    \
+            return _cb_dispatch(1, BCAST_FULL);                              \
+        }                                                                    \
+        _cb_dispatch(1, BCAST_OTHER);                                        \
+    }                                                                        \
+    _cb_header(2) {                                                          \
+        const ptrdiff_t* stride = _stride;                                   \
+        if (!stride[0] && stride[1]) {                                       \
+            return _cb_dispatch(2, BCAST_10);                                \
+        }                                                                    \
+        if (stride[0] && !stride[1]) {                                       \
+            return _cb_dispatch(2, BCAST_01);                                \
+        }                                                                    \
+        _cb_dispatch(2, BCAST_OTHER);                                        \
+    }                                                                        \
+    _cb_header(3) {                                                          \
+        const ptrdiff_t* stride = _stride;                                   \
+        if (!stride[0] && stride[1] && !stride[2]) {                         \
+            return _cb_dispatch(3, BCAST_101);                               \
+        }                                                                    \
+        _cb_dispatch(3, BCAST_OTHER);                                        \
     }
 
-    //! specialization for binary opr
-    template<class Op, typename ctype>
-    class UserOpInvoker<Op, ctype, 2> {
-        bool m_invoked;
-        const ElemwiseOpParamN<2> &m_param;
-        cudaStream_t m_stream;
-        const Op &m_op;
-
-        void fallback() {
-            megdnn_assert(!m_invoked);
-            UserOpInvokerToSameNdim<Op, ctype, 2>(m_param, m_stream, m_op);
-            m_invoked = true;
-        }
+//! specialization for binary opr
+template <class Op, typename ctype>
+class UserOpInvoker<Op, ctype, 2> {
+    bool m_invoked;
+    const ElemwiseOpParamN<2>& m_param;
+    cudaStream_t m_stream;
+    const Op& m_op;
+
+    void fallback() {
+        megdnn_assert(!m_invoked);
+        UserOpInvokerToSameNdim<Op, ctype, 2>(m_param, m_stream, m_op);
+        m_invoked = true;
+    }
 
-        void dispatch0() {
-            switch(m_param[0].layout.ndim) {
+    void dispatch0() {
+        switch (m_param[0].layout.ndim) {
 #define cb(ndim) \
-                case ndim: return dispatch1_##ndim();
-                MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
+    case ndim:   \
+        return dispatch1_##ndim();
+            MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
 #undef cb
-            }
-            fallback();
         }
+        fallback();
+    }
 
 #define cb_header(ndim) void dispatch1_##ndim()
 #define cb_dispatch(ndim, brdcast_mask) \
-        dispatch2<ParamElemVisitor<ndim, ctype, brdcast_mask> >()
-DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
-        m_param[0].layout.stride)
+    dispatch2<ParamElemVisitor<ndim, ctype, brdcast_mask>>()
+    DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
+                                      m_param[0].layout.stride)
 #undef cb_header
 #undef cb_dispatch
 
-
-        template<class PVis0>
-        void dispatch2() {
-            switch(m_param[1].layout.ndim) {
+    template <class PVis0>
+    void dispatch2() {
+        switch (m_param[1].layout.ndim) {
 #define cb(ndim) \
-                case ndim: return dispatch3_##ndim<PVis0>();
-                MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
+    case ndim:   \
+        return dispatch3_##ndim<PVis0>();
+            MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
 #undef cb
-            }
-            fallback();
         }
+        fallback();
+    }
 
-#define cb_header(ndim) \
-    template<class PVis0> \
+#define cb_header(ndim)    \
+    template <class PVis0> \
     void dispatch3_##ndim()
 #define cb_dispatch(ndim, brdcast_mask) \
-        do_run<PVis0, ParamElemVisitor<ndim, ctype, brdcast_mask> >()
-DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
-        m_param[1].layout.stride)
+    do_run<PVis0, ParamElemVisitor<ndim, ctype, brdcast_mask>>()
+    DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
+                                      m_param[1].layout.stride)
 #undef cb_header
 #undef cb_dispatch
 
-        template<class PVis0, class PVis1>
-        void do_run() {
-            megdnn_assert(!m_invoked);
-            m_invoked = true;
-            typedef OpCallerBinary<Op, PVis0, PVis1> Caller;
-            int grid_size, block_size;
-            void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
-            size_t size = m_param.size;
-            get_launch_spec(reinterpret_cast<const void*>(fptr),
-                    size, &grid_size, &block_size);
-            Caller caller;
-            caller.op = m_op;
-            caller.par0.host_init(m_param[0], grid_size, block_size);
-            caller.par1.host_init(m_param[1], grid_size, block_size);
-            (*fptr) <<< grid_size, block_size, 0, m_stream >>> (caller, size);
-            after_kernel_launch();
-        }
+    template <class PVis0, class PVis1>
+    void do_run() {
+        megdnn_assert(!m_invoked);
+        m_invoked = true;
+        typedef OpCallerBinary<Op, PVis0, PVis1> Caller;
+        int grid_size, block_size;
+        void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
+        size_t size = m_param.size;
+        get_launch_spec(reinterpret_cast<const void*>(fptr), size, &grid_size,
+                        &block_size);
+        Caller caller;
+        caller.op = m_op;
+        caller.par0.host_init(m_param[0], grid_size, block_size);
+        caller.par1.host_init(m_param[1], grid_size, block_size);
+        (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller, size);
+        after_kernel_launch();
+    }
 
-        public:
-            UserOpInvoker(const ElemwiseOpParamN<2> &param, cudaStream_t stream,
-                    const Op &op):
-                m_param(param), m_stream(stream), m_op(op)
-            {
-                m_invoked = false;
-                dispatch0();
-                megdnn_assert(m_invoked);
-            }
-    };
+public:
+    UserOpInvoker(const ElemwiseOpParamN<2>& param, cudaStream_t stream,
+                  const Op& op)
+            : m_param(param), m_stream(stream), m_op(op) {
+        m_invoked = false;
+        dispatch0();
+        megdnn_assert(m_invoked);
+    }
+};
 
 #define DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(_cb_header, _cb_dispatch, \
                                                _stride)                  \
@@ -1008,142 +993,141 @@ DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
         _cb_dispatch(4, BCAST_OTHER);                                    \
     }
 
-    template <class Op, typename ctype>
-    class UserOpInvokerBinaryIByteHelper {
-    private:
-        bool m_invoked;
-        size_t m_rw_size;
-        const ElemwiseOpParamN<2>& m_param;
-        cudaStream_t m_stream;
-        const Op& m_op;
-        using vect_type = typename VectTypeTrait<ctype>::vect_type;
-        static const size_t packed_size = VectTypeTrait<ctype>::packed_size;
-        bool try_vect_load_store() {
-            auto try_last_contig_or_scalar = [](const TensorLayout& layout) {
-                return (layout.stride[layout.ndim - 1] == 1 &&
-                        layout[layout.ndim - 1] % packed_size == 0) ||
-                       (layout.ndim == 1 && layout.stride[0] == 0);
-            };
-            for (int i = 0; i < 2; ++i) {
-                if (!try_last_contig_or_scalar(m_param[i].layout))
-                    return false;
-            }
-            m_rw_size /= packed_size;
-            dispatch0_vect();
-            return true;
+template <class Op, typename ctype>
+class UserOpInvokerBinaryIByteHelper {
+private:
+    bool m_invoked;
+    size_t m_rw_size;
+    const ElemwiseOpParamN<2>& m_param;
+    cudaStream_t m_stream;
+    const Op& m_op;
+    using vect_type = typename VectTypeTrait<ctype>::vect_type;
+    static const size_t packed_size = VectTypeTrait<ctype>::packed_size;
+    bool try_vect_load_store() {
+        auto try_last_contig_or_scalar = [](const TensorLayout& layout) {
+            return (layout.stride[layout.ndim - 1] == 1 &&
+                    layout[layout.ndim - 1] % packed_size == 0) ||
+                   (layout.ndim == 1 && layout.stride[0] == 0);
+        };
+        for (int i = 0; i < 2; ++i) {
+            if (!try_last_contig_or_scalar(m_param[i].layout))
+                return false;
         }
+        m_rw_size /= packed_size;
+        dispatch0_vect();
+        return true;
+    }
 
-        bool try_vect_load_store_contiguous() {
-            auto try_contig = [](const TensorLayout& layout) {
-                return (layout.is_contiguous());
-            };
-            for (int i = 0; i < 2; ++i) {
-                if (!try_contig(m_param[i].layout))
-                    return false;
-            }
-            m_rw_size = DIVUP(m_rw_size, packed_size);
-            dispatch_contiguous();
-            return true;
+    bool try_vect_load_store_contiguous() {
+        auto try_contig = [](const TensorLayout& layout) {
+            return (layout.is_contiguous());
+        };
+        for (int i = 0; i < 2; ++i) {
+            if (!try_contig(m_param[i].layout))
+                return false;
         }
+        m_rw_size = DIVUP(m_rw_size, packed_size);
+        dispatch_contiguous();
+        return true;
+    }
 
-        void dispatch0() {
-            switch (m_param[0].layout.ndim) {
+    void dispatch0() {
+        switch (m_param[0].layout.ndim) {
 #define cb(ndim) \
     case ndim:   \
         return dispatch1_##ndim();
-                MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
+            MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
 #undef cb
-            }
-            fallback();
         }
+        fallback();
+    }
 
-        void dispatch0_vect() {
-            switch (m_param[0].layout.ndim) {
+    void dispatch0_vect() {
+        switch (m_param[0].layout.ndim) {
 #define cb(ndim) \
     case ndim:   \
         return dispatch1_vect_##ndim();
-                MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
+            MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
 #undef cb
-                case 4:
-                    return dispatch1_vect_4();
-            }
-            fallback();
+            case 4:
+                return dispatch1_vect_4();
         }
+        fallback();
+    }
 
-        void dispatch_contiguous() {
-            m_invoked = true;
-            typedef ParamVectVisitor<1, ctype, BCAST_OTHER> PVis;
-            typedef OpCallerUniform<Op, 2, PVis> Caller;
-            size_t size = m_rw_size;
-            int grid_size, block_size;
-            void (*fptr)(Caller, uint32_t) = cuda_kern<Op, 2, PVis>;
-            get_launch_spec(reinterpret_cast<const void*>(fptr), size,
-                            &grid_size, &block_size);
-
-            Caller caller;
-            caller.op = m_op;
-            for (int i = 0; i < 2; ++i)
-                caller.par[i].host_init(m_param[i], grid_size, block_size);
-            (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller,
-                                                            m_param.size);
-            after_kernel_launch();
-        }
+    void dispatch_contiguous() {
+        m_invoked = true;
+        typedef ParamVectVisitor<1, ctype, BCAST_OTHER> PVis;
+        typedef OpCallerUniform<Op, 2, PVis> Caller;
+        size_t size = m_rw_size;
+        int grid_size, block_size;
+        void (*fptr)(Caller, uint32_t) = cuda_kern<Op, 2, PVis>;
+        get_launch_spec(reinterpret_cast<const void*>(fptr), size, &grid_size,
+                        &block_size);
+
+        Caller caller;
+        caller.op = m_op;
+        for (int i = 0; i < 2; ++i)
+            caller.par[i].host_init(m_param[i], grid_size, block_size);
+        (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller, m_param.size);
+        after_kernel_launch();
+    }
 
-        void fallback() {
-            megdnn_assert(!m_invoked);
-            UserOpInvokerToSameNdim<Op, ctype, 2>(m_param, m_stream, m_op);
-            m_invoked = true;
-        }
+    void fallback() {
+        megdnn_assert(!m_invoked);
+        UserOpInvokerToSameNdim<Op, ctype, 2>(m_param, m_stream, m_op);
+        m_invoked = true;
+    }
 
 #define cb_header(ndim) void dispatch1_##ndim()
 #define cb_dispatch(ndim, brdcast_mask) \
     dispatch2<ParamElemVisitor<ndim, ctype, brdcast_mask>>()
-        DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
-                                          m_param[0].layout.stride)
+    DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
+                                      m_param[0].layout.stride)
 #undef cb_header
 #undef cb_dispatch
 
 #define cb_header(ndim) void dispatch1_vect_##ndim()
 #define cb_dispatch(ndim, brdcast_mask) \
     dispatch2_vect<ParamVectVisitor<ndim, ctype, brdcast_mask>>()
-        DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
-                                               m_param[0].layout.stride)
+    DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
+                                           m_param[0].layout.stride)
 #undef cb_header
 #undef cb_dispatch
 
-        template <class PVis0>
-        void dispatch2() {
-            switch (m_param[1].layout.ndim) {
+    template <class PVis0>
+    void dispatch2() {
+        switch (m_param[1].layout.ndim) {
 #define cb(ndim) \
     case ndim:   \
         return dispatch3_##ndim<PVis0>();
-                MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
+            MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
 #undef cb
-            }
-            fallback();
         }
+        fallback();
+    }
 
-        template <class PVis0>
-        void dispatch2_vect() {
-            switch (m_param[1].layout.ndim) {
+    template <class PVis0>
+    void dispatch2_vect() {
+        switch (m_param[1].layout.ndim) {
 #define cb(ndim) \
     case ndim:   \
         return dispatch3_vect_##ndim<PVis0>();
-                MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
+            MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
 #undef cb
-                case 4:
-                    return dispatch3_vect_4<PVis0>();
-            }
-            fallback();
+            case 4:
+                return dispatch3_vect_4<PVis0>();
         }
+        fallback();
+    }
 
 #define cb_header(ndim)    \
     template <class PVis0> \
     void dispatch3_##ndim()
 #define cb_dispatch(ndim, brdcast_mask) \
     do_run<PVis0, ParamElemVisitor<ndim, ctype, brdcast_mask>>()
-        DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
-                                          m_param[1].layout.stride)
+    DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
+                                      m_param[1].layout.stride)
 #undef cb_header
 #undef cb_dispatch
 
@@ -1152,43 +1136,43 @@ DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
     void dispatch3_vect_##ndim()
 #define cb_dispatch(ndim, brdcast_mask) \
     do_run<PVis0, ParamVectVisitor<ndim, ctype, brdcast_mask>>()
-        DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
-                                               m_param[1].layout.stride)
+    DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
+                                           m_param[1].layout.stride)
 #undef cb_header
 #undef cb_dispatch
 
-        template <class PVis0, class PVis1>
-        void do_run() {
-            megdnn_assert(!m_invoked);
-            m_invoked = true;
-            typedef OpCallerBinary<Op, PVis0, PVis1> Caller;
-            int grid_size, block_size;
-            void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
-            size_t size = m_rw_size;
-            get_launch_spec(reinterpret_cast<const void*>(fptr), size,
-                            &grid_size, &block_size);
-            Caller caller;
-            caller.op = m_op;
-            caller.par0.host_init(m_param[0], grid_size, block_size);
-            caller.par1.host_init(m_param[1], grid_size, block_size);
-            (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller, size);
-            after_kernel_launch();
-        }
+    template <class PVis0, class PVis1>
+    void do_run() {
+        megdnn_assert(!m_invoked);
+        m_invoked = true;
+        typedef OpCallerBinary<Op, PVis0, PVis1> Caller;
+        int grid_size, block_size;
+        void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
+        size_t size = m_rw_size;
+        get_launch_spec(reinterpret_cast<const void*>(fptr), size, &grid_size,
+                        &block_size);
+        Caller caller;
+        caller.op = m_op;
+        caller.par0.host_init(m_param[0], grid_size, block_size);
+        caller.par1.host_init(m_param[1], grid_size, block_size);
+        (*fptr)<<<grid_size, block_size, 0, m_stream>>>(caller, size);
+        after_kernel_launch();
+    }
 
-    public:
-        UserOpInvokerBinaryIByteHelper(const ElemwiseOpParamN<2>& param,
-                                       cudaStream_t stream, const Op& op)
-                : m_rw_size(param.size),
-                  m_param(param),
-                  m_stream(stream),
-                  m_op(op) {
-            m_invoked = false;
-            if (!try_vect_load_store_contiguous() && !try_vect_load_store()) {
-                dispatch0();
-            }
-            megdnn_assert(m_invoked);
+public:
+    UserOpInvokerBinaryIByteHelper(const ElemwiseOpParamN<2>& param,
+                                   cudaStream_t stream, const Op& op)
+            : m_rw_size(param.size),
+              m_param(param),
+              m_stream(stream),
+              m_op(op) {
+        m_invoked = false;
+        if (!try_vect_load_store_contiguous() && !try_vect_load_store()) {
+            dispatch0();
         }
-    };
+        megdnn_assert(m_invoked);
+    }
+};
 
 #define INST_DT_IBYTE(ctype)                                                 \
     template <class Op>                                                      \
@@ -1201,20 +1185,20 @@ DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
                       const Op& op)                                          \
                 : Super{param, stream, op} {}                                \
     }
-    INST_DT_IBYTE(dt_int8);
-    INST_DT_IBYTE(dt_uint8);
-    INST_DT_IBYTE(dt_qint8);
-    INST_DT_IBYTE(dt_quint8);
+INST_DT_IBYTE(dt_int8);
+INST_DT_IBYTE(dt_uint8);
+INST_DT_IBYTE(dt_qint8);
+INST_DT_IBYTE(dt_quint8);
 #undef INST_DT_IBYTE
 #endif
 
 #undef DEFINE_BRDCAST_DISPATCH_RECEIVERS
 #undef DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS
 
-    /* f}}} */
+/* f}}} */
 
 #undef devfunc
-} // namespace elemwise_intl
+}  // namespace elemwise_intl
 
 /*!
  * \brief general element-wise kernel launcher
@@ -1236,10 +1220,9 @@ void run_elemwise(const ElemwiseOpParamN<arity>& param, cudaStream_t stream,
                   const Op& op = Op());
 
 #if MEGDNN_CC_CUDA
-template<class Op, typename ctype, int arity>
-void run_elemwise(
-        const ElemwiseOpParamN<arity> &param, cudaStream_t stream,
-        const Op &op) {
+template <class Op, typename ctype, int arity>
+void run_elemwise(const ElemwiseOpParamN<arity>& param, cudaStream_t stream,
+                  const Op& op) {
     param.assert_initialized();
     elemwise_intl::UserOpInvoker<Op, ctype, arity>(param, stream, op);
 }
@@ -1248,15 +1231,13 @@ void run_elemwise(
  * \brief explicit instantialization of run_elemwise for given template params;
  *      used in .cu files, so corresponding run_elemwise can be called from .cpp
  */
-#define INST_RUN_ELEMWISE(Op, ctype, arity) \
-template void run_elemwise<Op, ctype, arity>( \
-        const ElemwiseOpParamN<arity>&, cudaStream_t, const Op&)
+#define INST_RUN_ELEMWISE(Op, ctype, arity)       \
+    template void run_elemwise<Op, ctype, arity>( \
+            const ElemwiseOpParamN<arity>&, cudaStream_t, const Op&)
 
 #endif
 
-} // namespace cuda
-} // namespace megdnn
+}  // namespace cuda
+}  // namespace megdnn
 
 // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
-
-
diff --git a/dnn/test/cuda/elemwise.cpp b/dnn/test/cuda/elemwise.cpp
index 1ae79d3b0a669cafa0b01bd6078079e5dc67a9cf..d74afa37dd6ab46b88609e667f64488e9e218de5 100644
--- a/dnn/test/cuda/elemwise.cpp
+++ b/dnn/test/cuda/elemwise.cpp
@@ -6,7 +6,8 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 
 #include "test/common/elemwise.h"
@@ -26,66 +27,61 @@ using namespace test;
 
 #define cudnn_check(e) megdnn_assert((e) == CUDNN_STATUS_SUCCESS)
 namespace {
-    __attribute__((unused))
-    cudnnTensorDescriptor_t make_cudnn_tensor_desc(const TensorLayout &ly) {
-        megdnn_assert(ly.ndim && ly.ndim <= 4 && ly.is_contiguous());
-        int dim[4] = {1, 1, 1, 1}, stride[4] = {1, 1, 1, 1};
-        for (size_t i = 0; i < ly.ndim; ++ i) {
-            dim[i] = ly.shape[i];
-            stride[i] = ly.stride[i];
-        }
-        cudnnTensorDescriptor_t ret;
-        cudnn_check(cudnnCreateTensorDescriptor(&ret));
-        // cudnn requires tensors to be at-least 4D
-        cudnn_check(cudnnSetTensor4dDescriptorEx(ret,
-                    CUDNN_DATA_FLOAT,
-                    dim[0], dim[1], dim[2], dim[3],
-                    stride[0], stride[1], stride[2], stride[3]));
-
-        return ret;
+__attribute__((unused)) cudnnTensorDescriptor_t make_cudnn_tensor_desc(
+        const TensorLayout& ly) {
+    megdnn_assert(ly.ndim && ly.ndim <= 4 && ly.is_contiguous());
+    int dim[4] = {1, 1, 1, 1}, stride[4] = {1, 1, 1, 1};
+    for (size_t i = 0; i < ly.ndim; ++i) {
+        dim[i] = ly.shape[i];
+        stride[i] = ly.stride[i];
     }
+    cudnnTensorDescriptor_t ret;
+    cudnn_check(cudnnCreateTensorDescriptor(&ret));
+    // cudnn requires tensors to be at-least 4D
+    cudnn_check(cudnnSetTensor4dDescriptorEx(ret, CUDNN_DATA_FLOAT, dim[0],
+                                             dim[1], dim[2], dim[3], stride[0],
+                                             stride[1], stride[2], stride[3]));
+
+    return ret;
+}
 
-    void run_tensor_add(
-            Handle *handle_cuda,
-            const TensorND &a, const TensorND &b,
-            const TensorND &c) {
+void run_tensor_add(Handle* handle_cuda, const TensorND& a, const TensorND& b,
+                    const TensorND& c) {
 #if 1
-        cudnnHandle_t cudnn_handle;
-        cudnn_check(cudnnCreate(&cudnn_handle));
-        cuda_check(cudaDeviceSynchronize());
-        cuda_check(cudaMemcpy(c.raw_ptr, a.raw_ptr, a.layout.span().dist_byte(),
-                    cudaMemcpyDeviceToDevice));
-
-        auto bdesc = make_cudnn_tensor_desc(b.layout),
-             cdesc = make_cudnn_tensor_desc(c.layout);
-
-        float alpha = 1, beta = 1;
-        cudaProfilerStart();
-        cudnn_check(cudnnAddTensor(cudnn_handle,
-                &alpha, bdesc, b.raw_ptr,
-                &beta, cdesc, c.raw_ptr));
-        cudaProfilerStop();
-
-        cudnn_check(cudnnDestroyTensorDescriptor(cdesc));
-        cudnn_check(cudnnDestroyTensorDescriptor(bdesc));
-        cudnn_check(cudnnDestroy(cudnn_handle));
-
-        cuda_check(cudaMemset(c.raw_ptr, 0, c.layout.span().dist_byte()));
-        cuda_check(cudaDeviceSynchronize());
+    cudnnHandle_t cudnn_handle;
+    cudnn_check(cudnnCreate(&cudnn_handle));
+    cuda_check(cudaDeviceSynchronize());
+    cuda_check(cudaMemcpy(c.raw_ptr, a.raw_ptr, a.layout.span().dist_byte(),
+                          cudaMemcpyDeviceToDevice));
+
+    auto bdesc = make_cudnn_tensor_desc(b.layout),
+         cdesc = make_cudnn_tensor_desc(c.layout);
+
+    float alpha = 1, beta = 1;
+    cudaProfilerStart();
+    cudnn_check(cudnnAddTensor(cudnn_handle, &alpha, bdesc, b.raw_ptr, &beta,
+                               cdesc, c.raw_ptr));
+    cudaProfilerStop();
+
+    cudnn_check(cudnnDestroyTensorDescriptor(cdesc));
+    cudnn_check(cudnnDestroyTensorDescriptor(bdesc));
+    cudnn_check(cudnnDestroy(cudnn_handle));
+
+    cuda_check(cudaMemset(c.raw_ptr, 0, c.layout.span().dist_byte()));
+    cuda_check(cudaDeviceSynchronize());
 #endif
 
-        auto opr = handle_cuda->create_operator<ElemwiseForward>();
-        opr->param().mode = ElemwiseForward::Mode::ADD;
-        cudaProfilerStart();
-        opr->exec({a, b}, c);
-        cudaProfilerStop();
-    }
+    auto opr = handle_cuda->create_operator<ElemwiseForward>();
+    opr->param().mode = ElemwiseForward::Mode::ADD;
+    cudaProfilerStart();
+    opr->exec({a, b}, c);
+    cudaProfilerStop();
+}
 
-} // anonymous namespace
+}  // anonymous namespace
 
-template<typename tag>
-class CUDA_ELEMWISE: public CUDA {
-};
+template <typename tag>
+class CUDA_ELEMWISE : public CUDA {};
 TYPED_TEST_CASE(CUDA_ELEMWISE, elemwise::test_types);
 TYPED_TEST(CUDA_ELEMWISE, run) {
     elemwise::run_test<TypeParam>(this->handle_cuda());
@@ -275,18 +271,17 @@ TEST_F(CUDA, ELEMWISE_BFLOAT16) {
 
 //! the memory of this test case is too large, sometimes will fail on tx1
 TEST_F(CUDA, ELEMWISE_BENCHMARK_DENSE) {
-    constexpr size_t A = 256 * 1024 * 64,
-              S0 = 16, S1 = 256, S2 = 64, S3 = 64;
+    constexpr size_t A = 256 * 1024 * 64, S0 = 16, S1 = 256, S2 = 64, S3 = 64;
     static_assert(A == S0 * S1 * S2 * S3, "bad value");
-    SyncedTensor<>
-        t0(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}),
-        t1(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()});
+    SyncedTensor<> t0(handle_cuda(),
+                      {TensorShape{S0, S1, S2, S3}, dtype::Float32()}),
+            t1(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()});
     UniformFloatRNG rng{-2.f, 2.f};
     rng.gen(t0.tensornd_host());
-    run_tensor_add(handle_cuda(),
-            t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev());
+    run_tensor_add(handle_cuda(), t0.tensornd_dev(), t0.tensornd_dev(),
+                   t1.tensornd_dev());
     auto p0 = t0.ptr_host(), p1 = t1.ptr_host();
-    for (size_t i = 0; i < A; ++ i) {
+    for (size_t i = 0; i < A; ++i) {
         ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A;
     }
 }
@@ -294,19 +289,19 @@ TEST_F(CUDA, ELEMWISE_BENCHMARK_DENSE) {
 #if MEGDNN_WITH_BENCHMARK
 TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_101) {
     constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1;
-    SyncedTensor<>
-        t0(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()}),
-        t1(handle_cuda(), {TensorShape{1, B, 1, 1}, dtype::Float32()}),
-        t2(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()});
+    SyncedTensor<> t0(handle_cuda(),
+                      {TensorShape{A, B, C0, C1}, dtype::Float32()}),
+            t1(handle_cuda(), {TensorShape{1, B, 1, 1}, dtype::Float32()}),
+            t2(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()});
     UniformFloatRNG rng{-2.f, 2.f};
     rng.gen(t0.tensornd_host());
     rng.gen(t1.tensornd_host());
-    run_tensor_add(handle_cuda(),
-            t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
+    run_tensor_add(handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(),
+                   t2.tensornd_dev());
     auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
-    for (size_t i = 0; i < A; ++ i) {
-        for (size_t j = 0; j < B; ++ j) {
-            for (size_t k = 0; k < C; ++ k) {
+    for (size_t i = 0; i < A; ++i) {
+        for (size_t j = 0; j < B; ++j) {
+            for (size_t k = 0; k < C; ++k) {
                 auto off = i * B * C + j * C + k;
                 ASSERT_EQ(p0[off] + p1[j], p2[off]);
             }
@@ -317,16 +312,16 @@ TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_101) {
 TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_10) {
     constexpr size_t A = 11583, B = 11587;
     SyncedTensor<> t0(handle_cuda(), {TensorShape{A, B}, dtype::Float32()}),
-                   t1(handle_cuda(), {TensorShape{1, B}, dtype::Float32()}),
-                   t2(handle_cuda(), {TensorShape{A, B}, dtype::Float32()});
+            t1(handle_cuda(), {TensorShape{1, B}, dtype::Float32()}),
+            t2(handle_cuda(), {TensorShape{A, B}, dtype::Float32()});
     UniformFloatRNG rng{-2.f, 2.f};
     rng.gen(t0.tensornd_host());
     rng.gen(t1.tensornd_host());
-    run_tensor_add(handle_cuda(),
-            t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
+    run_tensor_add(handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(),
+                   t2.tensornd_dev());
     auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
-    for (size_t i = 0; i < A; ++ i) {
-        for (size_t j = 0; j < B; ++ j) {
+    for (size_t i = 0; i < A; ++i) {
+        for (size_t j = 0; j < B; ++j) {
             auto off = i * B + j;
             ASSERT_EQ(p0[off] + p1[j], p2[off]);
         }
@@ -336,16 +331,16 @@ TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_10) {
 TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_01) {
     constexpr size_t A = 11583, B = 11587;
     SyncedTensor<> t0(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()}),
-                   t1(handle_cuda(), {TensorShape{1, A, 1}, dtype::Float32()}),
-                   t2(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()});
+            t1(handle_cuda(), {TensorShape{1, A, 1}, dtype::Float32()}),
+            t2(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()});
     UniformFloatRNG rng{-2.f, 2.f};
     rng.gen(t0.tensornd_host());
     rng.gen(t1.tensornd_host());
-    run_tensor_add(handle_cuda(),
-            t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
+    run_tensor_add(handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(),
+                   t2.tensornd_dev());
     auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
-    for (size_t i = 0; i < A; ++ i) {
-        for (size_t j = 0; j < B; ++ j) {
+    for (size_t i = 0; i < A; ++i) {
+        for (size_t j = 0; j < B; ++j) {
             auto off = i * B + j;
             ASSERT_EQ(p0[off] + p1[i], p2[off]);
         }
@@ -361,8 +356,9 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_IBYTE) {
                 .set_param(Mode::FUSE_ADD_RELU)
                 .set_dtype(0, dtype::Int8())
                 .set_dtype(1, dtype::Int8());
-        auto time = bencher.execs({{N * C * H * W + 1}, {N * C * H * W + 1}, {}}) /
-                    nr_times;
+        auto time =
+                bencher.execs({{N * C * H * W + 1}, {N * C * H * W + 1}, {}}) /
+                nr_times;
         printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
                (3.0 * (N * C * H * W + 1)) / (time * 1e6));
         time = bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) /
@@ -395,7 +391,6 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_IBYTE) {
                nr_times;
         printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
                (C + 2.0 * N * C * H * W) / (time * 1e6));
-
     };
     run_bench(256, 256, 56, 56);
 }
@@ -428,4 +423,3 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_MIN_MAX) {
 #endif
 
 // vim: syntax=cpp.doxygen
-