feat(dnn/naive/norm,-dnn/cuda/norm,-dnn/test/norm): add norm dnn opr,

fwd only GitOrigin-RevId: 989474168d45c55ab9a45983b93e54cd3526e191

feat(dnn/naive/norm,-dnn/cuda/norm,-dnn/test/norm): add norm dnn opr,
fwd only GitOrigin-RevId: 989474168d45c55ab9a45983b93e54cd3526e191
b55942a9 · Megvii Engine Team · 7a7af8d7 · b55942a9 · b55942a9 · b55942a9
17 changed file
--- a/dnn/include/megdnn/oprs/general.h
+++ b/dnn/include/megdnn/oprs/general.h
@@ -1475,6 +1475,35 @@ protected:

 using LAMB = LAMBUpdate;

+class NormBase : public OperatorBase {
+    DEF_OPR_PARAM(Norm);  // package norm params in Norm keyword from py declaration
+    DEF_OPR_IMPL(NormBase, OperatorBase, 1, 1);  // constructor and static members
+
+public:
+    virtual void deduce_layout(const TensorLayout& src, TensorLayout& dst) = 0;
+    virtual size_t get_workspace_in_bytes(
+            const TensorLayout& src, const TensorLayout& dst) = 0;
+
+protected:
+    void check_exec(
+            const TensorLayout& src, const TensorLayout& dst,
+            size_t workspace_in_bytes);
+};
+
+class NormForward : public NormBase {
+    DEF_OPR_IMPL(NormForward, NormBase, 1, 1);
+    using Mode = Param::Mode;
+
+public:
+    virtual void exec(
+            _megdnn_tensor_in src, _megdnn_tensor_out dst,
+            _megdnn_workspace workspace) = 0;
+    virtual void deduce_layout(const TensorLayout& src, TensorLayout& dst);
+    virtual size_t get_workspace_in_bytes(
+            const TensorLayout& src, const TensorLayout& dst) = 0;
+};
+using Norm = NormForward;
+
 }  // namespace megdnn

 #include "megdnn/internal/opr_header_epilogue.h"

--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -1277,3 +1277,11 @@ PADDING_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'),
 add_fields('bool', Doc('bias_correction', 'whether correct bias'), 'true').
 add_fields('bool', Doc('always_adapt', 'apply adaptive lr to 0.0'), 'false')
 )
+(pdef("Norm").
+ add_enum('Mode',
+            Doc('P_NORM=0', 'calculate p-norm, parameter p would be ignored in other mode'),
+            Doc('INF_NORM=1', 'infinite norm'),
+            Doc('NEG_INF_NORM=2', 'negative infinite norm'), name_field="mode").
+ add_fields('float32', Doc('p', 'the order of norm'), '2').
+ add_fields('int32', Doc('dim', 'which dim the norm performed along'), '-1'),
+ )
--- a/dnn/src/common/handle_impl.h
+++ b/dnn/src/common/handle_impl.h
@@ -212,7 +212,8 @@ private:
    cb(LAMBUpdate) \
    cb(LSTMBackward) \
    cb(SoftmaxForward) \
-    cb(SoftmaxBackward)
+    cb(SoftmaxBackward) \
+    cb(NormForward)
 // clang-format on

 /*!

--- a/dnn/src/common/norm.cpp
+++ b/dnn/src/common/norm.cpp
+#include "megdnn/oprs.h"
+#include "src/common/utils.h"
+
+namespace megdnn {
+void NormForward::deduce_layout(const TensorLayout& src, TensorLayout& dst) {
+    megdnn_assert(
+            param().dim > -1 && param().dim < static_cast<dt_int32>(src.ndim),
+            "dim params must be passed and cannot be -1.");
+
+    SmallVector<size_t> shapeList;
+    for (size_t i = 0; i < src.ndim; ++i) {
+        if (static_cast<dt_int32>(i) != param().dim) {
+            shapeList.append(1, static_cast<size_t>(src.shape[i]));
+        } else {
+            shapeList.append(1, static_cast<size_t>(1));
+        }
+    }
+    dst = TensorLayout{TensorShape(shapeList), src.dtype};
+    return;
+}
+
+void NormBase::check_exec(
+        const TensorLayout& src, const TensorLayout& dst, size_t workspace_in_bytes) {
+    megdnn_assert_eq_dtype(src, dst);
+
+#if !MEGDNN_DISABLE_FLOAT16
+    megdnn_assert(
+            src.dtype.enumv() == DTypeEnum::Float16 ||
+                    src.dtype.enumv() == DTypeEnum::Float32,
+            "Float16 or Float32 is only supported.");
+#else
+    megdnn_assert(
+            src.dtype.enumv() == DTypeEnum::Float32, "Float32 is only supported.");
+#endif
+
+    TensorLayout dst_expected;
+    deduce_layout(src, dst_expected);
+    megdnn_assert_eq_layout(dst_expected, dst);
+
+    auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst);
+    megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
+}
+}  // namespace megdnn
--- a/dnn/src/common/opr_trait.h
+++ b/dnn/src/common/opr_trait.h
@@ -16,6 +16,7 @@ struct OprTrait {};
        static const bool can_deduce_layout = CanDeduceLayout; \
    }

+DEF(Norm, 2, true, true);
 DEF(Padding, 2, false, true);
 DEF(PaddingBackward, 2, false, false);
 DEF(ConvolutionForward, 3, true, true);

--- a/dnn/src/cuda/handle_create.cpp
+++ b/dnn/src/cuda/handle_create.cpp
@@ -47,6 +47,7 @@
 #include "src/cuda/matrix_mul/opr_impl.h"
 #include "src/cuda/max_tensor_diff/opr_impl.h"
 #include "src/cuda/mesh_indexing/opr_impl.h"
+#include "src/cuda/norm/opr_impl.h"
 #include "src/cuda/padding/opr_impl.h"
 #include "src/cuda/param_pack/opr_impl.h"
 #include "src/cuda/pooling/opr_impl.h"
@@ -216,6 +217,7 @@ MEGDNN_SPECIALIZE_CREATE_OPERATOR(DropoutForward);
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(DropoutBackward);
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(SoftmaxForward);
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(SoftmaxBackward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(NormForward);

 template <typename Opr>
 std::unique_ptr<Opr> HandleImpl::create_operator() {

--- a/dnn/src/cuda/norm/helper.cu
+++ b/dnn/src/cuda/norm/helper.cu
+
+
+#include "helper.h"
+#include "megdnn/dtype.h"
+#include "src/cuda/reduce_helper.cuh"
+
+namespace megdnn {
+namespace cuda {
+
+using namespace device_reduce;
+#define COMMA ,
+
+INST_REDUCE(NormOp<dt_float32 COMMA dt_float32 COMMA dt_float32>, false);
+INST_REDUCE(NormOp<dt_float16 COMMA dt_float16 COMMA dt_float16>, false);
+
+INST_REDUCE(NormZeroOp<dt_float32 COMMA dt_float32 COMMA dt_float32>, false);
+INST_REDUCE(NormZeroOp<dt_float16 COMMA dt_float16 COMMA dt_float16>, false);
+
+INST_REDUCE(NormOneOp<dt_float32 COMMA dt_float32 COMMA dt_float32>, false);
+INST_REDUCE(NormOneOp<dt_float16 COMMA dt_float16 COMMA dt_float16>, false);
+
+INST_REDUCE(NormTwoOp<dt_float32 COMMA dt_float32 COMMA dt_float32>, false);
+INST_REDUCE(NormTwoOp<dt_float16 COMMA dt_float16 COMMA dt_float16>, false);
+
+#undef COMMA
+
+}  // namespace cuda
+}  // namespace megdnn
\ No newline at end of file
--- a/dnn/src/cuda/norm/helper.h
+++ b/dnn/src/cuda/norm/helper.h
+#pragma once
+#include "megdnn/dtype.h"
+
+#if MEGDNN_CC_HOST
+#include "megdnn/basic_types.h"
+#endif
+
+namespace megdnn {
+namespace device_reduce {
+
+template <typename src_ctype, typename dst_ctype, typename wtype_>
+struct NormOp;
+
+template <>
+struct NormOp<dt_float32, dt_float32, dt_float32> {
+    typedef dt_float32 wtype;
+    typedef dt_float32 src_ctype;
+    typedef dt_float32 dst_ctype;
+    typedef wtype p_type;
+    const wtype INIT;
+
+    src_ctype* src;
+    dst_ctype* dst;
+    const size_t B;
+    const p_type p;
+
+    MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) {
+        return powf(fabsf(src[idx]), p);
+    }
+    MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) {
+        dst[idx] = powf(val, 1.f / p);
+    }
+    static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
+        return lhs + rhs;
+    }
+    MEGDNN_HOST MEGDNN_DEVICE NormOp(src_ctype* src, dst_ctype* dst, size_t B, p_type p)
+            : INIT(wtype(0)), src(src), dst(dst), B(B), p(static_cast<wtype>(p)) {}
+};
+
+#if !MEGDNN_DISABLE_FLOAT16
+template <>
+struct NormOp<dt_float16, dt_float16, dt_float16> {
+    typedef dt_float16 wtype;
+    typedef dt_float16 src_ctype;
+    typedef dt_float16 dst_ctype;
+    const wtype INIT;
+
+    src_ctype* src;
+    dst_ctype* dst;
+    const size_t B;
+    const wtype p;
+
+    // HALF_FLOAT API has dispatch host and device.
+    MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) {
+        return half_float::detail::pow(half_float::detail::abs(src[idx]), p);
+    }
+    MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) {
+        dst[idx] = half_float::detail::pow(val, static_cast<wtype>(1.f) / p);
+    }
+    static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
+        return lhs + rhs;
+    }
+    MEGDNN_HOST MEGDNN_DEVICE
+    NormOp(src_ctype* src, dst_ctype* dst, size_t B, dt_float32 p)
+            : INIT(wtype(0)), src(src), dst(dst), B(B), p(static_cast<wtype>(p)) {}
+};
+#endif
+
+// TODO: 0Norm impl need understand reduceop
+template <typename src_ctype, typename dst_ctype, typename wtype_>
+struct NormZeroOp;
+
+template <>
+struct NormZeroOp<dt_float32, dt_float32, dt_float32> {
+    typedef dt_float32 wtype;
+    typedef dt_float32 src_ctype;
+    typedef dt_float32 dst_ctype;
+    const wtype INIT;
+
+    src_ctype* src;
+    dst_ctype* dst;
+    const size_t B;
+    const wtype epsilon = 0.00001f;
+
+    MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) {
+        return fabsf(src[idx] - 0.0f) <= epsilon ? 0.0f : 1.0f;
+    }
+    MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
+
+    static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
+        return lhs + rhs;
+    }
+    MEGDNN_HOST MEGDNN_DEVICE NormZeroOp(src_ctype* src, dst_ctype* dst, size_t B)
+            : INIT(wtype(0)), src(src), dst(dst), B(B) {}
+};
+
+#if !MEGDNN_DISABLE_FLOAT16
+template <>
+struct NormZeroOp<dt_float16, dt_float16, dt_float16> {
+    typedef dt_float16 wtype;
+    typedef dt_float16 src_ctype;
+    typedef dt_float16 dst_ctype;
+    const wtype INIT;
+
+    src_ctype* src;
+    dst_ctype* dst;
+    const size_t B;
+    const wtype epsilon = half_float::half(0.00001f);
+
+    MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) {
+        return half_float::detail::fabs(src[idx] - half_float::half()) <= epsilon
+                     ? half_float::half(0.0f)
+                     : half_float::half(1.0f);
+    }
+    MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
+
+    static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
+        return lhs + rhs;
+    }
+    MEGDNN_HOST MEGDNN_DEVICE NormZeroOp(src_ctype* src, dst_ctype* dst, size_t B)
+            : INIT(wtype(0)), src(src), dst(dst), B(B) {}
+};
+#endif
+
+template <typename src_ctype, typename dst_ctype, typename wtype_>
+struct NormOneOp;
+
+template <>
+struct NormOneOp<dt_float32, dt_float32, dt_float32> {
+    typedef dt_float32 wtype;
+    typedef dt_float32 src_ctype;
+    typedef dt_float32 dst_ctype;
+    const wtype INIT;
+
+    src_ctype* src;
+    dst_ctype* dst;
+    const size_t B;
+
+    MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return fabsf(src[idx]); }
+    MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
+
+    static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
+        return lhs + rhs;
+    }
+    MEGDNN_HOST MEGDNN_DEVICE NormOneOp(src_ctype* src, dst_ctype* dst, size_t B)
+            : INIT(wtype(0)), src(src), dst(dst), B(B) {}
+};
+
+#if !MEGDNN_DISABLE_FLOAT16
+template <>
+struct NormOneOp<dt_float16, dt_float16, dt_float16> {
+    typedef dt_float16 wtype;
+    typedef dt_float16 src_ctype;
+    typedef dt_float16 dst_ctype;
+    const wtype INIT;
+
+    src_ctype* src;
+    dst_ctype* dst;
+    const size_t B;
+
+    MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) {
+        return half_float::detail::abs(src[idx]);
+    }
+    MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) { dst[idx] = val; }
+
+    static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
+        return lhs + rhs;
+    }
+    MEGDNN_HOST MEGDNN_DEVICE NormOneOp(src_ctype* src, dst_ctype* dst, size_t B)
+            : INIT(wtype(0)), src(src), dst(dst), B(B) {}
+};
+#endif
+
+template <typename src_ctype, typename dst_ctype, typename wtype_>
+struct NormTwoOp;
+
+template <>
+struct NormTwoOp<dt_float32, dt_float32, dt_float32> {
+    typedef dt_float32 wtype;
+    typedef dt_float32 src_ctype;
+    typedef dt_float32 dst_ctype;
+    const wtype INIT;
+
+    src_ctype* src;
+    dst_ctype* dst;
+    const size_t B;
+
+    MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx] * src[idx]; }
+    MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) {
+        dst[idx] = sqrtf(val);
+    }
+
+    static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
+        return lhs + rhs;
+    }
+    MEGDNN_HOST MEGDNN_DEVICE NormTwoOp(src_ctype* src, dst_ctype* dst, size_t B)
+            : INIT(wtype(0)), src(src), dst(dst), B(B) {}
+};
+
+#if !MEGDNN_DISABLE_FLOAT16
+template <>
+struct NormTwoOp<dt_float16, dt_float16, dt_float16> {
+    typedef dt_float16 wtype;
+    typedef dt_float16 src_ctype;
+    typedef dt_float16 dst_ctype;
+    const wtype INIT;
+
+    src_ctype* src;
+    dst_ctype* dst;
+    const size_t B;
+
+    MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx] * src[idx]; }
+    MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) {
+        dst[idx] = half_float::detail::sqrt(val);
+    }
+
+    static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
+        return lhs + rhs;
+    }
+    MEGDNN_HOST MEGDNN_DEVICE NormTwoOp(src_ctype* src, dst_ctype* dst, size_t B)
+            : INIT(wtype(0)), src(src), dst(dst), B(B) {}
+};
+#endif
+
+}  // namespace device_reduce
+}  // namespace megdnn
--- a/dnn/src/cuda/norm/opr_impl.cpp
+++ b/dnn/src/cuda/norm/opr_impl.cpp
+#include "src/cuda/norm/opr_impl.h"
+#include "helper.h"
+#include "src/common/reduce_helper_device.h"
+#include "src/common/utils.h"
+#include "src/cuda/handle.h"
+#include "src/cuda/reduce_helper.cuh"
+#include "src/cuda/utils.h"
+
+namespace megdnn {
+namespace cuda {
+
+using namespace device_reduce;
+using Mode = Norm::Mode;
+
+template <>
+void NormForwardImpl::dispatch_mode<Mode::NEG_INF_NORM>(
+        _megdnn_tensor_inout src, _megdnn_tensor_inout dst, _megdnn_workspace workspace,
+        size_t A, size_t B, size_t C, cudaStream_t stream) {
+#define CASE(dt)                                                                   \
+    case DTypeTrait<dt>::enumv: {                                                  \
+        using ctype = DTypeTrait<dt>::ctype;                                       \
+        auto reduceOp =                                                            \
+                MinOp<ctype, ctype, ctype>(src.ptr<ctype>(), dst.ptr<ctype>(), B); \
+        run_reduce<MinOp<ctype, ctype, ctype>, false>(                             \
+                workspace.ptr<ctype>(), A, B, C, stream, reduceOp);                \
+        break;                                                                     \
+    };
+    switch (src.layout.dtype.enumv()) {
+        CASE(::megdnn::dtype::Float32)
+#if !MEGDNN_DISABLE_FLOAT16
+        CASE(::megdnn::dtype::Float16)
+#endif
+        default:
+            megdnn_assert_internal(false);
+    }
+#undef CASE
+}
+
+template <>
+void NormForwardImpl::dispatch_mode<Mode::INF_NORM>(
+        _megdnn_tensor_inout src, _megdnn_tensor_inout dst, _megdnn_workspace workspace,
+        size_t A, size_t B, size_t C, cudaStream_t stream) {
+#define CASE(dt)                                                                   \
+    case DTypeTrait<dt>::enumv: {                                                  \
+        using ctype = DTypeTrait<dt>::ctype;                                       \
+        auto reduceOp =                                                            \
+                MaxOp<ctype, ctype, ctype>(src.ptr<ctype>(), dst.ptr<ctype>(), B); \
+        run_reduce<MaxOp<ctype, ctype, ctype>, false>(                             \
+                workspace.ptr<ctype>(), A, B, C, stream, reduceOp);                \
+        break;                                                                     \
+    };
+    switch (src.layout.dtype.enumv()) {
+        CASE(::megdnn::dtype::Float32)
+#if !MEGDNN_DISABLE_FLOAT16
+        CASE(::megdnn::dtype::Float16)
+#endif
+        default:
+            megdnn_assert_internal(false);
+    }
+#undef CASE
+}
+
+template <>
+void NormForwardImpl::dispatch_mode<Mode::P_NORM>(
+        _megdnn_tensor_inout src, _megdnn_tensor_inout dst, _megdnn_workspace workspace,
+        size_t A, size_t B, size_t C, cudaStream_t stream) {
+    typedef dt_float32 p_type;
+
+#define CASE(dt)                                                                \
+    case DTypeTrait<dt>::enumv: {                                               \
+        using ctype = DTypeTrait<dt>::ctype;                                    \
+        p_type epsilon = 0.000001f;                                             \
+        if (fabs(param().p - 0.0f) < epsilon) {                                 \
+            run_reduce<NormZeroOp<ctype, ctype, ctype>, false>(                 \
+                    workspace.ptr<ctype>(), A, B, C, stream,                    \
+                    NormZeroOp<ctype, ctype, ctype>(                            \
+                            src.ptr<ctype>(), dst.ptr<ctype>(), B));            \
+        } else if (fabs(param().p - 1.0f) < epsilon) {                          \
+            run_reduce<NormOneOp<ctype, ctype, ctype>, false>(                  \
+                    workspace.ptr<ctype>(), A, B, C, stream,                    \
+                    NormOneOp<ctype, ctype, ctype>(                             \
+                            src.ptr<ctype>(), dst.ptr<ctype>(), B));            \
+        } else if (fabs(param().p - 2.0f) < epsilon) {                          \
+            run_reduce<NormTwoOp<ctype, ctype, ctype>, false>(                  \
+                    workspace.ptr<ctype>(), A, B, C, stream,                    \
+                    NormTwoOp<ctype, ctype, ctype>(                             \
+                            src.ptr<ctype>(), dst.ptr<ctype>(), B));            \
+        } else {                                                                \
+            run_reduce<NormOp<ctype, ctype, ctype>, false>(                     \
+                    workspace.ptr<ctype>(), A, B, C, stream,                    \
+                    NormOp<ctype, ctype, ctype>(                                \
+                            src.ptr<ctype>(), dst.ptr<ctype>(), B, param().p)); \
+        }                                                                       \
+        break;                                                                  \
+    };
+
+    switch (src.layout.dtype.enumv()) {
+        CASE(::megdnn::dtype::Float32)
+#if !MEGDNN_DISABLE_FLOAT16
+        CASE(::megdnn::dtype::Float16)
+#endif
+        default:
+            megdnn_assert_internal(false);
+    }
+#undef CASE
+}
+
+}  // namespace cuda
+
+namespace cuda {
+void NormForwardImpl::exec(
+        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
+    check_exec(src.layout, dst.layout, workspace.size);
+    size_t A, B, C;
+    reduce::get_ABC(src.layout, A, B, C, param().dim);
+    auto stream = cuda_stream(this->handle());
+
+#define CASE(mode)                                                 \
+    case mode: {                                                   \
+        dispatch_mode<mode>(src, dst, workspace, A, B, C, stream); \
+        break;                                                     \
+    };
+
+    switch (param().mode) {
+        CASE(Mode::P_NORM)
+        CASE(Mode::INF_NORM)
+        CASE(Mode::NEG_INF_NORM)
+        default:
+            megdnn_assert_internal(false);
+    }
+#undef CASE
+
+    return;
+}
+
+size_t NormForwardImpl::get_workspace_in_bytes(
+        const TensorLayout& src, const TensorLayout& dst) {
+    using namespace device_reduce;
+    size_t A, B, C;
+    reduce::get_ABC(src, A, B, C, param().dim);
+
+#define cb(dt, op)                                                              \
+    case DTypeTrait<dt>::enumv: {                                               \
+        using ctype = DTypeTrait<dt>::ctype;                                    \
+        return get_reduce_workspace_in_bytes<op<ctype, ctype, ctype>>(A, B, C); \
+        break;                                                                  \
+    };
+
+#if !MEGDNN_DISABLE_FLOAT16
+#define CASE(mode, op)                                                                \
+    case mode: {                                                                      \
+        switch (src.dtype.enumv()) {                                                  \
+            cb(::megdnn::dtype::Float32, op) cb(::megdnn::dtype::Float16, op) default \
+                    : megdnn_assert_internal(false);                                  \
+        }                                                                             \
+    };
+#else
+#define CASE(mode, op)                                                                \
+    case mode: {                                                                      \
+        switch (src.dtype.enumv()) {                                                  \
+            cb(::megdnn::dtype::Float32, op) default : megdnn_assert_internal(false); \
+        }                                                                             \
+    };
+#endif
+
+    // XXX: 0/1 norm dispathed to different Op, but workspace size same as
+    // NormOp
+    switch (param().mode) {
+        CASE(Mode::INF_NORM, MaxOp)
+        CASE(Mode::NEG_INF_NORM, MinOp)
+        CASE(Mode::P_NORM, NormOp)
+        default:
+            megdnn_assert_internal(false);
+    }
+#undef CASE
+#undef cb
+}
+
+}  // namespace cuda
+}  // namespace megdnn
--- a/dnn/src/cuda/norm/opr_impl.h
+++ b/dnn/src/cuda/norm/opr_impl.h
+#pragma once
+#include "megdnn/oprs.h"
+#include "src/cuda/utils.h"
+
+namespace megdnn {
+namespace cuda {
+class NormForwardImpl : public NormForward {
+    using Norm::Norm;
+
+public:
+    void exec(
+            _megdnn_tensor_in src, _megdnn_tensor_out dst,
+            _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(
+            const TensorLayout& src, const TensorLayout& dst) override;
+
+protected:
+    template <Mode mode>
+    void dispatch_mode(
+            _megdnn_tensor_inout src, _megdnn_tensor_inout dst,
+            _megdnn_workspace workspace, size_t A, size_t B, size_t C,
+            cudaStream_t stream);
+};
+}  // namespace cuda
+}  // namespace megdnn
--- a/dnn/src/naive/handle.cpp
+++ b/dnn/src/naive/handle.cpp
@@ -51,6 +51,7 @@
 #include "src/naive/matrix_mul/opr_impl.h"
 #include "src/naive/max_tensor_diff/opr_impl.h"
 #include "src/naive/mesh_indexing/opr_impl.h"
+#include "src/naive/norm/opr_impl.h"
 #include "src/naive/padding/opr_impl.h"
 #include "src/naive/param_pack/opr_impl.h"
 #include "src/naive/pooling/opr_impl.h"

--- a/dnn/src/naive/norm/helper.h
+++ b/dnn/src/naive/norm/helper.h
+#pragma once
+#include <algorithm>
+#include <numeric>
+#include "megdnn/basic_types.h"
+#include "megdnn/dtype.h"
+#include "src/common/utils.h"
+
+using namespace megdnn;
+
+/* anonymous namespace */
+namespace {
+using Mode = Reduce::Mode;
+
+/* Reduce Trait */
+template <Mode mode, typename ctype>
+struct Trait;
+
+template <typename ctype>
+struct Trait<Mode::SUM, ctype> {
+    static const ctype INIT;
+
+    static ctype apply(ctype x, ctype y) { return x + y; }
+    static ctype visit(ctype x) { return x; }
+    static ctype write(ctype x, size_t) { return x; }
+};
+template <typename ctype>
+const ctype Trait<Mode::SUM, ctype>::INIT = ctype(0);
+
+template <typename ctype>
+struct Trait<Mode::MEAN, ctype> {
+    static const ctype INIT;
+
+    static ctype apply(ctype x, ctype y) { return x + y; }
+    static ctype visit(ctype x) { return x; }
+    static ctype write(ctype x, size_t B) { return x / (ctype)B; }
+};
+template <typename ctype>
+const ctype Trait<Mode::MEAN, ctype>::INIT = ctype(0);
+
+template <typename ctype>
+struct Trait<Mode::SUM_SQR, ctype> {
+    static const ctype INIT;
+
+    static ctype apply(ctype x, ctype y) { return x + y; }
+    static ctype visit(ctype x) { return x * x; }
+    static ctype write(ctype x, size_t) { return x; }
+};
+template <typename ctype>
+const ctype Trait<Mode::SUM_SQR, ctype>::INIT = ctype(0);
+
+template <typename ctype>
+struct Trait<Mode::PRODUCT, ctype> {
+    static const ctype INIT;
+
+    static ctype apply(ctype x, ctype y) { return x * y; }
+    static ctype visit(ctype x) { return x; }
+    static ctype write(ctype x, size_t) { return x; }
+};
+template <typename ctype>
+const ctype Trait<Mode::PRODUCT, ctype>::INIT = ctype(1);
+
+template <typename ctype>
+struct Trait<Mode::MIN, ctype> {
+    static ctype apply(ctype x, ctype y) { return x < y ? x : y; }
+    static ctype visit(ctype x) { return x; }
+    static ctype write(ctype x, size_t) { return x; }
+};
+
+template <>
+struct Trait<Mode::MIN, dt_float32> {
+    using ctype = dt_float32;
+
+    static ctype apply(ctype x, ctype y) { return (std::isnan(x) || x < y) ? x : y; }
+    static ctype visit(ctype x) { return x; }
+    static ctype write(ctype x, size_t) { return x; }
+};
+
+template <typename ctype>
+struct Trait<Mode::MAX, ctype> {
+    static ctype apply(ctype x, ctype y) { return x > y ? x : y; }
+    static ctype visit(ctype x) { return x; }
+    static ctype write(ctype x, size_t) { return x; }
+};
+
+template <>
+struct Trait<Mode::MAX, dt_float32> {
+    using ctype = dt_float32;
+
+    static ctype apply(ctype x, ctype y) { return (std::isnan(x) || x > y) ? x : y; }
+    static ctype visit(ctype x) { return x; }
+    static ctype write(ctype x, size_t) { return x; }
+};
+
+/* NormOp */
+template <typename ctype>
+struct NormOp;
+
+template <>
+struct NormOp<dt_float32> {
+    typedef dt_float32 ctype;
+    static const ctype INIT;
+
+    static ctype apply(ctype x, ctype y) { return x + y; }
+    static ctype visit(ctype x, dt_float32 p) { return powf(fabs(x), p); }
+    static ctype write(ctype x, size_t, dt_float32 p) { return powf(x, 1.f / p); }
+};
+
+#if !MEGDNN_DISABLE_FLOAT16
+template <>
+struct NormOp<dt_float16> {
+    typedef dt_float16 ctype;
+    static const ctype INIT;
+
+    static ctype apply(ctype x, ctype y) { return x + y; }
+    static ctype visit(ctype x, dt_float32 p) {
+        return half_float::pow(half_float::abs(x), half_float::half(p));
+    }
+    static ctype write(ctype x, size_t, dt_float32 p) {
+        return half_float::pow(x, half_float::half(1.f / p));
+    }
+};
+#endif
+
+template <typename ctype>
+struct NormZeroOp;
+
+template <>
+struct NormZeroOp<dt_float32> {
+    typedef dt_float32 ctype;
+    static const ctype INIT;
+
+    static ctype apply(ctype x, ctype y) { return x + y; }
+    static ctype visit(ctype x) { return x - 0.f < 0.00001f ? 0.f : 1.f; }
+    static ctype write(ctype x, size_t) { return x; }
+};
+
+#if !MEGDNN_DISABLE_FLOAT16
+template <>
+struct NormZeroOp<dt_float16> {
+    typedef dt_float16 ctype;
+    static const ctype INIT;
+
+    static ctype apply(ctype x, ctype y) { return x + y; }
+    static ctype visit(ctype x) {
+        return x - half_float::half(0.f) < half_float::half(0.00001f)
+                     ? half_float::half(0.f)
+                     : half_float::half(1.f);
+    }
+    static ctype write(ctype x, size_t) { return x; }
+};
+#endif
+}  // namespace
--- a/dnn/src/naive/norm/opr_impl.cpp
+++ b/dnn/src/naive/norm/opr_impl.cpp
+#include "src/naive/norm/opr_impl.h"
+
+#include "helper.h"
+#include "src/common/utils.h"
+#include "src/naive/handle.h"
+
+namespace megdnn {
+namespace naive {
+using Mode = Norm::Mode;
+
+template <>
+void NormForwardImpl::dispatch_mode<Mode::NEG_INF_NORM>(
+        _megdnn_tensor_in src, _megdnn_tensor_out dst, size_t A, size_t B, size_t C) {
+#define CASE(dt)                                                                 \
+    case DTypeTrait<dt>::enumv: {                                                \
+        using ctype = DTypeTrait<dt>::ctype;                                     \
+        const ctype* __restrict sptr = src.ptr<ctype>();                         \
+        ctype* __restrict dptr = dst.ptr<ctype>();                               \
+        std::function<ctype(size_t, size_t, size_t, size_t)> func;               \
+        func = [&](size_t a, size_t c, size_t bl, size_t br) -> ctype {          \
+            if (bl + 1 < br) {                                                   \
+                size_t mid = bl + (br - bl) / 2;                                 \
+                return Trait<ReduceForward::Mode::MIN, ctype>::apply(            \
+                        func(a, c, bl, mid), func(a, c, mid, br));               \
+            } else {                                                             \
+                return Trait<ReduceForward::Mode::MIN, ctype>::visit(            \
+                        sptr[a * B * C + bl * C + c]);                           \
+            }                                                                    \
+        };                                                                       \
+        for (size_t a = 0; a < A; ++a)                                           \
+            for (size_t c = 0; c < C; ++c) {                                     \
+                dptr[a * C + c] = Trait<ReduceForward::Mode::MIN, ctype>::write( \
+                        func(a, c, 0, B), B);                                    \
+            }                                                                    \
+        break;                                                                   \
+    };
+
+    switch (src.layout.dtype.enumv()) {
+        CASE(::megdnn::dtype::Float32)
+#if !MEGDNN_DISABLE_FLOAT16
+        CASE(::megdnn::dtype::Float16)
+#endif
+        default:
+            megdnn_assert_internal(false);
+    }
+#undef CASE
+}
+
+template <>
+void NormForwardImpl::dispatch_mode<Mode::INF_NORM>(
+        _megdnn_tensor_in src, _megdnn_tensor_out dst, size_t A, size_t B, size_t C) {
+#define CASE(dt)                                                                 \
+    case DTypeTrait<dt>::enumv: {                                                \
+        using ctype = DTypeTrait<dt>::ctype;                                     \
+        const ctype* __restrict sptr = src.ptr<ctype>();                         \
+        ctype* __restrict dptr = dst.ptr<ctype>();                               \
+        std::function<ctype(size_t, size_t, size_t, size_t)> func;               \
+        func = [&](size_t a, size_t c, size_t bl, size_t br) -> ctype {          \
+            if (bl + 1 < br) {                                                   \
+                size_t mid = bl + (br - bl) / 2;                                 \
+                return Trait<ReduceForward::Mode::MAX, ctype>::apply(            \
+                        func(a, c, bl, mid), func(a, c, mid, br));               \
+            } else {                                                             \
+                return Trait<ReduceForward::Mode::MAX, ctype>::visit(            \
+                        sptr[a * B * C + bl * C + c]);                           \
+            }                                                                    \
+        };                                                                       \
+        for (size_t a = 0; a < A; ++a)                                           \
+            for (size_t c = 0; c < C; ++c) {                                     \
+                dptr[a * C + c] = Trait<ReduceForward::Mode::MAX, ctype>::write( \
+                        func(a, c, 0, B), B);                                    \
+            }                                                                    \
+        break;                                                                   \
+    };
+
+    switch (src.layout.dtype.enumv()) {
+        CASE(::megdnn::dtype::Float32)
+#if !MEGDNN_DISABLE_FLOAT16
+        CASE(::megdnn::dtype::Float16)
+#endif
+        default:
+            megdnn_assert_internal(false);
+    }
+#undef CASE
+}
+
+template <>
+void NormForwardImpl::dispatch_mode<Mode::P_NORM>(
+        _megdnn_tensor_in src, _megdnn_tensor_out dst, size_t A, size_t B, size_t C) {
+#define CASE(dt)                                                                     \
+    case DTypeTrait<dt>::enumv: {                                                    \
+        using ctype = DTypeTrait<dt>::ctype;                                         \
+        const ctype* __restrict sptr = src.ptr<ctype>();                             \
+        ctype* __restrict dptr = dst.ptr<ctype>();                                   \
+        std::function<ctype(size_t, size_t, size_t, size_t)> func;                   \
+        if (param().p - 0.f < 0.00001f) {                                            \
+            func = [&](size_t a, size_t c, size_t bl, size_t br) -> ctype {          \
+                if (bl + 1 < br) {                                                   \
+                    size_t mid = bl + (br - bl) / 2;                                 \
+                    return NormZeroOp<ctype>::apply(                                 \
+                            func(a, c, bl, mid), func(a, c, mid, br));               \
+                } else {                                                             \
+                    return NormZeroOp<ctype>::visit(sptr[a * B * C + bl * C + c]);   \
+                }                                                                    \
+            };                                                                       \
+            for (size_t a = 0; a < A; ++a) {                                         \
+                for (size_t c = 0; c < C; ++c) {                                     \
+                    dptr[a * C + c] = NormZeroOp<ctype>::write(func(a, c, 0, B), B); \
+                }                                                                    \
+            }                                                                        \
+        } else {                                                                     \
+            func = [&](size_t a, size_t c, size_t bl, size_t br) -> ctype {          \
+                if (bl + 1 < br) {                                                   \
+                    size_t mid = bl + (br - bl) / 2;                                 \
+                    return NormOp<ctype>::apply(                                     \
+                            func(a, c, bl, mid), func(a, c, mid, br));               \
+                } else {                                                             \
+                    return NormOp<ctype>::visit(                                     \
+                            sptr[a * B * C + bl * C + c], param().p);                \
+                }                                                                    \
+            };                                                                       \
+            for (size_t a = 0; a < A; ++a) {                                         \
+                for (size_t c = 0; c < C; ++c) {                                     \
+                    dptr[a * C + c] =                                                \
+                            NormOp<ctype>::write(func(a, c, 0, B), B, param().p);    \
+                }                                                                    \
+            }                                                                        \
+        }                                                                            \
+        break;                                                                       \
+    };
+
+    switch (src.layout.dtype.enumv()) {
+        CASE(::megdnn::dtype::Float32)
+#if !MEGDNN_DISABLE_FLOAT16
+        CASE(::megdnn::dtype::Float16)
+#endif
+        default:
+            megdnn_assert_internal(false);
+    }
+#undef CASE
+}
+
+void NormForwardImpl::exec(
+        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
+    check_exec(src.layout, dst.layout, workspace.size);
+    using namespace reduce;
+    size_t A, B, C;
+    reduce::get_ABC(src.layout, A, B, C, param().dim);
+    auto make_tensor = [&](DType comp_dtype, _megdnn_tensor_inout tensor,
+                           dt_byte*& workspace_ptr) {
+        if (comp_dtype == tensor.layout.dtype)
+            return tensor;
+        auto layout = TensorLayout(tensor.layout, comp_dtype);
+        TensorND new_tensor{workspace_ptr, layout};
+        workspace_ptr += layout.span().dist_byte();
+        return new_tensor;
+    };
+    auto typecvt = handle()->create_operator<TypeCvt>();
+
+    auto copy_to = [&typecvt](const TensorND& from, const TensorND& to) {
+        if (from.raw_ptr() != to.raw_ptr())
+            typecvt->exec(from, to);
+    };
+
+    auto workspace_ptr = workspace.ptr<dt_byte>();
+
+    auto new_src = make_tensor(src.layout.dtype, src, workspace_ptr);
+    auto new_dst = make_tensor(dst.layout.dtype, dst, workspace_ptr);
+
+#define CASE(mode)                                                                   \
+    case mode: {                                                                     \
+        copy_to(src, new_src);                                                       \
+        ::megdnn::naive::HandleImpl* handlePtr = static_cast<HandleImpl*>(handle()); \
+        MEGDNN_DISPATCH_CPU_KERN(                                                    \
+                handlePtr, dispatch_mode<mode>(new_src, new_dst, A, B, C));          \
+        copy_to(new_dst, dst);                                                       \
+        break;                                                                       \
+    };
+    switch (param().mode) {
+        CASE(Mode::P_NORM)
+        CASE(Mode::INF_NORM)
+        CASE(Mode::NEG_INF_NORM)
+        default:
+            megdnn_assert_internal(false);
+    }
+#undef CASE
+}
+
+size_t NormForwardImpl::get_workspace_in_bytes(
+        const TensorLayout& src, const TensorLayout& dst) {
+    MEGDNN_MARK_USED_VAR(src);
+    MEGDNN_MARK_USED_VAR(dst);
+    return 0;
+}
+
+}  // namespace naive
+}  // namespace megdnn
--- a/dnn/src/naive/norm/opr_impl.h
+++ b/dnn/src/naive/norm/opr_impl.h
+#pragma once
+#include "megdnn/oprs.h"
+#include "src/common/reduce_helper.h"
+#include "src/naive/reduce/opr_impl.h"
+
+namespace megdnn {
+namespace naive {
+class NormForwardImpl : public Norm {
+public:
+    using Norm::Norm;
+    void exec(
+            _megdnn_tensor_in src, _megdnn_tensor_out dst,
+            _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(
+            const TensorLayout& src, const TensorLayout& dst) override;
+
+protected:
+    template <Mode mode>
+    void dispatch_mode(
+            _megdnn_tensor_in src, _megdnn_tensor_out dst, size_t, size_t, size_t);
+};
+}  // namespace naive
+}  // namespace megdnn
--- a/dnn/test/common/norm.h
+++ b/dnn/test/common/norm.h
+
+#pragma once
+#include <iostream>
+#include "megdnn/basic_types.h"
+#include "megdnn/opr_param_defs.h"
+
+namespace megdnn {
+namespace test {
+namespace norm {
+
+struct TestArg {
+    param::Norm param;
+    TensorShape src;
+    TestArg(param::Norm param, TensorShape src) : param(param), src(src) {}
+};
+
+}  // namespace norm
+}  // namespace test
+}  // namespace megdnn
--- a/dnn/test/cuda/norm.cpp
+++ b/dnn/test/cuda/norm.cpp
+#include "test/common/norm.h"
+#include "megdnn/dtype.h"
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+// #include "test/naive/fixture.h"
+// #include "test/common/benchmarker.h"
+#include <iostream>
+#include "test/cuda/benchmark.h"
+#include "test/cuda/fixture.h"
+#include "test/cuda/utils.h"
+
+namespace megdnn {
+namespace test {
+// CORRECT
+// L2, fp32, dim
+TEST_F(CUDA, L2NORM_FP32_DIM0) {
+    Checker<Norm> checker(handle_cuda());
+    Norm::Param param;
+    param.p = 2;
+    param.dim = 0;
+    checker.set_param(param);
+    checker.exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+            });
+}
+TEST_F(CUDA, L2NORM_FP32_DIM1) {
+    Checker<Norm> checker(handle_cuda());
+    Norm::Param param;
+    param.p = 2;
+    param.dim = 1;
+    checker.set_param(param);
+    checker.exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue(
+                            {1, 1, 3, 4}, dtype::Float32(),
+                            {12.000, 13.0384, 14.1421, 15.2971, 16.4924, 17.7200,
+                             18.9737, 20.2485, 21.5407, 22.8473, 24.1661, 25.4951}),
+            });
+}
+TEST_F(CUDA, L2NORM_FP32_DIM3) {
+    Checker<Norm> checker(handle_cuda());
+    Norm::Param param;
+    param.p = 2;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue(
+                            {1, 2, 3, 1}, dtype::Float32(),
+                            {3.7417, 11.2250, 19.1311, 27.0924, 35.0714, 43.0581})});
+}
+// TODO: support -1 dim param, or test for assert
+// l2, fp16
+TEST_F(CUDA, L2NORM_FP16_DIM3) {
+    Checker<Norm> checker(handle_cuda());
+    Norm::Param param;
+    param.p = 2;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float16(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue(
+                            {1, 2, 3, 1}, dtype::Float16(),
+                            {3.7422, 11.2266, 19.1250, 27.0938, 35.0625, 43.0625})});
+}
+// l1, fp32,fp16
+TEST_F(CUDA, L1NORM_FP32_DIM3) {
+    Checker<Norm> checker(handle_cuda());
+    Norm::Param param;
+    param.p = 1;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue(
+                            {1, 2, 3, 1}, dtype::Float32(), {6, 22, 38, 54, 70, 86}),
+            });
+}
+TEST_F(CUDA, L1NORM_FP16_DIM3) {
+    Checker<Norm> checker(handle_cuda());
+    Norm::Param param;
+    param.p = 1;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float16(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue(
+                            {1, 2, 3, 1}, dtype::Float16(), {6, 22, 38, 54, 70, 86}),
+            });
+}
+// l0, fp32,fp16
+TEST_F(CUDA, L0NORM_FP32_DIM3) {
+    Checker<Norm> checker(handle_cuda());
+    Norm::Param param;
+    param.p = 0;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue({1, 2, 3, 1}, dtype::Float32(), {3, 4, 4, 4, 4, 4}),
+            });
+}
+TEST_F(CUDA, L0NORM_FP16_DIM3) {
+    Checker<Norm> checker(handle_cuda());
+    Norm::Param param;
+    param.p = 0;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float16(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue({1, 2, 3, 1}, dtype::Float16(), {3, 4, 4, 4, 4, 4}),
+            });
+}
+// inf
+TEST_F(CUDA, INF_NORM_FP32_DIM3) {
+    Checker<Norm> checker(handle_cuda());
+    Norm::Param param;
+    using Mode = Norm::Param::Mode;
+
+    param.dim = 3;
+    param.mode = Mode::INF_NORM;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue({1, 2, 3, 1}, dtype::Float32(), {3, 7, 11, 15, 19, 23}),
+            });
+}
+TEST_F(CUDA, INF_NORM_FP16_DIM3) {
+    Checker<Norm> checker(handle_cuda());
+    Norm::Param param;
+    using Mode = Norm::Param::Mode;
+
+    param.dim = 3;
+    param.mode = Mode::INF_NORM;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float16(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue({1, 2, 3, 1}, dtype::Float16(), {3, 7, 11, 15, 19, 23}),
+            });
+}
+// -inf
+TEST_F(CUDA, NEG_INF_NORM_FP32_DIM3) {
+    Checker<Norm> checker(handle_cuda());
+    Norm::Param param;
+    param.mode = Norm::Param::Mode::NEG_INF_NORM;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue({1, 2, 3, 1}, dtype::Float32(), {0, 4, 8, 12, 16, 20}),
+            });
+}
+TEST_F(CUDA, NEG_INF_NORM_FP16_DIM3) {
+    Checker<Norm> checker(handle_cuda());
+    Norm::Param param;
+    param.mode = Norm::Param::Mode::NEG_INF_NORM;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float16(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue({1, 2, 3, 1}, dtype::Float16(), {0, 4, 8, 12, 16, 20}),
+            });
+}
+
+// PERF
+TEST_F(CUDA, L2NORM_SPEED_FP32) {
+    auto benchmarker = Benchmarker<Norm>(handle_cuda());
+    benchmarker.set_dtype(0, dtype::Float32());
+    benchmarker.set_dtype(1, dtype::Float32());
+    Norm::Param param;
+    param.mode = Norm::Param::Mode::P_NORM;
+    param.dim = 0;
+    param.p = 2;
+    SmallVector<TensorShape> shapes{{4194304}, {}};
+    NormalRNG rng(0, 1);
+    float eachTime;
+    float totalTime = 0.f;
+#define ITER 10
+    for (auto i = 0; i < ITER; i++) {
+        eachTime = benchmarker.set_param(param).set_rng(0, &rng).exec(shapes);
+        // printf("PNORM_SPEED_FP32 cuda time: %.6fms\n", eachTime);
+        totalTime += eachTime;
+    }
+    totalTime /= ITER;
+    printf("PNORM_SPEED_FP32 AVG TIME: %.6fms\n", totalTime);
+#undef ITER
+}
+TEST_F(CUDA, INFNORM_SPEED_FP32) {
+    auto benchmarker = Benchmarker<Norm>(handle_cuda());
+    benchmarker.set_dtype(0, dtype::Float32());
+    benchmarker.set_dtype(1, dtype::Float32());
+    Norm::Param param;
+    param.mode = Norm::Param::Mode::INF_NORM;
+    param.dim = 0;
+    SmallVector<TensorShape> shapes{{4194304}, {}};
+    NormalRNG rng(0, 1);
+    float time_fp32 = benchmarker.set_param(param).set_rng(0, &rng).exec(shapes);
+    printf("INF_SPEED_FP32 cuda time: float=%.6fms\n", time_fp32);
+}
+TEST_F(CUDA, NEG_INFNORM_SPEED_FP32) {
+    auto benchmarker = Benchmarker<Norm>(handle_cuda());
+    benchmarker.set_dtype(0, dtype::Float32());
+    benchmarker.set_dtype(1, dtype::Float32());
+    Norm::Param param;
+    param.mode = Norm::Param::Mode::NEG_INF_NORM;
+    param.dim = 0;
+    SmallVector<TensorShape> shapes{{4194304}, {}};
+    NormalRNG rng(0, 1);
+    float time_fp32 = benchmarker.set_param(param).set_rng(0, &rng).exec(shapes);
+    printf("NEG_INF_SPEED_FP32 cuda time: float=%.6fms\n", time_fp32);
+}
+}  // namespace test
+}  // namespace megdnn
--- a/dnn/test/naive/norm.cpp
+++ b/dnn/test/naive/norm.cpp
+#include "test/common/norm.h"
+#include "megdnn/dtype.h"
+#include "megdnn/oprs.h"
+#include "test/common/benchmarker.h"
+#include "test/common/checker.h"
+#include "test/naive/fixture.h"
+
+namespace megdnn {
+namespace test {
+TEST_F(NAIVE, L2NORM_FP32_DIM0) {
+    Checker<Norm> checker(handle(), false);
+    Norm::Param param;
+    param.p = 2;
+    param.dim = 0;
+    checker.set_param(param);
+    checker.exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+            });
+}
+TEST_F(NAIVE, L2NORM_FP32_DIM1) {
+    Checker<Norm> checker(handle());
+    Norm::Param param;
+    param.p = 2;
+    param.dim = 1;
+    checker.set_param(param);
+    checker.exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue(
+                            {1, 1, 3, 4}, dtype::Float32(),
+                            {12.000, 13.0384, 14.1421, 15.2971, 16.4924, 17.7200,
+                             18.9737, 20.2485, 21.5407, 22.8473, 24.1661, 25.4951}),
+            });
+}
+TEST_F(NAIVE, L2NORM_FP32_DIM3) {
+    Checker<Norm> checker(handle());
+    Norm::Param param;
+    param.p = 2;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue(
+                            {1, 2, 3, 1}, dtype::Float32(),
+                            {3.7417, 11.2250, 19.1311, 27.0924, 35.0714, 43.0581})});
+}
+// l2, fp16
+TEST_F(NAIVE, L2NORM_FP16_DIM3) {
+    Checker<Norm> checker(handle());
+    Norm::Param param;
+    param.p = 2;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float16(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue(
+                            {1, 2, 3, 1}, dtype::Float16(),
+                            {3.7422, 11.2266, 19.1250, 27.0938, 35.0625, 43.0625})});
+}
+// l1, fp32,fp16
+TEST_F(NAIVE, L1NORM_FP32_DIM3) {
+    Checker<Norm> checker(handle());
+    Norm::Param param;
+    param.p = 1;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue(
+                            {1, 2, 3, 1}, dtype::Float32(), {6, 22, 38, 54, 70, 86}),
+            });
+}
+TEST_F(NAIVE, L1NORM_FP16_DIM3) {
+    Checker<Norm> checker(handle());
+    Norm::Param param;
+    param.p = 1;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float16(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue(
+                            {1, 2, 3, 1}, dtype::Float16(), {6, 22, 38, 54, 70, 86}),
+            });
+}
+// l0, fp32,fp16
+TEST_F(NAIVE, L0NORM_FP32_DIM3) {
+    Checker<Norm> checker(handle());
+    Norm::Param param;
+    param.p = 0;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue({1, 2, 3, 1}, dtype::Float32(), {3, 4, 4, 4, 4, 4}),
+            });
+}
+TEST_F(NAIVE, L0NORM_FP16_DIM3) {
+    Checker<Norm> checker(handle());
+    Norm::Param param;
+    param.p = 0;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float16(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue({1, 2, 3, 1}, dtype::Float16(), {3, 4, 4, 4, 4, 4}),
+            });
+}
+// inf
+TEST_F(NAIVE, INF_NORM_FP32_DIM3) {
+    Checker<Norm> checker(handle());
+    Norm::Param param;
+    using Mode = Norm::Param::Mode;
+
+    param.dim = 3;
+    param.mode = Mode::INF_NORM;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue({1, 2, 3, 1}, dtype::Float32(), {3, 7, 11, 15, 19, 23}),
+            });
+}
+TEST_F(NAIVE, INF_NORM_FP16_DIM3) {
+    Checker<Norm> checker(handle());
+    Norm::Param param;
+    using Mode = Norm::Param::Mode;
+
+    param.dim = 3;
+    param.mode = Mode::INF_NORM;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float16(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue({1, 2, 3, 1}, dtype::Float16(), {3, 7, 11, 15, 19, 23}),
+            });
+}
+// -inf
+TEST_F(NAIVE, NEG_INF_NORM_FP32_DIM3) {
+    Checker<Norm> checker(handle());
+    Norm::Param param;
+    param.mode = Norm::Param::Mode::NEG_INF_NORM;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float32(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue({1, 2, 3, 1}, dtype::Float32(), {0, 4, 8, 12, 16, 20}),
+            });
+}
+TEST_F(NAIVE, NEG_INF_NORM_FP16_DIM3) {
+    Checker<Norm> checker(handle());
+    Norm::Param param;
+    param.mode = Norm::Param::Mode::NEG_INF_NORM;
+    param.dim = 3;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {1, 2, 3, 4}, dtype::Float16(),
+                            {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                             12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}),
+                    {}},
+            Testcase{
+                    {},
+                    TensorValue({1, 2, 3, 1}, dtype::Float16(), {0, 4, 8, 12, 16, 20}),
+            });
+}
+
+}  // namespace test
+}  // namespace megdnn
\ No newline at end of file