提交 1fa3449a 编写于 作者: M Megvii Engine Team

feat(opr): add general normalization cuda naive implementation

GitOrigin-RevId: e42f3c2df8317d83da62454e2cedcfd29a460fc6
上级 5b7e6a80
......@@ -1272,8 +1272,6 @@ PADDING_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'),
(pdef('GeneralNorm')
.add_fields('bool', 'affine', 'true')
.add_fields('float32', 'eps', '1e-5f')
.add_fields('uint64', 'normalized_dim', '1')
.add_fields('uint64', 'normalized_size', '1')
.add_fields('uint64', 'normalized_axis', '0')
)
......
......@@ -7,14 +7,15 @@ namespace general_norm {
template <typename T, typename T_ACC>
void forward(
T* X, T* gamma, T* beta, int64_t M, int64_t N, T_ACC eps, T* Y, T_ACC* mean,
T_ACC* rstd, cudaStream_t stream);
T* X_data, T* weight_data, T* bias_data, T* Y_data, T_ACC* mean_data,
T_ACC* rstd_data, T_ACC eps, int64_t A, int64_t B, int64_t C,
cudaStream_t stream);
template <typename T, typename T_ACC>
void backward(
const T* dY_data, const T* X_data, const T_ACC* mean_data,
const T_ACC* rstd_data, const T* gamma_data, int64_t M, int64_t N, T* dX_data,
T* dgamma_data, T* dbeta_data, cudaStream_t stream);
const T* dY_data, const T* X_data, const T* gamma_data, const T_ACC* mean_data,
const T_ACC* rstd_data, T* dX_data, T* dgamma_data,
T* dbeta_data, int64_t A, int64_t B, int64_t C, cudaStream_t stream);
} // namespace general_norm
} // namespace cuda
......
......@@ -16,12 +16,9 @@ void GeneralNormForwardImpl::exec(
auto p = param();
float eps = p.eps;
bool affine = p.affine;
uint64_t slice_length = p.normalized_size;
uint64_t slice_dim = p.normalized_dim;
uint64_t n_slices = 1;
for (size_t i = 0; i < data.layout.ndim - slice_dim; ++i) {
n_slices = n_slices * data.layout.shape[i];
}
uint64_t axis = p.normalized_axis;
uint64_t A, B, C;
megdnn::reduce::get_ABC(data.layout, A, B, C, axis);
auto stream = cuda_stream(handle());
using namespace ::megdnn::cuda::general_norm;
......@@ -32,9 +29,9 @@ void GeneralNormForwardImpl::exec(
using T_ACC = float; \
forward<T, T_ACC>( \
data.ptr<T>(), affine ? weight.ptr<T>() : nullptr, \
affine ? bias.ptr<T>() : nullptr, static_cast<int64_t>(n_slices), \
static_cast<int64_t>(slice_length), static_cast<T_ACC>(eps), \
dst.ptr<T>(), mean.ptr<T_ACC>(), rstd.ptr<T_ACC>(), stream); \
affine ? bias.ptr<T>() : nullptr, dst.ptr<T>(), mean.ptr<T_ACC>(), \
rstd.ptr<T_ACC>(), static_cast<T_ACC>(eps), A, B, \
C, stream); \
return; \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
......@@ -52,12 +49,9 @@ void GeneralNormBackwardImpl::exec(
ddata.layout, dweight.layout, dbias.layout, workspace.size);
auto p = param();
bool affine = p.affine;
uint64_t slice_length = p.normalized_size;
uint64_t slice_dim = p.normalized_dim;
uint64_t n_slices = 1;
for (size_t i = 0; i < data.layout.ndim - slice_dim; ++i) {
n_slices = n_slices * data.layout.shape[i];
}
uint64_t axis = p.normalized_axis;
uint64_t A, B, C;
megdnn::reduce::get_ABC(data.layout, A, B, C, axis);
auto stream = cuda_stream(handle());
using namespace ::megdnn::cuda::general_norm;
......@@ -66,10 +60,12 @@ void GeneralNormBackwardImpl::exec(
using T = typename DTypeTrait<DType>::ctype; \
using T_ACC = float; \
backward<T, T_ACC>( \
diff.ptr<T>(), data.ptr<T>(), mean.ptr<T_ACC>(), rstd.ptr<T_ACC>(), \
affine ? weight.ptr<T>() : nullptr, n_slices, slice_length, \
ddata.ptr<T>(), affine ? dweight.ptr<T>() : nullptr, \
affine ? dbias.ptr<T>() : nullptr, stream); \
diff.ptr<T>(), data.ptr<T>(), affine ? weight.ptr<T>() : nullptr, \
mean.ptr<T_ACC>(), rstd.ptr<T_ACC>(), \
ddata.ptr<T>(), \
affine ? dweight.ptr<T>() : nullptr, \
affine ? dbias.ptr<T>() : nullptr, A, B, C, \
stream); \
return; \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
......
#pragma once
#include "megdnn/oprs.h"
#include "src/common/reduce_helper.h"
#include "src/cuda/cudnn_wrapper.h"
......
......@@ -16,7 +16,7 @@ void forward(
_megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias,
_megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd,
const Param& param) {
printf("general forward\n");
printf("Cpu general forward\n");
float eps = param.eps;
bool affine = param.affine;
uint64_t axis = param.normalized_axis;
......@@ -105,7 +105,7 @@ void backward(
btmp = (db * mean.ptr<T_ACC>()[a * C + c] - ds) * atmp * atmp * atmp / B;
ctmp = -btmp * mean.ptr<T_ACC>()[a * C + c] - db * atmp / B;
for (uint64_t b = 0; b < B; b++) {
for (size_t b = 0; b < B; b++) {
auto weight_v = affine ? weight.ptr<T>()[b] : static_cast<T>(1.0f);
ddata.ptr<T>()[a * B * C + b * C + c] =
diff.ptr<T>()[a * B * C + b * C + c] * atmp * weight_v +
......
#include "test/cuda/fixture.h"
#include "test/common/checker.h"
#include "test/cuda/benchmark.h"
namespace megdnn {
namespace test {
TEST_F(CUDA, GeneralNorm_FORWARD) {
TEST_F(CUDA, GENERALNORM_FORWARD) {
using Param = GeneralNormForward::Param;
Param param;
param.affine = true;
param.eps = 1e-6;
param.normalized_dim = 1;
Checker<GeneralNormForward> checker(handle_cuda());
checker.set_epsilon(1e-2);
auto run = [&](DType d) {
for (size_t n_slices : {10, 30})
for (size_t slice_len : {10, 30}) {
param.normalized_size = slice_len;
param.normalized_axis = 0;
checker.set_param(param)
.set_dtype(0, d)
.set_dtype(1, d)
.set_dtype(2, d)
.set_dtype(3, d)
.set_dtype(4, dtype::Float32())
.set_dtype(5, dtype::Float32())
.execs({{n_slices, slice_len},
{n_slices},
{n_slices},
{n_slices, slice_len},
{slice_len},
{slice_len}});
param.normalized_axis = 1;
checker.set_param(param)
.set_dtype(0, d)
.set_dtype(1, d)
......@@ -39,19 +53,76 @@ TEST_F(CUDA, GeneralNorm_FORWARD) {
run(dtype::BFloat16());
}
TEST_F(CUDA, GeneralNorm_BACKWARD) {
TEST_F(CUDA, GENERALNORM_SPEED_FP32) {
using Param = GeneralNormForward::Param;
auto benchmarker = Benchmarker<GeneralNormForward>(handle_cuda());
benchmarker.set_dtype(0, dtype::Float32());
benchmarker.set_dtype(1, dtype::Float32());
Param param;
param.affine = true;
float eachTime;
float totalTime = 0.f;
#define ITER 10
param.normalized_axis = 0;
for (auto i = 0; i < ITER; i++) {
eachTime = benchmarker.set_param(param).exec({{100, 2000},
{100},
{100},
{},
{},
{}});
totalTime += eachTime;
}
totalTime /= ITER;
printf("PGENERALNORM_SPEED_FP32 AVG TIME: %.6fms\n", totalTime);
totalTime = 0.f;
param.normalized_axis = 1;
for (auto i = 0; i < ITER; i++) {
eachTime = benchmarker.set_param(param).exec({{2000, 100},
{100},
{100},
{},
{},
{}});
totalTime += eachTime;
}
totalTime /= ITER;
printf("PGENERALNORM_SPEED_FP32 AVG TIME: %.6fms\n", totalTime);
#undef ITER
}
TEST_F(CUDA, GENERALNORM_BACKWARD) {
using Param = GeneralNormBackward::Param;
Param param;
param.affine = true;
param.eps = 1e-6;
param.normalized_dim = 1;
Checker<GeneralNormBackward> checker(handle_cuda());
checker.set_epsilon(1e-1);
auto run = [&](DType d) {
for (size_t n_slices : {10, 30})
for (size_t slice_len : {10, 30}) {
param.normalized_size = slice_len;
param.normalized_axis = 0;
checker.set_param(param)
.set_dtype(0, d)
.set_dtype(1, d)
.set_dtype(2, d)
.set_dtype(3, dtype::Float32())
.set_dtype(4, dtype::Float32())
.set_dtype(5, d)
.set_dtype(6, d)
.set_dtype(7, d)
.execs({{n_slices, slice_len},
{n_slices, slice_len},
{n_slices},
{slice_len},
{slice_len},
{n_slices, slice_len},
{n_slices},
{n_slices}});
param.normalized_axis = 1;
checker.set_param(param)
.set_dtype(0, d)
.set_dtype(1, d)
......
......@@ -1136,7 +1136,6 @@ def layer_norm(
def general_norm(
inp: Tensor,
normalized_shape: tuple,
normalized_axis: int,
affine: bool,
weight: Optional[Tensor] = None,
......@@ -1158,21 +1157,11 @@ def general_norm(
See :math:`\beta` in :class:`~.GeneralNorm`.
eps: a value added to the denominator for numerical stability. Default: 1e-5
"""
if isinstance(normalized_shape, int):
normalized_shape = [normalized_shape]
normalized_dim = len(normalized_shape)
assert normalized_dim > 0
normalized_size = 1
for i in range(normalized_dim):
normalized_size = normalized_size * normalized_shape[i]
assert normalized_axis >= 0 and normalized_axis < inp.ndim
op = builtin.GeneralNorm(
affine=affine,
eps=eps,
normalized_dim=normalized_dim,
normalized_size=normalized_size,
normalized_axis = normalized_axis,
)
if affine:
......
......@@ -231,7 +231,7 @@ class GeneralNorm(Module):
(2, 3, 4, 4)
"""
def __init__(self, normalized_shape, normalized_axis, eps=1e-05, affine=True, **kwargs):
def __init__(self, inp_shape, normalized_axis, eps=1e-05, affine=True, **kwargs):
super().__init__(**kwargs)
if isinstance(normalized_shape, int):
normalized_shape = (normalized_shape,)
......@@ -241,9 +241,9 @@ class GeneralNorm(Module):
self.affine = affine
if self.affine:
self.weight = Parameter(
np.ones(self.normalized_shape, dtype="float32"))
np.ones(inp_shape[normalized_axis], dtype="float32"))
self.bias = Parameter(
np.zeros(self.normalized_shape, dtype="float32"))
np.zeros(inp_shape[normalized_axis], dtype="float32"))
else:
self.weight = None
self.bias = None
......@@ -257,10 +257,10 @@ class GeneralNorm(Module):
def forward(self, x):
x = F.nn.general_norm(
x, self.normalized_shape, self.normalized_axis, self.affine, self.weight, self.bias, self.eps
x, self.normalized_axis, self.affine, self.weight, self.bias, self.eps
)
return x
def _module_info_string(self) -> str:
s = "normalized_shape={normalized_shape}, normalized_axis={normalized_axis}, eps={eps}, affine={affine}"
s = "normalized_axis={normalized_axis}, eps={eps}, affine={affine}"
return s.format(**self.__dict__)
......@@ -66,7 +66,6 @@ SymbolVarArray GeneralNormForward::make(
void GeneralNormForward::get_output_var_shape(
const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const {
uint64_t normalized_dim = param().normalized_dim;
out_shape[0] = inp_shape[0];
TensorShape unnormalized_shape = inp_shape[0];
unnormalized_shape.ndim -= 1;
......
......@@ -23,8 +23,6 @@ void run_forward(bool is_affine, size_t normalized_size, size_t normalized_axis)
Param param;
param.eps = 1e-5;
param.affine = is_affine;
param.normalized_dim = 1;
param.normalized_size = normalized_size;
param.normalized_axis = normalized_axis;
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册