未验证 提交 8b914906 编写于 作者: M mapingshuo 提交者: GitHub

restrict block num of layer_norm_grad cuda kernel to 128, test=develop (#23986)

上级 b312cb89
...@@ -141,7 +141,7 @@ class LayerNormGradOp : public framework::OperatorWithKernel { ...@@ -141,7 +141,7 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
} }
if (ctx->HasOutput(framework::GradVarName("Bias"))) { if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"), ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Scale")); ctx->GetInputDim("Bias"));
} }
} }
...@@ -182,6 +182,7 @@ class LayerNormGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -182,6 +182,7 @@ class LayerNormGradOpMaker : public framework::SingleGradOpMaker<T> {
} }
if (this->HasInput("Bias")) { if (this->HasInput("Bias")) {
op->SetInput("Bias", this->Input("Bias"));
op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias")); op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
} }
...@@ -191,6 +192,9 @@ class LayerNormGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -191,6 +192,9 @@ class LayerNormGradOpMaker : public framework::SingleGradOpMaker<T> {
} }
}; };
DECLARE_NO_NEED_BUFFER_VARS_INFERER(LayerNormGradNoNeedBufferVarInference,
"Bias");
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -198,7 +202,8 @@ namespace ops = paddle::operators; ...@@ -198,7 +202,8 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
ops::LayerNormGradOpMaker<paddle::framework::OpDesc>, ops::LayerNormGradOpMaker<paddle::framework::OpDesc>,
ops::LayerNormGradOpMaker<paddle::imperative::OpBase>); ops::LayerNormGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp); REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp,
ops::LayerNormGradNoNeedBufferVarInference);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>, layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>); ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
......
...@@ -45,6 +45,37 @@ inline static int GetDesiredBlockDim(int block_dim) { ...@@ -45,6 +45,37 @@ inline static int GetDesiredBlockDim(int block_dim) {
FIXED_BLOCK_DIM_CASE_BASE(2, ##__VA_ARGS__); \ FIXED_BLOCK_DIM_CASE_BASE(2, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(1, ##__VA_ARGS__) FIXED_BLOCK_DIM_CASE_BASE(1, ##__VA_ARGS__)
#define FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE( \
log2_block_dim, feature_size, kMaxBlockNum, ...) \
case (1 << (log2_block_dim)): { \
for (int i = 0; i < std::ceil(feature_size / (1.0 * kMaxBlockNum)); i++) { \
int col_offset = i * kMaxBlockNum; \
int block_num = std::min(feature_size - col_offset, kMaxBlockNum); \
constexpr auto kBlockDim = (1 << (log2_block_dim)); \
__VA_ARGS__; \
} \
} break
#define FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(feature_size, kMaxBlockNum, ...) \
FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE(9, feature_size, kMaxBlockNum, \
##__VA_ARGS__); \
FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE(8, feature_size, kMaxBlockNum, \
##__VA_ARGS__); \
FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE(7, feature_size, kMaxBlockNum, \
##__VA_ARGS__); \
FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE(6, feature_size, kMaxBlockNum, \
##__VA_ARGS__); \
FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE(5, feature_size, kMaxBlockNum, \
##__VA_ARGS__); \
FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE(4, feature_size, kMaxBlockNum, \
##__VA_ARGS__); \
FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE(3, feature_size, kMaxBlockNum, \
##__VA_ARGS__); \
FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE(2, feature_size, kMaxBlockNum, \
##__VA_ARGS__); \
FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE(1, feature_size, kMaxBlockNum, \
##__VA_ARGS__)
static __device__ __forceinline__ float real_sqrt(float x) { return sqrtf(x); } static __device__ __forceinline__ float real_sqrt(float x) { return sqrtf(x); }
static __device__ __forceinline__ double real_sqrt(double x) { return sqrt(x); } static __device__ __forceinline__ double real_sqrt(double x) { return sqrt(x); }
...@@ -131,12 +162,13 @@ __global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y, ...@@ -131,12 +162,13 @@ __global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y,
T *d_scale, T *d_bias, T *d_x, T *d_scale, T *d_bias, T *d_x,
const T *mean, const T *var, const T *mean, const T *var,
const T *scale, float epsilon, const T *scale, float epsilon,
int batch_size, int feature_size) { int batch_size, int feature_size,
int col_offset) {
using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>; using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
int beg_idx = threadIdx.x * feature_size + blockIdx.x; int beg_idx = threadIdx.x * feature_size + (blockIdx.x + col_offset);
int end_idx = batch_size * feature_size + blockIdx.x; int end_idx = batch_size * feature_size + (blockIdx.x + col_offset);
int stride = BlockDim * feature_size; int stride = BlockDim * feature_size;
T d_scale_partial = 0, d_bias_partial = 0; T d_scale_partial = 0, d_bias_partial = 0;
...@@ -147,7 +179,7 @@ __global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y, ...@@ -147,7 +179,7 @@ __global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y,
d_scale_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val; d_scale_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val;
d_bias_partial += d_y[i]; d_bias_partial += d_y[i];
if (HasDx) { if (HasDx) {
d_x[i] = d_y[i] * scale[blockIdx.x] / var_val; d_x[i] = d_y[i] * scale[blockIdx.x + col_offset] / var_val;
} }
} }
...@@ -156,8 +188,8 @@ __global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y, ...@@ -156,8 +188,8 @@ __global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y,
PairForLayerNormAddFunctor<T>()); PairForLayerNormAddFunctor<T>());
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
d_scale[blockIdx.x] = pair.first_; d_scale[blockIdx.x + col_offset] = pair.first_;
d_bias[blockIdx.x] = pair.second_; d_bias[blockIdx.x + col_offset] = pair.second_;
} }
} }
...@@ -168,11 +200,11 @@ template <typename T, int BlockDim, bool HasDx, bool HasDScale> ...@@ -168,11 +200,11 @@ template <typename T, int BlockDim, bool HasDx, bool HasDScale>
__global__ void LayerNormBackwardGradientScaleOrBias( __global__ void LayerNormBackwardGradientScaleOrBias(
const T *x, const T *d_y, T *d_scale, T *d_bias, T *d_x, const T *mean, const T *x, const T *d_y, T *d_scale, T *d_bias, T *d_x, const T *mean,
const T *var, const T *scale, float epsilon, int batch_size, const T *var, const T *scale, float epsilon, int batch_size,
int feature_size) { int feature_size, int col_offset) {
using BlockReduce = cub::BlockReduce<T, BlockDim>; using BlockReduce = cub::BlockReduce<T, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
int beg_idx = threadIdx.x * feature_size + blockIdx.x; int beg_idx = threadIdx.x * feature_size + blockIdx.x + col_offset;
int end_idx = batch_size * feature_size + blockIdx.x; int end_idx = batch_size * feature_size + blockIdx.x + col_offset;
int stride = BlockDim * feature_size; int stride = BlockDim * feature_size;
T d_scale_or_d_bias_partial = 0; T d_scale_or_d_bias_partial = 0;
...@@ -187,7 +219,7 @@ __global__ void LayerNormBackwardGradientScaleOrBias( ...@@ -187,7 +219,7 @@ __global__ void LayerNormBackwardGradientScaleOrBias(
if (HasDx) { if (HasDx) {
if (scale != nullptr) { if (scale != nullptr) {
d_x[i] = d_y[i] * scale[blockIdx.x] / var_val; d_x[i] = d_y[i] * scale[blockIdx.x + col_offset] / var_val;
} else { } else {
d_x[i] = d_y[i] / var_val; d_x[i] = d_y[i] / var_val;
} }
...@@ -199,9 +231,9 @@ __global__ void LayerNormBackwardGradientScaleOrBias( ...@@ -199,9 +231,9 @@ __global__ void LayerNormBackwardGradientScaleOrBias(
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
if (HasDScale) { if (HasDScale) {
d_scale[blockIdx.x] = d_scale_or_d_bias_partial; d_scale[blockIdx.x + col_offset] = d_scale_or_d_bias_partial;
} else { } else {
d_bias[blockIdx.x] = d_scale_or_d_bias_partial; d_bias[blockIdx.x + col_offset] = d_scale_or_d_bias_partial;
} }
} }
} }
...@@ -322,6 +354,7 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, ...@@ -322,6 +354,7 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale,
T *d_bias, float epsilon, int batch_size, T *d_bias, float epsilon, int batch_size,
int feature_size, cudaStream_t stream) { int feature_size, cudaStream_t stream) {
const int kMaxBlockDim = 512; const int kMaxBlockDim = 512;
const int kMaxBlockNum = 128;
int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) | int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
((d_scale != nullptr ? 1 : 0) << 1) | ((d_scale != nullptr ? 1 : 0) << 1) |
((d_bias != nullptr ? 1 : 0)); ((d_bias != nullptr ? 1 : 0));
...@@ -347,29 +380,33 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, ...@@ -347,29 +380,33 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale,
switch (gradient_flag) { switch (gradient_flag) {
case 1: // d_x == nulptr, d_scale == nullptr, d_bias != nullptr case 1: // d_x == nulptr, d_scale == nullptr, d_bias != nullptr
switch (block_dim) { switch (block_dim) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
feature_size, kMaxBlockNum,
LayerNormBackwardGradientScaleOrBias<
T, kBlockDim, false, T, kBlockDim, false,
false><<<feature_size, kBlockDim, 0, stream>>>( false><<<block_num, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
feature_size)); batch_size, feature_size, col_offset));
} }
break; break;
case 2: // d_x == nullptr, d_scale != nullptr, d_bias == nullptr case 2: // d_x == nullptr, d_scale != nullptr, d_bias == nullptr
switch (block_dim) { switch (block_dim) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
T, kBlockDim, false, feature_size, kMaxBlockNum,
true><<<feature_size, kBlockDim, 0, stream>>>( LayerNormBackwardGradientScaleOrBias<
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, T, kBlockDim, false, true><<<block_num, kBlockDim, 0, stream>>>(
feature_size)); x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
batch_size, feature_size, col_offset));
} }
break; break;
case 3: // d_x == nullptr, d_scale != nulptr, d_bias != nullptr case 3: // d_x == nullptr, d_scale != nulptr, d_bias != nullptr
switch (block_dim) { switch (block_dim) {
FIXED_BLOCK_DIM_CASE( FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
feature_size, kMaxBlockNum,
LayerNormBackwardGradientAll< LayerNormBackwardGradientAll<
T, kBlockDim, false><<<feature_size, kBlockDim, 0, stream>>>( T, kBlockDim, false><<<block_num, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
batch_size, feature_size)); batch_size, feature_size, col_offset));
} }
break; break;
case 4: // d_x != nullptr, d_scale == nullptr, d_bias == nullptr case 4: // d_x != nullptr, d_scale == nullptr, d_bias == nullptr
...@@ -382,11 +419,12 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, ...@@ -382,11 +419,12 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale,
break; break;
case 5: // d_x != nulptr, d_scale == nullptr, d_bias != nullptr case 5: // d_x != nulptr, d_scale == nullptr, d_bias != nullptr
switch (block_dim) { switch (block_dim) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
T, kBlockDim, true, feature_size, kMaxBlockNum,
false><<<feature_size, kBlockDim, 0, stream>>>( LayerNormBackwardGradientScaleOrBias<
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, T, kBlockDim, true, false><<<block_num, kBlockDim, 0, stream>>>(
feature_size)); x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
batch_size, feature_size, col_offset));
} }
switch (GetDesiredBlockDim(feature_size)) { switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE( FIXED_BLOCK_DIM_CASE(
...@@ -397,11 +435,12 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, ...@@ -397,11 +435,12 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale,
break; break;
case 6: // d_x != nullptr, d_scale != nullptr, d_bias == nullptr case 6: // d_x != nullptr, d_scale != nullptr, d_bias == nullptr
switch (block_dim) { switch (block_dim) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
T, kBlockDim, true, feature_size, kMaxBlockNum,
true><<<feature_size, kBlockDim, 0, stream>>>( LayerNormBackwardGradientScaleOrBias<
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, T, kBlockDim, true, true><<<block_num, kBlockDim, 0, stream>>>(
feature_size)); x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
batch_size, feature_size, col_offset));
} }
switch (GetDesiredBlockDim(feature_size)) { switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE( FIXED_BLOCK_DIM_CASE(
...@@ -412,11 +451,12 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, ...@@ -412,11 +451,12 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale,
break; break;
case 7: // d_x != nullptr, d_scale != nullptr, d_bias != nullptr case 7: // d_x != nullptr, d_scale != nullptr, d_bias != nullptr
switch (block_dim) { switch (block_dim) {
FIXED_BLOCK_DIM_CASE( FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
feature_size, kMaxBlockNum,
LayerNormBackwardGradientAll< LayerNormBackwardGradientAll<
T, kBlockDim, true><<<feature_size, kBlockDim, 0, stream>>>( T, kBlockDim, true><<<block_num, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
batch_size, feature_size)); batch_size, feature_size, col_offset));
} }
switch (GetDesiredBlockDim(feature_size)) { switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE( FIXED_BLOCK_DIM_CASE(
...@@ -539,6 +579,8 @@ class LayerNormGradKernel<platform::CUDADeviceContext, T> ...@@ -539,6 +579,8 @@ class LayerNormGradKernel<platform::CUDADeviceContext, T>
} }
}; };
template class LayerNormDirectCUDAFunctor<float>; template class LayerNormDirectCUDAFunctor<float>;
#undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE
#undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE
#undef FIXED_BLOCK_DIM_CASE_BASE #undef FIXED_BLOCK_DIM_CASE_BASE
#undef FIXED_BLOCK_DIM_CASE #undef FIXED_BLOCK_DIM_CASE
} // namespace operators } // namespace operators
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -36,42 +36,72 @@ def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1): ...@@ -36,42 +36,72 @@ def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
mean = np.mean(x, axis=1) mean = np.mean(x, axis=1)
var = np.var(x, axis=1) + epsilon var = np.var(x, axis=1) + epsilon
output = scale.reshape([1, D]) * np.divide( output = np.divide((x - mean.reshape([N, 1])),
(x - mean.reshape([N, 1])), (np.sqrt(var)).reshape([N, 1]))
(np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D]) if scale is not None:
output = scale.reshape([1, D]) * output
if beta is not None:
output = output + beta.reshape([1, D])
x.shape, output.shape = x_shape, x_shape x.shape, output.shape = x_shape, x_shape
return output, mean, var return output, mean, var
def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): def _reference_layer_norm_grad(x,
grad_y,
scale,
bias,
mean,
var,
begin_norm_axis=1):
x_shape = x.shape x_shape = x.shape
scale_shape = scale.shape
N = reduce(mul, x_shape[0:begin_norm_axis], 1) N = reduce(mul, x_shape[0:begin_norm_axis], 1)
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
if scale is not None:
scale_shape = scale.shape
scale.shape = [1, D]
x.shape, grad_y.shape = [N, D], [N, D] x.shape, grad_y.shape = [N, D], [N, D]
var.shape, mean.shape = [N, 1], [N, 1] var.shape, mean.shape = [N, 1], [N, 1]
scale.shape = [1, D]
# d_bias # d_bias
if bias is not None:
d_bias = np.sum(grad_y, axis=0).reshape([1, D]) d_bias = np.sum(grad_y, axis=0).reshape([1, D])
else:
d_bias = None
# d_scale # d_scale
if scale is not None:
d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y, d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y,
axis=0).reshape([1, D]) axis=0).reshape([1, D])
else:
d_scale = None
# dx # dx
if scale is not None:
dx_end = scale * np.sqrt(1.0 / var) * grad_y dx_end = scale * np.sqrt(1.0 / var) * grad_y
d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape( d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
[N, 1]) # the second part equals to zero. [N, 1]) # the second part equals to zero.
d_mean = 1.0 / D * d_mean_0 d_mean = 1.0 / D * d_mean_0
d_std = np.sum( d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * scale,
-(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * ( axis=1).reshape([N, 1]) * (
1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) 1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) *
(x - mean))
else:
dx_end = 1.0 * np.sqrt(1.0 / var) * grad_y
d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * 1.0, axis=1).reshape(
[N, 1]) # the second part equals to zero.
d_mean = 1.0 / D * d_mean_0
d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * 1.0,
axis=1).reshape([N, 1]) * (
1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) *
(x - mean))
grad_x = dx_end + d_mean + d_std grad_x = dx_end + d_mean + d_std
grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape
scale.shape = scale_shape
var.shape, mean.shape = [N, ], [N, ] var.shape, mean.shape = [N, ], [N, ]
if scale is not None:
scale.shape = scale_shape
return grad_x, d_scale, d_bias return grad_x, d_scale, d_bias
...@@ -82,7 +112,12 @@ class TestLayerNormOp(unittest.TestCase): ...@@ -82,7 +112,12 @@ class TestLayerNormOp(unittest.TestCase):
def __assert_close(self, tensor, np_array, msg, atol=1e-4): def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
def check_forward_backward(self, shape, begin_norm_axis): def check_forward_backward(self,
shape,
begin_norm_axis,
has_scale=True,
has_bias=True,
y_grad_scale=1.0):
def test_with_place(place, shape, begin_norm_axis): def test_with_place(place, shape, begin_norm_axis):
# attr # attr
epsilon = 0.00001 epsilon = 0.00001
...@@ -92,21 +127,26 @@ class TestLayerNormOp(unittest.TestCase): ...@@ -92,21 +127,26 @@ class TestLayerNormOp(unittest.TestCase):
np.random.seed(123) np.random.seed(123)
x = np.random.random_sample(x_shape).astype(np.float32) x = np.random.random_sample(x_shape).astype(np.float32)
scale = np.random.random_sample(scale_shape).astype(np.float32) scale = np.random.random_sample(scale_shape).astype(
bias = np.random.random_sample(scale_shape).astype(np.float32) np.float32) if has_scale else None
y_grad = np.random.random_sample(x_shape).astype(np.float32) bias = np.random.random_sample(scale_shape).astype(
np.float32) if has_bias else None
y_grad = (np.random.random_sample(x_shape) *
y_grad_scale).astype(np.float32)
# reference forward & backward # reference forward & backward
y, mean, variance = _reference_layer_norm_naive( y, mean, variance = _reference_layer_norm_naive(
x, scale, bias, epsilon, begin_norm_axis) x, scale, bias, epsilon, begin_norm_axis)
x_grad, scale_grad, bias_grad = _reference_layer_norm_grad( x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
x, y_grad, scale, mean, variance, begin_norm_axis) x, y_grad, scale, bias, mean, variance, begin_norm_axis)
var_dict = locals() var_dict = locals()
var_dict['y@GRAD'] = y_grad var_dict['y@GRAD'] = y_grad
var_names = [ var_names = ['x', 'mean', 'variance', 'y', 'y@GRAD']
'x', 'scale', 'bias', 'mean', 'variance', 'y', 'y@GRAD' if has_scale:
] var_names += ['scale']
if has_bias:
var_names += ['bias']
ground_truth = {name: var_dict[name] for name in var_names} ground_truth = {name: var_dict[name] for name in var_names}
program = fluid.Program() program = fluid.Program()
...@@ -117,13 +157,22 @@ class TestLayerNormOp(unittest.TestCase): ...@@ -117,13 +157,22 @@ class TestLayerNormOp(unittest.TestCase):
name=name, name=name,
dtype='float32', dtype='float32',
shape=ground_truth[name].shape) shape=ground_truth[name].shape)
inputs = {"X": block.var('x')}
fetch_list = [
'y',
'mean',
'variance',
'x@GRAD',
]
if has_scale:
inputs["Scale"] = block.var('scale')
fetch_list += ['scale@GRAD']
if has_bias:
inputs["Bias"] = block.var('bias')
fetch_list += ['bias@GRAD']
layer_norm_op = block.append_op( layer_norm_op = block.append_op(
type="layer_norm", type="layer_norm",
inputs={ inputs=inputs,
"X": block.var('x'),
"Scale": block.var('scale'),
"Bias": block.var('bias'),
},
outputs={ outputs={
"Y": block.var('y'), "Y": block.var('y'),
"Mean": block.var('mean'), # share the same memory "Mean": block.var('mean'), # share the same memory
...@@ -134,7 +183,6 @@ class TestLayerNormOp(unittest.TestCase): ...@@ -134,7 +183,6 @@ class TestLayerNormOp(unittest.TestCase):
"epsilon": epsilon, "epsilon": epsilon,
"begin_norm_axis": begin_norm_axis "begin_norm_axis": begin_norm_axis
}) })
# generate backward op_desc # generate backward op_desc
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
layer_norm_op.desc, set(), []) layer_norm_op.desc, set(), [])
...@@ -150,23 +198,25 @@ class TestLayerNormOp(unittest.TestCase): ...@@ -150,23 +198,25 @@ class TestLayerNormOp(unittest.TestCase):
grad_var.set_dtype(core.VarDesc.VarType.FP32) grad_var.set_dtype(core.VarDesc.VarType.FP32)
program._sync_with_cpp() program._sync_with_cpp()
exe = fluid.Executor(place) exe = fluid.Executor(place)
out = exe.run(program, out = exe.run(program,
feed={ feed={
name: var_dict[name] name: var_dict[name]
for name in ['x', 'scale', 'bias', 'y@GRAD'] for name in ['x', 'scale', 'bias', 'y@GRAD']
}, },
fetch_list=[ fetch_list=fetch_list)
'y', 'mean', 'variance', 'x@GRAD',
'scale@GRAD', 'bias@GRAD'
])
self.__assert_close(y, out[0], "y") self.__assert_close(y, out[0], "y")
self.__assert_close(mean, out[1], "mean") self.__assert_close(mean, out[1], "mean")
self.__assert_close(variance, out[2], "variance", 1e-3) self.__assert_close(variance, out[2], "variance", 1e-3)
self.__assert_close(x_grad, out[3], "x_grad") self.__assert_close(x_grad, out[3], "x_grad")
self.__assert_close(scale_grad, out[4], "scale_grad", 1e-3) if has_scale:
self.__assert_close(bias_grad, out[5], "bias_grad") self.__assert_close(scale_grad,
out[fetch_list.index('scale@GRAD')],
"scale_grad", 1e-3)
if has_bias:
self.__assert_close(bias_grad,
out[fetch_list.index('bias@GRAD')],
"bias_grad")
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu( if core.is_compiled_with_cuda() and core.op_support_gpu(
...@@ -178,7 +228,45 @@ class TestLayerNormOp(unittest.TestCase): ...@@ -178,7 +228,45 @@ class TestLayerNormOp(unittest.TestCase):
def test_check_forward_backward_with_scale_and_bias(self): def test_check_forward_backward_with_scale_and_bias(self):
self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1) self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
self.check_forward_backward(
shape=[2, 3, 4, 5],
begin_norm_axis=1,
has_scale=False,
has_bias=True)
self.check_forward_backward(
shape=[2, 3, 4, 5],
begin_norm_axis=1,
has_scale=True,
has_bias=False)
self.check_forward_backward(
shape=[2, 3, 4, 5],
begin_norm_axis=1,
has_scale=False,
has_bias=False)
self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3) self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
self.check_forward_backward(
shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1)
self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2)
self.check_forward_backward(
shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1)
self.check_forward_backward(
shape=[92, 513, 1134],
begin_norm_axis=2,
has_scale=False,
has_bias=True,
y_grad_scale=0.1)
self.check_forward_backward(
shape=[92, 513, 1134],
begin_norm_axis=2,
has_scale=True,
has_bias=False,
y_grad_scale=0.1)
self.check_forward_backward(
shape=[92, 513, 1134],
begin_norm_axis=2,
has_scale=False,
has_bias=False,
y_grad_scale=0.1)
class TestLayerNormAPI(unittest.TestCase): class TestLayerNormAPI(unittest.TestCase):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册