未验证 提交 1bc47c84 编写于 作者: Y Yao Zihang 提交者: GitHub

Optimize batchnorm1d using 2D kernel (#43530)

上级 a2c4c86b
......@@ -591,10 +591,12 @@ void BatchNormGradRawKernel(const Context &ctx,
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else
// CUDNN PER_ACTIVATION mode only support small batch size
// CUDNN only support small batch size
const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
const size_t CUDNN_SPATIAL_THRESHOLD = 880801;
const bool use_native_kernel =
(x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD);
((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
(x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
if (use_native_kernel) {
if (compute_format == DataLayout::kNCHW) {
BNBackward<T, block, DataLayout::kNCHW>
......
......@@ -31,6 +31,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/norm_utils.h"
#include "paddle/phi/kernels/funcs/reduce_function.h"
#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
#ifdef __HIPCC__
......@@ -137,6 +138,398 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
}
}
template <typename T>
__device__ __forceinline__ void merge_block_vertical(
BatchNormParamType<T> x_sum,
BatchNormParamType<T> x_square_sum,
BatchNormParamType<T> *smem_sum,
BatchNormParamType<T> *smem_square_sum,
BatchNormParamType<T> *x_sum_out,
BatchNormParamType<T> *x_square_sum_out) {
int tid = threadIdx.x + threadIdx.y * blockDim.x;
#pragma unroll
for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
if (threadIdx.y < offset * 2) {
smem_sum[tid] = x_sum;
smem_square_sum[tid] = x_square_sum;
}
__syncthreads();
if (threadIdx.y < offset) {
int pair_tid = tid + offset * blockDim.x;
x_sum += smem_sum[pair_tid];
x_square_sum += smem_square_sum[pair_tid];
}
}
if (threadIdx.y == 0) {
*x_sum_out = x_sum;
*x_square_sum_out = x_square_sum;
}
}
template <typename T>
__device__ __forceinline__ void merge_block_horizonal(
BatchNormParamType<T> x_sum,
BatchNormParamType<T> x_square_sum,
BatchNormParamType<T> *smem_sum,
BatchNormParamType<T> *smem_square_sum,
BatchNormParamType<T> *x_sum_out,
BatchNormParamType<T> *x_square_sum_out) {
int tid = threadIdx.x + threadIdx.y * blockDim.x;
#pragma unroll
for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if (threadIdx.x < offset * 2) {
smem_sum[tid] = x_sum;
smem_square_sum[tid] = x_square_sum;
}
__syncthreads();
if (threadIdx.x < offset) {
int pair_tid = tid + offset;
x_sum += smem_sum[pair_tid];
x_square_sum += smem_square_sum[pair_tid];
}
}
if (threadIdx.x == 0) {
*x_sum_out = x_sum;
*x_square_sum_out = x_square_sum;
}
}
template <typename T, int BlockDim>
static __global__ void BNForwardTraining2DChannelLastCompStat(
const T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const int C,
const int N,
const int HxW,
const double epsilon,
double exponentialAverageFactor,
T *y,
BatchNormParamType<T> *global_mean,
BatchNormParamType<T> *global_variance,
BatchNormParamType<T> *save_mean,
BatchNormParamType<T> *save_inv_variance,
BatchNormParamType<T> *compute_mean,
BatchNormParamType<T> *compute_inv_var,
BatchNormParamType<T> *block_data_ptr,
int *flag_ptr) {
int outer_size = C;
int inner_size = N * HxW;
__shared__ BatchNormParamType<T> smem_sum[BlockDim];
__shared__ BatchNormParamType<T> smem_square_sum[BlockDim];
int outer_loop_stride = gridDim.x * blockDim.x;
int inner_loop_stride = gridDim.y * blockDim.y;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size;
i += outer_loop_stride) {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size;
j += inner_loop_stride) {
const int index = j * outer_size + i;
BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
// vertical block sum
merge_block_vertical<T>(x_sum,
x_square_sum,
&smem_sum[0],
&smem_square_sum[0],
&x_sum,
&x_square_sum);
if (gridDim.y > 1) {
volatile BatchNormParamType<T> *staging_sum = block_data_ptr;
volatile BatchNormParamType<T> *staging_square_sum =
&block_data_ptr[C * gridDim.y];
// write block data to global memory
if (threadIdx.y == 0) {
staging_sum[i + blockIdx.y * C] = x_sum;
staging_square_sum[i + blockIdx.y * C] = x_square_sum;
}
// make sure write is visible to all blocks
__threadfence();
__syncthreads();
__shared__ bool is_last_block_done;
// mark block done
if (threadIdx.x == 0 && threadIdx.y == 0) {
int old = atomicAdd(&flag_ptr[blockIdx.x], 1);
is_last_block_done = (old == (gridDim.y - 1));
}
__syncthreads();
if (is_last_block_done) {
x_sum = static_cast<BatchNormParamType<T>>(0);
x_square_sum = static_cast<BatchNormParamType<T>>(0);
// thread sum
for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
x_sum += staging_sum[i + y * C];
x_square_sum += staging_square_sum[i + y * C];
}
// vertical block sum
merge_block_vertical<T>(x_sum,
x_square_sum,
&smem_sum[0],
&smem_square_sum[0],
&x_sum,
&x_square_sum);
// final compute
if (threadIdx.y == 0) {
BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
BatchNormParamType<T> variance_val =
x_square_sum / inner_size - compute_mean_val * compute_mean_val;
BatchNormParamType<T> compute_inv_var_val =
1 / sqrt(variance_val + epsilon);
if (save_mean && save_inv_variance) {
save_mean[i] = compute_mean_val;
save_inv_variance[i] = compute_inv_var_val;
}
global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val +
exponentialAverageFactor * global_mean[i];
global_variance[i] = (1 - exponentialAverageFactor) * variance_val +
exponentialAverageFactor * global_variance[i];
compute_mean[i] = compute_mean_val;
compute_inv_var[i] = compute_inv_var_val;
}
}
} else {
if (blockIdx.y == 0 && threadIdx.y == 0) {
BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
BatchNormParamType<T> variance_val =
x_square_sum / inner_size - compute_mean_val * compute_mean_val;
BatchNormParamType<T> compute_inv_var_val =
1 / sqrt(variance_val + epsilon);
if (save_mean && save_inv_variance) {
save_mean[i] = compute_mean_val;
save_inv_variance[i] = compute_inv_var_val;
}
global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val +
exponentialAverageFactor * global_mean[i];
global_variance[i] = (1 - exponentialAverageFactor) * variance_val +
exponentialAverageFactor * global_variance[i];
compute_mean[i] = compute_mean_val;
compute_inv_var[i] = compute_inv_var_val;
}
}
}
}
template <typename T>
static __global__ void BNForwardTraining2DChannelLastWriteRes(
const T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const int C,
const int N,
const int HxW,
T *y,
BatchNormParamType<T> *compute_mean,
BatchNormParamType<T> *compute_inv_var) {
int outer_size = C;
int inner_size = N * HxW;
int outer_loop_stride = gridDim.x * blockDim.x;
int inner_loop_stride = gridDim.y * blockDim.y;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size;
i += outer_loop_stride) {
BatchNormParamType<T> mean_val = compute_mean[i];
BatchNormParamType<T> inv_var_val = compute_inv_var[i];
BatchNormParamType<T> scale_val = scale[i];
BatchNormParamType<T> bias_val = bias[i];
for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size;
j += inner_loop_stride) {
const int index = j * outer_size + i;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
y[index] = scale_val * x_sub_mean * inv_var_val + bias_val;
}
}
}
template <typename T, int BlockDim>
static __global__ void BNForwardTraining2DCompStat(
const T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const int C,
const int N,
const int HxW,
const double epsilon,
double exponentialAverageFactor,
T *y,
BatchNormParamType<T> *global_mean,
BatchNormParamType<T> *global_variance,
BatchNormParamType<T> *save_mean,
BatchNormParamType<T> *save_inv_variance,
BatchNormParamType<T> *compute_mean,
BatchNormParamType<T> *compute_inv_var,
BatchNormParamType<T> *block_data_ptr,
int *flag_ptr) {
int outer_size = C;
int inner_size = N * HxW;
__shared__ BatchNormParamType<T> smem_sum[BlockDim];
__shared__ BatchNormParamType<T> smem_square_sum[BlockDim];
int outer_loop_stride = gridDim.y * blockDim.y;
int inner_loop_stride = gridDim.x * blockDim.x;
for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < outer_size;
i += outer_loop_stride) {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size;
j += inner_loop_stride) {
const int index = (j / HxW * C + i) * HxW + j % HxW;
BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
// horizonal block sum
merge_block_horizonal<T>(x_sum,
x_square_sum,
&smem_sum[0],
&smem_square_sum[0],
&x_sum,
&x_square_sum);
if (gridDim.x > 1) {
volatile BatchNormParamType<T> *staging_sum = block_data_ptr;
volatile BatchNormParamType<T> *staging_square_sum =
&block_data_ptr[C * gridDim.x];
// write block data to global memory
if (threadIdx.x == 0) {
staging_sum[i + blockIdx.x * C] = x_sum;
staging_square_sum[i + blockIdx.x * C] = x_square_sum;
}
// make sure write is visible to all blocks
__threadfence();
__syncthreads();
__shared__ bool is_last_block_done;
// mark block done
if (threadIdx.x == 0 && threadIdx.y == 0) {
int old = atomicAdd(&flag_ptr[blockIdx.y], 1);
is_last_block_done = (old == (gridDim.x - 1));
}
__syncthreads();
if (is_last_block_done) {
x_sum = static_cast<BatchNormParamType<T>>(0);
x_square_sum = static_cast<BatchNormParamType<T>>(0);
// thread sum
for (int x = threadIdx.x; x < gridDim.x; x += blockDim.x) {
x_sum += staging_sum[i + x * C];
x_square_sum += staging_square_sum[i + x * C];
}
// horizonal block sum
merge_block_horizonal<T>(x_sum,
x_square_sum,
&smem_sum[0],
&smem_square_sum[0],
&x_sum,
&x_square_sum);
// final compute
if (threadIdx.x == 0) {
BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
BatchNormParamType<T> variance_val =
x_square_sum / inner_size - compute_mean_val * compute_mean_val;
BatchNormParamType<T> compute_inv_var_val =
1 / sqrt(variance_val + epsilon);
if (save_mean && save_inv_variance) {
save_mean[i] = compute_mean_val;
save_inv_variance[i] = compute_inv_var_val;
}
global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val +
exponentialAverageFactor * global_mean[i];
global_variance[i] = (1 - exponentialAverageFactor) * variance_val +
exponentialAverageFactor * global_variance[i];
compute_mean[i] = compute_mean_val;
compute_inv_var[i] = compute_inv_var_val;
}
}
} else {
if (blockIdx.x == 0 && threadIdx.x == 0) {
BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
BatchNormParamType<T> variance_val =
x_square_sum / inner_size - compute_mean_val * compute_mean_val;
BatchNormParamType<T> compute_inv_var_val =
1 / sqrt(variance_val + epsilon);
if (save_mean && save_inv_variance) {
save_mean[i] = compute_mean_val;
save_inv_variance[i] = compute_inv_var_val;
}
global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val +
exponentialAverageFactor * global_mean[i];
global_variance[i] = (1 - exponentialAverageFactor) * variance_val +
exponentialAverageFactor * global_variance[i];
compute_mean[i] = compute_mean_val;
compute_inv_var[i] = compute_inv_var_val;
}
}
}
}
template <typename T>
static __global__ void BNForwardTraining2DWriteRes(
const T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const int C,
const int N,
const int HxW,
T *y,
BatchNormParamType<T> *compute_mean,
BatchNormParamType<T> *compute_inv_var) {
int outer_size = C;
int inner_size = N * HxW;
int outer_loop_stride = gridDim.y * blockDim.y;
int inner_loop_stride = gridDim.x * blockDim.x;
for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < outer_size;
i += outer_loop_stride) {
BatchNormParamType<T> mean_val = compute_mean[i];
BatchNormParamType<T> inv_var_val = compute_inv_var[i];
BatchNormParamType<T> scale_val = scale[i];
BatchNormParamType<T> bias_val = bias[i];
for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size;
j += inner_loop_stride) {
const int index = (j / HxW * C + i) * HxW + j % HxW;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
y[index] = scale_val * x_sub_mean * inv_var_val + bias_val;
}
}
}
template <typename T, typename Context>
void BatchNormKernel(const Context &ctx,
const DenseTensor &x,
......@@ -515,17 +908,63 @@ void BatchNormKernel(const Context &ctx,
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
#else
// CUDNN PER_ACTIVATION mode only support small batch size
const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
const size_t CUDNN_SPATIAL_THRESHOLD = 880801;
const bool use_native_kernel =
(x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD);
((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
(x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
if (use_native_kernel) {
const int block = 512;
const int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(C, max_blocks);
if (compute_format == DataLayout::kNCHW) {
BNForwardTraining<T, block, DataLayout::kNCHW>
dim3 block;
dim3 grid;
const int block_size = 512;
const int MAX_GRID_SIZE = 128;
const int WARP_SIZE = 32;
// init intermediate storage
DenseTensor block_data_tensor;
DenseTensor flag_tensor;
DenseTensor compute_mean_tensor =
phi::Empty<BatchNormParamType<T>, Context>(ctx, {C});
DenseTensor compute_inv_var_tensor =
phi::Empty<BatchNormParamType<T>, Context>(ctx, {C});
BatchNormParamType<T> *block_data_ptr = nullptr;
int *flag_ptr = nullptr;
if (x_dims.size() != 2 && compute_format == DataLayout::kNCHW) {
// init block&grid config
int block_x =
std::min(phi::funcs::details::GetLastPow2(H * W * D), block_size);
int block_y = std::min(phi::funcs::details::GetLastPow2(C),
block_size / block_x);
if (block_x * block_y != block_size) {
block_x =
std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16),
block_size / block_y);
}
int grid_x =
std::min((N * H * W * D + block_x * 16 - 1) / (block_x * 16),
MAX_GRID_SIZE);
int grid_y = (C + block_y - 1) / block_y;
block.x = block_x;
block.y = block_y;
grid.x = grid_x;
grid.y = grid_y;
if (grid.x > 1) {
block_data_tensor = phi::Empty<BatchNormParamType<T>, Context>(
ctx, {2 * C * grid.x});
flag_tensor = phi::Empty<int, Context>(ctx, {grid.y});
block_data_ptr = block_data_tensor.data<BatchNormParamType<T>>();
flag_ptr = flag_tensor.data<int>();
funcs::SetConstant<Context, int> set_zero;
set_zero(ctx, &flag_tensor, static_cast<int>(0));
}
BNForwardTraining2DCompStat<T, block_size>
<<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
......@@ -539,9 +978,54 @@ void BatchNormKernel(const Context &ctx,
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
saved_variance->template data<BatchNormParamType<T>>(),
compute_mean_tensor.data<BatchNormParamType<T>>(),
compute_inv_var_tensor.data<BatchNormParamType<T>>(),
block_data_ptr,
flag_ptr);
BNForwardTraining2DWriteRes<T><<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
transformed_y.template data<T>(),
compute_mean_tensor.data<BatchNormParamType<T>>(),
compute_inv_var_tensor.data<BatchNormParamType<T>>());
} else {
BNForwardTraining<T, block, DataLayout::kNHWC>
// init block&grid config
int block_x =
std::min(phi::funcs::details::GetLastPow2(C), WARP_SIZE);
int block_y =
std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16),
block_size / block_x);
if (block_x * block_y != block_size) {
block_x = std::min(phi::funcs::details::GetLastPow2(C),
block_size / block_y);
}
int grid_x = (C + block_x - 1) / block_x;
int grid_y =
std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16),
MAX_GRID_SIZE);
block.x = block_x;
block.y = block_y;
grid.x = grid_x;
grid.y = grid_y;
if (grid.y > 1) {
block_data_tensor = phi::Empty<BatchNormParamType<T>, Context>(
ctx, {2 * C * grid.y});
flag_tensor = phi::Empty<int, Context>(ctx, {grid.x});
block_data_ptr = block_data_tensor.data<BatchNormParamType<T>>();
flag_ptr = flag_tensor.data<int>();
funcs::SetConstant<Context, int> set_zero;
set_zero(ctx, &flag_tensor, static_cast<int>(0));
}
BNForwardTraining2DChannelLastCompStat<T, block_size>
<<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
......@@ -555,7 +1039,23 @@ void BatchNormKernel(const Context &ctx,
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
saved_variance->template data<BatchNormParamType<T>>(),
compute_mean_tensor.data<BatchNormParamType<T>>(),
compute_inv_var_tensor.data<BatchNormParamType<T>>(),
block_data_ptr,
flag_ptr);
BNForwardTraining2DChannelLastWriteRes<T>
<<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
transformed_y.template data<T>(),
compute_mean_tensor.data<BatchNormParamType<T>>(),
compute_inv_var_tensor.data<BatchNormParamType<T>>());
}
} else {
#if CUDNN_VERSION_MIN(7, 4, 1)
......
......@@ -82,50 +82,58 @@ class TestBatchNorm(unittest.TestCase):
self.assertRaises(ValueError, error2d_dataformat)
self.assertRaises(ValueError, error3d_dataformat)
def test_eager_api(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
shape = [4, 10, 4, 4]
def test_large_batch(self):
def compute_v1(x):
with fluid.dygraph.guard(p):
bn = fluid.dygraph.BatchNorm(shape[1])
#bn = paddle.nn.BatchNorm2D(shape[1])
def compute_baseline(x):
with fluid.dygraph.guard(p):
bn = fluid.dygraph.BatchNorm(shape[1])
x1 = paddle.to_tensor(x)
x1.stop_gradient = False
y = bn(x1)
y.backward()
return y.numpy(), x1.gradient()
def compute_1d(x):
with fluid.dygraph.guard(p):
with _test_eager_guard():
bn = paddle.nn.BatchNorm1D(shape[1])
x1 = paddle.to_tensor(x)
x1.stop_gradient = False
y = bn(x1)
y.backward()
return y.numpy(), x1.gradient()
def compute_v2(x):
with fluid.dygraph.guard(p):
with _test_eager_guard():
print("v2")
bn = paddle.nn.BatchNorm2D(shape[1])
x1 = paddle.to_tensor(x)
x1.stop_gradient = False
y = bn(x1)
y.backward()
return y.numpy(), x1.gradient()
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
# [N, C]
shape = [200000, 4]
x = np.random.randn(*shape).astype("float32")
y1, g1 = compute_baseline(x)
y2, g2 = compute_1d(x)
self.assertTrue(np.allclose(g1, g2))
self.assertTrue(np.allclose(y1, y2))
# [N, C, L]
shape = [1000000, 4, 4]
x = np.random.randn(*shape).astype("float32")
y1, g1 = compute_v1(x)
y2, g2 = compute_v2(x)
y1, g1 = compute_baseline(x)
y2, g2 = compute_1d(x)
self.assertTrue(np.allclose(g1, g2))
self.assertTrue(np.allclose(y1, y2))
def test_eager_api_1d(self):
def test_eager_api(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
shape = [200000, 4]
shape = [4, 10, 4, 4]
def compute_v1(x):
with fluid.dygraph.guard(p):
bn = fluid.dygraph.BatchNorm(shape[1])
#bn = paddle.nn.BatchNorm2D(shape[1])
x1 = paddle.to_tensor(x)
x1.stop_gradient = False
y = bn(x1)
......@@ -135,7 +143,8 @@ class TestBatchNorm(unittest.TestCase):
def compute_v2(x):
with fluid.dygraph.guard(p):
with _test_eager_guard():
bn = paddle.nn.BatchNorm1D(shape[1])
print("v2")
bn = paddle.nn.BatchNorm2D(shape[1])
x1 = paddle.to_tensor(x)
x1.stop_gradient = False
y = bn(x1)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册