未验证 提交 7c98abd9 编写于 作者: Q qizhaoaoe 提交者: GitHub

【AMP OP&Test】instance_norm fp16 and bf16 support. (#52241)

* add fp16 and bf16 support for instance_norm

* fix /= operator which not support bf16

* fix instance_norm_grad kernel and unittests.

* fix fp32 unittests.

* fix instance_norm_kernel and unittests.

* fix instance_norm_grad_kernel and unittest threshold.

* add fp16/bf16 for instance_norm_grad_grad op.

* add bf16 dtype check.

* fix conflicts.

* fix cpu support for fp32 op and fix type in instance_norm_grad_kernel.

* fix type in instance_norm_kernel.

* fix bf16 outputs in unittests and refine codes.

* fix dx computation.

* delete unuseful params and head including.

* add fp16/bf16 for static graph.

* fix device condiction for instance_norm op.

* fix instance_norm_grad_grad and bf16 op tests.

* fix op_test to support grad of bf16 can be compared with fp32.

* remove updates.

* add self-defined grad.
上级 de44b3ac
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/instance_norm_grad_kernel.h" #include "paddle/phi/kernels/instance_norm_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/layout.h" #include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -62,12 +61,12 @@ static __global__ void GradComputeDX(const T *dy, ...@@ -62,12 +61,12 @@ static __global__ void GradComputeDX(const T *dy,
} }
__syncthreads(); __syncthreads();
for (int i = beg_idx; i < end_idx; i += BlockDim) { for (int i = beg_idx; i < end_idx; i += BlockDim) {
dx[i] = dx[i] = static_cast<T>(
(static_cast<BatchNormParamType<T>>(dy[i]) - (static_cast<BatchNormParamType<T>>(dy[i]) -
dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) - dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
(static_cast<BatchNormParamType<T>>(x[i]) - mean_val) * (static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) * dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
scale[c] * inv_var_val; scale[c] * inv_var_val);
} }
} }
...@@ -78,14 +77,14 @@ static __device__ __forceinline__ double real_sqrt(double x) { ...@@ -78,14 +77,14 @@ static __device__ __forceinline__ double real_sqrt(double x) {
return 1. / sqrt(x); return 1. / sqrt(x);
} }
template <typename T, int BlockDim> template <typename T, typename AccT, int BlockDim>
__global__ void DoubleGradComputeDX(const T *x, __global__ void DoubleGradComputeDX(const T *x,
const T *mean, const AccT *mean,
const T *variance, const AccT *variance,
const T *ddx, const T *ddx,
const T *dy, const T *dy,
const T *scale, const AccT *scale,
const T *ddscale, const AccT *ddscale,
int C, int C,
int sample_size, int sample_size,
const double epsilon, const double epsilon,
...@@ -95,30 +94,30 @@ __global__ void DoubleGradComputeDX(const T *x, ...@@ -95,30 +94,30 @@ __global__ void DoubleGradComputeDX(const T *x,
int ncid = blockIdx.x; int ncid = blockIdx.x;
int c = ncid % C; int c = ncid % C;
T mean_val = mean[ncid]; AccT mean_val = mean[ncid];
T var_val = variance[ncid]; AccT var_val = variance[ncid];
typedef cub::BlockReduce<T, BlockDim> BlockReduce; typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage dy_storage; __shared__ typename BlockReduce::TempStorage dy_storage;
__shared__ typename BlockReduce::TempStorage ddx_storage; __shared__ typename BlockReduce::TempStorage ddx_storage;
__shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage; __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
__shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage; __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
__shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage; __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
__shared__ T dy_sum_val; __shared__ AccT dy_sum_val;
__shared__ T ddx_sum_val; __shared__ AccT ddx_sum_val;
__shared__ T dy_mul_ddx_sum_val; __shared__ AccT dy_mul_ddx_sum_val;
__shared__ T dy_mul_x_sub_mean_sum_val; __shared__ AccT dy_mul_x_sub_mean_sum_val;
__shared__ T ddx_mul_x_sub_mean_sum_val; __shared__ AccT ddx_mul_x_sub_mean_sum_val;
T dy_sum = 0; AccT dy_sum = 0;
T ddx_sum = 0; AccT ddx_sum = 0;
T dy_mul_ddx_sum = 0; AccT dy_mul_ddx_sum = 0;
T dy_mul_x_sub_mean_sum = 0; AccT dy_mul_x_sub_mean_sum = 0;
T ddx_mul_x_sub_mean_sum = 0; AccT ddx_mul_x_sub_mean_sum = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) { for (int i = beg_idx; i < end_idx; i += BlockDim) {
T ddx_i = ddx[i]; AccT ddx_i = static_cast<AccT>(ddx[i]);
T dy_i = dy[i]; AccT dy_i = static_cast<AccT>(dy[i]);
T tmp = x[i] - mean_val; AccT tmp = static_cast<AccT>(x[i]) - mean_val;
dy_sum += dy_i; dy_sum += dy_i;
ddx_sum += ddx_i; ddx_sum += ddx_i;
...@@ -148,37 +147,44 @@ __global__ void DoubleGradComputeDX(const T *x, ...@@ -148,37 +147,44 @@ __global__ void DoubleGradComputeDX(const T *x,
if (ddx != nullptr) { if (ddx != nullptr) {
for (int i = beg_idx; i < end_idx; i += BlockDim) { for (int i = beg_idx; i < end_idx; i += BlockDim) {
dx[i] += AccT tmp = static_cast<AccT>(dx[i]);
((x[i] - mean_val) * var_val * var_val * var_val / sample_size * tmp +=
((static_cast<AccT>(x[i]) - mean_val) * var_val * var_val * var_val /
sample_size *
(ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val + (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
3. * dy_mul_x_sub_mean_sum_val * var_val * 3. * dy_mul_x_sub_mean_sum_val * var_val *
ddx_mul_x_sub_mean_sum_val * var_val / sample_size) + ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val * ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
var_val * (dy_sum_val / sample_size - dy[i]) + var_val * (dy_sum_val / sample_size - static_cast<AccT>(dy[i])) +
dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val * dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
var_val * (ddx_sum_val / sample_size - ddx[i])) * var_val *
(ddx_sum_val / sample_size - static_cast<AccT>(ddx[i]))) *
scale[c]; scale[c];
dx[i] = static_cast<T>(tmp);
} }
} }
__syncthreads(); __syncthreads();
if (ddscale != nullptr) { if (ddscale != nullptr) {
for (int i = beg_idx; i < end_idx; i += BlockDim) { for (int i = beg_idx; i < end_idx; i += BlockDim) {
dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val - AccT tmp = static_cast<AccT>(dx[i]);
(x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val * tmp += (static_cast<AccT>(dy[i]) * var_val -
var_val / sample_size) * dy_sum_val / sample_size * var_val -
(static_cast<AccT>(x[i]) - mean_val) * var_val *
dy_mul_x_sub_mean_sum_val * var_val / sample_size) *
ddscale[c]; ddscale[c];
dx[i] = static_cast<T>(tmp);
} }
} }
} }
template <typename T, int BlockDim> template <typename T, typename AccT, int BlockDim>
__global__ void DoubleGradComputeDDY(const T *x, __global__ void DoubleGradComputeDDY(const T *x,
const T *mean, const AccT *mean,
const T *variance, const AccT *variance,
const T *ddscale, const AccT *ddscale,
const T *ddbias, const AccT *ddbias,
const T *ddx, const T *ddx,
const T *scale, const AccT *scale,
int C, int C,
int sample_size, int sample_size,
const double epsilon, const double epsilon,
...@@ -187,20 +193,20 @@ __global__ void DoubleGradComputeDDY(const T *x, ...@@ -187,20 +193,20 @@ __global__ void DoubleGradComputeDDY(const T *x,
int end_idx = (blockIdx.x + 1) * sample_size; int end_idx = (blockIdx.x + 1) * sample_size;
int ncid = blockIdx.x; int ncid = blockIdx.x;
int c = ncid % C; int c = ncid % C;
T mean_val = mean[ncid]; AccT mean_val = mean[ncid];
T var_val = variance[ncid]; AccT var_val = variance[ncid];
typedef cub::BlockReduce<T, BlockDim> BlockReduce; typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ddx_storage; __shared__ typename BlockReduce::TempStorage ddx_storage;
__shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage; __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
__shared__ T ddx_sum_val; __shared__ AccT ddx_sum_val;
__shared__ T ddx_mul_x_sub_mean_sum_val; __shared__ AccT ddx_mul_x_sub_mean_sum_val;
T ddx_sum = 0; AccT ddx_sum = 0;
T ddx_mul_x_sub_mean_sum = 0; AccT ddx_mul_x_sub_mean_sum = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) { for (int i = beg_idx; i < end_idx; i += BlockDim) {
T ddx_i = ddx[i]; AccT ddx_i = static_cast<AccT>(ddx[i]);
ddx_sum += ddx_i; ddx_sum += ddx_i;
ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val)); ddx_mul_x_sub_mean_sum += (ddx_i * (static_cast<AccT>(x[i]) - mean_val));
} }
ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum()); ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage) ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
...@@ -212,55 +218,59 @@ __global__ void DoubleGradComputeDDY(const T *x, ...@@ -212,55 +218,59 @@ __global__ void DoubleGradComputeDDY(const T *x,
__syncthreads(); __syncthreads();
if (ddx != nullptr) { if (ddx != nullptr) {
for (int i = beg_idx; i < end_idx; i += BlockDim) { for (int i = beg_idx; i < end_idx; i += BlockDim) {
ddy[i] += scale[c] * var_val * AccT tmp = static_cast<AccT>(ddy[i]);
(ddx[i] - ddx_sum_val / sample_size - tmp += scale[c] * var_val *
(x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val * (static_cast<AccT>(ddx[i]) - ddx_sum_val / sample_size -
var_val / sample_size); (static_cast<AccT>(x[i]) - mean_val) * var_val *
ddx_mul_x_sub_mean_sum_val * var_val / sample_size);
ddy[i] = static_cast<T>(tmp);
} }
} }
__syncthreads(); __syncthreads();
if (ddscale != nullptr) { if (ddscale != nullptr) {
for (int i = beg_idx; i < end_idx; i += BlockDim) { for (int i = beg_idx; i < end_idx; i += BlockDim) {
ddy[i] += (x[i] - mean_val) * var_val * ddscale[c]; AccT tmp = static_cast<AccT>(ddy[i]);
tmp += (static_cast<AccT>(x[i]) - mean_val) * var_val * ddscale[c];
ddy[i] = static_cast<T>(tmp);
} }
} }
__syncthreads(); __syncthreads();
if (ddbias != nullptr) { if (ddbias != nullptr) {
for (int i = beg_idx; i < end_idx; i += BlockDim) { for (int i = beg_idx; i < end_idx; i += BlockDim) {
ddy[i] += ddbias[c]; ddy[i] = static_cast<T>(static_cast<AccT>(ddy[i]) + ddbias[c]);
} }
} }
} }
template <typename T, int BlockDim> template <typename T, typename AccT, int BlockDim>
__global__ void DoubleGradComputeDScale(const T *x, __global__ void DoubleGradComputeDScale(const T *x,
const T *mean, const AccT *mean,
const T *variance, const AccT *variance,
const T *ddx, const T *ddx,
const T *dy, const T *dy,
int C, int C,
int sample_size, int sample_size,
const double epsilon, const double epsilon,
T *dscale) { AccT *dscale) {
int beg_idx = blockIdx.x * sample_size + threadIdx.x; int beg_idx = blockIdx.x * sample_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * sample_size; int end_idx = (blockIdx.x + 1) * sample_size;
int ncid = blockIdx.x; int ncid = blockIdx.x;
int c = ncid % C; int c = ncid % C;
T mean_val = mean[ncid]; AccT mean_val = mean[ncid];
T var_val = variance[ncid]; AccT var_val = variance[ncid];
typedef cub::BlockReduce<T, BlockDim> BlockReduce; typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage dy_storage; __shared__ typename BlockReduce::TempStorage dy_storage;
__shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage; __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
__shared__ typename BlockReduce::TempStorage dscale_tmp_storage; __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
__shared__ T dy_sum_val; __shared__ AccT dy_sum_val;
__shared__ T dy_mul_x_sub_mean_sum_val; __shared__ AccT dy_mul_x_sub_mean_sum_val;
T dy_sum = 0; AccT dy_sum = 0;
T dy_mul_x_sub_mean_sum = 0; AccT dy_mul_x_sub_mean_sum = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) { for (int i = beg_idx; i < end_idx; i += BlockDim) {
T dy_i = dy[i]; AccT dy_i = static_cast<AccT>(dy[i]);
dy_sum += dy_i; dy_sum += dy_i;
dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val)); dy_mul_x_sub_mean_sum += (dy_i * (static_cast<AccT>(x[i]) - mean_val));
} }
dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage) dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
...@@ -272,12 +282,13 @@ __global__ void DoubleGradComputeDScale(const T *x, ...@@ -272,12 +282,13 @@ __global__ void DoubleGradComputeDScale(const T *x,
} }
__syncthreads(); __syncthreads();
if (ddx != nullptr) { if (ddx != nullptr) {
T dscale_tmp = 0; AccT dscale_tmp = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) { for (int i = beg_idx; i < end_idx; i += BlockDim) {
dscale_tmp += ddx[i] * var_val * dscale_tmp +=
(dy[i] - dy_sum_val / sample_size - static_cast<AccT>(ddx[i]) * var_val *
dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) * var_val * (static_cast<AccT>(dy[i]) - dy_sum_val / sample_size -
var_val / sample_size); dy_mul_x_sub_mean_sum_val * (static_cast<AccT>(x[i]) - mean_val) *
var_val * var_val / sample_size);
} }
dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum()); dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
...@@ -298,6 +309,7 @@ void InstanceNormGradKernel(const Context &dev_ctx, ...@@ -298,6 +309,7 @@ void InstanceNormGradKernel(const Context &dev_ctx,
DenseTensor *d_x, DenseTensor *d_x,
DenseTensor *d_scale, DenseTensor *d_scale,
DenseTensor *d_bias) { DenseTensor *d_bias) {
using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
double epsilon = static_cast<double>(epsilon_f); double epsilon = static_cast<double>(epsilon_f);
const auto *scale_ptr = scale.get_ptr(); const auto *scale_ptr = scale.get_ptr();
...@@ -313,8 +325,8 @@ void InstanceNormGradKernel(const Context &dev_ctx, ...@@ -313,8 +325,8 @@ void InstanceNormGradKernel(const Context &dev_ctx,
dev_ctx.template Alloc<T>(d_x); dev_ctx.template Alloc<T>(d_x);
if (d_scale && d_bias) { if (d_scale && d_bias) {
dev_ctx.template Alloc<T>(d_scale); dev_ctx.template Alloc<AccT>(d_scale);
dev_ctx.template Alloc<T>(d_bias); dev_ctx.template Alloc<AccT>(d_bias);
} }
if (scale_ptr) { if (scale_ptr) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -339,7 +351,7 @@ void InstanceNormGradKernel(const Context &dev_ctx, ...@@ -339,7 +351,7 @@ void InstanceNormGradKernel(const Context &dev_ctx,
scale_ptr->dims())); scale_ptr->dims()));
} }
phi::funcs::SetConstant<GPUContext, T> set_constant; phi::funcs::SetConstant<GPUContext, AccT> set_constant;
const int n = x.numel(); const int n = x.numel();
const int block = 512; const int block = 512;
...@@ -350,23 +362,21 @@ void InstanceNormGradKernel(const Context &dev_ctx, ...@@ -350,23 +362,21 @@ void InstanceNormGradKernel(const Context &dev_ctx,
DenseTensor scale_tmp; DenseTensor scale_tmp;
scale_tmp.Resize({NxC}); scale_tmp.Resize({NxC});
dev_ctx.template Alloc<T>(&scale_tmp); dev_ctx.template Alloc<AccT>(&scale_tmp);
DenseTensor d_scale_tmp; DenseTensor d_scale_tmp;
d_scale_tmp.Resize({NxC}); d_scale_tmp.Resize({NxC});
dev_ctx.template Alloc<T>(&d_scale_tmp); dev_ctx.template Alloc<AccT>(&d_scale_tmp);
DenseTensor d_bias_tmp; DenseTensor d_bias_tmp;
d_bias_tmp.Resize({NxC}); d_bias_tmp.Resize({NxC});
dev_ctx.template Alloc<T>(&d_bias_tmp); dev_ctx.template Alloc<AccT>(&d_bias_tmp);
if (scale_ptr) { if (scale_ptr) {
repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>( repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
scale_ptr->data<T>(), scale_tmp.data<T>(), N, C); scale_ptr->data<AccT>(), scale_tmp.data<AccT>(), N, C);
} else { } else {
set_constant(dev_ctx, &scale_tmp, static_cast<T>(1)); set_constant(dev_ctx, &scale_tmp, static_cast<AccT>(1));
} }
std::vector<int> dims; std::vector<int> dims;
std::vector<int> strides; std::vector<int> strides;
dims = {1, NxC, H, W, D}; dims = {1, NxC, H, W, D};
...@@ -424,11 +434,11 @@ void InstanceNormGradKernel(const Context &dev_ctx, ...@@ -424,11 +434,11 @@ void InstanceNormGradKernel(const Context &dev_ctx,
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
#endif #endif
const auto *saved_mean_data = const auto *saved_mean_data =
saved_mean.template data<BatchNormParamType<T>>(); saved_mean.template data<BatchNormParamType<T>>();
const auto *saved_var_data = const auto *saved_var_data =
saved_variance.template data<BatchNormParamType<T>>(); saved_variance.template data<BatchNormParamType<T>>();
if (d_scale && d_bias) { if (d_scale && d_bias) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenBatchNormalizationBackward( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenBatchNormalizationBackward(
...@@ -486,12 +496,11 @@ void InstanceNormGradKernel(const Context &dev_ctx, ...@@ -486,12 +496,11 @@ void InstanceNormGradKernel(const Context &dev_ctx,
d_x->data<T>()); d_x->data<T>());
} }
} }
if (d_scale && d_bias) { if (d_scale && d_bias) {
add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>( add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
d_scale_tmp.data<T>(), d_scale->data<T>(), N, C); d_scale_tmp.data<AccT>(), d_scale->data<AccT>(), N, C);
add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>( add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
d_bias_tmp.data<T>(), d_bias->data<T>(), N, C); d_bias_tmp.data<AccT>(), d_bias->data<AccT>(), N, C);
} }
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -521,6 +530,7 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx, ...@@ -521,6 +530,7 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
DenseTensor *dx, DenseTensor *dx,
DenseTensor *dscale, DenseTensor *dscale,
DenseTensor *ddy) { DenseTensor *ddy) {
using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
const auto *Scale = scale.get_ptr(); const auto *Scale = scale.get_ptr();
const auto *ddX = ddx.get_ptr(); const auto *ddX = ddx.get_ptr();
const auto *ddScale = ddscale.get_ptr(); const auto *ddScale = ddscale.get_ptr();
...@@ -529,11 +539,15 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx, ...@@ -529,11 +539,15 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
const T *x_data = x.data<T>(); const T *x_data = x.data<T>();
const T *dy_data = dy.data<T>(); const T *dy_data = dy.data<T>();
const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>()); const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>()); const AccT *ddscale_data =
const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data<T>()); (ddScale == nullptr ? nullptr : ddScale->data<AccT>());
const T *mean_data = saved_mean.data<T>(); const AccT *ddbias_data =
const T *variance_data = saved_variance.data<T>(); (ddScale == nullptr ? nullptr : ddBias->data<AccT>());
const AccT *mean_data = saved_mean.data<AccT>();
const AccT *variance_data = saved_variance.data<AccT>();
phi::funcs::SetConstant<GPUContext, T> set_zero; phi::funcs::SetConstant<GPUContext, T> set_zero;
phi::funcs::SetConstant<GPUContext, AccT> set_zero_AccT;
auto &x_dims = x.dims(); auto &x_dims = x.dims();
int N, C, H, W, D; int N, C, H, W, D;
funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
...@@ -544,10 +558,10 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx, ...@@ -544,10 +558,10 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
DenseTensor scale_tmp; DenseTensor scale_tmp;
if (!Scale) { if (!Scale) {
scale_tmp.Resize({C}); scale_tmp.Resize({C});
dev_ctx.template Alloc<T>(&scale_tmp); dev_ctx.template Alloc<AccT>(&scale_tmp);
set_zero(dev_ctx, &scale_tmp, static_cast<T>(1)); set_zero_AccT(dev_ctx, &scale_tmp, static_cast<AccT>(1));
} }
const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>(); const AccT *scale_data = Scale ? Scale->data<AccT>() : scale_tmp.data<AccT>();
const int block = 512; const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
...@@ -557,7 +571,7 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx, ...@@ -557,7 +571,7 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
if (dx) { if (dx) {
T *dx_data = dev_ctx.template Alloc<T>(dx); T *dx_data = dev_ctx.template Alloc<T>(dx);
set_zero(dev_ctx, dx, static_cast<T>(0)); set_zero(dev_ctx, dx, static_cast<T>(0));
DoubleGradComputeDX<T, block> DoubleGradComputeDX<T, AccT, block>
<<<grid, block, 0, dev_ctx.stream()>>>(x_data, <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
mean_data, mean_data,
variance_data, variance_data,
...@@ -573,13 +587,13 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx, ...@@ -573,13 +587,13 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
if (dscale) { if (dscale) {
DenseTensor dscale_tmp; DenseTensor dscale_tmp;
dscale_tmp.Resize({NxC}); dscale_tmp.Resize({NxC});
dev_ctx.template Alloc<T>(&dscale_tmp); dev_ctx.template Alloc<AccT>(&dscale_tmp);
set_zero(dev_ctx, &dscale_tmp, static_cast<T>(0)); set_zero_AccT(dev_ctx, &dscale_tmp, static_cast<AccT>(0));
T *dscale_tmp_data = dscale_tmp.data<T>(); AccT *dscale_tmp_data = dscale_tmp.data<AccT>();
T *dscale_data = dev_ctx.template Alloc<T>(dscale); AccT *dscale_data = dev_ctx.template Alloc<AccT>(dscale);
set_zero(dev_ctx, dscale, static_cast<T>(0)); set_zero_AccT(dev_ctx, dscale, static_cast<AccT>(0));
DoubleGradComputeDScale<T, block> DoubleGradComputeDScale<T, AccT, block>
<<<grid, block, 0, dev_ctx.stream()>>>(x_data, <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
mean_data, mean_data,
variance_data, variance_data,
...@@ -589,13 +603,13 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx, ...@@ -589,13 +603,13 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
sample_size, sample_size,
epsilon, epsilon,
dscale_tmp_data); dscale_tmp_data);
add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>( add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
dscale_tmp.data<T>(), dscale->data<T>(), N, C); dscale_tmp.data<AccT>(), dscale->data<AccT>(), N, C);
} }
if (ddy) { if (ddy) {
T *ddy_data = dev_ctx.template Alloc<T>(ddy); T *ddy_data = dev_ctx.template Alloc<T>(ddy);
set_zero(dev_ctx, ddy, static_cast<T>(0)); set_zero(dev_ctx, ddy, static_cast<T>(0));
DoubleGradComputeDDY<T, block> DoubleGradComputeDDY<T, AccT, block>
<<<grid, block, 0, dev_ctx.stream()>>>(x_data, <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
mean_data, mean_data,
variance_data, variance_data,
...@@ -613,24 +627,48 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx, ...@@ -613,24 +627,48 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// MIOPEN do not support double // MIOPEN do not support double
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(instance_norm_grad,
instance_norm_grad, GPU, ALL_LAYOUT, phi::InstanceNormGradKernel, float) {} GPU,
ALL_LAYOUT,
phi::InstanceNormGradKernel,
float,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(instance_norm_double_grad, PD_REGISTER_KERNEL(instance_norm_double_grad,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
phi::InstanceNormDoubleGradKernel, phi::InstanceNormDoubleGradKernel,
float) {} float,
phi::dtype::float16) {}
#elif CUDNN_VERSION_MIN(8, 1, 0)
PD_REGISTER_KERNEL(instance_norm_grad,
GPU,
ALL_LAYOUT,
phi::InstanceNormGradKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(instance_norm_double_grad,
GPU,
ALL_LAYOUT,
phi::InstanceNormDoubleGradKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#else #else
PD_REGISTER_KERNEL(instance_norm_grad, PD_REGISTER_KERNEL(instance_norm_grad,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
phi::InstanceNormGradKernel, phi::InstanceNormGradKernel,
float, float,
double) {} double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(instance_norm_double_grad, PD_REGISTER_KERNEL(instance_norm_double_grad,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
phi::InstanceNormDoubleGradKernel, phi::InstanceNormDoubleGradKernel,
float, float,
double) {} double,
phi::dtype::float16) {}
#endif #endif
...@@ -33,6 +33,7 @@ void InstanceNormKernel(const Context &dev_ctx, ...@@ -33,6 +33,7 @@ void InstanceNormKernel(const Context &dev_ctx,
DenseTensor *y, DenseTensor *y,
DenseTensor *saved_mean, DenseTensor *saved_mean,
DenseTensor *saved_variance) { DenseTensor *saved_variance) {
using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
double epsilon = static_cast<double>(epsilon_f); double epsilon = static_cast<double>(epsilon_f);
auto &x_dims = x.dims(); auto &x_dims = x.dims();
PADDLE_ENFORCE_GE(x_dims.size(), PADDLE_ENFORCE_GE(x_dims.size(),
...@@ -113,10 +114,10 @@ void InstanceNormKernel(const Context &dev_ctx, ...@@ -113,10 +114,10 @@ void InstanceNormKernel(const Context &dev_ctx,
DenseTensor scale_tmp; DenseTensor scale_tmp;
scale_tmp.Resize({NxC}); scale_tmp.Resize({NxC});
dev_ctx.template Alloc<T>(&scale_tmp); dev_ctx.template Alloc<AccT>(&scale_tmp);
DenseTensor bias_tmp; DenseTensor bias_tmp;
bias_tmp.Resize({NxC}); bias_tmp.Resize({NxC});
dev_ctx.template Alloc<T>(&bias_tmp); dev_ctx.template Alloc<AccT>(&bias_tmp);
const int n = x.numel(); const int n = x.numel();
const int block = 512; const int block = 512;
...@@ -124,24 +125,25 @@ void InstanceNormKernel(const Context &dev_ctx, ...@@ -124,24 +125,25 @@ void InstanceNormKernel(const Context &dev_ctx,
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min((NxC + block - 1) / block, max_blocks); const int grid = std::min((NxC + block - 1) / block, max_blocks);
phi::funcs::SetConstant<GPUContext, T> set_constant; phi::funcs::SetConstant<GPUContext, AccT> set_constant;
if (scale_ptr) { if (scale_ptr) {
repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>( repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
scale_ptr->data<T>(), scale_tmp.data<T>(), N, C); scale_ptr->data<AccT>(), scale_tmp.data<AccT>(), N, C);
} else { } else {
set_constant(dev_ctx, &scale_tmp, static_cast<T>(1)); set_constant(dev_ctx, &scale_tmp, static_cast<AccT>(1));
} }
if (bias_ptr) { if (bias_ptr) {
repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>( repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
bias_ptr->data<T>(), bias_tmp.data<T>(), N, C); bias_ptr->data<AccT>(), bias_tmp.data<AccT>(), N, C);
} else { } else {
set_constant(dev_ctx, &bias_tmp, static_cast<T>(0)); set_constant(dev_ctx, &bias_tmp, static_cast<AccT>(0));
} }
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
DenseTensor saved_mean_tmp, saved_variance_tmp; DenseTensor saved_mean_tmp, saved_variance_tmp;
phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor; phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
if (saved_mean) { if (saved_mean) {
dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean); dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0)); functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
...@@ -156,7 +158,6 @@ void InstanceNormKernel(const Context &dev_ctx, ...@@ -156,7 +158,6 @@ void InstanceNormKernel(const Context &dev_ctx,
saved_variance_tmp = phi::Full<BatchNormParamType<T>>( saved_variance_tmp = phi::Full<BatchNormParamType<T>>(
dev_ctx, {NxC}, static_cast<BatchNormParamType<T>>(0)); dev_ctx, {NxC}, static_cast<BatchNormParamType<T>>(0));
} }
auto *saved_mean_data = saved_mean auto *saved_mean_data = saved_mean
? saved_mean->data<BatchNormParamType<T>>() ? saved_mean->data<BatchNormParamType<T>>()
: saved_mean_tmp.data<BatchNormParamType<T>>(); : saved_mean_tmp.data<BatchNormParamType<T>>();
...@@ -225,9 +226,27 @@ void InstanceNormKernel(const Context &dev_ctx, ...@@ -225,9 +226,27 @@ void InstanceNormKernel(const Context &dev_ctx,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// MIOPEN do not support double // MIOPEN do not support double
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(instance_norm,
instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float) {} GPU,
ALL_LAYOUT,
phi::InstanceNormKernel,
float,
phi::dtype::float16) {}
#elif CUDNN_VERSION_MIN(8, 1, 0)
PD_REGISTER_KERNEL(instance_norm,
GPU,
ALL_LAYOUT,
phi::InstanceNormKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#else #else
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(instance_norm,
instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float, double) {} GPU,
ALL_LAYOUT,
phi::InstanceNormKernel,
float,
double,
phi::dtype::float16) {}
#endif #endif
...@@ -27,6 +27,7 @@ namespace cub = hipcub; ...@@ -27,6 +27,7 @@ namespace cub = hipcub;
#endif #endif
#include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/amp_type_traits.h"
namespace phi { namespace phi {
...@@ -51,22 +52,23 @@ static __global__ void add_param(const T *input, ...@@ -51,22 +52,23 @@ static __global__ void add_param(const T *input,
T *output, T *output,
const int repeat_num, const int repeat_num,
const int C) { const int C) {
typedef cub::BlockReduce<T, BlockDim> BlockReduce; using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
typedef cub::BlockReduce<MPType, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ou_storage; __shared__ typename BlockReduce::TempStorage ou_storage;
for (int i = blockIdx.x; i < C; i += gridDim.x) { for (int i = blockIdx.x; i < C; i += gridDim.x) {
T ou = static_cast<T>(0); MPType ou = static_cast<MPType>(0);
for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) { for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) {
const int index = j * C + i; const int index = j * C + i;
ou += static_cast<T>(input[index]); ou = ou + static_cast<MPType>(input[index]);
} }
ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum()); ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum());
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
output[i] = ou; output[i] = static_cast<T>(ou);
} }
__syncthreads(); __syncthreads();
if (AVG) { if (AVG) {
output[i] /= repeat_num; output[i] = static_cast<T>(static_cast<MPType>(output[i]) / repeat_num);
} }
} }
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import unittest import unittest
import numpy as np import numpy as np
from eager_op_test import OpTest, convert_float_to_uint16
import paddle import paddle
from paddle import fluid from paddle import fluid
...@@ -121,5 +122,202 @@ class TestInstanceNorm(unittest.TestCase): ...@@ -121,5 +122,202 @@ class TestInstanceNorm(unittest.TestCase):
np.testing.assert_allclose(y1, y2, rtol=1e-05) np.testing.assert_allclose(y1, y2, rtol=1e-05)
def instance_norm_warpper(
input, weight, bias, epsilon=1e-5, momentum=0.9, data_format='NCHW'
):
if data_format == "AnyLayout":
data_format = "NCDHW"
return paddle._C_ops.instance_norm(
input, weight, bias, epsilon, momentum, data_format
)
def _reference_instance_norm(x, scale, bias, epsilon):
N, C, H, W = x.shape
mean = np.mean(x, axis=(2, 3), keepdims=True)
variance = np.var(x, axis=(2, 3), keepdims=True)
std = np.sqrt(variance) + epsilon
x_norm = (x - mean) / std
scale = scale.reshape([1, C, 1, 1])
bias = bias.reshape([1, C, 1, 1])
x_norm = scale * x_norm + bias
return x_norm, mean.reshape(N * C), std.reshape(N * C)
def _reference_instance_norm_grad(x, scale, mean, var):
n, c, h, w = x.shape
d_y = np.ones(x.shape) / (np.prod(x.shape))
d_bias = np.ones((c,)) / c
mean_tile = np.reshape(mean, (n, c, 1, 1))
mean_tile = np.tile(mean_tile, (1, 1, h, w))
var_tile = np.reshape(var, (n, c, 1, 1))
var_tile = np.tile(var_tile, (1, 1, h, w))
d_scale = np.sum(d_y * (x - mean_tile) * var_tile, axis=(0, 2, 3))
var_inv = var_tile
scale_tile = np.reshape(scale, (1, c, 1, 1))
scale_tile = np.tile(scale_tile, (n, 1, h, w))
d_x = (
scale_tile
* var_inv
* (
d_y
- np.mean(d_y, axis=(2, 3), keepdims=True)
- (x - mean_tile)
* var_inv
* np.mean(
d_y * (x - mean_tile) * var_inv, axis=(2, 3), keepdims=True
)
)
)
return d_x, d_scale, d_bias
class TestInstanceNormFP32OP(OpTest):
def setUp(self):
'''Test instance_norm op with default value'''
self.op_type = "instance_norm"
self.__class__.op_type = self.op_type
self.python_api = instance_norm_warpper
self.data_format = "NCHW"
self.eps = 1e-5
self.init_dtype()
self.init_shape()
self.init_value()
self.set_err_thre()
self.inputs = {'X': self.value, 'Scale': self.scale, 'Bias': self.bias}
self.attrs = {
'epsilon': self.eps,
'momentum': 0.9,
'data_format': self.data_format,
}
y, mean, variance = _reference_instance_norm(
self.value, self.scale, self.bias, self.eps
)
self.python_out_sig = ['Y']
self.outputs = {
'Y': y,
'SavedMean': mean,
'SavedVariance': 1.0 / variance,
}
def test_check_output(self):
self.check_output(atol=self.atol)
def test_check_grad(self):
self.check_grad(
['X', 'Scale', 'Bias'],
'Y',
)
def init_dtype(self):
self.dtype = np.float32
def init_shape(self):
self.shape = [4, 100, 4, 4]
def init_value(self):
np.random.seed(0)
self.value = np.random.random(self.shape).astype(self.dtype)
self.scale = np.random.random([self.shape[1]]).astype(np.float32)
self.bias = np.random.random([self.shape[1]]).astype(np.float32)
def set_err_thre(self):
self.atol = 1e-3
@unittest.skipIf(
not core.is_compiled_with_cuda()
or not core.is_float16_supported(core.CUDAPlace(0)),
"core is not compiled with CUDA or not support the float16",
)
class TestInstanceNormFP16OP(TestInstanceNormFP32OP):
def init_dtype(self):
self.dtype = np.float16
def set_err_thre(self):
self.atol = 0.03125
self.max_relative_error = 8e-3
def test_check_output(self):
place = core.CUDAPlace(0)
self.check_output_with_place(place, atol=self.atol)
def test_check_grad(self):
place = core.CUDAPlace(0)
self.check_grad_with_place(
place,
['X', 'Scale', 'Bias'],
'Y',
max_relative_error=self.max_relative_error,
)
@unittest.skipIf(
not core.is_compiled_with_cuda()
or not core.is_bfloat16_supported(core.CUDAPlace(0)),
"core is not compiled with CUDA or not support the bfloat16",
)
class TestInstanceNormBF16OP(OpTest):
def setUp(self):
self.op_type = "instance_norm"
self.__class__.op_type = self.op_type
self.python_api = instance_norm_warpper
self.eps = 1e-5
self.data_format = "NCHW"
self.dtype = np.uint16
self.init_shape()
self.init_value()
y, mean, variance = _reference_instance_norm(
self.value, self.scale, self.bias, self.eps
)
var_inv = 1.0 / variance
self.user_defined_grads = _reference_instance_norm_grad(
self.value, self.scale, mean, var_inv
)
self.python_out_sig = ['Y']
self.outputs = {
'Y': convert_float_to_uint16(y),
'SavedMean': mean,
'SavedVariance': var_inv,
}
self.inputs = {
'X': convert_float_to_uint16(self.value),
'Scale': self.scale,
'Bias': self.bias,
}
self.attrs = {
'epsilon': self.eps,
'momentum': 0.9,
'data_format': self.data_format,
}
def init_value(self):
np.random.seed(0)
self.value = np.random.random(self.shape).astype(np.float32)
self.scale = np.random.random([self.shape[1]]).astype(np.float32)
self.bias = np.random.random([self.shape[1]]).astype(np.float32)
def init_shape(self):
self.shape = [4, 100, 4, 4]
def test_check_output(self):
place = core.CUDAPlace(0)
self.check_output_with_place(place)
def test_check_grad(self):
place = core.CUDAPlace(0)
self.check_grad_with_place(
place,
['X', 'Scale', 'Bias'],
'Y',
user_defined_grads=self.user_defined_grads,
)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
# For op in NO_FP64_CHECK_GRAD_OP_LIST, the op test requires check_grad with fp64 precision # For op in NO_FP64_CHECK_GRAD_OP_LIST, the op test requires check_grad with fp64 precision
NO_FP64_CHECK_GRAD_OP_LIST = [ NO_FP64_CHECK_GRAD_OP_LIST = [
'instance_norm',
'affine_grid', 'affine_grid',
'clip', 'clip',
'conv2d', 'conv2d',
......
...@@ -426,7 +426,10 @@ def instance_norm( ...@@ -426,7 +426,10 @@ def instance_norm(
return out return out
else: else:
check_variable_and_dtype( check_variable_and_dtype(
x, 'input', ['float32', 'float64'], "InstanceNorm" x,
'input',
['float32', 'float64', 'float16', 'uint16'],
"InstanceNorm",
) )
attrs = { attrs = {
......
...@@ -306,7 +306,10 @@ def instance_norm( ...@@ -306,7 +306,10 @@ def instance_norm(
hidden2 = paddle.static.nn.instance_norm(hidden1) hidden2 = paddle.static.nn.instance_norm(hidden1)
""" """
check_variable_and_dtype( check_variable_and_dtype(
input, 'input', ['float32', 'float64'], 'instance_norm' input,
'input',
['uint16', 'float16', 'float32', 'float64'],
'instance_norm',
) )
if param_attr is False: if param_attr is False:
assert ( assert (
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册