未验证 提交 ebd0f512 编写于 作者: H hong 提交者: GitHub

Move bn to pten (#39347)

* add bn cpu version; test=develop

* move batch norm to pten

* move batch norm to pten; test=develop

* fix bug; test=develop

* fix func::tranpose depend bug; test=develop

* fix compile bugs; test=develop

* fix use_op batch_norm bug; test=develop

* fix cudnn bn add relu test; test=develop

* fix pten context build and double grad bug; test= develop

* remve useless code; test=develop

* add batch norm gpu fp16 support; test=develop

* fix test bn op bug; test=develop

* remove output dtype set; test=develop

* fix bug; test=develop

* fix bug; test=develop

* fix applay pass to program bug; test=develop

* revert to develop; test=develop

* fix rocm bug; test=develop

* revert operator to develop; test=develop

* fix pre_commit; test=develop

* fix statci check error; test=develop

* resolve conflict; test=develop

* ana batch norm bug;

* revert batch norm op

* resolve conlict

* fix nan inf and speed bug; test=develop

* fix bug; test=develop

* fix error; test=develop

* test expand op; test=develop

* fix bug; test=develop

* resolve confilct

* resolve confilct; test=develop

* polish code; test=develop

* polish code; test=develop

* change mutable data to ctx alloc; test=develop

* make format same with ci; test=develop

* fix format error with ci; test=develop
上级 c16f85f9
...@@ -12,12 +12,13 @@ ...@@ -12,12 +12,13 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <random>
#include <string> #include <string>
#include <unordered_set>
#include <gtest/gtest.h>
#include <boost/logic/tribool.hpp> #include <boost/logic/tribool.hpp>
#include <random>
#include <unordered_set> #include "gtest/gtest.h"
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h"
...@@ -25,7 +26,7 @@ ...@@ -25,7 +26,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
USE_OP(batch_norm); USE_OP_ITSELF(batch_norm);
USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
USE_OP(conv2d_transpose); USE_OP(conv2d_transpose);
USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
......
...@@ -2215,8 +2215,6 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2215,8 +2215,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
vector_int_attr.end()); vector_int_attr.end());
pt_kernel_context->EmplaceBackAttr(vector_int64_attr); pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
} }
// TODO(YuanRisheng) Need support vector<int64_t> attr
} else if (attr_defs[i].type_index == } else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<int32_t>))) { std::type_index(typeid(std::vector<int32_t>))) {
const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr); const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
......
...@@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, ...@@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>); ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp, REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp,
ops::BatchNormDoubleGradOpInplaceInferer); ops::BatchNormDoubleGradOpInplaceInferer);
REGISTER_OP_CPU_KERNEL(
batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
batch_norm_grad,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
batch_norm_grad_grad,
ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType<T>; ...@@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType<T>;
template <typename T> template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType; using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T, framework::DataLayout layout>
static __global__ void BNForwardInference(
const T *x, const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance, const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
const double epsilon, T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int num = N * C * HxW;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
}
}
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
const T *x, const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
const double epsilon, double exponentialAverageFactor, T *y,
BatchNormParamType<T> *mean, BatchNormParamType<T> *variance,
BatchNormParamType<T> *save_mean,
BatchNormParamType<T> *save_inv_variance) {
int outer_size = C;
int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage mean_storage;
__shared__ typename BlockReduce::TempStorage variance_storeage;
__shared__ BatchNormParamType<T> mean_val;
__shared__ BatchNormParamType<T> variance_val;
__shared__ BatchNormParamType<T> inv_var_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
x_square_sum =
BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
if (threadIdx.x == 0) {
mean_val = x_sum / inner_size;
variance_val = x_square_sum / inner_size - mean_val * mean_val;
inv_var_val = 1 / sqrt(variance_val + epsilon);
if (save_mean && save_inv_variance) {
save_mean[i] = mean_val;
save_inv_variance[i] = inv_var_val;
}
mean[i] = (1 - exponentialAverageFactor) * mean_val +
exponentialAverageFactor * mean[i];
variance[i] = (1 - exponentialAverageFactor) * variance_val +
exponentialAverageFactor * variance[i];
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
}
}
}
template <typename T>
class BatchNormKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("It must use CUDAPlace."));
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
bool test_mode = is_test && (!trainable_stats);
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
PADDLE_ENFORCE_EQ(
x_dims.size() >= 2 && x_dims.size() <= 5, true,
platform::errors::InvalidArgument(
"The size of input's dimensions should be between 2 and 5"
"But received: the size of input's dimensions is [%d]",
x_dims.size()));
auto *y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace());
int N, C, H, W, D;
ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
auto dtype = platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
: DataLayout::kNCHW;
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// HIP do not support compute format of NHWC
// auto compute_format = DataLayout::kNCHW;
#else
const bool fast_nhwc_batch_norm =
test_mode ||
(dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
auto compute_format =
fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
? DataLayout::kNHWC
: DataLayout::kNCHW;
#endif
Tensor transformed_x(x->type());
Tensor transformed_y(y->type());
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
&transformed_x);
TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
&transformed_x);
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, y,
&transformed_y);
} else {
transformed_x.ShareDataWith(*x);
transformed_y.ShareDataWith(*y);
}
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#endif // CUDNN_VERSION_MIN(7, 0, 1)
VLOG(3) << "Setting descriptors.";
std::vector<int> dims;
std::vector<int> strides;
if (compute_format == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * D * C, 1, W * D * C, D * C, C};
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// Note: PERSISTENT not implemented for inference
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(
// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
// Note: PERSISTENT not implemented for inference
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
bn_param_desc_, data_desc_,
test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
#endif
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
// Now, depending on whether we are running test or not, we have two paths.
// It is training mode when it's not reference AND not using pre-trained
// model.
bool training = !test_mode && !use_global_stats;
if (!training) {
// only when test we use input to do computation.
const auto *est_mean = ctx.Input<Tensor>("Mean");
const auto *est_var = ctx.Input<Tensor>("Variance");
// Run inference mode.
PADDLE_ENFORCE_EQ(
est_mean->dims().size(), 1UL,
platform::errors::InvalidArgument(
"The size of mean's dimensions must equal to 1."
"But received: the size of mean's dimensions mean is [%d],"
"the dimensions of mean is [%s].",
est_mean->dims().size(), est_mean->dims()));
PADDLE_ENFORCE_EQ(
est_var->dims().size(), 1UL,
platform::errors::InvalidArgument(
"The size of variance's dimensions must equal to 1."
"But received: the size of variance's dimensions is [%d],"
"the dimensions of variance is [%s].",
est_var->dims().size(), est_var->dims()));
PADDLE_ENFORCE_EQ(
est_mean->dims()[0], C,
platform::errors::InvalidArgument(
"The first dimension of mean must equal to the number of "
"Channels, which is [%d]. But received: the first dimension"
"of mean is [%d], the dimensions of mean is [%s].",
C, est_mean->dims()[0], est_mean->dims()));
PADDLE_ENFORCE_EQ(
est_var->dims()[0], C,
platform::errors::InvalidArgument(
"The first dimension of variance must equal to the number"
"of Channels, which is [%d]. But received: the first dimension of"
"variance is [%d], the dimensions of variance is [%s].",
C, est_var->dims()[0], est_var->dims()));
#ifdef PADDLE_WITH_HIP
const int block_size = 256;
const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
if (compute_format == DataLayout::kNCHW) {
BNForwardInference<
T,
DataLayout::kNCHW><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
epsilon, transformed_y.template data<T>());
} else {
BNForwardInference<
T,
DataLayout::kNHWC><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
epsilon, transformed_y.template data<T>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardInference(
// handle, miopenBNSpatial,
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_mean->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_var->template data<BatchNormParamType<T>>())),
// epsilon));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardInference(
handle,
// Note: PERSISTENT not implemented for inference
CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), data_desc_,
transformed_y.template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(), epsilon));
#endif
} else {
// if MomentumTensor is set, use MomentumTensor value, momentum
// is only used in this training branch
if (ctx.HasInput("MomentumTensor")) {
const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
Tensor mom_cpu;
paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
&mom_cpu);
momentum = mom_cpu.data<float>()[0];
}
// Run training mode.
// obtain running mean and running inv var, and there is no need
// to initialize them.
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
if ((N * H * W * D) == 1) {
// Only 1 element in normalization dimension,
// skip the batch norm calculation, let y = x.
framework::TensorCopy(*x, ctx.GetPlace(), y);
} else {
double this_factor = 1. - momentum;
bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
size_t reserve_space_size = 0;
void *reserve_space_ptr = nullptr;
void *workspace_ptr = nullptr;
Tensor workspace_tensor;
// Create reserve space and workspace for batch norm.
// Create tensor for each batchnorm op, it will be used in the
// backward. Thus this tensor shouldn't be temp.
auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
PADDLE_ENFORCE_NOT_NULL(
reserve_space,
platform::errors::NotFound(
"The argument ReserveSpace of batch_norm op is not found."));
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*zDesc=*/nullptr,
/*yDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
// -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::
cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*activationDesc=*/nullptr,
/*xDesc=*/data_desc_,
/*sizeInBytes=*/&reserve_space_size));
reserve_space_ptr = reserve_space->mutable_data(
ctx.GetPlace(), transformed_x.type(), reserve_space_size);
workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), nullptr, nullptr, data_desc_,
transformed_y.template data<T>(), bn_param_desc_,
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
nullptr, workspace_ptr, workspace_size, reserve_space_ptr,
reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
const int num = transformed_x.numel();
const int block = 256;
const int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(C, max_blocks);
if (compute_format == DataLayout::kNCHW) {
BNForwardTraining<
T, block,
DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
transformed_x.template data<T>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
epsilon, this_factor, transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
} else {
BNForwardTraining<
T, block,
DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
transformed_x.template data<T>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
epsilon, this_factor, transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardTraining(
// handle, mode_, const_cast<void *>(static_cast<const void *>(
// CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// this_factor,
// static_cast<void *>(
// mean_out->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(variance_out->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace())),
// epsilon,
// static_cast<void *>(
// saved_mean->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardTraining(
handle, mode_, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), data_desc_,
transformed_y.template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())));
#endif
}
}
}
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_y, y);
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
#endif
}
};
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
const T *dy, const T *x, const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance, const double epsilon, const int N,
const int C, const int HxW, BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
BatchNormParamType<T> mean_i = mean[i];
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale[i] = ds_sum * inv_var_i;
dbias[i] = db_sum;
}
__syncthreads();
}
}
template <typename T, framework::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *variance,
const double epsilon, const int C,
const int HxW, const int num, T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
scale[c] * inv_var);
}
}
template <typename T>
static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
double epsilon, int C, int M,
const int num, const T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
x[i] = static_cast<T>(x_i);
}
}
template <typename T>
class InplaceHelper {
public:
void operator()(const framework::DataLayout layout, T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance, double epsilon, int C,
int M, const int num, const T *y, int grid2, const int block,
const gpuStream_t &stream) {
PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
"X and Y should be inplaced in inplace mode"));
KeBNRestoreData<<<grid2, block, 0, stream>>>(
layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
}
};
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
const T *dy, const T *x, const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *saved_mean,
const BatchNormParamType<T> *saved_inv_variance, const int C, const int N,
const int HxW, const double epsilon, T *dx, BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
__shared__ typename BlockReduce::TempStorage mean_storage;
__shared__ typename BlockReduce::TempStorage variance_storeage;
__shared__ BatchNormParamType<T> inv_var_val;
__shared__ BatchNormParamType<T> mean_val;
__shared__ BatchNormParamType<T> dscale_val;
__shared__ BatchNormParamType<T> dbias_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
if (saved_mean && saved_inv_variance) {
if (threadIdx.x == 0) {
inv_var_val = saved_inv_variance[i];
mean_val = saved_mean[i];
}
} else {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum =
static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_i =
static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
x_square_sum =
BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
if (threadIdx.x == 0) {
mean_val = x_sum / inner_size;
inv_var_val =
1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
}
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> dy_i =
static_cast<BatchNormParamType<T>>(dy[index]);
ds_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
db_sum += dy_i;
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale_val = ds_sum * inv_var_val;
dbias_val = db_sum;
dscale[i] = dscale_val;
dbias[i] = dbias_val;
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
dx[index] = scale[i] * inv_var_val *
(static_cast<BatchNormParamType<T>>(dy[index]) -
dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
(static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
inv_var_val * dscale_val / inner_size);
}
}
}
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
const T *dy, const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *mean, const T *x,
const BatchNormParamType<T> *variance, const int C, const int N,
const int HxW, T *dx) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage dy_storage;
__shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
__shared__ BatchNormParamType<T> dy_sum_val;
__shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> inv_var_i = variance[i];
BatchNormParamType<T> mean_i = mean[i];
BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> dy_x_sub_mean_sum =
static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> dy_i =
static_cast<BatchNormParamType<T>>(dy[index]);
dy_sum += dy_i;
dy_x_sub_mean_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
}
dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
.Reduce(dy_x_sub_mean_sum, cub::Sum());
if (threadIdx.x == 0) {
dy_sum_val = dy_sum;
dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
dx[index] =
(static_cast<BatchNormParamType<T>>(dy[index]) -
dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
scale[i] * inv_var_i;
}
}
}
template <typename T>
class BatchNormGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("It must use CUDAPlace."));
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
// batch_norm with inplace as false will take X as grad input, which
// is same as cuDNN batch_norm backward calculation, batch_norm
// with inplace as true only take Y as input and X should be calculate
// by inverse operation of batch_norm on Y
const Tensor *x;
bool is_inplace;
if (ctx.HasInput("Y")) {
x = ctx.Input<Tensor>("Y");
is_inplace = true;
if (d_x) {
PADDLE_ENFORCE_EQ(d_x, d_y,
platform::errors::InvalidArgument(
"X@GRAD and Y@GRAD not inplace in inplace mode"));
}
} else {
x = ctx.Input<Tensor>("X");
is_inplace = false;
if (d_x) {
PADDLE_ENFORCE_NE(
d_x, d_y, platform::errors::InvalidArgument(
"X@GRAD and Y@GRAD inplaced in non-inplace mode"));
}
}
const bool is_test = ctx.Attr<bool>("is_test");
use_global_stats = is_test || use_global_stats;
const auto &x_dims = x->dims();
PADDLE_ENFORCE_EQ(
x_dims.size() >= 2 && x_dims.size() <= 5, true,
platform::errors::InvalidArgument(
"The size of input's dimensions should be between 2 and 5."
"But received: the size of input's dimensions is [%d],"
"the dimensions of input is [%s]",
x_dims.size(), x_dims));
int N, C, H, W, D;
ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
// init output
if (d_x) {
d_x->mutable_data<T>(ctx.GetPlace());
}
if (d_scale && d_bias) {
d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
}
PADDLE_ENFORCE_EQ(
scale->dims().size(), 1UL,
platform::errors::InvalidArgument(
"The size of scale's dimensions must equal to 1. But received: "
"the size of scale's dimensions is [%d], the dimensions of scale "
"is [%s].",
scale->dims().size(), scale->dims()));
PADDLE_ENFORCE_EQ(
scale->dims()[0], C,
platform::errors::InvalidArgument(
"The first dimension of scale must equal to Channels[%d]. But "
"received: the first dimension of scale is [%d]",
C, scale->dims()[0]));
auto dtype = platform::CudnnDataType<T>::type;
const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
#ifdef PADDLE_WITH_HIP
auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
: DataLayout::kNCHW;
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// HIP do not support compute format of NHWC
// auto compute_format = DataLayout::kNCHW;
#else
const bool fast_nhwc_batch_norm =
dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent &&
reserve_space != nullptr;
auto compute_format =
fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
? DataLayout::kNHWC
: DataLayout::kNCHW;
#endif
Tensor transformed_x(x->type());
Tensor transformed_d_y(d_y->type());
Tensor transformed_d_x;
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
&transformed_x);
TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
&transformed_x);
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
&transformed_d_y);
TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
&transformed_d_y);
if (d_x) {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_x,
&transformed_d_x);
}
} else {
transformed_x.ShareDataWith(*x);
transformed_d_y.ShareDataWith(*d_y);
if (d_x) {
transformed_d_x.ShareDataWith(*d_x);
}
}
std::vector<int> dims;
std::vector<int> strides;
if (compute_format == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * C * D, 1, W * D * C, D * C, C};
}
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
const int num = transformed_x.numel();
#ifdef HIPCC
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid1 = (num + block - 1) / block;
int grid2 = std::min(C, max_blocks);
auto stream = dev_ctx.stream();
InplaceHelper<T> inplace_functor;
if (!use_global_stats) {
if ((N * H * W * D) == 1) {
if (d_x) {
framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
}
phi::funcs::SetConstant<platform::CUDADeviceContext,
BatchNormParamType<T>>
functor;
functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
return;
}
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#endif // CUDNN_VERSION_MIN(7, 0, 1)
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
// data_desc_, mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_));
#endif
const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
const auto *saved_mean_data =
saved_mean->template data<BatchNormParamType<T>>();
const auto *saved_var_data =
saved_var->template data<BatchNormParamType<T>>();
if (is_inplace) {
inplace_functor(compute_format, transformed_x.data<T>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(),
saved_mean_data, saved_var_data, epsilon, C, H * W * D,
num, transformed_x.data<T>(), grid2, block, stream);
}
// This branch calls CUDNN APIs
if (d_x && d_scale && d_bias) {
bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
void *workspace_ptr = nullptr;
Tensor workspace_tensor;
auto reserve_space_size = reserve_space->memory_size();
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::
cudnnGetBatchNormalizationBackwardExWorkspaceSize(
/*handle=*/dev_ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*yDesc=*/data_desc_,
/*dyDesc=*/data_desc_,
/*dzDesc=*/nullptr,
/*dxDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationBackwardEx(
/*handle=*/dev_ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*alphaDataDiff=*/CudnnDataType<T>::kOne(),
/*betaDataDiff=*/CudnnDataType<T>::kZero(),
/*alphaParamDiff=*/CudnnDataType<T>::kOne(),
/*betaParamDiff=*/CudnnDataType<T>::kZero(),
/*xDesc=*/data_desc_,
/*xData=*/transformed_x.template data<T>(),
/*yDesc=*/nullptr,
/*yData=*/nullptr,
/*dyDesc=*/data_desc_,
/*dyData=*/transformed_d_y.template data<T>(),
/*dzDesc=*/nullptr,
/*dzData=*/nullptr,
/*dxDesc=*/data_desc_,
/*dxData=*/transformed_d_x.template mutable_data<T>(
ctx.GetPlace()),
/*dBnScaleBiasDesc=*/bn_param_desc_,
/*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
/*bnBiasData=*/nullptr,
/*dBnScaleData=*/d_scale
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*dBnBiasData=*/d_bias
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*epsilon=*/epsilon,
/*savedMean=*/saved_mean_data,
/*savedInvVariance=*/saved_var_data,
/*activationDesc=*/nullptr,
/*workspace=*/workspace_ptr,
/*workSpaceSizeInBytes=*/workspace_size,
/*reserveSpace=*/const_cast<T *>(
reserve_space->template data<T>()),
/*reserveSpaceSizeInBytes=*/reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
if (compute_format == DataLayout::kNCHW) {
BNBackward<
T, block,
DataLayout::kNCHW><<<grid2, block, 0, dev_ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale->template data<BatchNormParamType<T>>(), saved_mean_data,
saved_var_data, C, N, H * W * D, epsilon,
transformed_d_x.template data<T>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()));
} else {
BNBackward<
T, block,
DataLayout::kNHWC><<<grid2, block, 0, dev_ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale->template data<BatchNormParamType<T>>(), saved_mean_data,
saved_var_data, C, N, H * W * D, epsilon,
transformed_d_x.template data<T>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()));
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationBackward(
// dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), data_desc_,
// transformed_x.template data<T>(), data_desc_,
// transformed_d_y.template data<T>(), data_desc_,
// transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
// bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
// d_scale->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// d_bias->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationBackward(
dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), data_desc_,
transformed_d_y.template data<T>(), data_desc_,
transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
#endif
}
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_d_x, d_x);
}
} else {
// This branch call CUDA kernels
if (compute_format == DataLayout::kNCHW) {
if (d_x) {
BNBackwardData<T, block, framework::DataLayout::kNCHW><<<
grid2, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, block,
framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
epsilon, N, C, H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
BNBackwardData<T, block, framework::DataLayout::kNHWC><<<
grid2, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, block,
framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
epsilon, N, C, H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
#endif
} else {
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_var = ctx.Input<Tensor>("Variance");
const auto *running_mean_data =
running_mean->template data<BatchNormParamType<T>>();
const auto *running_var_data =
running_var->template data<BatchNormParamType<T>>();
if (is_inplace) {
auto px = *x;
inplace_functor(data_layout, px.mutable_data<T>(ctx.GetPlace()),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(),
running_mean_data, running_var_data, epsilon, C,
H * W * D, num, x->data<T>(), grid2, block, stream);
}
if (compute_format == DataLayout::kNCHW) {
if (d_x) {
KeBNBackwardData<
T, framework::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
running_var_data, epsilon, C, H * W, num, d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, block,
framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
KeBNBackwardData<
T, framework::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
running_var_data, epsilon, C, H * W, num, d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, block,
framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
}
};
template <typename T>
class BatchNormDoubleGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *X = ctx.Input<Tensor>("X");
const auto *Scale = ctx.Input<Tensor>("Scale");
const auto *dY = ctx.Input<Tensor>("DY");
const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool is_test = ctx.Attr<bool>("is_test");
PADDLE_ENFORCE_EQ(
is_test, false,
platform::errors::InvalidArgument(
"`is_test = True` CANNOT be used in train program. If "
"you want to use global status in pre_train model, "
"please set `use_global_stats = True`"));
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const auto *ddX = ctx.Input<Tensor>("DDX");
const auto *ddScale = ctx.Input<Tensor>("DDScale");
const auto *ddBias = ctx.Input<Tensor>("DDBias");
auto *dX = ctx.Output<Tensor>("DX");
auto *dScale = ctx.Output<Tensor>("DScale");
auto *ddY = ctx.Output<Tensor>("DDY");
NormDoubleGradFunctor<platform::CUDADeviceContext, T>(
ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon,
use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_CUDA_KERNEL(
batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad_grad,
ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>);
#else
REGISTER_OP_CUDA_KERNEL(
batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
ops::BatchNormKernel<plat::CUDADeviceContext, double>,
ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad_grad,
ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, double>);
#endif
...@@ -32,7 +32,7 @@ namespace platform = paddle::platform; ...@@ -32,7 +32,7 @@ namespace platform = paddle::platform;
namespace op = paddle::operators; namespace op = paddle::operators;
using Tensor = paddle::framework::Tensor; using Tensor = paddle::framework::Tensor;
USE_OP(batch_norm); USE_OP_ITSELF(batch_norm);
USE_CUDA_ONLY_OP(fused_bn_add_activation); USE_CUDA_ONLY_OP(fused_bn_add_activation);
USE_CUDA_ONLY_OP(fused_bn_add_activation_grad); USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class InplaceABNKernel class InplaceABNKernel : public framework::OpKernel<T> {
: public paddle::operators::BatchNormKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
...@@ -213,7 +214,33 @@ class InplaceABNKernel ...@@ -213,7 +214,33 @@ class InplaceABNKernel
auto activation = auto activation =
GetInplaceABNActivationType(ctx.Attr<std::string>("activation")); GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
BatchNormKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* mean = ctx.Input<Tensor>("Mean");
auto* variance = ctx.Input<Tensor>("Variance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* mean_out = ctx.Output<Tensor>("MeanOut");
auto* variance_out = ctx.Output<Tensor>("VarianceOut");
auto* saved_mean = ctx.Output<Tensor>("SavedMean");
auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
mean_out, variance_out, saved_mean, saved_variance, reserve_space);
auto cur_y = EigenVector<T>::Flatten(*y); auto cur_y = EigenVector<T>::Flatten(*y);
InplaceABNActivation<DeviceContext, T> functor; InplaceABNActivation<DeviceContext, T> functor;
...@@ -222,8 +249,7 @@ class InplaceABNKernel ...@@ -222,8 +249,7 @@ class InplaceABNKernel
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class InplaceABNGradKernel class InplaceABNGradKernel : public framework::OpKernel<T> {
: public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
...@@ -244,7 +270,52 @@ class InplaceABNGradKernel ...@@ -244,7 +270,52 @@ class InplaceABNGradKernel
InplaceABNActivation<DeviceContext, T> functor; InplaceABNActivation<DeviceContext, T> functor;
functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy); functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
BatchNormGradKernel<DeviceContext, T>::Compute(ctx); // BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* saved_mean = ctx.Input<Tensor>("SavedMean");
auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
auto* mean = ctx.Input<Tensor>("ReserveSpace");
auto* variance = ctx.Input<Tensor>("ReserveSpace");
paddle::optional<const Tensor&> space_opt = paddle::none;
paddle::optional<const Tensor&> mean_opt = paddle::none;
paddle::optional<const Tensor&> variance_opt = paddle::none;
if (reserve_space != nullptr) {
space_opt = *reserve_space;
}
if (mean != nullptr) {
mean_opt = *mean;
}
if (variance != nullptr) {
variance_opt = *variance;
}
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormGradRawKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
scale_grad, bias_grad);
} }
}; };
......
...@@ -15,14 +15,15 @@ limitations under the License. */ ...@@ -15,14 +15,15 @@ limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/inplace_abn_op.h" #include "paddle/fluid/operators/inplace_abn_op.h"
#include "paddle/fluid/operators/sync_batch_norm_op.cu.h" #include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class InplaceABNKernel class InplaceABNKernel
: public paddle::operators::SyncBatchNormKernel<DeviceContext, T>, : public paddle::operators::SyncBatchNormKernel<DeviceContext, T> {
public paddle::operators::BatchNormKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* y = ctx.Output<Tensor>("Y"); auto* y = ctx.Output<Tensor>("Y");
...@@ -36,7 +37,33 @@ class InplaceABNKernel ...@@ -36,7 +37,33 @@ class InplaceABNKernel
if (ctx.Attr<bool>("use_sync_bn")) { if (ctx.Attr<bool>("use_sync_bn")) {
SyncBatchNormKernel<DeviceContext, T>::Compute(ctx); SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
} else { } else {
BatchNormKernel<DeviceContext, T>::Compute(ctx); // BatchNormKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* mean = ctx.Input<Tensor>("Mean");
auto* variance = ctx.Input<Tensor>("Variance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* mean_out = ctx.Output<Tensor>("MeanOut");
auto* variance_out = ctx.Output<Tensor>("VarianceOut");
auto* saved_mean = ctx.Output<Tensor>("SavedMean");
auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
mean_out, variance_out, saved_mean, saved_variance, reserve_space);
} }
auto cur_y = EigenVector<T>::Flatten(*y); auto cur_y = EigenVector<T>::Flatten(*y);
...@@ -49,8 +76,7 @@ class InplaceABNKernel ...@@ -49,8 +76,7 @@ class InplaceABNKernel
// https://kevinzakka.github.io/2016/09/14/batch_normalization/ // https://kevinzakka.github.io/2016/09/14/batch_normalization/
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class InplaceABNGradKernel class InplaceABNGradKernel
: public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T>, : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T> {
public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* y = ctx.Input<Tensor>("Y"); const auto* y = ctx.Input<Tensor>("Y");
...@@ -74,7 +100,50 @@ class InplaceABNGradKernel ...@@ -74,7 +100,50 @@ class InplaceABNGradKernel
if (ctx.Attr<bool>("use_sync_bn")) { if (ctx.Attr<bool>("use_sync_bn")) {
SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx); SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
} else { } else {
BatchNormGradKernel<DeviceContext, T>::Compute(ctx); auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* saved_mean = ctx.Input<Tensor>("SavedMean");
auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
auto* mean = ctx.Input<Tensor>("ReserveSpace");
auto* variance = ctx.Input<Tensor>("ReserveSpace");
paddle::optional<const Tensor&> space_opt = paddle::none;
paddle::optional<const Tensor&> mean_opt = paddle::none;
paddle::optional<const Tensor&> variance_opt = paddle::none;
if (reserve_space != nullptr) {
space_opt = *reserve_space;
}
if (mean != nullptr) {
mean_opt = *mean;
}
if (variance != nullptr) {
variance_opt = *variance;
}
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormGradRawKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
scale_grad, bias_grad);
} }
} }
}; };
......
...@@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal( ...@@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal(
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, void NormDoubleGradFunctor(const DeviceContext &ctx,
const DataLayout data_layout, const Tensor *X, const DataLayout data_layout, const Tensor *X,
const Tensor *Scale, const Tensor *dY, const Tensor *Scale, const Tensor *dY,
const Tensor *Saved_mean, const Tensor *Saved_mean,
const Tensor *Saved_variance, const double epsilon, const Tensor *Saved_variance, const Tensor *Mean,
const Tensor *Variance, const double epsilon,
const bool use_global_stats, const Tensor *ddX, const bool use_global_stats, const Tensor *ddX,
const Tensor *ddScale, const Tensor *ddBias, const Tensor *ddScale, const Tensor *ddBias,
Tensor *dX, Tensor *dScale, Tensor *ddY) { Tensor *dX, Tensor *dScale, Tensor *ddY) {
...@@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>()); const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>()); const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); phi::funcs::SetConstant<DeviceContext, T> set_constant;
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
auto &x_dims = X->dims(); auto &x_dims = X->dims();
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
...@@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
Tensor scale_tmp; Tensor scale_tmp;
if (!Scale) { if (!Scale) {
scale_tmp.mutable_data<T>({C}, ctx.GetPlace()); scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
set_constant(dev_ctx, &scale_tmp, static_cast<T>(1)); set_constant(ctx, &scale_tmp, static_cast<T>(1));
} }
const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>(); const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
#ifdef __HIPCC__ #ifdef __HIPCC__
...@@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
#else #else
const int block = 512; const int block = 512;
#endif #endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(C, max_blocks); int grid = std::min(C, max_blocks);
int grid1 = (num + block - 1) / block; int grid1 = (num + block - 1) / block;
const T *mean_data, *variance_data; const T *mean_data, *variance_data;
if (use_global_stats) { if (use_global_stats) {
const auto *running_mean = ctx.Input<Tensor>("Mean"); const auto *running_mean = Mean;
const auto *running_var = ctx.Input<Tensor>("Variance"); const auto *running_var = Variance;
const auto *running_mean_data = running_mean->template data<T>(); const auto *running_mean_data = running_mean->template data<T>();
const auto *running_var_data = running_var->template data<T>(); const auto *running_var_data = running_var->template data<T>();
mean_data = running_mean_data; mean_data = running_mean_data;
...@@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
} else { } else {
const T *smean_data = Saved_mean->data<T>(); const T *smean_data = Saved_mean->data<T>();
const T *svariance_data = Saved_variance->data<T>(); const T *svariance_data = Saved_variance->data<T>();
mean_data = smean_data; mean_data = smean_data;
variance_data = svariance_data; variance_data = svariance_data;
} }
if (dX) { if (dX) {
T *dx_data = dX->mutable_data<T>(ctx.GetPlace()); T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, dX, static_cast<T>(0)); set_constant(ctx, dX, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDXWithGlobal< DoubleGradComputeDXWithGlobal<
T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
dx_data); dx_data);
} else { } else {
DoubleGradComputeDXWithGlobal< DoubleGradComputeDXWithGlobal<
T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
dx_data); dx_data);
} }
} else { } else {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDX< DoubleGradComputeDX<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
ddscale_data, N, C, sample_size, epsilon, dx_data); ddscale_data, N, C, sample_size, epsilon, dx_data);
} else { } else {
DoubleGradComputeDX< DoubleGradComputeDX<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
ddscale_data, N, C, sample_size, epsilon, dx_data); ddscale_data, N, C, sample_size, epsilon, dx_data);
} }
...@@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
} }
if (dScale) { if (dScale) {
T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace()); T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, dScale, static_cast<T>(0)); set_constant(ctx, dScale, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDScaleWithGlobal< DoubleGradComputeDScaleWithGlobal<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
dscale_data); dscale_data);
} else { } else {
DoubleGradComputeDScaleWithGlobal< DoubleGradComputeDScaleWithGlobal<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
dscale_data); dscale_data);
} }
} else { } else {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDScale< DoubleGradComputeDScale<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, N, C, x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
sample_size, epsilon, dscale_data); sample_size, epsilon, dscale_data);
} else { } else {
DoubleGradComputeDScale< DoubleGradComputeDScale<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, N, C, x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
sample_size, epsilon, dscale_data); sample_size, epsilon, dscale_data);
} }
...@@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
} }
if (ddY) { if (ddY) {
T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace()); T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, ddY, static_cast<T>(0)); set_constant(ctx, ddY, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDDYWithGlobal< DoubleGradComputeDDYWithGlobal<
T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
ddscale_data, epsilon, C, sample_size, num, ddy_data); ddscale_data, epsilon, C, sample_size, num, ddy_data);
} else { } else {
DoubleGradComputeDDYWithGlobal< DoubleGradComputeDDYWithGlobal<
T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
ddscale_data, epsilon, C, sample_size, num, ddy_data); ddscale_data, epsilon, C, sample_size, num, ddy_data);
} }
} else { } else {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDDY< DoubleGradComputeDDY<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddscale_data, ddbias_data, x_data, mean_data, variance_data, ddscale_data, ddbias_data,
ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
} else { } else {
DoubleGradComputeDDY< DoubleGradComputeDDY<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddscale_data, ddbias_data, x_data, mean_data, variance_data, ddscale_data, ddbias_data,
ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
} }
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void BatchNormGradRawKernel(const Context& dev_ctx,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> reserve_space,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
bool is_inplace,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad);
template <typename T, typename Context>
void BatchNormGradKernel(const Context& dev_ctx,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> reserve_space,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad);
template <typename T, typename Context>
void BatchNormDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x_grad_grad,
const DenseTensor& scale_grad_grad,
const DenseTensor& bias_grad_grad,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* y_grad_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void BatchNormKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& mean,
const DenseTensor& variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out,
DenseTensor* saved_mean,
DenseTensor* saved_variance,
DenseTensor* reserve_space);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
namespace phi {
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T, typename Context>
void BatchNormGradRawKernel(const Context& ctx,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> reserve_space,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
bool is_inplace,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad) {
const auto* d_y = &y_grad;
DataLayout data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
auto* d_x = x_grad;
auto* d_scale = scale_grad;
auto* d_bias = bias_grad;
use_global_stats = is_test || use_global_stats;
// batch_norm with inplace as false will take X as grad input, which
// is same as cuDNN batch_norm backward calculation, batch_norm
// with inplace as true only take Y as input and X should be calculate
// by inverse operation of batch_norm on Y
if (is_inplace) {
if (d_x) {
PADDLE_ENFORCE_EQ(d_x,
d_y,
phi::errors::InvalidArgument(
"X@GRAD and Y@GRAD inplaced in non-inplace mode"));
}
} else {
if (d_x) {
PADDLE_ENFORCE_NE(d_x,
d_y,
phi::errors::InvalidArgument(
"X@GRAD and Y@GRAD inplaced in non-inplace mode"));
}
}
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const auto& x_dims = x.dims();
PADDLE_ENFORCE_GE(
x_dims.size(),
2,
phi::errors::InvalidArgument(
"The size of input X's dimensions should be larger than 1."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
PADDLE_ENFORCE_LE(
x_dims.size(),
5,
phi::errors::InvalidArgument(
"The size of input X's dimensions should be less than 6."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
const int N = x_dims[0];
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int sample_size = x.numel() / N / C;
// input dimension is 2 and the format is NCHW. The input can be regarded as
// NHWC format
if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
data_layout = DataLayout::kNHWC;
}
// init output
if (d_x) {
ctx.template Alloc<T>(d_x);
}
const T* mean_data = saved_mean.data<T>();
const T* inv_var_data = saved_variance.data<T>();
DenseTensor inv_var_tensor;
if (use_global_stats) {
const auto* running_mean = mean.get_ptr();
const auto* running_variance = variance.get_ptr();
mean_data = running_mean->data<T>();
inv_var_tensor.Resize({C});
T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
inv_var_data = running_inv_var_data;
}
ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
T* d_bias_data = nullptr;
T* d_scale_data = nullptr;
if (d_scale && d_bias) {
d_bias_data = ctx.template Alloc<T>(d_bias);
d_scale_data = ctx.template Alloc<T>(d_scale);
}
// d_bias = np.sum(d_y, axis=0)
// d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
// d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
// - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
if (d_scale && d_bias) {
d_bias_arr.setZero();
d_scale_arr.setZero();
}
if (d_x && (N * sample_size) == 1 && !use_global_stats) {
paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
return;
}
int scale_coefff = use_global_stats ? 1 : N * sample_size;
const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
DenseTensor dy_sum;
dy_sum.Resize({C});
auto dy_sum_data = ctx.template Alloc<T>(&dy_sum);
EigenVectorArrayMap<T> dy_sum_arr(dy_sum_data, C);
DenseTensor dy_mul_x_sub_mean_mul_invstd_sum;
dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
auto dy_mul_x_sub_mean_mul_invstd_sum_data =
ctx.template Alloc<T>(&dy_mul_x_sub_mean_mul_invstd_sum);
EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
dy_mul_x_sub_mean_mul_invstd_sum_data, C);
dy_sum_arr.setZero();
dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero();
// inplace calculation
// Y: ((x - est_mean) * (inv_var) * scale + bias
// formula transform ====>
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
// X: (y - bias) / scale / (inv_var) + est_mean
// formula transform ====>
// (y - bias) / (scale * inv_var) + est_mean
switch (data_layout) {
case DataLayout::kNCHW: {
if (is_inplace) {
auto px = x;
EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), sample_size, N * C);
ConstEigenArrayMap<T> y_data(x.data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) /
scale_inv_var_nhw(nc % C) / scale_coefff +
mean_arr(nc % C);
}
}
ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
int c = nc % C;
dy_sum_arr(c) += d_y_arr.col(nc).sum();
dy_mul_x_sub_mean_mul_invstd_sum_arr(c) +=
((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
.sum();
}
if (d_scale && d_bias) {
d_bias_arr = dy_sum_arr;
d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
}
if (d_x) {
EigenArrayMap<T> d_x_arr(
ctx.template Alloc<T>(d_x), sample_size, N * C);
if (!use_global_stats) {
for (int nc = 0; nc < N * C; ++nc) {
int c = nc % C;
d_x_arr.col(nc) =
scale_inv_var_nhw(c) *
(d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) -
(x_arr.col(nc) - mean_arr[c]) *
dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c));
}
} else {
for (int nc = 0; nc < N * C; ++nc) {
int c = nc % C;
d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc);
}
}
}
break;
}
case DataLayout::kNHWC: {
if (is_inplace) {
auto px = x;
EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), C, N * sample_size);
ConstEigenArrayMap<T> y_data(x.data<T>(), C, N * sample_size);
for (int nhw = 0; nhw < N * sample_size; nhw++) {
x_data.col(nhw) =
(y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / scale_coefff +
mean_arr;
}
}
ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
for (int nhw = 0; nhw < N * sample_size; ++nhw) {
dy_sum_arr += d_y_arr.col(nhw);
dy_mul_x_sub_mean_mul_invstd_sum_arr +=
(x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
}
if (d_scale && d_bias) {
d_bias_arr = dy_sum_arr;
d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
}
if (d_x) {
EigenArrayMap<T> d_x_arr(
ctx.template Alloc<T>(d_x), C, N * sample_size);
if (!use_global_stats) {
for (int nhw = 0; nhw < N * sample_size; ++nhw) {
d_x_arr.col(nhw) =
scale_inv_var_nhw *
(d_y_arr.col(nhw) * N * sample_size - dy_sum_arr -
(x_arr.col(nhw) - mean_arr) *
dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr);
}
} else {
for (int nhw = 0; nhw < N * sample_size; ++nhw) {
d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw);
}
}
}
break;
}
default:
PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
data_layout_str));
}
}
template <typename T, typename Context>
void BatchNormGradKernel(const Context& dev_ctx,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> reserve_space,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad) {
BatchNormGradRawKernel<T, Context>(dev_ctx,
y_grad,
x,
scale,
bias,
saved_mean,
saved_variance,
reserve_space,
mean,
variance,
momentum,
epsilon,
data_layout,
is_test,
use_global_stats,
trainable_statistics,
fuse_with_relu,
false,
x_grad,
scale_grad,
bias_grad);
}
template <typename T, typename Context>
void BatchNormDoubleGradKernel(const Context& ctx,
const DenseTensor& x_grad_grad,
const DenseTensor& scale_grad_grad,
const DenseTensor& bias_grad_grad,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* y_grad_grad) {
const auto* X = &x;
const auto* Scale = &scale;
const auto* dY = &y_grad;
const auto* Saved_mean = &saved_mean;
const auto* Saved_variance = &saved_variance;
PADDLE_ENFORCE_EQ(is_test,
false,
phi::errors::InvalidArgument(
"`is_test = True` CANNOT be used in train program. If "
"you want to use global status in pre_train model, "
"please set `use_global_stats = True`"));
const auto data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
const auto* ddX = &x_grad_grad;
const auto* ddScale = &scale_grad_grad;
const auto* ddBias = &bias_grad_grad;
auto* dX = x_grad;
auto* dScale = scale_grad;
auto* ddY = y_grad_grad;
ctx.template Alloc<T>(dX);
ctx.template Alloc<T>(ddY);
const auto& x_dims = X->dims();
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int sample_size = X->numel() / C;
phi::funcs::SetConstant<Context, T> set_constant;
const T* mean_data = Saved_mean->data<T>();
const T* inv_var_data = Saved_variance->data<T>();
DenseTensor inv_var_tensor;
if (use_global_stats) {
const auto* running_mean = mean.get_ptr();
const auto* running_variance = variance.get_ptr();
mean_data = running_mean->data<T>();
inv_var_tensor.Resize({C});
T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
inv_var_data = running_inv_var_data;
}
// transpose NCHW -> NHWC for easy calculate
DenseTensor transformed_x(X->type());
DenseTensor transformed_dy(dY->type());
DenseTensor transformed_ddx(ddX->type());
DenseTensor transformed_dx(dX->type());
DenseTensor transformed_ddy(ddY->type());
if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
// Input Tensor
ResizeToChannelLast<Context, T>(ctx, X, &transformed_x);
TransToChannelLast<Context, T>(ctx, X, &transformed_x);
ResizeToChannelLast<Context, T>(ctx, dY, &transformed_dy);
TransToChannelLast<Context, T>(ctx, dY, &transformed_dy);
ResizeToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
TransToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
// Output Tensor
ResizeToChannelLast<Context, T>(ctx, dX, &transformed_dx);
ResizeToChannelLast<Context, T>(ctx, ddY, &transformed_ddy);
} else {
transformed_x.ShareDataWith(*X);
transformed_dy.ShareDataWith(*dY);
transformed_ddx.ShareDataWith(*ddX);
transformed_dx.ShareDataWith(*dX);
transformed_ddy.ShareDataWith(*ddY);
}
ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
Tensor mean_tile;
mean_tile.Resize({C, sample_size});
EigenArrayMap<T> mean_tile_data(
ctx.template Alloc<T>(&mean_tile), C, sample_size);
DenseTensor inv_var_tile;
inv_var_tile.Resize({C, sample_size});
EigenArrayMap<T> inv_var_tile_data(
ctx.template Alloc<T>(&inv_var_tile), C, sample_size);
mean_tile_data = mean_arr.replicate(1, sample_size);
inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
DenseTensor Scale_data;
if (!Scale) {
Scale_data.Resize({C});
ctx.template Alloc<T>(&Scale_data);
set_constant(ctx, &Scale_data, static_cast<T>(1));
}
ConstEigenVectorArrayMap<T> scale_arr(
Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
Tensor scale_tile;
scale_tile.Resize({C, sample_size});
EigenArrayMap<T> scale_tile_data(
ctx.template Alloc<T>(&scale_tile), C, sample_size);
scale_tile_data = scale_arr.replicate(1, sample_size);
ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
DenseTensor x_sub_mean_mul_invstd;
x_sub_mean_mul_invstd.Resize({C, sample_size});
EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
ctx.template Alloc<T>(&x_sub_mean_mul_invstd), C, sample_size);
x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
if (dX) {
ctx.template Alloc<T>(dX);
EigenArrayMap<T> dx_arr(
ctx.template Alloc<T>(&transformed_dx), C, sample_size);
dx_arr.setZero();
if (use_global_stats) {
// math: dx = (ddscale * dy) * inv_var
if (ddScale) {
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
Tensor ddscale_tile;
ddscale_tile.Resize({C, sample_size});
EigenArrayMap<T> ddscale_tile_data(
ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
}
} else {
// math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
// axis=(n,h,w)) *
// np.sum(dy, axis=(n,h,w)) -
// np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
// mean),
// axis=(n,h,w)) * inv_var.pow(2) *
// np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
// NxHxW *
// np.sum(ddx * (x - mean)) *
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// np.sum(dy,
// axis=(n,h,w)) * (x - mean) *
// (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
// inv_var
// *
// np.mean(dy, axis=(n,h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(n,h,w)))
if (ddX) {
dx_arr +=
(x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
sample_size)
.colwise() *
(ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
(dy_arr * ddx_arr).rowwise().sum() +
3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
(ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
sample_size);
dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
(ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
sample_size * (dy_arr.rowwise().sum() / sample_size - dy_arr);
dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
(dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
sample_size *
(ddx_arr.rowwise().sum() / sample_size - ddx_arr);
dx_arr = scale_tile_data * dx_arr;
}
if (ddScale) {
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
Tensor ddscale_tile;
ddscale_tile.Resize({C, sample_size});
EigenArrayMap<T> ddscale_tile_data(
ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
dx_arr +=
(dy_arr * inv_var_tile_data -
(dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size) *
inv_var_tile_data -
x_sub_mean_mul_invstd_arr * inv_var_tile_data *
(dy_arr * x_sub_mean_mul_invstd_arr)
.rowwise()
.sum()
.replicate(1, sample_size) /
sample_size) *
ddscale_tile_data;
}
}
if (data_layout == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
TransToChannelFirst<Context, T>(ctx, &transformed_dx, dX);
}
}
if (dScale) {
EigenVectorArrayMap<T> dscale_arr(ctx.template Alloc<T>(dScale), C);
dscale_arr.setZero();
if (use_global_stats) {
// math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
if (ddX) {
dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
}
} else {
// math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
// inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
// ddx
if (ddX) {
Tensor first_grad;
first_grad.Resize({C, sample_size});
EigenArrayMap<T> first_grad_arr(
ctx.template Alloc<T>(&first_grad), C, sample_size);
first_grad_arr.setZero();
first_grad_arr +=
inv_var_tile_data *
(dy_arr -
dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
x_sub_mean_mul_invstd_arr *
(dy_arr * x_sub_mean_mul_invstd_arr)
.rowwise()
.sum()
.replicate(1, sample_size) /
sample_size);
dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
}
}
}
if (ddY) {
ctx.template Alloc<T>(ddY);
EigenArrayMap<T> ddy_arr(
ctx.template Alloc<T>(&transformed_ddy), C, sample_size);
ddy_arr.setZero();
if (use_global_stats) {
// math: ddy = r * ddx * inv_var + ddbias +
// ddscale * (x - mean) * inv_var
if (ddX) {
ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
}
} else {
// math: ddy = (x - mean) * inv_var * ddscale + ddbias +
// scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
// np.mean(ddx * (x - mean), axis=(n,h,w)))
if (ddX) {
ddy_arr +=
scale_tile_data * inv_var_tile_data *
(ddx_arr -
ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
x_sub_mean_mul_invstd_arr *
(ddx_arr * x_sub_mean_mul_invstd_arr)
.rowwise()
.sum()
.replicate(1, sample_size) /
sample_size);
}
}
if (ddScale) {
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
Tensor ddscale_tile;
ddscale_tile.Resize({C, sample_size});
EigenArrayMap<T> ddscale_tile_data(
ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
}
if (ddBias) {
ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
Tensor ddbias_tile;
ddbias_tile.Resize({C, sample_size});
EigenArrayMap<T> ddbias_tile_data(
ctx.template Alloc<T>(&ddbias_tile), C, sample_size);
ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
ddy_arr += ddbias_tile_data;
}
if (data_layout == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
TransToChannelFirst<Context, T>(ctx, &transformed_ddy, ddY);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(
batch_norm_grad, CPU, ALL_LAYOUT, phi::BatchNormGradKernel, float, double) {
}
PD_REGISTER_KERNEL(batch_norm_grad_raw,
CPU,
ALL_LAYOUT,
phi::BatchNormGradRawKernel,
float,
double) {}
PD_REGISTER_KERNEL(batch_norm_grad_grad,
CPU,
ALL_LAYOUT,
phi::BatchNormDoubleGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/framework/tensor_util.h"
namespace phi {
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T, typename Context>
void BatchNormKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& mean,
const DenseTensor& variance,
float momentum,
float epsilon,
const std::string& data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out,
DenseTensor* saved_mean,
DenseTensor* saved_variance,
DenseTensor* reserve_space) {
bool test_mode = is_test && (!trainable_statistics);
bool global_stats = test_mode || use_global_stats;
auto data_layout = paddle::framework::StringToDataLayout(data_layout_str);
const auto& x_dims = x.dims();
PADDLE_ENFORCE_GE(
x_dims.size(),
2,
phi::errors::InvalidArgument(
"The size of input X's dimensions should be larger than 1."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
PADDLE_ENFORCE_LE(
x_dims.size(),
5,
phi::errors::InvalidArgument(
"The size of input X's dimensions should be less than 6."
"But received: the size of input X's dimensionss is [%d]",
x_dims.size()));
const int N = x_dims[0];
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int sample_size = x.numel() / N / C;
// alloc memory
ctx.template Alloc<T>(y);
ctx.template Alloc<T>(mean_out);
ctx.template Alloc<T>(variance_out);
ctx.template Alloc<T>(saved_mean);
ctx.template Alloc<T>(saved_variance);
// input dimension is 2 and the format is NCHW. The input can be regarded
// as NHWC format
if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
data_layout = DataLayout::kNHWC;
}
if (!global_stats) {
// saved_xx is use just in this batch of data
EigenVectorArrayMap<T> saved_mean_e(ctx.template Alloc<T>(saved_mean), C);
EigenVectorArrayMap<T> saved_variance_e(
ctx.template Alloc<T>(saved_variance), C);
saved_mean_e.setZero();
saved_variance_e.setZero();
EigenVectorArrayMap<T> running_mean_arr(ctx.template Alloc<T>(mean_out), C);
EigenVectorArrayMap<T> running_var_arr(ctx.template Alloc<T>(variance_out),
C);
if ((N * sample_size) == 1) {
// Only 1 element in normalization dimension,
// we skip the batch norm calculation, let y = x.
paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
return;
}
switch (data_layout) {
case DataLayout::kNCHW: {
ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
saved_mean_e(nc % C) += x_arr.col(nc).sum();
}
saved_mean_e /= N * sample_size;
for (int nc = 0; nc < N * C; ++nc) {
saved_variance_e(nc % C) +=
(x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
}
saved_variance_e /= N * sample_size;
break;
}
case DataLayout::kNHWC: {
ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
for (int i = 0; i < N * sample_size; ++i) {
saved_mean_e += x_arr.col(i);
}
saved_mean_e /= N * sample_size;
for (int i = 0; i < N * sample_size; ++i) {
saved_variance_e +=
(x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
}
saved_variance_e /= N * sample_size;
break;
}
default:
PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
data_layout_str));
}
// if MomentumTensor is set, use MomentumTensor value, momentum
// is only used in this training branch
running_mean_arr =
running_mean_arr * momentum + saved_mean_e * (1. - momentum);
running_var_arr =
running_var_arr * momentum + saved_variance_e * (1. - momentum);
}
// use SavedMean and SavedVariance to do normalize
Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
if (global_stats) {
ConstEigenVectorArrayMap<T> var_arr(variance.data<T>(), C);
inv_std = (var_arr + epsilon).sqrt().inverse();
} else {
EigenVectorArrayMap<T> saved_inv_std(saved_variance->data<T>(), C);
// inverse SavedVariance first, gradient will use it too.
saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
inv_std = saved_inv_std;
}
ConstEigenVectorArrayMap<T> mean_arr(
global_stats ? mean.data<T>() : saved_mean->data<T>(), C);
// ((x - est_mean) * (inv_var) * scale + bias
// formula transform ====>
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
bias_arr - mean_arr * inv_std * scale_arr;
switch (data_layout) {
case DataLayout::kNCHW: {
EigenArrayMap<T> y_arr(ctx.template Alloc<T>(y), sample_size, N * C);
ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
}
break;
}
case DataLayout::kNHWC: {
EigenArrayMap<T>(ctx.template Alloc<T>(y), C, N * sample_size) =
(ConstEigenArrayMap<T>(x.data<T>(), C, N * sample_size).colwise() *
new_scale)
.colwise() +
new_bias;
break;
}
default:
PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %d",
data_layout));
}
}
} // namespace phi
PD_REGISTER_KERNEL(
batch_norm, CPU, ALL_LAYOUT, phi::BatchNormKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/operators/norm_utils.cu.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/layout_utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
#ifdef __HIPCC__
#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
#else
#define LAUNCH_BOUNDS(BlockDim)
#endif
DECLARE_bool(cudnn_batchnorm_spatial_persistent);
namespace phi {
template <typename T>
using CudnnDataType = paddle::platform::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
const T *dy,
const T *x,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
const double epsilon,
const int N,
const int C,
const int HxW,
BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
BatchNormParamType<T> mean_i = mean[i];
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale[i] = ds_sum * inv_var_i;
dbias[i] = db_sum;
}
__syncthreads();
}
}
template <typename T, phi::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *variance,
const double epsilon,
const int C,
const int HxW,
const int num,
T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
scale[c] * inv_var);
}
}
template <typename T>
static __global__ void KeBNRestoreData(const phi::DataLayout layout,
T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
double epsilon,
int C,
int M,
const int num,
const T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == phi::DataLayout::kNCHW ? (i / M) % C : i % C;
auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
x[i] = static_cast<T>(x_i);
}
}
template <typename T>
class InplaceHelper {
public:
void operator()(const phi::DataLayout layout,
T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
double epsilon,
int C,
int M,
const int num,
const T *y,
int grid2,
const int block,
const gpuStream_t &stream) {
PADDLE_ENFORCE_EQ(x,
y,
phi::errors::InvalidArgument(
"X and Y should be inplaced in inplace mode"));
KeBNRestoreData<<<grid2, block, 0, stream>>>(
layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
}
};
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
const T *dy,
const T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *saved_mean,
const BatchNormParamType<T> *saved_inv_variance,
const int C,
const int N,
const int HxW,
const double epsilon,
T *dx,
BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
__shared__ typename BlockReduce::TempStorage mean_storage;
__shared__ typename BlockReduce::TempStorage variance_storeage;
__shared__ BatchNormParamType<T> inv_var_val;
__shared__ BatchNormParamType<T> mean_val;
__shared__ BatchNormParamType<T> dscale_val;
__shared__ BatchNormParamType<T> dbias_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
if (saved_mean && saved_inv_variance) {
if (threadIdx.x == 0) {
inv_var_val = saved_inv_variance[i];
mean_val = saved_mean[i];
}
} else {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum =
static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_i =
static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
x_square_sum =
BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
if (threadIdx.x == 0) {
mean_val = x_sum / inner_size;
inv_var_val =
1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
}
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> dy_i =
static_cast<BatchNormParamType<T>>(dy[index]);
ds_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
db_sum += dy_i;
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale_val = ds_sum * inv_var_val;
dbias_val = db_sum;
dscale[i] = dscale_val;
dbias[i] = dbias_val;
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
dx[index] = scale[i] * inv_var_val *
(static_cast<BatchNormParamType<T>>(dy[index]) -
dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
(static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
inv_var_val * dscale_val / inner_size);
}
}
}
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *mean,
const T *x,
const BatchNormParamType<T> *variance,
const int C,
const int N,
const int HxW,
T *dx) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage dy_storage;
__shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
__shared__ BatchNormParamType<T> dy_sum_val;
__shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> inv_var_i = variance[i];
BatchNormParamType<T> mean_i = mean[i];
BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> dy_x_sub_mean_sum =
static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> dy_i =
static_cast<BatchNormParamType<T>>(dy[index]);
dy_sum += dy_i;
dy_x_sub_mean_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
}
dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
.Reduce(dy_x_sub_mean_sum, cub::Sum());
if (threadIdx.x == 0) {
dy_sum_val = dy_sum;
dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
dx[index] =
(static_cast<BatchNormParamType<T>>(dy[index]) -
dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
scale[i] * inv_var_i;
}
}
}
template <typename T, typename Context>
void BatchNormGradRawKernel(const Context &ctx,
const DenseTensor &y_grad,
const DenseTensor &x,
const DenseTensor &scale,
const DenseTensor &bias,
const DenseTensor &saved_mean,
const DenseTensor &saved_variance,
paddle::optional<const DenseTensor &> reserve_space,
paddle::optional<const DenseTensor &> mean,
paddle::optional<const DenseTensor &> variance,
float momentum,
float epsilon_f,
const std::string &data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
bool is_inplace,
DenseTensor *x_grad,
DenseTensor *scale_grad,
DenseTensor *bias_grad) {
double epsilon = static_cast<double>(epsilon_f);
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
const auto *d_y = &y_grad;
auto *d_x = x_grad;
auto *d_scale = scale_grad;
auto *d_bias = bias_grad;
use_global_stats = is_test || use_global_stats;
const auto &x_dims = x.dims();
PADDLE_ENFORCE_EQ(
x_dims.size() >= 2 && x_dims.size() <= 5,
true,
phi::errors::InvalidArgument(
"The size of input's dimensions should be between 2 and 5."
"But received: the size of input's dimensions is [%d],"
"the dimensions of input is [%s]",
x_dims.size(),
x_dims));
int N, C, H, W, D;
paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
// init output
if (d_x) {
ctx.template Alloc<T>(d_x);
}
if (d_scale && d_bias) {
d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
}
PADDLE_ENFORCE_EQ(
scale.dims().size(),
1UL,
phi::errors::InvalidArgument(
"The size of scale's dimensions must equal to 1. But received: "
"the size of scale's dimensions is [%d], the dimensions of scale "
"is [%s].",
scale.dims().size(),
scale.dims()));
PADDLE_ENFORCE_EQ(
scale.dims()[0],
C,
phi::errors::InvalidArgument(
"The first dimension of scale must equal to Channels[%d]. But "
"received: the first dimension of scale is [%d]",
C,
scale.dims()[0]));
auto dtype = paddle::platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
auto compute_format =
data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// HIP do not support compute format of NHWC
// auto compute_format = DataLayout::kNCHW;
#else
const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF &&
FLAGS_cudnn_batchnorm_spatial_persistent &&
(reserve_space.get_ptr() != nullptr);
auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
? DataLayout::kNHWC
: DataLayout::kNCHW;
#endif
DenseTensor transformed_x(x.type());
DenseTensor transformed_d_y(d_y->type());
DenseTensor transformed_d_x;
if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
x_dims.size() > 2) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
ResizeToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
TransToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
if (d_x) {
ResizeToChannelFirst<Context, T>(ctx, d_x, &transformed_d_x);
}
} else {
transformed_x.ShareDataWith(x);
transformed_d_y.ShareDataWith(*d_y);
if (d_x) {
transformed_d_x.ShareDataWith(*d_x);
}
}
std::vector<int> dims;
std::vector<int> strides;
if (compute_format == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * C * D, 1, W * D * C, D * C, C};
}
const int num = transformed_x.numel();
#ifdef HIPCC
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid1 = (num + block - 1) / block;
int grid2 = std::min(C, max_blocks);
auto stream = ctx.stream();
InplaceHelper<T> inplace_functor;
if (!use_global_stats) {
if ((N * H * W * D) == 1) {
if (d_x) {
paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
}
phi::funcs::SetConstant<Context, BatchNormParamType<T>> functor;
functor(ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
functor(ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
return;
}
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(
&bn_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#endif // CUDNN_VERSION_MIN(7, 0, 1)
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
// data_desc_, mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
dims.data(),
strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
bn_param_desc_, data_desc_, mode_));
#endif
const auto *saved_mean_data =
saved_mean.template data<BatchNormParamType<T>>();
const auto *saved_var_data =
saved_variance.template data<BatchNormParamType<T>>();
if (is_inplace) {
inplace_functor(compute_format,
transformed_x.data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
saved_mean_data,
saved_var_data,
epsilon,
C,
H * W * D,
num,
transformed_x.data<T>(),
grid2,
block,
stream);
}
// This branch calls CUDNN APIs
if (d_x && d_scale && d_bias) {
bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
void *workspace_ptr = nullptr;
DenseTensor workspace_tensor;
auto reserve_space_size = reserve_space->memory_size();
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationBackwardExWorkspaceSize(
/*handle=*/ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*yDesc=*/data_desc_,
/*dyDesc=*/data_desc_,
/*dzDesc=*/nullptr,
/*dxDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationBackwardEx(
/*handle=*/ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*alphaDataDiff=*/CudnnDataType<T>::kOne(),
/*betaDataDiff=*/CudnnDataType<T>::kZero(),
/*alphaParamDiff=*/CudnnDataType<T>::kOne(),
/*betaParamDiff=*/CudnnDataType<T>::kZero(),
/*xDesc=*/data_desc_,
/*xData=*/transformed_x.template data<T>(),
/*yDesc=*/nullptr,
/*yData=*/nullptr,
/*dyDesc=*/data_desc_,
/*dyData=*/transformed_d_y.template data<T>(),
/*dzDesc=*/nullptr,
/*dzData=*/nullptr,
/*dxDesc=*/data_desc_,
/*dxData=*/ctx.template Alloc<T>(&transformed_d_x),
/*dBnScaleBiasDesc=*/bn_param_desc_,
/*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
/*bnBiasData=*/nullptr,
/*dBnScaleData=*/d_scale
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*dBnBiasData=*/d_bias
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*epsilon=*/epsilon,
/*savedMean=*/saved_mean_data,
/*savedInvVariance=*/saved_var_data,
/*activationDesc=*/nullptr,
/*workspace=*/workspace_ptr,
/*workSpaceSizeInBytes=*/workspace_size,
/*reserveSpace=*/const_cast<T *>(
reserve_space->template data<T>()),
/*reserveSpaceSizeInBytes=*/reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
if (compute_format == DataLayout::kNCHW) {
BNBackward<T,
block,
DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
saved_mean_data,
saved_var_data,
C,
N,
H * W * D,
epsilon,
transformed_d_x.template data<T>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()));
} else {
BNBackward<T,
block,
DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
saved_mean_data,
saved_var_data,
C,
N,
H * W * D,
epsilon,
transformed_d_x.template data<T>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()));
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationBackward(
// dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), data_desc_,
// transformed_x.template data<T>(), data_desc_,
// transformed_d_y.template data<T>(), data_desc_,
// transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
// bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
// d_scale->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// d_bias->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationBackward(
ctx.cudnn_handle(),
mode_,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
data_desc_,
transformed_d_y.template data<T>(),
data_desc_,
ctx.template Alloc<T>(&transformed_d_x),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean_data,
saved_var_data));
#endif
}
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
TransToChannelLast<Context, T>(ctx, &transformed_d_x, d_x);
}
} else {
// This branch call CUDA kernels
if (compute_format == DataLayout::kNCHW) {
if (d_x) {
BNBackwardData<
T,
block,
phi::DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
d_y->data<T>(),
scale.data<BatchNormParamType<T>>(),
saved_mean_data,
x.data<T>(),
saved_var_data,
C,
N,
H * W * D,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T,
block,
phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
d_y->data<T>(),
x.data<T>(),
saved_mean_data,
saved_var_data,
epsilon,
N,
C,
H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
BNBackwardData<
T,
block,
phi::DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
d_y->data<T>(),
scale.data<BatchNormParamType<T>>(),
saved_mean_data,
x.data<T>(),
saved_var_data,
C,
N,
H * W * D,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T,
block,
phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
d_y->data<T>(),
x.data<T>(),
saved_mean_data,
saved_var_data,
epsilon,
N,
C,
H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(
bn_param_desc_));
#endif
} else {
const auto *running_mean = mean.get_ptr();
const auto *running_var = variance.get_ptr();
const auto *running_mean_data =
running_mean->template data<BatchNormParamType<T>>();
const auto *running_var_data =
running_var->template data<BatchNormParamType<T>>();
if (is_inplace) {
auto px = x;
inplace_functor(data_layout,
ctx.template Alloc<T>(&px),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
running_mean_data,
running_var_data,
epsilon,
C,
H * W * D,
num,
x.data<T>(),
grid2,
block,
stream);
}
if (compute_format == DataLayout::kNCHW) {
if (d_x) {
KeBNBackwardData<T,
phi::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
d_y->data<T>(),
scale.data<BatchNormParamType<T>>(),
running_var_data,
epsilon,
C,
H * W,
num,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T,
block,
phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
d_y->data<T>(),
x.data<T>(),
running_mean_data,
running_var_data,
epsilon,
N,
C,
H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
KeBNBackwardData<T,
phi::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
d_y->data<T>(),
scale.data<BatchNormParamType<T>>(),
running_var_data,
epsilon,
C,
H * W,
num,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T,
block,
phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
d_y->data<T>(),
x.data<T>(),
running_mean_data,
running_var_data,
epsilon,
N,
C,
H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
}
template <typename T, typename Context>
void BatchNormGradKernel(const Context &dev_ctx,
const DenseTensor &y_grad,
const DenseTensor &x,
const DenseTensor &scale,
const DenseTensor &bias,
const DenseTensor &saved_mean,
const DenseTensor &saved_variance,
paddle::optional<const DenseTensor &> reserve_space,
paddle::optional<const DenseTensor &> mean,
paddle::optional<const DenseTensor &> variance,
float momentum,
float epsilon,
const std::string &data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor *x_grad,
DenseTensor *scale_grad,
DenseTensor *bias_grad) {
BatchNormGradRawKernel<T, Context>(dev_ctx,
y_grad,
x,
scale,
bias,
saved_mean,
saved_variance,
reserve_space,
mean,
variance,
momentum,
epsilon,
data_layout,
is_test,
use_global_stats,
trainable_statistics,
fuse_with_relu,
false,
x_grad,
scale_grad,
bias_grad);
}
template <typename T, typename Context>
void BatchNormDoubleGradKernel(const Context &ctx,
const DenseTensor &x_grad_grad,
const DenseTensor &scale_grad_grad,
const DenseTensor &bias_grad_grad,
const DenseTensor &y_grad,
const DenseTensor &x,
const DenseTensor &scale,
const DenseTensor &saved_mean,
const DenseTensor &saved_variance,
paddle::optional<const DenseTensor &> mean,
paddle::optional<const DenseTensor &> variance,
float momentum,
float epsilon,
const std::string &data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor *x_grad,
DenseTensor *scale_grad,
DenseTensor *y_grad_grad) {
PADDLE_ENFORCE_EQ(is_test,
false,
phi::errors::InvalidArgument(
"`is_test = True` CANNOT be used in train program. If "
"you want to use global status in pre_train model, "
"please set `use_global_stats = True`"));
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
const DenseTensor *running_mean = nullptr;
const DenseTensor *running_variance = nullptr;
if (use_global_stats) {
running_mean = mean.get_ptr();
running_variance = variance.get_ptr();
}
paddle::operators::NormDoubleGradFunctor<Context, T>(ctx,
data_layout,
&x,
&scale,
&y_grad,
&saved_mean,
&saved_variance,
running_mean,
running_variance,
epsilon,
use_global_stats,
&x_grad_grad,
&scale_grad_grad,
&bias_grad_grad,
x_grad,
scale_grad,
y_grad_grad);
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(batch_norm_grad,
GPU,
ALL_LAYOUT,
phi::BatchNormGradKernel,
float,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(batch_norm_grad_raw,
GPU,
ALL_LAYOUT,
phi::BatchNormGradRawKernel,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(batch_norm_grad,
GPU,
ALL_LAYOUT,
phi::BatchNormGradKernel,
float,
double,
phi::dtype::float16) {
if (kernel_key.dtype() == phi::DataType::FLOAT16) {
kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
}
}
PD_REGISTER_KERNEL(batch_norm_grad_raw,
GPU,
ALL_LAYOUT,
phi::BatchNormGradRawKernel,
float,
double,
phi::dtype::float16) {
if (kernel_key.dtype() == phi::DataType::FLOAT16) {
kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
}
}
#endif
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(batch_norm_grad_grad,
GPU,
ALL_LAYOUT,
phi::BatchNormDoubleGradKernel,
float,
double) {}
#else
PD_REGISTER_KERNEL(batch_norm_grad_grad,
GPU,
ALL_LAYOUT,
phi::BatchNormDoubleGradKernel,
float,
double) {}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/operators/norm_utils.cu.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/layout_utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
#ifdef __HIPCC__
#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
#else
#define LAUNCH_BOUNDS(BlockDim)
#endif
DECLARE_bool(cudnn_batchnorm_spatial_persistent);
namespace phi {
template <typename T>
using CudnnDataType = paddle::platform::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T, phi::DataLayout layout>
static __global__ void BNForwardInference(const T *x,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const int C,
const int N,
const int HxW,
const double epsilon,
T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int num = N * C * HxW;
for (int i = gid; i < num; i += stride) {
const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
}
}
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
const T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const int C,
const int N,
const int HxW,
const double epsilon,
double exponentialAverageFactor,
T *y,
BatchNormParamType<T> *mean,
BatchNormParamType<T> *variance,
BatchNormParamType<T> *save_mean,
BatchNormParamType<T> *save_inv_variance) {
int outer_size = C;
int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage mean_storage;
__shared__ typename BlockReduce::TempStorage variance_storeage;
__shared__ BatchNormParamType<T> mean_val;
__shared__ BatchNormParamType<T> variance_val;
__shared__ BatchNormParamType<T> inv_var_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
x_square_sum =
BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
if (threadIdx.x == 0) {
mean_val = x_sum / inner_size;
variance_val = x_square_sum / inner_size - mean_val * mean_val;
inv_var_val = 1 / sqrt(variance_val + epsilon);
if (save_mean && save_inv_variance) {
save_mean[i] = mean_val;
save_inv_variance[i] = inv_var_val;
}
mean[i] = (1 - exponentialAverageFactor) * mean_val +
exponentialAverageFactor * mean[i];
variance[i] = (1 - exponentialAverageFactor) * variance_val +
exponentialAverageFactor * variance[i];
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
}
}
}
template <typename T, typename Context>
void BatchNormKernel(const Context &ctx,
const DenseTensor &x,
const DenseTensor &scale,
const DenseTensor &bias,
const DenseTensor &mean,
const DenseTensor &variance,
float momentum,
float epsilon_f,
const std::string &data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor *y,
DenseTensor *mean_out,
DenseTensor *variance_out,
DenseTensor *saved_mean,
DenseTensor *saved_variance,
DenseTensor *reserve_space) {
double epsilon = epsilon_f;
const bool trainable_stats = trainable_statistics;
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
bool test_mode = is_test && (!trainable_stats);
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const auto &x_dims = x.dims();
PADDLE_ENFORCE_EQ(
x_dims.size() >= 2 && x_dims.size() <= 5,
true,
phi::errors::InvalidArgument(
"The size of input's dimensions should be between 2 and 5"
"But received: the size of input's dimensions is [%d]",
x_dims.size()));
ctx.template Alloc<T>(y);
int N, C, H, W, D;
paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
auto dtype = paddle::platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
auto compute_format =
data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// HIP do not support compute format of NHWC
// auto compute_format = DataLayout::kNCHW;
#else
const bool fast_nhwc_batch_norm =
test_mode ||
(dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
? DataLayout::kNHWC
: DataLayout::kNCHW;
#endif
DenseTensor transformed_x(x.type());
DenseTensor transformed_y(y->type());
if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
x_dims.size() > 2) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
ResizeToChannelFirst<Context, T>(ctx, y, &transformed_y);
} else {
transformed_x.ShareDataWith(x);
transformed_y.ShareDataWith(*y);
}
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#endif // CUDNN_VERSION_MIN(7, 0, 1)
VLOG(3) << "Setting descriptors.";
std::vector<int> dims;
std::vector<int> strides;
if (compute_format == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * D * C, 1, W * D * C, D * C, C};
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// Note: PERSISTENT not implemented for inference
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(
// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
dims.data(),
strides.data()));
// Note: PERSISTENT not implemented for inference
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
bn_param_desc_,
data_desc_,
test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
#endif
auto handle = ctx.cudnn_handle();
// Now, depending on whether we are running test or not, we have two paths.
// It is training mode when it's not reference AND not using pre-trained
// model.
bool training = !test_mode && !use_global_stats;
if (!training) {
// only when test we use input to do computation.
const auto *est_mean = &mean;
const auto *est_var = &variance;
// Run inference mode.
PADDLE_ENFORCE_EQ(
est_mean->dims().size(),
1UL,
phi::errors::InvalidArgument(
"The size of mean's dimensions must equal to 1."
"But received: the size of mean's dimensions mean is [%d],"
"the dimensions of mean is [%s].",
est_mean->dims().size(),
est_mean->dims()));
PADDLE_ENFORCE_EQ(
est_var->dims().size(),
1UL,
phi::errors::InvalidArgument(
"The size of variance's dimensions must equal to 1."
"But received: the size of variance's dimensions is [%d],"
"the dimensions of variance is [%s].",
est_var->dims().size(),
est_var->dims()));
PADDLE_ENFORCE_EQ(
est_mean->dims()[0],
C,
phi::errors::InvalidArgument(
"The first dimension of mean must equal to the number of "
"Channels, which is [%d]. But received: the first dimension"
"of mean is [%d], the dimensions of mean is [%s].",
C,
est_mean->dims()[0],
est_mean->dims()));
PADDLE_ENFORCE_EQ(
est_var->dims()[0],
C,
phi::errors::InvalidArgument(
"The first dimension of variance must equal to the number"
"of Channels, which is [%d]. But received: the first dimension of"
"variance is [%d], the dimensions of variance is [%s].",
C,
est_var->dims()[0],
est_var->dims()));
#ifdef PADDLE_WITH_HIP
const int block_size = 256;
const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
if (compute_format == DataLayout::kNCHW) {
BNForwardInference<
T,
DataLayout::kNCHW><<<grid_size, block_size, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
transformed_y.template data<T>());
} else {
BNForwardInference<
T,
DataLayout::kNHWC><<<grid_size, block_size, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
transformed_y.template data<T>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardInference(
// handle, miopenBNSpatial,
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_mean->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_var->template data<BatchNormParamType<T>>())),
// epsilon));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardInference(
handle,
// Note: PERSISTENT not implemented for inference
CUDNN_BATCHNORM_SPATIAL,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
data_desc_,
ctx.template Alloc<T>(&transformed_y),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
epsilon));
#endif
} else {
// if MomentumTensor is set, use MomentumTensor value, momentum
// is only used in this training branch
// need to solve here
// if (ctx.HasInput("MomentumTensor")) {
// const auto *mom_tensor = MomentumTensor;
// DenseTensor mom_cpu;
// paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
// &mom_cpu);
// momentum = mom_cpu.data<float>()[0];
// }
// Run training mode.
// obtain running mean and running inv var, and there is no need
// to initialize them.
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
if ((N * H * W * D) == 1) {
// Only 1 element in normalization dimension,
// skip the batch norm calculation, let y = x.
paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
} else {
double this_factor = 1. - momentum;
bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
size_t reserve_space_size = 0;
void *reserve_space_ptr = nullptr;
void *workspace_ptr = nullptr;
DenseTensor workspace_tensor;
// Create reserve space and workspace for batch norm.
// Create tensor for each batchnorm op, it will be used in the
// backward. Thus this tensor shouldn't be temp.
// auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
PADDLE_ENFORCE_NOT_NULL(
reserve_space,
phi::errors::NotFound(
"The argument ReserveSpace of batch_norm op is not found."));
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*zDesc=*/nullptr,
/*yDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
// -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*activationDesc=*/nullptr,
/*xDesc=*/data_desc_,
/*sizeInBytes=*/&reserve_space_size));
reserve_space_ptr = reserve_space->mutable_data(
ctx.GetPlace(), transformed_x.type(), reserve_space_size);
workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
handle,
mode_,
CUDNN_BATCHNORM_OPS_BN,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
nullptr,
nullptr,
data_desc_,
transformed_y.template data<T>(),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
nullptr,
workspace_ptr,
workspace_size,
reserve_space_ptr,
reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
const int num = transformed_x.numel();
const int block = 256;
const int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(C, max_blocks);
if (compute_format == DataLayout::kNCHW) {
BNForwardTraining<
T,
block,
DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
this_factor,
transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
} else {
BNForwardTraining<
T,
block,
DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
this_factor,
transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardTraining(
// handle, mode_, const_cast<void *>(static_cast<const void *>(
// CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// this_factor,
// static_cast<void *>(
// mean_out->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(variance_out->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace())),
// epsilon,
// static_cast<void *>(
// saved_mean->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
handle,
mode_,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
data_desc_,
ctx.template Alloc<T>(&transformed_y),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())));
#endif
}
}
}
if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
x_dims.size() > 2) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
TransToChannelLast<Context, T>(ctx, &transformed_y, y);
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
#endif
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(batch_norm,
GPU,
ALL_LAYOUT,
phi::BatchNormKernel,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(batch_norm,
GPU,
ALL_LAYOUT,
phi::BatchNormKernel,
float,
double,
phi::dtype::float16) {
if (kernel_key.dtype() == phi::DataType::FLOAT16) {
kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
}
}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
using Tensor = DenseTensor;
template <typename DeviceContext, typename T>
inline void ResizeToChannelFirst(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[4];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
in_dims_vec[4] = input->dims()[3];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[3];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
} else if (dim == 1) {
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[1];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
}
}
template <typename DeviceContext, typename T>
inline void ResizeToChannelLast(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[4];
in_dims_vec[4] = input->dims()[1];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[1];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
} else if (dim == 1) {
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[1];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelFirst(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
VLOG(5) << "Why am I called?";
int dim = input->dims().size() - 2;
if (dim == 3) {
std::vector<int> axis{0, 4, 1, 2, 3};
funcs::Transpose<DeviceContext, T, 5> trans5;
trans5(context, *input, transformed_input, axis);
} else if (dim == 2) {
std::vector<int> axis{0, 3, 1, 2};
funcs::Transpose<DeviceContext, T, 4> trans4;
trans4(context, *input, transformed_input, axis);
} else if (dim == 1) {
std::vector<int> axis{0, 2, 1};
funcs::Transpose<DeviceContext, T, 3> trans3;
trans3(context, *input, transformed_input, axis);
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelLast(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
std::vector<int> axis{0, 2, 3, 4, 1};
funcs::Transpose<DeviceContext, T, 5> trans5;
trans5(context, *input, transformed_input, axis);
} else if (dim == 2) {
std::vector<int> axis{0, 2, 3, 1};
funcs::Transpose<DeviceContext, T, 4> trans4;
trans4(context, *input, transformed_input, axis);
} else if (dim == 1) {
std::vector<int> axis{0, 2, 1};
funcs::Transpose<DeviceContext, T, 3> trans3;
trans3(context, *input, transformed_input, axis);
}
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("batch_norm",
{"X", "Scale", "Bias", "Mean", "Variance"},
{"momentum",
"epsilon",
"data_layout",
"is_test",
"use_global_stats",
"trainable_statistics",
"fuse_with_relu"},
{"Y",
"MeanOut",
"VarianceOut",
"SavedMean",
"SavedVariance",
"ReserveSpace"});
}
KernelSignature BatchNormGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"batch_norm_grad",
{GradVarName("Y"),
"X",
"Scale",
"Bias",
"SavedMean",
"SavedVariance",
"ReserveSpace",
"Mean",
"Variance"},
{"momentum",
"epsilon",
"data_layout",
"is_test",
"use_global_stats",
"trainable_statistics",
"fuse_with_relu"},
{GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
}
KernelSignature BatchNormGradGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("batch_norm_grad_grad",
{"DDX",
"DDScale",
"DDBias",
"DY",
"X",
"Scale",
"SavedMean",
"SavedVariance",
"Mean",
"Variance"},
{"momentum",
"epsilon",
"data_layout",
"is_test",
"use_global_stats",
"trainable_statistics",
"fuse_with_relu"},
{"DX", "DScale", "DDY"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(batch_norm, phi::BatchNormOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad,
phi::BatchNormGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad_grad,
phi::BatchNormGradGradOpArgumentMapping);
...@@ -520,6 +520,7 @@ def predict_static(args, data): ...@@ -520,6 +520,7 @@ def predict_static(args, data):
paddle.enable_static() paddle.enable_static()
exe = fluid.Executor(args.place) exe = fluid.Executor(args.place)
# load inference model # load inference model
[inference_program, feed_target_names, [inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model( fetch_targets] = fluid.io.load_inference_model(
args.model_save_dir, args.model_save_dir,
......
...@@ -162,6 +162,7 @@ class TestIRPassBase(unittest.TestCase): ...@@ -162,6 +162,7 @@ class TestIRPassBase(unittest.TestCase):
for k, v in self.get_strategy().items(): for k, v in self.get_strategy().items():
setattr(build_strategy, k, v) setattr(build_strategy, k, v)
self.check_before_applied(main2, startup2) self.check_before_applied(main2, startup2)
apply_build_strategy(main2, startup2, build_strategy, apply_build_strategy(main2, startup2, build_strategy,
{"use_cuda": self.use_cuda}) {"use_cuda": self.use_cuda})
self.check_after_applied(main2, startup2) self.check_after_applied(main2, startup2)
......
...@@ -320,7 +320,7 @@ class TestBatchNormOpInference(unittest.TestCase): ...@@ -320,7 +320,7 @@ class TestBatchNormOpInference(unittest.TestCase):
def test_check_output(self): def test_check_output(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
...@@ -342,13 +342,13 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): ...@@ -342,13 +342,13 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
def test_check_output(self): def test_check_output(self):
places = [] places = []
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
if core.is_float16_supported(place): if core.is_float16_supported(place):
places.append(place) places.append(place)
for place in places: for place in places:
for data_format in ["NCHW", "NHWC"]: #for data_format in ["NCHW", "NHWC"]:
for data_format in ["NCHW"]:
self.check_with_place(place, data_format, self.dtype, self.check_with_place(place, data_format, self.dtype,
[2, 3, 4, 5]) [2, 3, 4, 5])
self.check_with_place(place, data_format, self.dtype, [2, 3]) self.check_with_place(place, data_format, self.dtype, [2, 3])
...@@ -517,7 +517,7 @@ class TestBatchNormOpTraining(unittest.TestCase): ...@@ -517,7 +517,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
...@@ -657,7 +657,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase): ...@@ -657,7 +657,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
class TestDygraphBatchNormTrainableStats(unittest.TestCase): class TestDygraphBatchNormTrainableStats(unittest.TestCase):
def test_dygraph(self): def test_dygraph(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
shape = [4, 10, 4, 4] shape = [4, 10, 4, 4]
...@@ -678,7 +678,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase): ...@@ -678,7 +678,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
def test_static(self): def test_static(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
exe = fluid.Executor(p) exe = fluid.Executor(p)
...@@ -716,4 +716,6 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase): ...@@ -716,4 +716,6 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
import paddle
paddle.enable_static()
unittest.main() unittest.main()
...@@ -28,7 +28,7 @@ import paddle ...@@ -28,7 +28,7 @@ import paddle
class TestBatchNorm(unittest.TestCase): class TestBatchNorm(unittest.TestCase):
def test_name(self): def test_name(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
with fluid.dygraph.guard(p): with fluid.dygraph.guard(p):
...@@ -36,7 +36,7 @@ class TestBatchNorm(unittest.TestCase): ...@@ -36,7 +36,7 @@ class TestBatchNorm(unittest.TestCase):
def test_error(self): def test_error(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
#paddle.disable_static() #paddle.disable_static()
...@@ -83,7 +83,7 @@ class TestBatchNorm(unittest.TestCase): ...@@ -83,7 +83,7 @@ class TestBatchNorm(unittest.TestCase):
def test_dygraph(self): def test_dygraph(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
shape = [4, 10, 4, 4] shape = [4, 10, 4, 4]
...@@ -135,7 +135,7 @@ class TestBatchNorm(unittest.TestCase): ...@@ -135,7 +135,7 @@ class TestBatchNorm(unittest.TestCase):
def test_static(self): def test_static(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
exe = fluid.Executor(p) exe = fluid.Executor(p)
...@@ -177,7 +177,7 @@ class TestBatchNormChannelLast(unittest.TestCase): ...@@ -177,7 +177,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
else: else:
paddle.set_default_dtype("float64") paddle.set_default_dtype("float64")
self.places = [fluid.CPUPlace()] self.places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
self.places.append(fluid.CUDAPlace(0)) self.places.append(fluid.CUDAPlace(0))
def tearDown(self): def tearDown(self):
...@@ -247,7 +247,7 @@ class TestBatchNormChannelLast(unittest.TestCase): ...@@ -247,7 +247,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
class TestBatchNormUseGlobalStats(unittest.TestCase): class TestBatchNormUseGlobalStats(unittest.TestCase):
def setUp(self): def setUp(self):
self.places = [fluid.CPUPlace()] self.places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
self.places.append(fluid.CUDAPlace(0)) self.places.append(fluid.CUDAPlace(0))
self.init_test() self.init_test()
...@@ -300,4 +300,6 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats): ...@@ -300,4 +300,6 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats):
if __name__ == '__main__': if __name__ == '__main__':
import paddle
paddle.enable_static()
unittest.main() unittest.main()
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle
import paddle import paddle
import paddle.fluid.core as core import paddle.fluid.core as core
...@@ -1001,4 +1002,5 @@ create_test_cudnn_channel_last_fp16_class( ...@@ -1001,4 +1002,5 @@ create_test_cudnn_channel_last_fp16_class(
TestWithDilation_AsyPadding, grad_check=False) TestWithDilation_AsyPadding, grad_check=False)
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -231,4 +231,5 @@ class TestExpandV2API(unittest.TestCase): ...@@ -231,4 +231,5 @@ class TestExpandV2API(unittest.TestCase):
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static()
unittest.main() unittest.main()
...@@ -23,6 +23,7 @@ import paddle.fluid as fluid ...@@ -23,6 +23,7 @@ import paddle.fluid as fluid
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid import compiler from paddle.fluid import compiler
import paddle.fluid.unique_name as unique_name import paddle.fluid.unique_name as unique_name
import paddle
class TestInplaceANBOpTraining(unittest.TestCase): class TestInplaceANBOpTraining(unittest.TestCase):
...@@ -138,14 +139,14 @@ class TestInplaceANBOpTraining(unittest.TestCase): ...@@ -138,14 +139,14 @@ class TestInplaceANBOpTraining(unittest.TestCase):
outs[0].name if not only_forward else None, outs[0].name if not only_forward else None,
build_strategy=build_strategy, build_strategy=build_strategy,
exec_strategy=exec_strategy) exec_strategy=exec_strategy)
bn_fetches = exe.run(program=comp_prog1, bn_fetches = exe.run(program=main,
feed={'input': data}, feed={'input': data},
fetch_list=fetch_name) fetch_list=fetch_name)
fetch_outs.append(bn_fetches) fetch_outs.append(bn_fetches)
fetch_names.append(fetch_name) fetch_names.append(fetch_name)
for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs + for bn_val, inplace_abn_val, name1, name2 in zip(*(
fetch_names)): fetch_outs + fetch_names)):
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
bn_val, inplace_abn_val, atol=1e-2), bn_val, inplace_abn_val, atol=1e-2),
...@@ -156,6 +157,7 @@ class TestInplaceANBOpTraining(unittest.TestCase): ...@@ -156,6 +157,7 @@ class TestInplaceANBOpTraining(unittest.TestCase):
def test_op(self): def test_op(self):
use_cudas = [False, True] if core.is_compiled_with_cuda() else [False] use_cudas = [False, True] if core.is_compiled_with_cuda() else [False]
#use_cudas = [False]
for use_cuda in use_cudas: for use_cuda in use_cudas:
place = core.CUDAPlace(0) if use_cuda else core.CPUPlace() place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
layouts = ["NCHW", "NHWC"] layouts = ["NCHW", "NHWC"]
...@@ -186,4 +188,5 @@ class TestInplaceANBOpTraining(unittest.TestCase): ...@@ -186,4 +188,5 @@ class TestInplaceANBOpTraining(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -21,6 +21,7 @@ import paddle.fluid as fluid ...@@ -21,6 +21,7 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import paddle.fluid.core as core import paddle.fluid.core as core
import gradient_checker import gradient_checker
import paddle
from decorator_helper import prog_scope from decorator_helper import prog_scope
...@@ -167,4 +168,5 @@ class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5): ...@@ -167,4 +168,5 @@ class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static()
unittest.main() unittest.main()
...@@ -24,6 +24,7 @@ from simple_nets import init_data, simple_fc_net, fc_with_batchnorm ...@@ -24,6 +24,7 @@ from simple_nets import init_data, simple_fc_net, fc_with_batchnorm
import seresnext_net import seresnext_net
from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType
from fake_reader import fake_imdb_reader from fake_reader import fake_imdb_reader
import paddle
def lstm_net(use_feed): def lstm_net(use_feed):
...@@ -309,4 +310,5 @@ class TestProgramPruneBackward(unittest.TestCase): ...@@ -309,4 +310,5 @@ class TestProgramPruneBackward(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -507,4 +507,5 @@ class TestReshapeZeroTensor(unittest.TestCase): ...@@ -507,4 +507,5 @@ class TestReshapeZeroTensor(unittest.TestCase):
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static()
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册