未验证 提交 cc272afb 编写于 作者: Y YuanRisheng 提交者: GitHub

[Phi]Refactor InstanceNormKernel and InstanceNormGradKernel (#42978)

* move instance_norm

* change mutable_data

* fix compile bugs
上级 8f7f3ac9
......@@ -170,104 +170,6 @@ NCHW `[batch, in_channels, in_height, in_width]`
)DOC");
}
template <typename T>
class InstanceNormKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
const int N = x_dims[0];
const int C = x_dims[1];
const int NxC = N * C;
const int sample_size = x->numel() / N / C;
auto *y = ctx.Output<Tensor>("Y");
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
auto *place = dev_ctx.eigen_device();
Eigen::DSizes<int, 2> shape(NxC, sample_size);
// Once eigen on Windows is updated, the if branch can be removed.
#ifndef EIGEN_HAS_INDEX_LIST
Eigen::DSizes<int, 2> bcast(1, sample_size);
Eigen::DSizes<int, 2> C_shape(C, 1);
Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
Eigen::DSizes<int, 1> rdims(1);
#else
Eigen::IndexList<Eigen::type2index<1>, int> bcast;
bcast.set(1, sample_size);
Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
C_shape.set(0, C);
Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
NxC_shape.set(0, NxC);
Eigen::IndexList<Eigen::type2index<1>> rdims;
#endif
phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
saved_mean->mutable_data<T>(ctx.GetPlace());
saved_variance->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, saved_mean, static_cast<T>(0));
set_constant(dev_ctx, saved_variance, static_cast<T>(0));
auto saved_mean_a = framework::EigenVector<T>::Flatten(*saved_mean);
auto saved_mean_e = saved_mean_a.reshape(NxC_shape);
auto saved_variance_a = framework::EigenVector<T>::Flatten(*saved_variance);
auto saved_variance_e = saved_variance_a.reshape(NxC_shape);
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto x_arr = x_e.reshape(shape);
saved_mean_e.device(*place) = x_arr.mean(rdims);
auto saved_variance_arr =
(x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse();
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
Tensor scale_data;
Tensor bias_data;
if (!scale) {
scale_data.mutable_data<T>({C}, ctx.GetPlace());
set_constant(dev_ctx, &scale_data, static_cast<T>(1));
}
if (!bias) {
bias_data.mutable_data<T>({C}, ctx.GetPlace());
set_constant(dev_ctx, &bias_data, static_cast<T>(0));
}
auto scale_e = scale
? framework::EigenVector<T>::Flatten(*scale)
: framework::EigenVector<T>::Flatten(
const_cast<const framework::Tensor &>(scale_data));
auto scale_arr = scale_e.reshape(C_shape);
auto bias_e = bias ? framework::EigenVector<T>::Flatten(*bias)
: framework::EigenVector<T>::Flatten(
const_cast<const framework::Tensor &>(bias_data));
auto bias_arr = bias_e.reshape(C_shape);
y->mutable_data<T>(ctx.GetPlace());
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto y_arr = y_e.reshape(shape);
// (x - mean) * inv_std * scale + bias
Eigen::DSizes<int, 2> bcast_param(N, sample_size);
y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) *
saved_variance_e.broadcast(bcast) *
scale_arr.broadcast(bcast_param) +
bias_arr.broadcast(bcast_param);
}
};
void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormGrad");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
......@@ -312,120 +214,6 @@ framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
template <typename T>
class InstanceNormGradKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<Tensor>("X");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
const auto &x_dims = x->dims();
const int N = x_dims[0];
const int C = x_dims[1];
const int NxC = N * C;
const int sample_size = x->numel() / N / C;
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
d_x->mutable_data<T>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
auto *place = dev_ctx.eigen_device();
Eigen::DSizes<int, 2> rshape(NxC, sample_size);
Eigen::DSizes<int, 2> param_shape(N, C);
Eigen::DSizes<int, 2> shape(NxC, sample_size);
#ifndef EIGEN_HAS_INDEX_LIST
Eigen::DSizes<int, 1> rdims(0);
Eigen::DSizes<int, 1> mean_rdims(1);
Eigen::DSizes<int, 2> bcast(1, sample_size);
Eigen::DSizes<int, 2> C_shape(C, 1);
Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
#else
Eigen::IndexList<Eigen::type2index<0>> rdims;
Eigen::IndexList<Eigen::type2index<1>> mean_rdims;
Eigen::IndexList<Eigen::type2index<1>, int> bcast;
bcast.set(1, sample_size);
Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
C_shape.set(0, C);
Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
NxC_shape.set(0, NxC);
#endif
phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
Tensor scale_data;
if (!scale) {
scale_data.mutable_data<T>({C}, ctx.GetPlace());
set_constant(dev_ctx, &scale_data, static_cast<T>(1));
}
auto scale_e = scale
? framework::EigenVector<T>::Flatten(*scale)
: framework::EigenVector<T>::Flatten(
const_cast<const framework::Tensor &>(scale_data));
auto mean_e = framework::EigenVector<T>::Flatten(*saved_mean);
auto inv_var_e = framework::EigenVector<T>::Flatten(*saved_inv_variance);
auto dy_e = framework::EigenVector<T>::Flatten(*d_y);
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto scale_arr = scale_e.reshape(C_shape);
auto mean_arr = mean_e.reshape(NxC_shape);
auto inv_var_arr = inv_var_e.reshape(NxC_shape);
auto dy_arr = dy_e.reshape(shape);
auto x_arr = x_e.reshape(shape);
auto tmp = (x_arr - mean_arr.eval().broadcast(bcast)) *
inv_var_arr.eval().broadcast(bcast);
// math: d_bias = np.sum(d_y, axis=(n,h,w))
// math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
if (d_scale && d_bias) {
d_scale->mutable_data<T>(ctx.GetPlace());
d_bias->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, d_scale, static_cast<T>(0));
set_constant(dev_ctx, d_bias, static_cast<T>(0));
auto d_scale_e = framework::EigenVector<T>::Flatten(*d_scale);
auto d_scale_data = d_scale_e.reshape(C_shape);
auto d_bias_e = framework::EigenVector<T>::Flatten(*d_bias);
auto d_bias_data = d_bias_e.reshape(C_shape);
d_bias_data.device(*place) =
dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims);
d_scale_data.device(*place) =
(tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims);
}
auto dy_mean =
dy_arr.mean(mean_rdims).reshape(NxC_shape).eval().broadcast(bcast);
Eigen::DSizes<int, 2> bcast_param(N, sample_size);
set_constant(dev_ctx, d_x, static_cast<T>(0));
// math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
// axis=(h,w))
// - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
// mean),
// axis=(h,w))
auto dx_e = framework::EigenVector<T>::Flatten(*d_x);
auto dx_arr = dx_e.reshape(shape);
dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
inv_var_arr.broadcast(bcast) *
(dy_arr - dy_mean -
tmp *
(dy_arr * tmp)
.mean(mean_rdims)
.reshape(NxC_shape)
.eval()
.broadcast(bcast));
}
};
void InstanceNormDoubleGradOp::InferShape(
framework::InferShapeContext *ctx) const {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormDoubleGrad");
......@@ -699,14 +487,6 @@ REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp,
REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp,
ops::InstanceNormDoubleGradOpInplaceInferer);
REGISTER_OP_CPU_KERNEL(
instance_norm,
ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
instance_norm_grad,
ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
instance_norm_grad_grad,
ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
......
......@@ -70,181 +70,6 @@ static __global__ void add_param(const T *input, T *output,
}
}
template <typename T>
class InstanceNormKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::PreconditionNotMet("It must be CUDAPlace."));
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
auto *x = ctx.Input<Tensor>("X");
auto &x_dims = x->dims();
PADDLE_ENFORCE_GE(x_dims.size(), 2,
platform::errors::InvalidArgument(
"The `shape` in InstanceNormOp is invalid: "
"the size of X's dimensions must greater than "
"or equal to 2. But received: "
"the size of X's dimensions is [%d]",
x_dims.size()));
PADDLE_ENFORCE_LE(x_dims.size(), 5,
platform::errors::InvalidArgument(
"The `shape` in InstanceNormOp is invalid: "
"the size of X's dimensions must smaller than"
"or equal to 5. But received: "
"the size of X's dimensions is [%d]",
x_dims.size()));
int N, C, H, W, D;
ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
int NxC = N * C;
Tensor x_tmp;
x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D});
auto *y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace());
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t data_desc_;
miopenTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
VLOG(3) << "Setting descriptors.";
std::vector<int> dims;
std::vector<int> strides;
dims = {1, NxC, H, W, D};
strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
const_cast<int *>(strides.data())));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, miopenBNSpatial));
#else
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
#endif
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
Tensor scale_tmp =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
scale_tmp.mutable_data<T>(ctx.GetPlace());
Tensor bias_tmp =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
bias_tmp.mutable_data<T>(ctx.GetPlace());
const int n = x->numel();
const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min((NxC + block - 1) / block, max_blocks);
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
if (scale) {
repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
scale->data<T>(), scale_tmp.data<T>(), N, C);
} else {
set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
}
if (bias) {
repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
bias->data<T>(), bias_tmp.data<T>(), N, C);
} else {
set_constant(dev_ctx, &bias_tmp, static_cast<T>(0));
}
auto handle = dev_ctx.cudnn_handle();
phi::funcs::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenBatchNormalizationForwardTraining(
handle, miopenBNSpatial,
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kOne())),
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kZero())),
data_desc_, static_cast<const void *>(x_tmp.template data<T>()),
data_desc_,
static_cast<void *>(y->template mutable_data<T>(ctx.GetPlace())),
in_param_desc_,
const_cast<void *>(static_cast<const void *>(
scale_tmp.template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
bias_tmp.template data<BatchNormParamType<T>>())),
0, nullptr, nullptr, epsilon,
static_cast<void *>(
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())),
static_cast<void *>(
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()))));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardTraining(
handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
bias_tmp.template data<BatchNormParamType<T>>(), 0, nullptr,
nullptr, epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
#endif
}
};
template <typename T, int BlockDim>
static __global__ void GradComputeDX(const T *dy,
const BatchNormParamType<T> *scale,
......@@ -297,203 +122,6 @@ static __global__ void GradComputeDX(const T *dy,
}
}
template <typename T>
class InstanceNormGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::PreconditionNotMet("It must use CUDAPlace."));
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *x = ctx.Input<Tensor>("X");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto &x_dims = x->dims();
int N, C, H, W, D;
ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
int NxC = N * C;
Tensor x_tmp, d_y_tmp;
x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D});
d_y_tmp.ShareDataWith(*d_y).Resize({1, NxC, H, W, D});
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
d_x->mutable_data<T>(ctx.GetPlace());
if (d_scale && d_bias) {
d_scale->mutable_data<T>(ctx.GetPlace());
d_bias->mutable_data<T>(ctx.GetPlace());
}
if (scale) {
PADDLE_ENFORCE_EQ(
scale->dims().size(), 1UL,
platform::errors::InvalidArgument(
"The `shape` in InstanceNormOp is invalid: "
"the size of scale's dimensions must be equal to 1. But "
"received: the size of scale's dimensions"
"is [%d]",
scale->dims().size()));
PADDLE_ENFORCE_EQ(scale->dims()[0], C,
platform::errors::InvalidArgument(
"The `shape` in InstanceNormOp is invalid: "
"the first dimension of scale must be equal to "
"Channels([%d]). But received: "
"the first dimension of scale is [%d],"
"the dimensions of scale is [%s], ",
C, scale->dims()[0], scale->dims()));
}
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
const int n = x->numel();
const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(NxC, max_blocks);
const int grid1 = (C + block - 1) / block;
Tensor scale_tmp =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
scale_tmp.mutable_data<T>(ctx.GetPlace());
Tensor d_scale_tmp =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
Tensor d_bias_tmp =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
if (scale) {
repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
scale->data<T>(), scale_tmp.data<T>(), N, C);
} else {
set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
}
std::vector<int> dims;
std::vector<int> strides;
dims = {1, NxC, H, W, D};
strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
if ((H * W * D) == 1) {
framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
phi::funcs::SetConstant<platform::CUDADeviceContext,
BatchNormParamType<T>>
functor;
functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
return;
}
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t data_desc_;
miopenTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
const_cast<int *>(strides.data())));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, miopenBNSpatial));
#else
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
#endif
const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
const auto *saved_mean_data =
saved_mean->template data<BatchNormParamType<T>>();
const auto *saved_var_data =
saved_var->template data<BatchNormParamType<T>>();
if (d_scale && d_bias) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenBatchNormalizationBackward(
dev_ctx.cudnn_handle(), miopenBNSpatial, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
data_desc_, d_y_tmp.template data<T>(), data_desc_,
d_x->template mutable_data<T>(ctx.GetPlace()), in_param_desc_,
scale_tmp.template data<BatchNormParamType<T>>(),
d_scale_tmp.template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias_tmp.template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationBackward(
dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL,
CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), data_desc_,
x_tmp.template data<T>(), data_desc_, d_y_tmp.template data<T>(),
data_desc_, d_x->template mutable_data<T>(ctx.GetPlace()),
in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
d_scale_tmp.template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias_tmp.template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
#endif
} else {
if (d_x) {
GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), scale_tmp.data<BatchNormParamType<T>>(),
saved_mean_data, x->data<T>(), saved_var_data, C, H * W * D,
d_x->data<T>());
}
}
if (d_scale && d_bias) {
add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
d_scale_tmp.data<T>(), d_scale->data<T>(), N, C);
add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
d_bias_tmp.data<T>(), d_bias->data<T>(), N, C);
}
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
#endif
}
};
static __device__ __forceinline__ float real_sqrt(float x) {
return 1. / sqrtf(x);
}
......@@ -793,22 +421,10 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_CUDA_KERNEL(
instance_norm, ops::InstanceNormKernel<plat::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(
instance_norm_grad,
ops::InstanceNormGradKernel<plat::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(instance_norm_grad_grad,
ops::InstanceNormDoubleGradKernel<
paddle::platform::CUDADeviceContext, float>);
#else
REGISTER_OP_CUDA_KERNEL(
instance_norm, ops::InstanceNormKernel<plat::CUDADeviceContext, float>,
ops::InstanceNormKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
instance_norm_grad,
ops::InstanceNormGradKernel<plat::CUDADeviceContext, float>,
ops::InstanceNormGradKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
instance_norm_grad_grad,
ops::InstanceNormDoubleGradKernel<paddle::platform::CUDADeviceContext,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename T, typename Context>
void InstanceNormGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& d_y,
paddle::optional<const DenseTensor&> scale,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
float epsilon,
DenseTensor* d_x,
DenseTensor* d_scale,
DenseTensor* d_bias) {
const auto* scale_ptr = scale.get_ptr();
const auto& x_dims = x.dims();
const int N = x_dims[0];
const int C = x_dims[1];
const int NxC = N * C;
const int sample_size = x.numel() / N / C;
dev_ctx.template Alloc<T>(d_x);
auto* place = dev_ctx.eigen_device();
Eigen::DSizes<int, 2> rshape(NxC, sample_size);
Eigen::DSizes<int, 2> param_shape(N, C);
Eigen::DSizes<int, 2> shape(NxC, sample_size);
#ifndef EIGEN_HAS_INDEX_LIST
Eigen::DSizes<int, 1> rdims(0);
Eigen::DSizes<int, 1> mean_rdims(1);
Eigen::DSizes<int, 2> bcast(1, sample_size);
Eigen::DSizes<int, 2> C_shape(C, 1);
Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
#else
Eigen::IndexList<Eigen::type2index<0>> rdims;
Eigen::IndexList<Eigen::type2index<1>> mean_rdims;
Eigen::IndexList<Eigen::type2index<1>, int> bcast;
bcast.set(1, sample_size);
Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
C_shape.set(0, C);
Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
NxC_shape.set(0, NxC);
#endif
phi::funcs::SetConstant<CPUContext, T> set_constant;
DenseTensor scale_data;
if (!scale_ptr) {
scale_data.Resize({C});
dev_ctx.template Alloc<T>(&scale_data);
set_constant(dev_ctx, &scale_data, static_cast<T>(1));
}
auto scale_e =
scale_ptr
? EigenVector<T>::Flatten(*scale_ptr)
: EigenVector<T>::Flatten(const_cast<const DenseTensor&>(scale_data));
auto mean_e = EigenVector<T>::Flatten(saved_mean);
auto inv_var_e = EigenVector<T>::Flatten(saved_variance);
auto dy_e = EigenVector<T>::Flatten(d_y);
auto x_e = EigenVector<T>::Flatten(x);
auto scale_arr = scale_e.reshape(C_shape);
auto mean_arr = mean_e.reshape(NxC_shape);
auto inv_var_arr = inv_var_e.reshape(NxC_shape);
auto dy_arr = dy_e.reshape(shape);
auto x_arr = x_e.reshape(shape);
auto tmp = (x_arr - mean_arr.eval().broadcast(bcast)) *
inv_var_arr.eval().broadcast(bcast);
// math: d_bias = np.sum(d_y, axis=(n,h,w))
// math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
if (d_scale && d_bias) {
dev_ctx.template Alloc<T>(d_scale);
dev_ctx.template Alloc<T>(d_bias);
set_constant(dev_ctx, d_scale, static_cast<T>(0));
set_constant(dev_ctx, d_bias, static_cast<T>(0));
auto d_scale_e = EigenVector<T>::Flatten(*d_scale);
auto d_scale_data = d_scale_e.reshape(C_shape);
auto d_bias_e = EigenVector<T>::Flatten(*d_bias);
auto d_bias_data = d_bias_e.reshape(C_shape);
d_bias_data.device(*place) =
dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims);
d_scale_data.device(*place) =
(tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims);
}
auto dy_mean =
dy_arr.mean(mean_rdims).reshape(NxC_shape).eval().broadcast(bcast);
Eigen::DSizes<int, 2> bcast_param(N, sample_size);
set_constant(dev_ctx, d_x, static_cast<T>(0));
// math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
// axis=(h,w))
// - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
// mean),
// axis=(h,w))
auto dx_e = EigenVector<T>::Flatten(*d_x);
auto dx_arr = dx_e.reshape(shape);
dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
inv_var_arr.broadcast(bcast) *
(dy_arr - dy_mean -
tmp *
(dy_arr * tmp)
.mean(mean_rdims)
.reshape(NxC_shape)
.eval()
.broadcast(bcast));
}
} // namespace phi
PD_REGISTER_KERNEL(instance_norm_grad,
CPU,
ALL_LAYOUT,
phi::InstanceNormGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/instance_norm_kernel.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename T, typename Context>
void InstanceNormKernel(const Context& dev_ctx,
const DenseTensor& x,
paddle::optional<const DenseTensor&> scale,
paddle::optional<const DenseTensor&> bias,
float epsilon_f,
DenseTensor* y,
DenseTensor* saved_mean,
DenseTensor* saved_variance) {
const auto& x_dims = x.dims();
T epsilon = static_cast<T>(epsilon_f);
const int N = x_dims[0];
const int C = x_dims[1];
const int NxC = N * C;
const int sample_size = x.numel() / N / C;
auto* place = dev_ctx.eigen_device();
Eigen::DSizes<int, 2> shape(NxC, sample_size);
// Once eigen on Windows is updated, the if branch can be removed.
#ifndef EIGEN_HAS_INDEX_LIST
Eigen::DSizes<int, 2> bcast(1, sample_size);
Eigen::DSizes<int, 2> C_shape(C, 1);
Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
Eigen::DSizes<int, 1> rdims(1);
#else
Eigen::IndexList<Eigen::type2index<1>, int> bcast;
bcast.set(1, sample_size);
Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
C_shape.set(0, C);
Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
NxC_shape.set(0, NxC);
Eigen::IndexList<Eigen::type2index<1>> rdims;
#endif
phi::funcs::SetConstant<CPUContext, T> set_constant;
dev_ctx.template Alloc<T>(saved_mean);
dev_ctx.template Alloc<T>(saved_variance);
set_constant(dev_ctx, saved_mean, static_cast<T>(0));
set_constant(dev_ctx, saved_variance, static_cast<T>(0));
auto saved_mean_a = EigenVector<T>::Flatten(*saved_mean);
auto saved_mean_e = saved_mean_a.reshape(NxC_shape);
auto saved_variance_a = EigenVector<T>::Flatten(*saved_variance);
auto saved_variance_e = saved_variance_a.reshape(NxC_shape);
auto x_e = EigenVector<T>::Flatten(x);
auto x_arr = x_e.reshape(shape);
saved_mean_e.device(*place) = x_arr.mean(rdims);
auto saved_variance_arr =
(x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse();
const auto scale_ptr = scale.get_ptr();
const auto bias_ptr = bias.get_ptr();
DenseTensor scale_data;
DenseTensor bias_data;
if (!scale_ptr) {
scale_data.Resize({C});
dev_ctx.template Alloc<T>(&scale_data);
set_constant(dev_ctx, &scale_data, static_cast<T>(1));
}
if (!bias_ptr) {
bias_data.Resize({C});
dev_ctx.template Alloc<T>(&bias_data);
set_constant(dev_ctx, &bias_data, static_cast<T>(0));
}
auto scale_e =
scale_ptr
? EigenVector<T>::Flatten(*scale_ptr)
: EigenVector<T>::Flatten(const_cast<const DenseTensor&>(scale_data));
auto scale_arr = scale_e.reshape(C_shape);
auto bias_e =
bias_ptr
? EigenVector<T>::Flatten(*bias_ptr)
: EigenVector<T>::Flatten(const_cast<const DenseTensor&>(bias_data));
auto bias_arr = bias_e.reshape(C_shape);
dev_ctx.template Alloc<T>(y);
auto y_e = EigenVector<T>::Flatten(*y);
auto y_arr = y_e.reshape(shape);
// (x - mean) * inv_std * scale + bias
Eigen::DSizes<int, 2> bcast_param(N, sample_size);
y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) *
saved_variance_e.broadcast(bcast) *
scale_arr.broadcast(bcast_param) +
bias_arr.broadcast(bcast_param);
}
} // namespace phi
PD_REGISTER_KERNEL(
instance_norm, CPU, ALL_LAYOUT, phi::InstanceNormKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
namespace phi {
template <typename T, int BlockDim>
static __global__ void GradComputeDX(const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *mean,
const T *x,
const BatchNormParamType<T> *variance,
const int C,
const int sample_size,
T *dx) {
int beg_idx = blockIdx.x * sample_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * sample_size;
int ncid = blockIdx.x;
int c = ncid % C;
BatchNormParamType<T> mean_val = mean[ncid];
BatchNormParamType<T> inv_var_val = variance[ncid];
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage dy_storage;
__shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
__shared__ BatchNormParamType<T> dy_sum_val;
__shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> dy_x_sub_mean_sum =
static_cast<BatchNormParamType<T>>(0);
for (int i = beg_idx; i < end_idx; i += BlockDim) {
BatchNormParamType<T> dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
dy_sum += dy_i;
dy_x_sub_mean_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[i]) - mean_val);
}
dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
dy_x_sub_mean_sum =
BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum());
if (threadIdx.x == 0) {
dy_sum_val = dy_sum;
dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
}
__syncthreads();
for (int i = beg_idx; i < end_idx; i += BlockDim) {
dx[i] =
(static_cast<BatchNormParamType<T>>(dy[i]) -
dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
(static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
scale[c] * inv_var_val;
}
}
template <typename T, typename Context>
void InstanceNormGradKernel(const Context &dev_ctx,
const DenseTensor &x,
const DenseTensor &d_y,
paddle::optional<const DenseTensor &> scale,
const DenseTensor &saved_mean,
const DenseTensor &saved_variance,
float epsilon_f,
DenseTensor *d_x,
DenseTensor *d_scale,
DenseTensor *d_bias) {
double epsilon = static_cast<double>(epsilon_f);
const auto *scale_ptr = scale.get_ptr();
const auto &x_dims = x.dims();
int N, C, H, W, D;
paddle::operators::ExtractNCWHD(
x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
int NxC = N * C;
DenseTensor x_tmp, d_y_tmp;
x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
d_y_tmp.ShareDataWith(d_y).Resize({1, NxC, H, W, D});
dev_ctx.template Alloc<T>(d_x);
if (d_scale && d_bias) {
dev_ctx.template Alloc<T>(d_scale);
dev_ctx.template Alloc<T>(d_bias);
}
if (scale_ptr) {
PADDLE_ENFORCE_EQ(
scale_ptr->dims().size(),
1UL,
phi::errors::InvalidArgument(
"The `shape` in InstanceNormOp is invalid: "
"the size of scale's dimensions must be equal to 1. But "
"received: the size of scale's dimensions"
"is [%d]",
scale_ptr->dims().size()));
PADDLE_ENFORCE_EQ(scale_ptr->dims()[0],
C,
phi::errors::InvalidArgument(
"The `shape` in InstanceNormOp is invalid: "
"the first dimension of scale must be equal to "
"Channels([%d]). But received: "
"the first dimension of scale is [%d],"
"the dimensions of scale is [%s], ",
C,
scale_ptr->dims()[0],
scale_ptr->dims()));
}
phi::funcs::SetConstant<GPUContext, T> set_constant;
const int n = x.numel();
const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(NxC, max_blocks);
const int grid1 = (C + block - 1) / block;
DenseTensor scale_tmp;
scale_tmp.Resize({NxC});
dev_ctx.template Alloc<T>(&scale_tmp);
DenseTensor d_scale_tmp;
d_scale_tmp.Resize({NxC});
dev_ctx.template Alloc<T>(&d_scale_tmp);
DenseTensor d_bias_tmp;
d_bias_tmp.Resize({NxC});
dev_ctx.template Alloc<T>(&d_bias_tmp);
if (scale_ptr) {
repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
scale_ptr->data<T>(), scale_tmp.data<T>(), N, C);
} else {
set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
}
std::vector<int> dims;
std::vector<int> strides;
dims = {1, NxC, H, W, D};
strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
if ((H * W * D) == 1) {
phi::Copy(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x);
phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
return;
}
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t data_desc_;
miopenTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenSetTensorDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
const_cast<int *>(dims.data()),
const_cast<int *>(strides.data())));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, miopenBNSpatial));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
dims.data(),
strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
#endif
const auto *saved_mean_data =
saved_mean.template data<BatchNormParamType<T>>();
const auto *saved_var_data =
saved_variance.template data<BatchNormParamType<T>>();
if (d_scale && d_bias) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenBatchNormalizationBackward(
dev_ctx.cudnn_handle(),
miopenBNSpatial,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
x_tmp.template data<T>(),
data_desc_,
d_y_tmp.template data<T>(),
data_desc_,
d_x->template data<T>(),
in_param_desc_,
scale_tmp.template data<BatchNormParamType<T>>(),
d_scale_tmp.template data<BatchNormParamType<T>>(),
d_bias_tmp.template data<BatchNormParamType<T>>(),
epsilon,
saved_mean_data,
saved_var_data));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationBackward(
dev_ctx.cudnn_handle(),
CUDNN_BATCHNORM_SPATIAL,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
x_tmp.template data<T>(),
data_desc_,
d_y_tmp.template data<T>(),
data_desc_,
d_x->template data<T>(),
in_param_desc_,
scale_tmp.template data<BatchNormParamType<T>>(),
d_scale_tmp.template data<BatchNormParamType<T>>(),
d_bias_tmp.template data<BatchNormParamType<T>>(),
epsilon,
saved_mean_data,
saved_var_data));
#endif
} else {
if (d_x) {
GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
d_y.data<T>(),
scale_tmp.data<BatchNormParamType<T>>(),
saved_mean_data,
x.data<T>(),
saved_var_data,
C,
H * W * D,
d_x->data<T>());
}
}
if (d_scale && d_bias) {
add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
d_scale_tmp.data<T>(), d_scale->data<T>(), N, C);
add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
d_bias_tmp.data<T>(), d_bias->data<T>(), N, C);
}
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
#endif
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
PD_REGISTER_KERNEL(
instance_norm_grad, GPU, ALL_LAYOUT, phi::InstanceNormGradKernel, float) {}
#else
PD_REGISTER_KERNEL(instance_norm_grad,
GPU,
ALL_LAYOUT,
phi::InstanceNormGradKernel,
float,
double) {}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/instance_norm_kernel.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
namespace phi {
template <typename T, typename Context>
void InstanceNormKernel(const Context &dev_ctx,
const DenseTensor &x,
paddle::optional<const DenseTensor &> scale,
paddle::optional<const DenseTensor &> bias,
float epsilon_f,
DenseTensor *y,
DenseTensor *saved_mean,
DenseTensor *saved_variance) {
double epsilon = static_cast<double>(epsilon_f);
auto &x_dims = x.dims();
PADDLE_ENFORCE_GE(x_dims.size(),
2,
phi::errors::InvalidArgument(
"The `shape` in InstanceNormOp is invalid: "
"the size of X's dimensions must greater than "
"or equal to 2. But received: "
"the size of X's dimensions is [%d]",
x_dims.size()));
PADDLE_ENFORCE_LE(x_dims.size(),
5,
phi::errors::InvalidArgument(
"The `shape` in InstanceNormOp is invalid: "
"the size of X's dimensions must smaller than"
"or equal to 5. But received: "
"the size of X's dimensions is [%d]",
x_dims.size()));
int N, C, H, W, D;
paddle::operators::ExtractNCWHD(
x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
int NxC = N * C;
DenseTensor x_tmp;
x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
dev_ctx.template Alloc<T>(y);
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t data_desc_;
miopenTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
VLOG(3) << "Setting descriptors.";
std::vector<int> dims;
std::vector<int> strides;
dims = {1, NxC, H, W, D};
strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenSetTensorDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
const_cast<int *>(dims.data()),
const_cast<int *>(strides.data())));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, miopenBNSpatial));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
dims.data(),
strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
#endif
const auto scale_ptr = scale.get_ptr();
const auto bias_ptr = bias.get_ptr();
DenseTensor scale_tmp;
scale_tmp.Resize({NxC});
dev_ctx.template Alloc<T>(&scale_tmp);
DenseTensor bias_tmp;
bias_tmp.Resize({NxC});
dev_ctx.template Alloc<T>(&bias_tmp);
const int n = x.numel();
const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min((NxC + block - 1) / block, max_blocks);
phi::funcs::SetConstant<GPUContext, T> set_constant;
if (scale_ptr) {
repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
scale_ptr->data<T>(), scale_tmp.data<T>(), N, C);
} else {
set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
}
if (bias_ptr) {
repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
bias_ptr->data<T>(), bias_tmp.data<T>(), N, C);
} else {
set_constant(dev_ctx, &bias_tmp, static_cast<T>(0));
}
auto handle = dev_ctx.cudnn_handle();
phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
dev_ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenBatchNormalizationForwardTraining(
handle,
miopenBNSpatial,
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kOne())),
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kZero())),
data_desc_,
static_cast<const void *>(x_tmp.template data<T>()),
data_desc_,
static_cast<void *>(y->template data<T>()),
in_param_desc_,
const_cast<void *>(static_cast<const void *>(
scale_tmp.template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
bias_tmp.template data<BatchNormParamType<T>>())),
0,
nullptr,
nullptr,
epsilon,
static_cast<void *>(
saved_mean->template data<BatchNormParamType<T>>()),
static_cast<void *>(
saved_variance->template data<BatchNormParamType<T>>())));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
handle,
CUDNN_BATCHNORM_SPATIAL,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
x_tmp.template data<T>(),
data_desc_,
y->template data<T>(),
in_param_desc_,
scale_tmp.template data<BatchNormParamType<T>>(),
bias_tmp.template data<BatchNormParamType<T>>(),
0,
nullptr,
nullptr,
epsilon,
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>()));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
#endif
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
PD_REGISTER_KERNEL(
instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float) {}
#else
PD_REGISTER_KERNEL(
instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float, double) {}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cfloat>
#include <string>
#include <vector>
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
namespace phi {
template <typename T>
using CudnnDataType = paddle::platform::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T>
static __global__ void repeat_param(const T *input,
T *output,
const int repeat_num,
const int C) {
CUDA_KERNEL_LOOP(i, repeat_num * C) {
int index = i % C;
output[i] = input[index];
}
}
template <typename T, int BlockDim, bool AVG>
static __global__ void add_param(const T *input,
T *output,
const int repeat_num,
const int C) {
typedef cub::BlockReduce<T, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ou_storage;
for (int i = blockIdx.x; i < C; i += gridDim.x) {
T ou = static_cast<T>(0);
for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) {
const int index = j * C + i;
ou += static_cast<T>(input[index]);
}
ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum());
if (threadIdx.x == 0) {
output[i] = ou;
}
__syncthreads();
if (AVG) {
output[i] /= repeat_num;
}
}
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void InstanceNormGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y_grad,
paddle::optional<const DenseTensor&> scale,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
float epsilon,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void InstanceNormKernel(const Context& dev_ctx,
const DenseTensor& x,
paddle::optional<const DenseTensor&> scale,
paddle::optional<const DenseTensor&> bias,
float epsilon,
DenseTensor* y,
DenseTensor* saved_mean,
DenseTensor* saved_variance);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature InstanceNormOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("instance_norm",
{"X", "Scale", "Bias"},
{"epsilon"},
{"Y", "SavedMean", "SavedVariance"});
}
KernelSignature InstanceNormGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("instance_norm_grad",
{"X", "Y@GRAD", "Scale", "SavedMean", "SavedVariance"},
{"epsilon"},
{"X@GRAD", "Scale@GRAD", "Bias@GRAD"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(instance_norm, phi::InstanceNormOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad,
phi::InstanceNormGradOpArgumentMapping);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册