未验证 提交 4155e625 编写于 作者: L lvmengsi 提交者: GitHub

add instance norm (#19500)

* add instance norm op
上级 c7f36e7c
......@@ -133,6 +133,7 @@ paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po
paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '52343203de40afe29607397e13aaf0d2'))
paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '55db6ae7275fb9678a6814aebab81a9c'))
paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '404741b5690228c493a2d9f59c6b1122'))
paddle.fluid.layers.instance_norm (ArgSpec(args=['input', 'epsilon', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None)), ('document', 'c124b947a6ac4d01f491275561b9c1ab'))
paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', '2460b30fb87037555208fa8ac6fc1787'))
paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '83e08f21af41ac8bac37aeab1f86fdd0'))
paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '6d3b135bb3834d58ef2cb581ead1487c'))
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <unordered_map>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/norm_utils.h"
namespace paddle {
namespace operators {
......@@ -96,26 +97,5 @@ class BatchNormGradKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override;
};
inline void ExtractNCWHD(const framework::DDim &dims,
const DataLayout &data_layout, int *N, int *C, int *H,
int *W, int *D) {
*N = dims[0];
if (dims.size() == 2) {
*C = dims[1];
*H = 1;
*W = 1;
*D = 1;
} else {
*C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
*H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
*W = dims.size() > 3
? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
: 1;
*D = dims.size() > 4
? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
: 1;
}
}
} // namespace operators
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/instance_norm_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
void InstanceNormOp::InferShape(framework::InferShapeContext *ctx) const {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
"Input(X) of Instance Norm Op should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true,
"Input(Scale) of Instance Norm Op should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("Bias"), true,
"Input(Bias) of Instance Norm Op should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true,
"Output(Y) of Instance Norm Op should not be null.");
PADDLE_ENFORCE_EQ(
ctx->HasOutput("SavedMean"), true,
"Output(SavedMean) of Instance Norm Op should not be null.");
PADDLE_ENFORCE_EQ(
ctx->HasOutput("SavedVariance"), true,
"Output(SavedVariance) of Instance Norm Op should not be null.");
const auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_GE(x_dims.size(), 2,
"the dimension of input X must greater than or equal to 2");
PADDLE_ENFORCE_LE(x_dims.size(), 5,
"the dimension of input X must smaller than or equal to 5");
auto N = x_dims[0];
auto C = x_dims[1];
auto NxC = N * C;
auto scale_dim = ctx->GetInputDim("Scale");
auto bias_dim = ctx->GetInputDim("Bias");
PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL);
PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL);
bool check = !((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
framework::product(bias_dim) <= 0));
if (check) {
PADDLE_ENFORCE_EQ(scale_dim[0], C);
PADDLE_ENFORCE_EQ(bias_dim[0], C);
}
ctx->SetOutputDim("Y", x_dims);
ctx->SetOutputDim("SavedMean", {NxC});
ctx->SetOutputDim("SavedVariance", {NxC});
ctx->ShareLoD("X", "Y");
}
framework::OpKernelType InstanceNormOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
auto input_data_type = ctx.Input<Tensor>("X")->type();
// By default, the type of the scale, bias, mean,
// and var tensors should both be float. (For float or float16 input tensor)
// or double (For double input tensor).
auto in_param_type = framework::proto::VarType::FP32;
if (input_data_type == framework::proto::VarType::FP64) {
in_param_type = framework::proto::VarType::FP64;
}
PADDLE_ENFORCE_EQ(in_param_type, ctx.Input<Tensor>("Scale")->type(),
"Scale input should be of float type");
PADDLE_ENFORCE_EQ(in_param_type, ctx.Input<Tensor>("Bias")->type(),
"Bias input should be of float type");
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
void InstanceNormOpMaker::Make() {
AddAttr<float>("epsilon", "")
.SetDefault(1e-5)
.AddCustomChecker([](const float &epsilon) {
PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true,
"'epsilon' should be between 0.0 and 0.001.");
});
AddInput("X", "The input tensor");
AddInput("Scale",
"Scale is a 1-dimensional tensor of size C "
"that is applied to the output");
AddInput("Bias",
"Bias is a 1-dimensional tensor of size C "
"that is applied to the output");
AddOutput("Y", "result after normalization");
AddOutput("SavedMean",
"Mean of the current mini batch, "
"will apply to output when training")
.AsIntermediate();
AddOutput("SavedVariance",
"Variance of the current mini batch, "
"will apply to output when training")
.AsIntermediate();
AddComment(R"DOC(
Instance Normalization.
Instance Norm has been implemented as disscussed in the paper:
https://arxiv.org/pdf/1607.08022.pdf
Can be used as a normalizer function for conv2d and fully_connected operations.
The required data format for this layer is as following:
NCHW `[batch, in_channels, in_height, in_width]`
)DOC");
}
template <typename T>
class InstanceNormKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
const int N = x_dims[0];
const int C = x_dims[1];
const int NxC = N * C;
const int sample_size = x->numel() / N / C;
auto *y = ctx.Output<Tensor>("Y");
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
auto *place = dev_ctx.eigen_device();
Eigen::DSizes<int, 2> bcast(1, sample_size);
Eigen::DSizes<int, 2> C_shape(C, 1);
Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
Eigen::DSizes<int, 2> shape(NxC, sample_size);
math::SetConstant<platform::CPUDeviceContext, T> set_constant;
saved_mean->mutable_data<T>(ctx.GetPlace());
saved_variance->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, saved_mean, static_cast<T>(0));
set_constant(dev_ctx, saved_variance, static_cast<T>(0));
auto saved_mean_a = framework::EigenVector<T>::Flatten(*saved_mean);
auto saved_mean_e = saved_mean_a.reshape(NxC_shape);
auto saved_variance_a = framework::EigenVector<T>::Flatten(*saved_variance);
auto saved_variance_e = saved_variance_a.reshape(NxC_shape);
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto x_arr = x_e.reshape(shape);
Eigen::DSizes<int, 1> rdims(1);
saved_mean_e.device(*place) = x_arr.mean(rdims);
auto saved_variance_arr =
(x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse();
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
auto scale_e = framework::EigenVector<T>::Flatten(*scale);
auto scale_arr = scale_e.reshape(C_shape);
auto bias_e = framework::EigenVector<T>::Flatten(*bias);
auto bias_arr = bias_e.reshape(C_shape);
y->mutable_data<T>(ctx.GetPlace());
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto y_arr = y_e.reshape(shape);
// (x - mean) * inv_std * scale + bias
Eigen::DSizes<int, 2> bcast_param(N, sample_size);
y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) *
saved_variance_e.broadcast(bcast) *
scale_arr.broadcast(bcast_param) +
bias_arr.broadcast(bcast_param);
}
};
void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null");
PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true,
"Input(scale) should not be null");
PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Y")), true,
"Input(Y@GRAD) should not be null");
PADDLE_ENFORCE_EQ(ctx->HasInput("SavedMean"), true,
"Input(SavedMean) should not be null");
PADDLE_ENFORCE_EQ(ctx->HasInput("SavedVariance"), true,
"Input(SavedVariance) should not be null");
// check output
PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
"Output(x@GRAD) should not be null");
if (ctx->HasOutput(framework::GradVarName("Scale"))) {
PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Bias")), true,
"Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
"null at the same time");
}
const auto x_dims = ctx->GetInputDim("X");
const int C = x_dims[1];
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
if (ctx->HasOutput(framework::GradVarName("Scale"))) {
ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
}
}
framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
const auto *var = ctx.InputVar(framework::GradVarName("Y"));
if (var == nullptr) {
PADDLE_THROW("cannot find Y@GRAD");
}
const Tensor *t = nullptr;
if (var->IsType<Tensor>()) {
t = &var->Get<Tensor>();
} else if (var->IsType<LoDTensor>()) {
t = &var->Get<LoDTensor>();
}
if (t == nullptr) {
PADDLE_THROW("cannot find Y@GRAD");
}
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace());
}
template <typename T>
class InstanceNormGradKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<Tensor>("X");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
const auto &x_dims = x->dims();
const int N = x_dims[0];
const int C = x_dims[1];
const int NxC = N * C;
const int sample_size = x->numel() / N / C;
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
d_x->mutable_data<T>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
auto *place = dev_ctx.eigen_device();
Eigen::DSizes<int, 1> rdims(0);
Eigen::DSizes<int, 1> mean_rdims(1);
Eigen::DSizes<int, 2> rshape(NxC, sample_size);
Eigen::DSizes<int, 2> bcast(1, sample_size);
Eigen::DSizes<int, 2> C_shape(C, 1);
Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
Eigen::DSizes<int, 2> param_shape(N, C);
Eigen::DSizes<int, 2> shape(NxC, sample_size);
auto scale_e = framework::EigenVector<T>::Flatten(*scale);
auto mean_e = framework::EigenVector<T>::Flatten(*saved_mean);
auto inv_var_e = framework::EigenVector<T>::Flatten(*saved_inv_variance);
auto dy_e = framework::EigenVector<T>::Flatten(*d_y);
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto scale_arr = scale_e.reshape(C_shape);
auto mean_arr = mean_e.reshape(NxC_shape);
auto inv_var_arr = inv_var_e.reshape(NxC_shape);
auto dy_arr = dy_e.reshape(shape);
auto x_arr = x_e.reshape(shape);
auto tmp =
(x_arr - mean_arr.broadcast(bcast)) * inv_var_arr.broadcast(bcast);
math::SetConstant<platform::CPUDeviceContext, T> set_constant;
// math: d_bias = np.sum(d_y, axis=(n,h,w))
// math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
if (d_scale && d_bias) {
d_scale->mutable_data<T>(ctx.GetPlace());
d_bias->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, d_scale, static_cast<T>(0));
set_constant(dev_ctx, d_bias, static_cast<T>(0));
auto d_scale_e = framework::EigenVector<T>::Flatten(*d_scale);
auto d_bias_e = framework::EigenVector<T>::Flatten(*d_bias);
auto d_scale_data = d_scale_e.reshape(C_shape);
auto d_bias_data = d_bias_e.reshape(C_shape);
d_bias_data.device(*place) =
dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims);
d_scale_data.device(*place) =
(tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims);
}
auto dy_mean = dy_arr.mean(mean_rdims).reshape(NxC_shape).broadcast(bcast);
Eigen::DSizes<int, 2> bcast_param(N, sample_size);
set_constant(dev_ctx, d_x, static_cast<T>(0));
// math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
// axis=(h,w))
// - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
// mean),
// axis=(h,w))
auto dx_e = framework::EigenVector<T>::Flatten(*d_x);
auto dx_arr = dx_e.reshape(shape);
dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
inv_var_arr.broadcast(bcast) *
(dy_arr - dy_mean -
tmp *
(dy_arr * tmp)
.mean(mean_rdims)
.reshape(NxC_shape)
.broadcast(bcast));
}
};
std::unique_ptr<framework::OpDesc> InstanceNormGradMaker::Apply() const {
auto *op = new framework::OpDesc();
op->SetType("instance_norm_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetInput("Scale", Input("Scale"));
op->SetInput("Bias", Input("Bias"));
op->SetInput("SavedMean", Output("SavedMean"));
op->SetInput("SavedVariance", Output("SavedVariance"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
return std::unique_ptr<framework::OpDesc>(op);
}
void InstanceNormDoubleGradOp::InferShape(
framework::InferShapeContext *ctx) const {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null");
PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true,
"Input(Scale) should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("SavedMean"), true,
"Input(SavedMean) should not be null");
PADDLE_ENFORCE_EQ(ctx->HasInput("SavedVariance"), true,
"Input(SavedVariance) should not be null");
PADDLE_ENFORCE_EQ(ctx->HasInput("DDX"), true,
"Input(DDX) should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("DY"), true,
"Input(Y@GRAD) should not be null");
// check output
PADDLE_ENFORCE_EQ(ctx->HasOutput("DX"), true,
"Output(DX) should not be null");
const auto x_dims = ctx->GetInputDim("X");
const int C = x_dims[1];
if (ctx->HasOutput("DX")) {
ctx->SetOutputDim("DX", x_dims);
}
if (ctx->HasOutput("DScale")) {
ctx->SetOutputDim("DScale", {C});
}
if (ctx->HasOutput("DDY")) {
ctx->ShareDim("X", "DDY");
}
}
framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
const auto *var = ctx.InputVar("DY");
if (var == nullptr) {
PADDLE_THROW("cannot find Y@GRAD");
}
const Tensor *t = nullptr;
if (var->IsType<Tensor>()) {
t = &var->Get<Tensor>();
} else if (var->IsType<LoDTensor>()) {
t = &var->Get<LoDTensor>();
}
if (t == nullptr) {
PADDLE_THROW("cannot find Y@GRAD");
}
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace());
}
std::unique_ptr<framework::OpDesc> InstanceNormDoubleGradMaker::Apply() const {
auto *op = new framework::OpDesc();
op->SetType("instance_norm_grad_grad");
op->SetInput("X", Input("X"));
op->SetInput("Scale", Input("Scale"));
op->SetInput("SavedMean", Input("SavedMean"));
op->SetInput("SavedVariance", Input("SavedVariance"));
op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
op->SetInput("DDScale", OutputGrad(framework::GradVarName("Scale")));
op->SetInput("DDBias", OutputGrad(framework::GradVarName("Bias")));
op->SetInput("DY", Input(framework::GradVarName("Y")));
op->SetAttrMap(Attrs());
op->SetOutput("DX", InputGrad("X"));
op->SetOutput("DScale", InputGrad("Scale"));
op->SetOutput("DDY", InputGrad(framework::GradVarName("Y")));
return std::unique_ptr<framework::OpDesc>(op);
}
template <typename T>
class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *X = ctx.Input<Tensor>("X");
const auto *Scale = ctx.Input<Tensor>("Scale");
const auto *dY = ctx.Input<Tensor>("DY");
const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
const auto *ddX = ctx.Input<Tensor>("DDX");
const auto *ddScale = ctx.Input<Tensor>("DDScale");
const auto *ddBias = ctx.Input<Tensor>("DDBias");
auto *dX = ctx.Output<Tensor>("DX");
auto *dScale = ctx.Output<Tensor>("DScale");
auto *ddY = ctx.Output<Tensor>("DDY");
const auto &x_dims = X->dims();
int N, C, H, W, D;
ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
const int sample_size = X->numel() / N / C;
const int NxC = N * C;
const T *mean_data = Saved_mean->data<T>();
const T *inv_var_data = Saved_variance->data<T>();
Tensor mean_tensor;
Tensor inv_var_tensor;
ConstEigenArrayMap<T> x_arr(X->data<T>(), sample_size, NxC);
ConstEigenVectorArrayMap<T> mean_arr(mean_data, NxC);
ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, NxC);
Tensor mean_tile;
mean_tile.Resize({sample_size, NxC});
mean_tile.mutable_data<T>(ctx.GetPlace());
EigenArrayMap<T> mean_tile_data(mean_tile.mutable_data<T>(ctx.GetPlace()),
sample_size, NxC);
Tensor inv_var_tile;
inv_var_tile.Resize({sample_size, NxC});
inv_var_tile.mutable_data<T>(ctx.GetPlace());
EigenArrayMap<T> inv_var_tile_data(
inv_var_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
mean_tile_data = mean_arr.transpose().replicate(sample_size, 1);
inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1);
ConstEigenVectorArrayMap<T> scale_arr(Scale->data<T>(), C);
Tensor scale_tile;
scale_tile.Resize({sample_size, NxC});
scale_tile.mutable_data<T>(ctx.GetPlace());
EigenArrayMap<T> scale_tile_data(scale_tile.mutable_data<T>(ctx.GetPlace()),
sample_size, NxC);
scale_tile_data = scale_arr.transpose().replicate(sample_size, N);
ConstEigenArrayMap<T> dy_arr(dY->data<T>(), sample_size, NxC);
ConstEigenArrayMap<T> ddx_arr(ddX->data<T>(), sample_size, NxC);
// math: dx = scale * ((x - mean) * inv_var / HxW * (np.mean(ddx,
// axis=(h,w)) *
// np.sum(dy, axis=(h,w)) -
// np.sum(dy * ddx, axis=(h,w)) + 3 * np.mean(dy * (x - mean),
// axis=(h,w)) * inv_var.pow(2) *
// np.sum(ddx * (x - mean), axis=(h,w))) + inv_var.pow(3) / HxW *
// np.sum(ddx * (x - mean)) *
// (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
// np.sum(dy,
// axis=(h,w)) * (x - mean) *
// (np.mean(ddx, axis=(h,w)) - ddx) + ddr * (dy * inv_var - inv_var
// *
// np.mean(dy, axis=(h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(h,w))))
auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, T> set_constant;
Tensor x_sub_mean_mul_invstd;
x_sub_mean_mul_invstd.Resize({sample_size, NxC});
x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace());
EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace()), sample_size,
NxC);
x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
if (dX) {
dX->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, dX, static_cast<T>(0));
EigenArrayMap<T> dx_arr(dX->mutable_data<T>(ctx.GetPlace()), sample_size,
NxC);
if (ddX) {
dx_arr +=
x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
sample_size *
(ddx_arr.colwise().sum() * dy_arr.colwise().sum() / sample_size -
(dy_arr * ddx_arr).colwise().sum() +
3. * (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() *
(ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
sample_size);
dx_arr += (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
sample_size * inv_var_tile_data * inv_var_tile_data *
(dy_arr.colwise().sum() / sample_size - dy_arr);
dx_arr += (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
sample_size * inv_var_tile_data * inv_var_tile_data *
(ddx_arr.colwise().sum() / sample_size - ddx_arr);
dx_arr = scale_tile_data * dx_arr.eval();
}
if (ddScale) {
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
Tensor ddscale_tile;
ddscale_tile.Resize({sample_size, NxC});
ddscale_tile.mutable_data<T>(ctx.GetPlace());
EigenArrayMap<T> ddscale_tile_data(
ddscale_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
dx_arr += (dy_arr * inv_var_tile_data -
dy_arr.colwise().sum() / sample_size * inv_var_tile_data -
x_sub_mean_mul_invstd_arr * inv_var_tile_data *
(dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
sample_size) *
ddscale_tile_data;
}
}
if (dScale) {
// math: dscale = inv_var * (dy - np.mean(dy, axis=(h,w) - (x-mean) *
// inv_var.pow(2) * np.mean(dy * (x-mean), axis=(h,w)))) * ddx
dScale->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, dScale, static_cast<T>(0));
EigenVectorArrayMap<T> dscale_arr(dScale->mutable_data<T>(ctx.GetPlace()),
C);
if (ddX) {
Tensor first_grad;
first_grad.Resize({sample_size, NxC});
first_grad.mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, &first_grad, static_cast<T>(0));
EigenArrayMap<T> first_grad_arr(
first_grad.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
first_grad_arr +=
inv_var_tile_data *
(dy_arr - dy_arr.colwise().sum() / sample_size -
x_sub_mean_mul_invstd_arr *
(dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
sample_size);
first_grad_arr = first_grad_arr.eval() * ddx_arr;
for (int nc = 0; nc < NxC; ++nc) {
int c = nc % C;
dscale_arr(c) += first_grad_arr.colwise().sum()(nc);
}
}
}
if (ddY) {
// math: ddy = (x - mean) * inv_var * ddscale + ddbias +
// scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
// np.mean(ddx * (x - mean), axis=(h,w)))
ddY->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, ddY, static_cast<T>(0));
EigenArrayMap<T> ddy_arr(ddY->mutable_data<T>(ctx.GetPlace()),
sample_size, NxC);
if (ddX) {
ddy_arr += scale_tile_data * inv_var_tile_data *
(ddx_arr - ddx_arr.colwise().sum() / sample_size -
x_sub_mean_mul_invstd_arr *
(ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
sample_size);
}
if (ddScale && ddBias) {
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
Tensor ddscale_tile;
ddscale_tile.Resize({sample_size, NxC});
ddscale_tile.mutable_data<T>(ctx.GetPlace());
EigenArrayMap<T> ddscale_tile_data(
ddscale_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
Tensor ddbias_tile;
ddbias_tile.Resize({sample_size, NxC});
ddbias_tile.mutable_data<T>(ctx.GetPlace());
EigenArrayMap<T> ddbias_tile_data(
ddbias_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
ddbias_tile_data = ddbias_arr.transpose().replicate(sample_size, N);
ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
ddy_arr += ddbias_tile_data;
}
}
}
};
DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInference,
{"DY", "DDY"});
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(instance_norm, ops::InstanceNormOp, ops::InstanceNormOpMaker,
ops::InstanceNormOpInferVarType, ops::InstanceNormGradMaker);
REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp,
ops::InstanceNormDoubleGradMaker);
REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp,
ops::InstanceNormDoubleGradOpInplaceInference);
REGISTER_OP_CPU_KERNEL(
instance_norm,
ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
instance_norm_grad,
ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
instance_norm_grad_grad,
ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
float>,
ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <cfloat>
#include <string>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/instance_norm_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DataLayout = framework::DataLayout;
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T>
static __global__ void repeat_param(const T *input, T *output,
const int repeat_num, const int C) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < repeat_num * C;
i += blockDim.x * gridDim.x) {
int index = i % C;
output[i] = input[index];
}
}
template <typename T, int BlockDim, bool AVG>
static __global__ void add_param(const T *input, T *output,
const int repeat_num, const int C) {
typedef cub::BlockReduce<T, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ou_storage;
for (int i = blockIdx.x; i < C; i += gridDim.x) {
T ou = static_cast<T>(0);
for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) {
const int index = j * C + i;
ou += static_cast<T>(input[index]);
}
ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum());
if (threadIdx.x == 0) {
output[i] = ou;
}
__syncthreads();
if (AVG) {
output[i] /= repeat_num;
}
}
}
template <typename T>
class InstanceNormKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
"It must be CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
auto *x = ctx.Input<Tensor>("X");
auto &x_dims = x->dims();
PADDLE_ENFORCE_GE(
x_dims.size(), 2,
"the dimension of input X must greater than or equal to 2");
PADDLE_ENFORCE_LE(
x_dims.size(), 5,
"the dimension of input X must smaller than or equal to 5");
int N, C, H, W, D;
ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
int NxC = N * C;
Tensor x_tmp;
x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D});
auto *y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace());
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t in_param_desc_;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
VLOG(3) << "Setting descriptors.";
std::vector<int> dims;
std::vector<int> strides;
dims = {1, NxC, H, W, D};
strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
Tensor scale_tmp =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
scale_tmp.mutable_data<T>(ctx.GetPlace());
Tensor bias_tmp =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
bias_tmp.mutable_data<T>(ctx.GetPlace());
const int n = x->numel();
const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min((NxC + block - 1) / block, max_blocks);
repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
scale->data<T>(), scale_tmp.data<T>(), N, C);
repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
bias->data<T>(), bias_tmp.data<T>(), N, C);
auto handle = dev_ctx.cudnn_handle();
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
data_desc_, y->template mutable_data<T>(ctx.GetPlace()), in_param_desc_,
scale_tmp.template data<BatchNormParamType<T>>(),
bias_tmp.template data<BatchNormParamType<T>>(), 0, nullptr, nullptr,
epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
}
};
template <typename T>
class InstanceNormGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
"It must use CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *x = ctx.Input<Tensor>("X");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto &x_dims = x->dims();
int N, C, H, W, D;
ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
int NxC = N * C;
Tensor x_tmp, d_y_tmp;
x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D});
d_y_tmp.ShareDataWith(*d_y).Resize({1, NxC, H, W, D});
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
d_x->mutable_data<T>(ctx.GetPlace());
if (d_scale && d_bias) {
d_scale->mutable_data<T>(ctx.GetPlace());
d_bias->mutable_data<T>(ctx.GetPlace());
}
PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
PADDLE_ENFORCE_EQ(scale->dims()[0], C);
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
const int n = x->numel();
const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(NxC, max_blocks);
const int grid1 = (C + block - 1) / block;
Tensor scale_tmp =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
scale_tmp.mutable_data<T>(ctx.GetPlace());
Tensor d_scale_tmp =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
Tensor d_bias_tmp =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
scale->data<T>(), scale_tmp.data<T>(), N, C);
std::vector<int> dims;
std::vector<int> strides;
dims = {1, NxC, H, W, D};
strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
if ((H * W * D) == 1) {
framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
return;
}
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t in_param_desc_;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
const void *saved_mean_data =
saved_mean->template data<BatchNormParamType<T>>();
const void *saved_var_data =
saved_var->template data<BatchNormParamType<T>>();
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL,
CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), data_desc_,
x_tmp.template data<T>(), data_desc_, d_y_tmp.template data<T>(),
data_desc_, d_x->template mutable_data<T>(ctx.GetPlace()),
in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
d_scale_tmp.template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias_tmp.template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
if (d_scale && d_bias) {
add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
d_scale_tmp.data<T>(), d_scale->data<T>(), N, C);
add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
d_bias_tmp.data<T>(), d_bias->data<T>(), N, C);
}
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
}
};
static __device__ __forceinline__ float real_sqrt(float x) {
return 1. / sqrtf(x);
}
static __device__ __forceinline__ double real_sqrt(double x) {
return 1. / sqrt(x);
}
template <typename T, int BlockDim>
__global__ void DoubleGradComputeDX(const T *x, const T *mean,
const T *variance, const T *ddx,
const T *dy, const T *scale,
const T *ddscale, int C, int sample_size,
const double epsilon, T *dx) {
int beg_idx = blockIdx.x * sample_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * sample_size;
int ncid = blockIdx.x;
int c = ncid % C;
T mean_val = mean[ncid];
T var_val = variance[ncid];
typedef cub::BlockReduce<T, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage dy_storage;
__shared__ typename BlockReduce::TempStorage ddx_storage;
__shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
__shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
__shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
__shared__ T dy_sum_val;
__shared__ T ddx_sum_val;
__shared__ T dy_mul_ddx_sum_val;
__shared__ T dy_mul_x_sub_mean_sum_val;
__shared__ T ddx_mul_x_sub_mean_sum_val;
T dy_sum = 0;
T ddx_sum = 0;
T dy_mul_ddx_sum = 0;
T dy_mul_x_sub_mean_sum = 0;
T ddx_mul_x_sub_mean_sum = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) {
T ddx_i = ddx[i];
T dy_i = dy[i];
T tmp = x[i] - mean_val;
dy_sum += dy_i;
ddx_sum += ddx_i;
dy_mul_ddx_sum += (ddx_i * dy_i);
dy_mul_x_sub_mean_sum += (dy_i * tmp);
ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
}
dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
dy_mul_ddx_sum =
BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
.Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
.Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
if (threadIdx.x == 0) {
dy_sum_val = dy_sum;
ddx_sum_val = ddx_sum;
dy_mul_ddx_sum_val = dy_mul_ddx_sum;
dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
}
__syncthreads();
if (ddx != nullptr) {
for (int i = beg_idx; i < end_idx; i += BlockDim) {
dx[i] +=
((x[i] - mean_val) * var_val * var_val * var_val / sample_size *
(ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
3. * dy_mul_x_sub_mean_sum_val * var_val *
ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
var_val * (dy_sum_val / sample_size - dy[i]) +
dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
var_val * (ddx_sum_val / sample_size - ddx[i])) *
scale[c];
}
}
__syncthreads();
if (ddscale != nullptr) {
for (int i = beg_idx; i < end_idx; i += BlockDim) {
dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val -
(x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val *
var_val / sample_size) *
ddscale[c];
}
}
}
template <typename T, int BlockDim>
__global__ void DoubleGradComputeDDY(const T *x, const T *mean,
const T *variance, const T *ddscale,
const T *ddbias, const T *ddx,
const T *scale, int C, int sample_size,
const double epsilon, T *ddy) {
int beg_idx = blockIdx.x * sample_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * sample_size;
int ncid = blockIdx.x;
int c = ncid % C;
T mean_val = mean[ncid];
T var_val = variance[ncid];
typedef cub::BlockReduce<T, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ddx_storage;
__shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
__shared__ T ddx_sum_val;
__shared__ T ddx_mul_x_sub_mean_sum_val;
T ddx_sum = 0;
T ddx_mul_x_sub_mean_sum = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) {
T ddx_i = ddx[i];
ddx_sum += ddx_i;
ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val));
}
ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
.Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
if (threadIdx.x == 0) {
ddx_sum_val = ddx_sum;
ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
}
__syncthreads();
if (ddx != nullptr) {
for (int i = beg_idx; i < end_idx; i += BlockDim) {
ddy[i] += scale[c] * var_val *
(ddx[i] - ddx_sum_val / sample_size -
(x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val *
var_val / sample_size);
}
}
__syncthreads();
if (ddscale != nullptr) {
for (int i = beg_idx; i < end_idx; i += BlockDim) {
ddy[i] += (x[i] - mean_val) * var_val * ddscale[c];
}
}
__syncthreads();
if (ddbias != nullptr) {
for (int i = beg_idx; i < end_idx; i += BlockDim) {
ddy[i] += ddbias[c];
}
}
}
template <typename T, int BlockDim>
__global__ void DoubleGradComputeDScale(const T *x, const T *mean,
const T *variance, const T *ddx,
const T *dy, int C, int sample_size,
const double epsilon, T *dscale) {
int beg_idx = blockIdx.x * sample_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * sample_size;
int ncid = blockIdx.x;
int c = ncid % C;
T mean_val = mean[ncid];
T var_val = variance[ncid];
typedef cub::BlockReduce<T, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage dy_storage;
__shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
__shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
__shared__ T dy_sum_val;
__shared__ T dy_mul_x_sub_mean_sum_val;
T dy_sum = 0;
T dy_mul_x_sub_mean_sum = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) {
T dy_i = dy[i];
dy_sum += dy_i;
dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val));
}
dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
.Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
if (threadIdx.x == 0) {
dy_sum_val = dy_sum;
dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
}
__syncthreads();
if (ddx != nullptr) {
T dscale_tmp = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) {
dscale_tmp +=
ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size -
dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) *
var_val * var_val / sample_size);
}
dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
if (threadIdx.x == 0) {
dscale[ncid] += dscale_tmp;
}
__syncthreads();
}
}
template <typename T>
class InstanceNormDoubleGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *X = ctx.Input<Tensor>("X");
const auto *Scale = ctx.Input<Tensor>("Scale");
const auto *dY = ctx.Input<Tensor>("DY");
const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_var = ctx.Input<Tensor>("Variance");
const auto *ddX = ctx.Input<Tensor>("DDX");
const auto *ddScale = ctx.Input<Tensor>("DDScale");
const auto *ddBias = ctx.Input<Tensor>("DDBias");
const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
auto *dX = ctx.Output<Tensor>("DX");
auto *dScale = ctx.Output<Tensor>("DScale");
auto *ddY = ctx.Output<Tensor>("DDY");
const T *x_data = X->data<T>();
const T *scale_data = Scale->data<T>();
const T *dy_data = dY->data<T>();
const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data<T>());
const T *mean_data = Saved_mean->data<T>();
const T *variance_data = Saved_variance->data<T>();
auto &x_dims = X->dims();
int N, C, H, W, D;
ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
int NxC = N * C;
const int n = X->numel();
int sample_size = n / N / C;
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = NxC;
const int grid1 = (C + block - 1) / block;
math::SetConstant<platform::CUDADeviceContext, T> set_zero;
if (dX) {
T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, dX, static_cast<T>(0));
DoubleGradComputeDX<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
ddscale_data, C, sample_size, epsilon, dx_data);
}
if (dScale) {
Tensor dscale_tmp =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
set_zero(dev_ctx, &dscale_tmp, static_cast<T>(0));
T *dscale_tmp_data = dscale_tmp.mutable_data<T>(ctx.GetPlace());
T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, dScale, static_cast<T>(0));
DoubleGradComputeDScale<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, C, sample_size,
epsilon, dscale_tmp_data);
add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
dscale_tmp.data<T>(), dScale->data<T>(), N, C);
}
if (ddY) {
T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, ddY, static_cast<T>(0));
DoubleGradComputeDDY<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data,
scale_data, C, sample_size, epsilon, ddy_data);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
instance_norm, ops::InstanceNormKernel<plat::CUDADeviceContext, float>,
ops::InstanceNormKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
instance_norm_grad,
ops::InstanceNormGradKernel<plat::CUDADeviceContext, float>,
ops::InstanceNormGradKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
instance_norm_grad_grad,
ops::InstanceNormDoubleGradKernel<paddle::platform::CUDADeviceContext,
float>,
ops::InstanceNormDoubleGradKernel<paddle::platform::CUDADeviceContext,
double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/norm_utils.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
class InstanceNormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override;
};
class InstanceNormGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override;
};
class InstanceNormDoubleGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override;
};
class InstanceNormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
class InstanceNormGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override;
};
class InstanceNormDoubleGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override;
};
class InstanceNormOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", "Y"}};
}
};
template <typename DeviceContext, typename T>
class InstanceNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override;
};
template <typename DeviceContext, typename T>
class InstanceNormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override;
};
template <typename DeviceContext, typename T>
class InstanceNormDoubleGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override;
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using DataLayout = framework::DataLayout;
inline void ExtractNCWHD(const framework::DDim &dims,
const DataLayout &data_layout, int *N, int *C, int *H,
int *W, int *D) {
*N = dims[0];
if (dims.size() == 2) {
*C = dims[1];
*H = 1;
*W = 1;
*D = 1;
} else {
*C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
*H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
*W = dims.size() > 3
? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
: 1;
*D = dims.size() > 4
? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
: 1;
}
}
} // namespace operators
} // namespace paddle
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/nccl_helper.h"
......
......@@ -61,6 +61,7 @@ __all__ = [
'adaptive_pool2d',
'adaptive_pool3d',
'batch_norm',
'instance_norm',
'data_norm',
'beam_search_decode',
'conv2d_transpose',
......@@ -3498,6 +3499,128 @@ def batch_norm(input,
return helper.append_activation(batch_norm_out)
def instance_norm(input,
epsilon=1e-05,
param_attr=None,
bias_attr=None,
name=None):
"""
**Instance Normalization Layer**
Can be used as a normalizer function for conv2d and fully_connected operations.
The required data format for this layer is one of the following:
DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
Refer to `Instance Normalization: The Missing Ingredient for
Fast Stylization <https://arxiv.org/pdf/1607.08022.pdf>`_
for more details.
:math:`input` is the input features over a mini-batch.
.. math::
\\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
\\ mean of one feature map in mini-batch \\\\
\\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
\\mu_{\\beta})^2 \\qquad &//\ variance of one feature map in mini-batch \\\\
\\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
\\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
When use_global_stats = True, the :math:`\\mu_{\\beta}`
and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
They are global (or running) statistics. (It usually got from the
pre-trained model.)
The training and testing (or inference) have the same behavior:
.. math::
\\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
\\sigma_{\\beta}^{2} + \\epsilon}} \\\\
y_i &\\gets \\gamma \\hat{x_i} + \\beta
Args:
input(variable): The rank of input variable can be 2, 3, 4, 5.
epsilon(float, Default 1e-05): A value added to the denominator for
numerical stability. Default is 1e-5.
param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
If the Initializer of the param_attr is not set, the parameter is initialized
with Xavier. Default: None.
bias_attr(ParamAttr|None): The parameter attribute for the bias of instance_norm.
If it is set to None or one attribute of ParamAttr, instance_norm
will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
If the Initializer of the bias_attr is not set, the bias is initialized zero.
Default: None.
name(string, Default None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: A tensor variable which is the result after applying instance normalization on the input.
Examples:
.. code-block:: python
import paddle.fluid as fluid
x = fluid.layers.data(name='x', shape=[3, 7, 3, 7], dtype='float32', append_batch_size=False)
hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
hidden2 = fluid.layers.instance_norm(input=hidden1)
"""
assert bias_attr is not False, "bias_attr should not be False in instance_norm."
helper = LayerHelper('instance_norm', **locals())
dtype = helper.input_dtype()
# use fp32 for in parameter
if dtype == core.VarDesc.VarType.FP16:
dtype = core.VarDesc.VarType.FP32
input_shape = input.shape
channel_num = input_shape[1]
param_shape = [channel_num]
# create parameter
scale = helper.create_parameter(
attr=helper.param_attr,
shape=param_shape,
dtype=dtype,
default_initializer=Constant(1.0))
bias = helper.create_parameter(
attr=helper.bias_attr,
shape=param_shape,
dtype=dtype,
is_bias=True,
default_initializer=Constant(0.0))
# create output
saved_mean = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True)
saved_variance = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True)
instance_norm_out = helper.create_variable_for_type_inference(dtype)
helper.append_op(
type="instance_norm",
inputs={
"X": input,
"Scale": scale,
"Bias": bias,
},
outputs={
"Y": instance_norm_out,
"SavedMean": saved_mean,
"SavedVariance": saved_variance
},
attrs={"epsilon": epsilon, })
return instance_norm_out
def data_norm(input,
act=None,
epsilon=1e-05,
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
import paddle.fluid as fluid
from paddle.fluid.op import Operator
from op_test import OpTest
def _reference_instance_norm_naive(x, scale, bias, epsilon, mean, var):
x_shape = x.shape
if len(x_shape) == 2:
x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
n, c, h, w = x.shape
mean_tile = np.reshape(mean, (n, c, 1, 1))
mean_tile = np.tile(mean_tile, (1, 1, h, w))
var_tile = np.reshape(var, (n, c, 1, 1))
var_tile = np.tile(var_tile, (1, 1, h, w))
x_norm = (x - mean_tile) / np.sqrt(var_tile + epsilon).astype('float32')
scale_tile = np.reshape(scale, (1, c, 1, 1))
scale_tile = np.tile(scale_tile, (n, 1, h, w))
bias_tile = np.reshape(bias, (1, c, 1, 1))
bias_tile = np.tile(bias_tile, (n, 1, h, w))
y = scale_tile * x_norm + bias_tile
if len(x_shape) == 2:
y = np.reshape(y, x_shape)
return y, mean, var
def _reference_instance_norm_grad(x, d_y, scale, mean, var, epsilon):
# d_scale = sum(d_y * (x-mean) / sqrt(var+epsilon))
# d_offset = sum(d_y)
# d_x = scale / sqrt(var+epsilon) * (d_y - np.mean(d_y, axis=(2,3)) - (x-mean)/sqrt(var+epsilon)* np.mean(y_grad * (x-mean)/sqrt(var+epsilon), axis=(2,3)))
n, c, h, w = x.shape
d_bias = np.sum(d_y, axis=(0, 2, 3))
mean_tile = np.reshape(mean, (n, c, 1, 1))
mean_tile = np.tile(mean_tile, (1, 1, h, w))
var_tile = np.reshape(var, (n, c, 1, 1))
var_tile = np.tile(var_tile, (1, 1, h, w))
d_scale = np.sum(d_y * (x - mean_tile) * var_tile, axis=(0, 2, 3))
var_inv = var_tile
scale_tile = np.reshape(scale, (1, c, 1, 1))
scale_tile = np.tile(scale_tile, (n, 1, h, w))
d_x = scale_tile * var_inv * (d_y - np.mean(
d_y, axis=(2, 3), keepdims=True) - (x - mean_tile) * var_inv * np.mean(
d_y * (x - mean_tile) * var_inv, axis=(2, 3), keepdims=True))
return d_x, d_scale, d_bias
def _cal_mean_variance(x, epsilon, mean_shape):
mean = np.reshape(np.mean(x, axis=(2, 3)), mean_shape)
var = np.reshape(np.var(x, axis=(2, 3)), mean_shape)
return mean, var
class TestInstanceNormOpTraining(unittest.TestCase):
def setUp(self):
self.epsilon = 1e-5
self.init_test_case()
def init_test_case(self):
self.use_global_stats = False
self.no_grad_set = set()
self.fetch_list = [
'y', 'saved_mean', 'saved_variance', 'x@GRAD', 'scale@GRAD',
'bias@GRAD'
]
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
def set_global_mean_var(self, mean_shape, x):
mean, variance = _cal_mean_variance(x, self.epsilon, mean_shape)
return mean, variance
def test_forward_backward(self):
def test_with_place(place, shape):
epsilon = self.epsilon
n, c, h, w = shape[0], shape[1], shape[2], shape[3]
scale_shape = [c]
mean_shape = [n * c]
np.random.seed()
x = np.random.random_sample(shape).astype(np.float32)
scale = np.random.random_sample(scale_shape).astype(np.float32)
bias = np.random.random_sample(scale_shape).astype(np.float32)
mean, variance = self.set_global_mean_var(mean_shape, x)
d_y = np.random.random_sample(shape).astype(np.float32)
y, saved_mean, variance_tmp = _reference_instance_norm_naive(
x, scale, bias, epsilon, mean, variance)
saved_variance = 1 / np.sqrt(variance_tmp + epsilon)
d_x, d_scale, d_bias = _reference_instance_norm_grad(
x, d_y, scale, saved_mean, saved_variance, epsilon)
var_dict = locals()
var_dict['y@GRAD'] = d_y
var_dict['x@GRAD'] = d_x
var_dict['scale@GRAD'] = d_scale
var_dict['bias@GRAD'] = d_bias
var_names = [
'x', 'scale', 'bias', 'y', 'saved_mean', 'saved_variance'
]
ground_truth = {name: var_dict[name] for name in var_names}
program = fluid.Program()
with fluid.program_guard(program):
block = program.global_block()
for name in ground_truth:
block.create_var(
name=name,
dtype='float32',
shape=ground_truth[name].shape)
in_op = block.append_op(
type="instance_norm",
inputs={
"X": block.var("x"),
"Scale": block.var("scale"),
"Bias": block.var("bias"),
},
outputs={
"Y": block.var("y"),
"SavedMean": block.var("saved_mean"),
"SavedVariance": block.var("saved_variance")
},
attrs={"epsilon": epsilon, })
block.create_var(name="y@GRAD", dtype='float32', shape=y.shape)
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
in_op.desc, self.no_grad_set, [])
grad_op_desc = grad_op_desc_list[0]
new_op_desc = block.desc.append_op()
new_op_desc.copy_from(grad_op_desc)
for var_name in grad_op_desc.output_arg_names():
block.desc.var(var_name.encode("ascii"))
grad_op_desc.infer_var_type(block.desc)
grad_op_desc.infer_shape(block.desc)
for arg in grad_op_desc.output_arg_names():
grad_var = block.desc.find_var(arg.encode("ascii"))
grad_var.set_dtype(core.VarDesc.VarType.FP32)
exe = fluid.Executor(place)
out = exe.run(program,
feed={
name: var_dict[name]
for name in ['x', 'scale', 'bias', 'y@GRAD']
},
fetch_list=self.fetch_list)
for id, name in enumerate(self.fetch_list):
self.__assert_close(var_dict[name], out[id], name)
print("op test forward passes: ", str(place))
places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu(
"instance_norm"):
places.append(core.CUDAPlace(0))
for place in places:
test_with_place(place, [2, 3, 4, 5])
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.core as core
import gradient_checker
from decorator_helper import prog_scope
class TestInstanceNormDoubleGradCheck(unittest.TestCase):
@prog_scope()
def func(self, place):
prog = fluid.Program()
with fluid.program_guard(prog):
np.random.seed()
shape = [2, 3, 4, 5]
dtype = "float32"
eps = 0.005
atol = 1e-4
x = layers.create_parameter(dtype=dtype, shape=shape, name='x')
z = fluid.layers.instance_norm(input=x)
x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
gradient_checker.double_grad_check(
[x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
def test_grad(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
self.func(p)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册