diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 7229754cb8ed82ed6f9da427c044bcb5de388bb9..837bca6c2cf4e3f8992325bce651666f8c951ac9 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -33,6 +33,8 @@ namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; USE_OP(batch_norm); +USE_CUDA_ONLY_OP(fused_bn_add_activation); +USE_CUDA_ONLY_OP(fused_bn_add_activation_grad); template void InitRandomTensor(const std::vector &dims, @@ -40,7 +42,7 @@ void InitRandomTensor(const std::vector &dims, T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), platform::CPUPlace()); std::default_random_engine random(0); - std::uniform_real_distribution dis(0.0, 1.0); + std::uniform_real_distribution dis(-1.0, 1.0); for (int i = 0; i < cpu_out->numel(); ++i) { cpu_out_ptr[i] = static_cast(dis(random)); } @@ -89,7 +91,7 @@ void CheckOutput(std::string name, const framework::Tensor &cpu_res, } } std::string error_type = is_relative_atol ? "relative" : "absolute"; - LOG(INFO) << "[" << name << "], The dims is [" << cpu_res.dims() + LOG(INFO) << "[" << name << "] The dims is [" << cpu_res.dims() << "], maximum " << error_type << " error is " << max_diff << ": " << cpu_res_ptr[index] << " vs " << cpu_base_ptr[index]; } @@ -121,13 +123,33 @@ void ComputeSumAndSquareSum(const framework::Tensor &cpu_x, } } -// get paddle batchnorm op results as baseline +template +void ComputeInplaceAdd(const framework::Tensor &cpu_x, + framework::Tensor *cpu_y) { + EXPECT_EQ(cpu_x.dims(), cpu_y->dims()); + + const T *cpu_x_ptr = cpu_x.data(); + T *cpu_y_ptr = cpu_y->data(); + for (int64_t i = 0; i < cpu_x.numel(); ++i) { + cpu_y_ptr[i] += cpu_x_ptr[i]; + } +} + +template +void ComputeInplaceRelu(framework::Tensor *cpu_x) { + T *cpu_x_ptr = cpu_x->data(); + for (int64_t i = 0; i < cpu_x->numel(); ++i) { + cpu_x_ptr[i] = + cpu_x_ptr[i] > static_cast(0) ? cpu_x_ptr[i] : static_cast(0); + } +} + void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, const Tensor &cpu_x, const Tensor &cpu_scale, const Tensor &cpu_bias, Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean, Tensor *cpu_saved_var, Tensor *cpu_y, - Tensor *cpu_reserve_space) { + Tensor *saved_reserve_space) { framework::Scope scope; auto *x = scope.Var("X")->GetMutable(); auto *scale = scope.Var("Scale")->GetMutable(); @@ -178,68 +200,258 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, TensorCopySync(*var, platform::CPUPlace(), cpu_var); TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); - TensorCopySync(*reserve_space, platform::CPUPlace(), cpu_reserve_space); + // reserved_space will stay on GPU and used in grad op. + saved_reserve_space->ShareDataWith(*reserve_space); +} + +void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_x, const Tensor &cpu_z, + const Tensor &cpu_scale, + const Tensor &cpu_bias, Tensor *cpu_mean, + Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *saved_reserve_space) { + framework::Scope scope; + auto *x = scope.Var("X")->GetMutable(); + auto *z = scope.Var("Z")->GetMutable(); + auto *scale = scope.Var("Scale")->GetMutable(); + auto *bias = scope.Var("Bias")->GetMutable(); + auto *mean = scope.Var("Mean")->GetMutable(); + auto *var = scope.Var("Variance")->GetMutable(); + auto *y = scope.Var("Y")->GetMutable(); + auto *saved_mean = scope.Var("SavedMean")->GetMutable(); + auto *saved_var = + scope.Var("SavedVariance")->GetMutable(); + auto *reserve_space = + scope.Var("ReserveSpace")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x, place, x); + TensorCopySync(cpu_z, place, z); + TensorCopySync(cpu_scale, place, scale); + TensorCopySync(cpu_bias, place, bias); + TensorCopySync(*cpu_mean, place, mean); + TensorCopySync(*cpu_var, place, var); + + int64_t channels = x->dims()[3]; + scale->Resize({channels}); + bias->Resize({channels}); + mean->Resize({channels}); + var->Resize({channels}); + + framework::AttributeMap attrs; + + auto op = framework::OpRegistry::CreateOp( + "fused_bn_add_activation", + {{"X", {"X"}}, {"Z", {"Z"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}}, + {{"Y", {"Y"}}, + {"MeanOut", {"Mean"}}, + {"VarianceOut", {"Variance"}}, + {"SavedMean", {"SavedMean"}}, + {"SavedVariance", {"SavedVariance"}}, + {"ReserveSpace", {"ReserveSpace"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*y, platform::CPUPlace(), cpu_y); + TensorCopySync(*mean, platform::CPUPlace(), cpu_mean); + TensorCopySync(*var, platform::CPUPlace(), cpu_var); + TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); + TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); + // reserved_space will stay on GPU and used in grad op. + saved_reserve_space->ShareDataWith(*reserve_space); +} + +void ComputeFusedBNAddReluBackward( + const platform::CUDADeviceContext &ctx, const Tensor &cpu_dy, + const Tensor &cpu_x, const Tensor &cpu_scale, const Tensor &cpu_bias, + const Tensor &cpu_saved_mean, const Tensor &cpu_saved_var, + const Tensor &cpu_y, const Tensor &saved_reserve_space, Tensor *cpu_dx, + Tensor *cpu_dz, Tensor *cpu_dscale, Tensor *cpu_dbias) { + framework::Scope scope; + auto *x = scope.Var("X")->GetMutable(); + auto *y = scope.Var("Y")->GetMutable(); + auto *dy = scope.Var("Y@GRAD")->GetMutable(); + auto *scale = scope.Var("Scale")->GetMutable(); + auto *bias = scope.Var("Bias")->GetMutable(); + auto *saved_mean = scope.Var("SavedMean")->GetMutable(); + auto *saved_var = + scope.Var("SavedVariance")->GetMutable(); + auto *reserve_space = + scope.Var("ReserveSpace")->GetMutable(); + auto *dx = scope.Var("X@GRAD")->GetMutable(); + auto *dz = scope.Var("Z@GRAD")->GetMutable(); + auto *dscale = scope.Var("Scale@GRAD")->GetMutable(); + auto *dbias = scope.Var("Bias@GRAD")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x, place, x); + TensorCopySync(cpu_y, place, y); + TensorCopySync(cpu_dy, place, dy); + TensorCopySync(cpu_scale, place, scale); + TensorCopySync(cpu_bias, place, bias); + TensorCopySync(cpu_saved_mean, place, saved_mean); + TensorCopySync(cpu_saved_var, place, saved_var); + reserve_space->ShareDataWith(saved_reserve_space); + + int64_t channels = x->dims()[3]; + scale->Resize({channels}); + bias->Resize({channels}); + saved_mean->Resize({channels}); + saved_var->Resize({channels}); + + framework::AttributeMap attrs; + float momentum = 0.9; + float epsilon = 1e-5; + std::string act_type = "relu"; + attrs.insert({"momentum", momentum}); + attrs.insert({"epsilon", epsilon}); + attrs.insert({"act_type", act_type}); + + auto op = framework::OpRegistry::CreateOp( + "fused_bn_add_activation_grad", {{"X", {"X"}}, + {"Y", {"Y"}}, + {"Y@GRAD", {"Y@GRAD"}}, + {"Scale", {"Scale"}}, + {"Bias", {"Bias"}}, + {"SavedMean", {"SavedMean"}}, + {"SavedVariance", {"SavedVariance"}}, + {"ReserveSpace", {"ReserveSpace"}}}, + {{"X@GRAD", {"X@GRAD"}}, + {"Z@GRAD", {"Z@GRAD"}}, + {"Scale@GRAD", {"Scale@GRAD"}}, + {"Bias@GRAD", {"Bias@GRAD"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*dx, platform::CPUPlace(), cpu_dx); + TensorCopySync(*dz, platform::CPUPlace(), cpu_dz); + TensorCopySync(*dscale, platform::CPUPlace(), cpu_dscale); + TensorCopySync(*dbias, platform::CPUPlace(), cpu_dbias); } template class CudnnBNAddReluTester { public: - CudnnBNAddReluTester(int batch_size, int height, int width, int channels) { + CudnnBNAddReluTester(int batch_size, int height, int width, int channels, + std::string act_type, bool fuse_add, bool has_shortcut) { batch_size_ = batch_size; height_ = height; width_ = width; channels_ = channels; ele_count_ = batch_size_ * height_ * width_; + act_type_ = act_type; + fuse_add_ = fuse_add; + has_shortcut_ = has_shortcut; SetUp(); } ~CudnnBNAddReluTester() {} void CheckForward(float diff, bool is_relative_atol = false) { + LOG(INFO) << "[CheckForward, diff=" << diff + << ", is_relative_atol=" << is_relative_atol + << "] act_type=" << act_type_ << ", fuse_add=" << fuse_add_ + << ", has_shortcut=" << has_shortcut_; platform::CUDADeviceContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(0))); - framework::Tensor cpu_mean_base; - framework::Tensor cpu_var_base; - framework::Tensor cpu_saved_mean_base; - framework::Tensor cpu_saved_var_base; - framework::Tensor cpu_y_base; - framework::Tensor cpu_reserve_space_base; - BaselineForward(*ctx, &cpu_mean_base, &cpu_var_base, &cpu_saved_mean_base, - &cpu_saved_var_base, &cpu_y_base, &cpu_reserve_space_base); - - framework::Tensor cpu_mean; - framework::Tensor cpu_var; - framework::Tensor cpu_saved_mean; - framework::Tensor cpu_saved_var; - framework::Tensor cpu_y; - framework::Tensor cpu_bitmask; - FusedForward(*ctx, &cpu_mean, &cpu_var, &cpu_saved_mean, &cpu_saved_var, - &cpu_y, &cpu_bitmask); + auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; }; - CheckOutput("Mean", cpu_mean, cpu_mean_base, diff, is_relative_atol); - CheckOutput("Variance", cpu_var, cpu_var_base, diff, + framework::Tensor cpu_mean_base_x; + framework::Tensor cpu_var_base_x; + framework::Tensor cpu_mean_base_z; + framework::Tensor cpu_var_base_z; + if (!has_shortcut_ && fuse_add_ && (act_type_ == "relu")) { + BaselineForwardFusedBNAddRelu( + *ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_, + &cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_); + } else { + BaselineForward( + *ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_, + &cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_, + select(&cpu_mean_base_z), select(&cpu_var_base_z), + select(&cpu_saved_mean_base_z_), select(&cpu_saved_var_base_z_), + select(&saved_reserve_space_z_)); + } + + framework::Tensor cpu_mean_x; + framework::Tensor cpu_var_x; + framework::Tensor cpu_y; + framework::Tensor cpu_mean_z; + framework::Tensor cpu_var_z; + FusedForward(*ctx, &cpu_mean_x, &cpu_var_x, &cpu_saved_mean_x_, + &cpu_saved_var_x_, &cpu_y, &cpu_bitmask_, select(&cpu_mean_z), + select(&cpu_var_z), select(&cpu_saved_mean_z_), + select(&cpu_saved_var_z_)); + + CheckOutput("Mean", cpu_mean_x, cpu_mean_base_x, diff, + is_relative_atol); + CheckOutput("Variance", cpu_var_x, cpu_var_base_x, diff, is_relative_atol); - CheckOutput("SavedMean", cpu_saved_mean, cpu_saved_mean_base, diff, + CheckOutput("SavedMean", cpu_saved_mean_x_, cpu_saved_mean_base_x_, + diff, is_relative_atol); + CheckOutput("SavedVariance", cpu_saved_var_x_, cpu_saved_var_base_x_, + diff, is_relative_atol); + if (has_shortcut_) { + CheckOutput("MeanZ", cpu_mean_z, cpu_mean_base_z, diff, + is_relative_atol); + CheckOutput("VarianceZ", cpu_var_z, cpu_var_base_z, diff, + is_relative_atol); + CheckOutput("SavedMeanZ", cpu_saved_mean_z_, + cpu_saved_mean_base_z_, diff, is_relative_atol); + CheckOutput("SavedVarianceZ", cpu_saved_var_z_, + cpu_saved_var_base_z_, diff, is_relative_atol); + } + CheckOutput("Y", cpu_y, cpu_y_base_, diff, is_relative_atol); + } + + void CheckBackward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + framework::Tensor cpu_dx_base; + framework::Tensor cpu_dz_base; + framework::Tensor cpu_dscale_base; + framework::Tensor cpu_dbias_base; + BaselineBackwardFusedBNAddRelu(*ctx, &cpu_dx_base, &cpu_dz_base, + &cpu_dscale_base, &cpu_dbias_base); + + framework::Tensor cpu_dx; + framework::Tensor cpu_dz; + framework::Tensor cpu_dscale; + framework::Tensor cpu_dbias; + FusedBackward(*ctx, &cpu_dx, &cpu_dz, &cpu_dscale, &cpu_dbias); + + CheckOutput("DX", cpu_dx, cpu_dx_base, diff, is_relative_atol); + CheckOutput("DZ", cpu_dz, cpu_dz_base, diff, is_relative_atol); + CheckOutput("DScale", cpu_dscale, cpu_dscale_base, diff, is_relative_atol); - CheckOutput("SavedVariance", cpu_saved_var, cpu_saved_var_base, diff, + CheckOutput("DBias", cpu_dbias, cpu_dbias_base, diff, is_relative_atol); - CheckOutput("Y", cpu_y, cpu_y_base, diff, is_relative_atol); } private: void SetUp() { - // Initialize input data InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_x_); - ComputeSumAndSquareSum(cpu_x_, &cpu_sum_, &cpu_sum_of_square_); + InitRandomTensor({channels_}, &cpu_bn_scale_x_); + InitRandomTensor({channels_}, &cpu_bn_bias_x_); - // scale and bias should be initialized randomly. - InitConstantTensor({channels_}, static_cast(1.0f), - &cpu_bn_scale_); - InitConstantTensor({channels_}, static_cast(0.0f), - &cpu_bn_bias_); + if (has_shortcut_) { + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_z_); + InitRandomTensor({channels_}, &cpu_bn_scale_z_); + InitRandomTensor({channels_}, &cpu_bn_bias_z_); + } else { + if (fuse_add_) { + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_z_); + } + } + + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_dy_); } void InitMeanVar(Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean, @@ -252,71 +464,178 @@ class CudnnBNAddReluTester { cpu_saved_var); } - void BaselineForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean, - Tensor *cpu_var, Tensor *cpu_saved_mean, - Tensor *cpu_saved_var, Tensor *cpu_y, - Tensor *cpu_reserve_space) { + void BaselineForward(const platform::CUDADeviceContext &ctx, + Tensor *cpu_mean_x, Tensor *cpu_var_x, + Tensor *cpu_saved_mean_x, Tensor *cpu_saved_var_x, + Tensor *cpu_y, Tensor *saved_reserve_space_x, + Tensor *cpu_mean_z = nullptr, + Tensor *cpu_var_z = nullptr, + Tensor *cpu_saved_mean_z = nullptr, + Tensor *cpu_saved_var_z = nullptr, + Tensor *saved_reserve_space_z = nullptr) { + InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x); + ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, + cpu_mean_x, cpu_var_x, cpu_saved_mean_x, + cpu_saved_var_x, cpu_y, saved_reserve_space_x); + if (has_shortcut_) { + framework::Tensor cpu_z_out; + InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z); + ComputeBatchNormForward( + ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_, cpu_mean_z, cpu_var_z, + cpu_saved_mean_z, cpu_saved_var_z, &cpu_z_out, saved_reserve_space_z); + ComputeInplaceAdd(cpu_z_out, cpu_y); + } else { + if (fuse_add_) { + ComputeInplaceAdd(cpu_z_, cpu_y); + } + } + if (act_type_ == "relu") { + ComputeInplaceRelu(cpu_y); + } + } + + void BaselineForwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, + Tensor *cpu_mean, Tensor *cpu_var, + Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *saved_reserve_space) { InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var); - ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_, cpu_bn_bias_, cpu_mean, - cpu_var, cpu_saved_mean, cpu_saved_var, cpu_y, - cpu_reserve_space); + ComputeFusedBNAddReluForward( + ctx, cpu_x_, cpu_z_, cpu_bn_scale_x_, cpu_bn_bias_x_, cpu_mean, cpu_var, + cpu_saved_mean, cpu_saved_var, cpu_y, saved_reserve_space); + } + + void BaselineBackwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, + Tensor *cpu_dx, Tensor *cpu_dz, + Tensor *cpu_dscale, Tensor *cpu_dbias) { + ComputeFusedBNAddReluBackward( + ctx, cpu_dy_, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, + cpu_saved_mean_base_x_, cpu_saved_var_base_x_, cpu_y_base_, + saved_reserve_space_x_, cpu_dx, cpu_dz, cpu_dscale, cpu_dbias); + } + + void ComputeFusedBNStatsFinalize(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_x, + const Tensor &cpu_bn_scale, + const Tensor &cpu_bn_bias, Tensor *sum, + Tensor *sum_of_square, Tensor *bn_scale, + Tensor *bn_bias, Tensor *mean, Tensor *var, + Tensor *saved_mean, Tensor *saved_var, + Tensor *equiv_scale, Tensor *equiv_bias) { + framework::Tensor cpu_sum; + framework::Tensor cpu_sum_of_square; + ComputeSumAndSquareSum(cpu_x, &cpu_sum, &cpu_sum_of_square); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_sum, place, sum); + TensorCopySync(cpu_sum_of_square, place, sum_of_square); + TensorCopySync(cpu_bn_scale, place, bn_scale); + TensorCopySync(cpu_bn_bias, place, bn_bias); + + bn_scale->Resize({1, 1, 1, channels_}); + bn_bias->Resize({1, 1, 1, channels_}); + + // input + float *sum_ptr = sum->data(); + float *sum_of_square_ptr = sum_of_square->data(); + float *bn_scale_ptr = bn_scale->data(); + float *bn_bias_ptr = bn_bias->data(); + + mean->Resize({1, 1, 1, channels_}); + var->Resize({1, 1, 1, channels_}); + + // output + float *mean_ptr = mean->data(); + float *var_ptr = var->data(); + float *saved_mean_ptr = + saved_mean->mutable_data({1, 1, 1, channels_}, place); + float *saved_var_ptr = + saved_var->mutable_data({1, 1, 1, channels_}, place); + T *equiv_scale_ptr = + equiv_scale->mutable_data({1, 1, 1, channels_}, place); + T *equiv_bias_ptr = + equiv_bias->mutable_data({1, 1, 1, channels_}, place); + + auto param_shape = framework::vectorize(bn_scale->dims()); + op::CudnnBNStatsFinalize bn_op(ctx, param_shape); + bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr, + saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr, + equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_, + true); } // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu - void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean, - Tensor *cpu_var, Tensor *cpu_saved_mean, - Tensor *cpu_saved_var, Tensor *cpu_y, Tensor *cpu_bitmask) { + void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean_x, + Tensor *cpu_var_x, Tensor *cpu_saved_mean_x, + Tensor *cpu_saved_var_x, Tensor *cpu_y, Tensor *cpu_bitmask, + Tensor *cpu_mean_z = nullptr, Tensor *cpu_var_z = nullptr, + Tensor *cpu_saved_mean_z = nullptr, + Tensor *cpu_saved_var_z = nullptr) { framework::Tensor x; - framework::Tensor sum; - framework::Tensor sum_of_square; - framework::Tensor bn_scale; - framework::Tensor bn_bias; + framework::Tensor sum_x; + framework::Tensor sum_of_square_x; + framework::Tensor bn_scale_x; + framework::Tensor bn_bias_x; + + framework::Tensor z; + framework::Tensor sum_z; + framework::Tensor sum_of_square_z; + framework::Tensor bn_scale_z; + framework::Tensor bn_bias_z; auto place = ctx.GetPlace(); TensorCopySync(cpu_x_, place, &x); - TensorCopySync(cpu_sum_, place, &sum); - TensorCopySync(cpu_sum_of_square_, place, &sum_of_square); - TensorCopySync(cpu_bn_scale_, place, &bn_scale); - TensorCopySync(cpu_bn_bias_, place, &bn_bias); + if (fuse_add_ || has_shortcut_) { + TensorCopySync(cpu_z_, place, &z); + } - bn_scale.Resize({1, 1, 1, channels_}); - bn_bias.Resize({1, 1, 1, channels_}); + framework::Tensor mean_x; + framework::Tensor var_x; + framework::Tensor saved_mean_x; + framework::Tensor saved_var_x; + framework::Tensor equiv_scale_x; + framework::Tensor equiv_bias_x; - T *x_ptr = x.data(); - float *sum_ptr = sum.data(); - float *sum_of_square_ptr = sum_of_square.data(); - float *bn_scale_ptr = bn_scale.data(); - float *bn_bias_ptr = bn_bias.data(); + framework::Tensor mean_z; + framework::Tensor var_z; + framework::Tensor saved_mean_z; + framework::Tensor saved_var_z; + framework::Tensor equiv_scale_z; + framework::Tensor equiv_bias_z; - framework::Tensor mean; - framework::Tensor var; - framework::Tensor saved_mean; - framework::Tensor saved_var; - framework::Tensor equiv_scale; - framework::Tensor equiv_bias; framework::Tensor y; framework::Tensor bitmask; - InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var); - TensorCopySync(*cpu_mean, place, &mean); - TensorCopySync(*cpu_var, place, &var); + InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x); + TensorCopySync(*cpu_mean_x, place, &mean_x); + TensorCopySync(*cpu_var_x, place, &var_x); + if (has_shortcut_) { + InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z); + TensorCopySync(*cpu_mean_z, place, &mean_z); + TensorCopySync(*cpu_var_z, place, &var_z); + } - mean.Resize({1, 1, 1, channels_}); - var.Resize({1, 1, 1, channels_}); + // 1. BN Stats Finalize + ComputeFusedBNStatsFinalize(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, + &sum_x, &sum_of_square_x, &bn_scale_x, + &bn_bias_x, &mean_x, &var_x, &saved_mean_x, + &saved_var_x, &equiv_scale_x, &equiv_bias_x); + if (has_shortcut_) { + ComputeFusedBNStatsFinalize(ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_, + &sum_z, &sum_of_square_z, &bn_scale_z, + &bn_bias_z, &mean_z, &var_z, &saved_mean_z, + &saved_var_z, &equiv_scale_z, &equiv_bias_z); + } - float *mean_ptr = mean.data(); - float *var_ptr = var.data(); - float *saved_mean_ptr = - saved_mean.mutable_data({1, 1, 1, channels_}, place); - float *saved_var_ptr = - saved_var.mutable_data({1, 1, 1, channels_}, place); - T *equiv_scale_ptr = - equiv_scale.mutable_data({1, 1, 1, channels_}, place); - T *equiv_bias_ptr = equiv_bias.mutable_data({1, 1, 1, channels_}, place); + T *x_ptr = x.data(); + T *z_ptr = (fuse_add_ || has_shortcut_) ? z.data() : nullptr; + T *equiv_scale_x_ptr = equiv_scale_x.data(); + T *equiv_bias_x_ptr = equiv_bias_x.data(); + T *equiv_scale_z_ptr = has_shortcut_ ? equiv_scale_z.data() : nullptr; + T *equiv_bias_z_ptr = has_shortcut_ ? equiv_bias_z.data() : nullptr; T *y_ptr = y.mutable_data({batch_size_, height_, width_, channels_}, place); - // bitmask int c = channels_; int64_t nhw = ele_count_; int32_t c_int32_elems = ((c + 63) & ~63) / 32; @@ -325,31 +644,90 @@ class CudnnBNAddReluTester { {nhw_int32_elems, c_int32_elems, 1}, place); auto data_shape = framework::vectorize(x.dims()); - auto param_shape = framework::vectorize(bn_scale.dims()); + auto param_shape = framework::vectorize(bn_scale_x.dims()); auto bitmask_shape = framework::vectorize(bitmask.dims()); - // 1. BN Stats Finalize - op::CudnnBNStatsFinalize bn_op(ctx, param_shape); - bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr, - saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr, - equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_, - true); - - // 2. Scale Bias + Relu (not fused add) - std::string act_type = ""; - op::CudnnScaleBiasAddRelu sbar_op( - ctx, act_type, false, false, data_shape, param_shape, bitmask_shape); - sbar_op.Forward(ctx, x_ptr, equiv_scale_ptr, equiv_bias_ptr, y_ptr, - bitmask_ptr); - - TensorCopySync(mean, platform::CPUPlace(), cpu_mean); - TensorCopySync(var, platform::CPUPlace(), cpu_var); - TensorCopySync(saved_mean, platform::CPUPlace(), cpu_saved_mean); - TensorCopySync(saved_var, platform::CPUPlace(), cpu_saved_var); + // 2. Scale Bias + Relu + op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type_, fuse_add_, + has_shortcut_, data_shape, param_shape, + bitmask_shape); + sbar_op.Forward(ctx, x_ptr, equiv_scale_x_ptr, equiv_bias_x_ptr, y_ptr, + bitmask_ptr, z_ptr, equiv_scale_z_ptr, equiv_bias_z_ptr); + + TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x); + TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x); + TensorCopySync(saved_mean_x, platform::CPUPlace(), cpu_saved_mean_x); + TensorCopySync(saved_var_x, platform::CPUPlace(), cpu_saved_var_x); + if (has_shortcut_) { + TensorCopySync(mean_z, platform::CPUPlace(), cpu_mean_z); + TensorCopySync(var_z, platform::CPUPlace(), cpu_var_z); + TensorCopySync(saved_mean_z, platform::CPUPlace(), cpu_saved_mean_z); + TensorCopySync(saved_var_z, platform::CPUPlace(), cpu_saved_var_z); + } TensorCopySync(y, platform::CPUPlace(), cpu_y); TensorCopySync(bitmask, platform::CPUPlace(), cpu_bitmask); } + // Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu + void FusedBackward(const platform::CUDADeviceContext &ctx, Tensor *cpu_dx, + Tensor *cpu_dz, Tensor *cpu_dscale, Tensor *cpu_dbias) { + framework::Tensor dy; + framework::Tensor x; + framework::Tensor bn_scale; + framework::Tensor bn_bias; + framework::Tensor saved_mean; + framework::Tensor saved_var; + framework::Tensor bitmask; + framework::Tensor dx; + framework::Tensor dz; + framework::Tensor dscale; + framework::Tensor dbias; + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_dy_, place, &dy); + TensorCopySync(cpu_x_, place, &x); + TensorCopySync(cpu_bn_scale_x_, place, &bn_scale); + TensorCopySync(cpu_bn_bias_x_, place, &bn_bias); + TensorCopySync(cpu_saved_mean_x_, place, &saved_mean); + TensorCopySync(cpu_saved_var_x_, place, &saved_var); + TensorCopySync(cpu_bitmask_, place, &bitmask); + + bn_scale.Resize({1, 1, 1, channels_}); + bn_bias.Resize({1, 1, 1, channels_}); + saved_mean.Resize({1, 1, 1, channels_}); + saved_var.Resize({1, 1, 1, channels_}); + + T *dy_ptr = dy.data(); + T *x_ptr = x.data(); + float *bn_scale_ptr = bn_scale.data(); + float *bn_bias_ptr = bn_bias.data(); + float *saved_mean_ptr = saved_mean.data(); + float *saved_var_ptr = saved_var.data(); + int32_t *bitmask_ptr = bitmask.data(); + T *dx_ptr = + dx.mutable_data({batch_size_, height_, width_, channels_}, place); + T *dz_ptr = + dz.mutable_data({batch_size_, height_, width_, channels_}, place); + float *dscale_ptr = dscale.mutable_data({1, 1, 1, channels_}, place); + float *dbias_ptr = dbias.mutable_data({1, 1, 1, channels_}, place); + + auto data_shape = framework::vectorize(x.dims()); + auto param_shape = framework::vectorize(bn_scale.dims()); + auto bitmask_shape = framework::vectorize(bitmask.dims()); + + std::string act_type = "relu"; + op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type, true, false, data_shape, + param_shape, bitmask_shape); + sbar_op.Backward(ctx, dy_ptr, x_ptr, bn_scale_ptr, bn_bias_ptr, + saved_mean_ptr, saved_var_ptr, bitmask_ptr, dx_ptr, dz_ptr, + dscale_ptr, dbias_ptr, eps_); + + TensorCopySync(dx, platform::CPUPlace(), cpu_dx); + TensorCopySync(dz, platform::CPUPlace(), cpu_dz); + TensorCopySync(dscale, platform::CPUPlace(), cpu_dscale); + TensorCopySync(dbias, platform::CPUPlace(), cpu_dbias); + } + private: int batch_size_; int height_; @@ -357,24 +735,80 @@ class CudnnBNAddReluTester { int channels_; int ele_count_; + std::string act_type_; + bool fuse_add_; + bool has_shortcut_; + // Forward input framework::Tensor cpu_x_; - framework::Tensor cpu_sum_; - framework::Tensor cpu_sum_of_square_; - framework::Tensor cpu_bn_scale_; - framework::Tensor cpu_bn_bias_; + framework::Tensor cpu_bn_scale_x_; + framework::Tensor cpu_bn_bias_x_; + framework::Tensor cpu_z_; + framework::Tensor cpu_bn_scale_z_; + framework::Tensor cpu_bn_bias_z_; + + // Backward input + framework::Tensor cpu_dy_; + framework::Tensor cpu_bitmask_; + framework::Tensor cpu_saved_mean_x_; + framework::Tensor cpu_saved_var_x_; + framework::Tensor cpu_saved_mean_z_; + framework::Tensor cpu_saved_var_z_; + framework::Tensor cpu_saved_mean_base_x_; + framework::Tensor cpu_saved_var_base_x_; + framework::Tensor saved_reserve_space_x_; + framework::Tensor cpu_saved_mean_base_z_; + framework::Tensor cpu_saved_var_base_z_; + framework::Tensor saved_reserve_space_z_; + framework::Tensor cpu_y_base_; double eps_ = 1e-5; float momentum_ = 0.9; }; -TEST(CudnnBNAddReluForward, GPUCudnnBNAddReluForwardFp16) { +TEST(CudnnBNAddReluFp16, BNAdd) { + int batch_size = 4; + int height = 8; + int width = 8; + int channels = 64; + std::string act_type = ""; + bool has_shortcut = false; + FLAGS_cudnn_batchnorm_spatial_persistent = true; + for (auto fuse_add : {false, true}) { + CudnnBNAddReluTester test( + batch_size, height, width, channels, act_type, fuse_add, has_shortcut); + test.CheckForward(2e-3); + } +} + +TEST(CudnnBNAddReluFp16, BNAddRelu) { + int batch_size = 4; + int height = 8; + int width = 8; + int channels = 64; + std::string act_type = "relu"; + bool has_shortcut = false; + FLAGS_cudnn_batchnorm_spatial_persistent = true; + for (auto fuse_add : {false, true}) { + CudnnBNAddReluTester test( + batch_size, height, width, channels, act_type, fuse_add, has_shortcut); + test.CheckForward(2e-3); + if (fuse_add) { + test.CheckBackward(2e-4); + } + } +} + +TEST(CudnnBNAddReluFp16, HasShortcut) { int batch_size = 4; int height = 8; int width = 8; int channels = 64; + std::string act_type = ""; + bool fuse_add = false; + bool has_shortcut = true; FLAGS_cudnn_batchnorm_spatial_persistent = true; - CudnnBNAddReluTester test(batch_size, height, - width, channels); - test.CheckForward(2e-3); + CudnnBNAddReluTester test( + batch_size, height, width, channels, act_type, fuse_add, has_shortcut); + test.CheckForward(5e-3); } diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index fff7b327f3f2ecc5b44dd098f0029ba6843a9ec5..4c14029b99c69cb07a40faf19633783a22b78583 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -92,10 +92,9 @@ void CheckOutput(const framework::Tensor &cpu_res, } // Use Paddle conv2d op results as baseline -template void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, const Tensor &cpu_input, const Tensor &cpu_filter, - Tensor *cpu_output) { + Tensor *cpu_output, int stride, int padding) { framework::Scope scope; auto *input = scope.Var("Input")->GetMutable(); auto *filter = scope.Var("Filter")->GetMutable(); @@ -108,10 +107,12 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, framework::AttributeMap attrs; bool use_cudnn = true; std::string data_format = "NHWC"; - std::string padding_algorithm = "SAME"; + std::vector strides = {stride, stride}; + std::vector paddings = {padding, padding}; + attrs.insert({"strides", strides}); + attrs.insert({"paddings", paddings}); attrs.insert({"use_cudnn", use_cudnn}); attrs.insert({"data_format", data_format}); - attrs.insert({"padding_algorithm", padding_algorithm}); auto op = framework::OpRegistry::CreateOp( "conv2d", {{"Input", {"Input"}}, {"Filter", {"Filter"}}}, @@ -122,7 +123,6 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, } // Use Paddle conv2d_grad op results as baseline -template void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, const Tensor &cpu_input, const Tensor &cpu_filter, const Tensor &cpu_output_grad, @@ -147,7 +147,7 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, framework::AttributeMap attrs; bool use_cudnn = true; std::string data_format = "NHWC"; - std::string padding_algorithm = "SAME"; + std::string padding_algorithm = "EXPLICIT"; std::vector strides = {stride, stride}; std::vector paddings = {padding, padding}; std::vector dilations = {dilation, dilation}; @@ -216,6 +216,8 @@ class CudnnNormConvolutionTester { kernel_size_ = kernel_size; stride_ = stride; padding_ = (kernel_size_ - 1) / 2; + out_height_ = (height_ + 2 * padding_ - kernel_size_) / stride_ + 1; + out_width_ = (width_ + 2 * padding_ - kernel_size_) / stride_ + 1; SetUp(); } @@ -227,6 +229,15 @@ class CudnnNormConvolutionTester { platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(0))); + if (!Support(*ctx)) { + LOG(INFO) + << "Current test is only supported in the platforms with " + << "compatiblity greater than or equal to 70 and the kernel size " + << "must be equal to 1 or 3. Besides, when the kernel size is 1, " + << "the stride must be 1 if the compatiblity is equal to 70."; + return; + } + framework::Tensor cpu_output_base; framework::Tensor cpu_sum_base; framework::Tensor cpu_sum_of_square_base; @@ -277,15 +288,17 @@ class CudnnNormConvolutionTester { &cpu_filter_nchw_); // transpoes for filter, NCHW -> NHWC TransposeNchwToNhwc(cpu_filter_nchw_, &cpu_filter_nhwc_); - InitRandomTensor({batch_size_, height_, width_, output_channels_}, - &cpu_output_grad_); + InitRandomTensor( + {batch_size_, out_height_, out_width_, output_channels_}, + &cpu_output_grad_); } void BaselineForward(const platform::CUDADeviceContext &ctx, framework::Tensor *cpu_output_base, framework::Tensor *cpu_sum_base, framework::Tensor *cpu_sum_of_square_base) { - ComputeConv2DForward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base); + ComputeConv2DForward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base, + stride_, padding_); ComputeSumAndSquareSum(*cpu_output_base, cpu_sum_base, cpu_sum_of_square_base); } @@ -293,10 +306,9 @@ class CudnnNormConvolutionTester { void BaselineBackward(const platform::CUDADeviceContext &ctx, framework::Tensor *cpu_input_grad_base, framework::Tensor *cpu_filter_grad_base) { - ComputeConv2DBackward(ctx, cpu_input_, cpu_filter_nchw_, - cpu_output_grad_, cpu_input_grad_base, - cpu_filter_grad_base, stride_, padding_, - dilation_); + ComputeConv2DBackward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_grad_, + cpu_input_grad_base, cpu_filter_grad_base, stride_, + padding_, dilation_); } // get forward results of cudnn_norm_conv @@ -316,7 +328,7 @@ class CudnnNormConvolutionTester { T *input_ptr = input.data(); T *filter_ptr = filter_nhwc.data(); T *output_ptr = output.mutable_data( - {batch_size_, height_, width_, output_channels_}, place); + {batch_size_, out_height_, out_width_, output_channels_}, place); float *sum_ptr = sum.mutable_data({1, 1, 1, output_channels_}, place); float *sum_of_square_ptr = @@ -369,10 +381,25 @@ class CudnnNormConvolutionTester { TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad); } + bool Support(const platform::CUDADeviceContext &ctx) { + if (ctx.GetComputeCapability() == 70) { + if ((kernel_size_ == 3) || ((kernel_size_ == 1) && (stride_ == 1))) { + return true; + } + } else if (ctx.GetComputeCapability() > 70) { + if ((kernel_size_ == 3) || (kernel_size_ == 1)) { + return true; + } + } + return false; + } + private: int batch_size_; int height_; int width_; + int out_height_; + int out_width_; int input_channels_; int output_channels_; int kernel_size_; @@ -437,3 +464,19 @@ TEST(CudnnNormConvFp16, K1S1O4) { test.CheckForward(1e-3, true); test.CheckBackward(1e-3, true); } + +// test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4 +TEST(CudnnNormConvFp16, K1S2O4) { + int batch_size = 4; + int height = 8; + int width = 8; + int input_channels = 32; + int output_channels = 128; + int kernel_size = 1; + int stride = 2; + CudnnNormConvolutionTester test( + batch_size, height, width, input_channels, output_channels, kernel_size, + stride); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3); +}