fused_bn_activation_op.cu 17.4 KB
Newer Older
Z
Zhen Wang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <algorithm>
#include <cfloat>
#include <string>
#include <vector>
19

Z
Zhen Wang 已提交
20 21 22 23 24
#include "cub/cub.cuh"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/fused/fused_bn_activation_op.h"
#include "paddle/fluid/operators/norm_utils.h"
25
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
Z
Zhen Wang 已提交
26
#include "paddle/fluid/platform/float16.h"
27
#include "paddle/phi/kernels/funcs/math_function.h"
Z
Zhen Wang 已提交
28 29 30 31 32 33 34 35 36 37 38 39

DECLARE_bool(cudnn_batchnorm_spatial_persistent);

namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;

template <typename T>
L
Leo Chen 已提交
40
class FusedBatchNormActKernel<phi::GPUContext, T>
Z
Zhen Wang 已提交
41 42 43 44
    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE_EQ(
45 46
        platform::is_gpu_place(ctx.GetPlace()),
        true,
Z
Zhen Wang 已提交
47
        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
48
    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
Z
Zhen Wang 已提交
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
    float momentum = ctx.Attr<float>("momentum");
    std::string act_type = ctx.Attr<std::string>("act_type");

    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
      LOG(ERROR) << "Provided epsilon is smaller than "
                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
                 << "CUDNN_BN_MIN_EPSILON instead.";
    }
    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);

    // Get the size for each dimension.
    // NHWC [batch_size, in_height, in_width, in_channels]
    const auto *x = ctx.Input<Tensor>("X");
    const auto &x_dims = x->dims();
64 65
    PADDLE_ENFORCE_EQ(x_dims.size() >= 2 && x_dims.size() <= 5,
                      true,
Z
Zhen Wang 已提交
66 67 68 69 70 71 72 73 74 75 76
                      platform::errors::PreconditionNotMet(
                          "The Input dim size should be between 2 and 5"));

    const auto *scale = ctx.Input<Tensor>("Scale");
    const auto *bias = ctx.Input<Tensor>("Bias");

    // Run training mode.
    // obtain running mean and running inv var, and see if we need to
    // initialize them.
    auto *mean_out = ctx.Output<Tensor>("MeanOut");
    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
77 78 79 80
    dev_ctx.Alloc<BatchNormParamType<T>>(
        mean_out, mean_out->numel() * sizeof(BatchNormParamType<T>));
    dev_ctx.Alloc<BatchNormParamType<T>>(
        variance_out, variance_out->numel() * sizeof(BatchNormParamType<T>));
Z
Zhen Wang 已提交
81 82 83

    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
84 85 86 87 88
    dev_ctx.Alloc<BatchNormParamType<T>>(
        saved_mean, saved_mean->numel() * sizeof(BatchNormParamType<T>));
    dev_ctx.Alloc<BatchNormParamType<T>>(
        saved_variance,
        saved_variance->numel() * sizeof(BatchNormParamType<T>));
Z
Zhen Wang 已提交
89 90

    auto *y = ctx.Output<Tensor>("Y");
91
    dev_ctx.Alloc<T>(y, y->numel() * sizeof(T));
Z
Zhen Wang 已提交
92 93 94 95 96 97 98 99 100 101 102 103

    int N, C, H, W, D;
    const DataLayout data_layout = DataLayout::kNHWC;
    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);

    if ((N * H * W * D) == 1) {
      // Only 1 element in normalization dimension,
      // skip the batch norm calculation, let y = act(x).
      auto x_v = framework::EigenVector<T>::Flatten(*x);
      auto y_v = framework::EigenVector<T>::Flatten(*y);
      auto &dev = *dev_ctx.eigen_device();
      if (act_type == "relu") {
104
        ReluCUDAFunctor<T>()(dev, x_v, y_v);
Z
Zhen Wang 已提交
105 106 107 108 109 110 111 112 113 114 115 116 117
      } else {
        PADDLE_THROW(
            platform::errors::Unimplemented("Unsupported activation type"));
      }
      return;
    }

    // ------------------- cudnn descriptors ---------------------
    auto handle = dev_ctx.cudnn_handle();
    cudnnTensorDescriptor_t data_desc_;
    cudnnTensorDescriptor_t bn_param_desc_;
    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;

118
    PADDLE_ENFORCE_GPU_SUCCESS(
119
        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
120
    PADDLE_ENFORCE_GPU_SUCCESS(
121
        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
Z
Zhen Wang 已提交
122 123 124 125 126

    VLOG(3) << "Setting descriptors.";
    std::vector<int> dims = {N, C, H, W, D};
    std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};

127
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
128 129 130 131 132
        data_desc_,
        CudnnDataType<T>::type,
        x_dims.size() > 3 ? x_dims.size() : 4,
        dims.data(),
        strides.data()));
Z
Zhen Wang 已提交
133

134 135
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
        bn_param_desc_, data_desc_, mode_));
Z
Zhen Wang 已提交
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156

    double this_factor = 1. - momentum;
    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
    platform::ScopedActivationDescriptor scope_act_desc;
    cudnnActivationDescriptor_t activation_desc_ =
        scope_act_desc.descriptor<T>(act_type);
    size_t workspace_size = 0;
    size_t reserve_space_size = 0;
    void *reserve_space_ptr = nullptr;
    void *workspace_ptr = nullptr;
    Tensor workspace_tensor;
    // Create reserve space and workspace for batch norm.
    // Create tensor for each batchnorm op, it will be used in the
    // backward. Thus this tensor shouldn't be temp.
    auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
    PADDLE_ENFORCE_NOT_NULL(
        reserve_space,
        platform::errors::NotFound(
            "The argument ReserveSpace of batch_norm op is not found."));

    // --------------- cudnn batchnorm workspace ---------------
157
    PADDLE_ENFORCE_GPU_SUCCESS(
Z
Zhen Wang 已提交
158 159 160 161 162 163 164 165 166 167
        platform::dynload::
            cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
                /*handle=*/handle,
                /*mode=*/mode_,
                /*bnOps=*/bnOps_,
                /*xDesc=*/data_desc_,
                /*zDesc=*/nullptr,
                /*yDesc=*/data_desc_,
                /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
                /*activationDesc=*/activation_desc_,
168
                /*sizeInBytes=*/&workspace_size));
Z
Zhen Wang 已提交
169 170

    // -------------- cudnn batchnorm reserve space --------------
171
    PADDLE_ENFORCE_GPU_SUCCESS(
Z
Zhen Wang 已提交
172 173 174 175 176 177
        platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
            /*handle=*/handle,
            /*mode=*/mode_,
            /*bnOps=*/bnOps_,
            /*activationDesc=*/activation_desc_,
            /*xDesc=*/data_desc_,
178
            /*sizeInBytes=*/&reserve_space_size));
Z
Zhen Wang 已提交
179

180 181 182 183 184 185 186 187 188 189 190
    reserve_space->Resize({static_cast<int64_t>(
        (reserve_space_size + experimental::SizeOf(x->dtype()) - 1) /
        experimental::SizeOf(x->dtype()))});
    reserve_space_ptr =
        dev_ctx.Alloc<T>(reserve_space, reserve_space->numel() * sizeof(T));
    workspace_tensor.Resize({static_cast<int64_t>(
        (workspace_size + experimental::SizeOf(x->dtype()) - 1) /
        experimental::SizeOf(x->dtype()))});
    workspace_ptr = dev_ctx.Alloc<T>(&workspace_tensor,
                                     workspace_tensor.numel() * sizeof(T));

191
    PADDLE_ENFORCE_GPU_SUCCESS(
Z
Zhen Wang 已提交
192
        platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
193 194 195 196 197 198 199 200 201 202 203 204
            handle,
            mode_,
            bnOps_,
            CudnnDataType<T>::kOne(),
            CudnnDataType<T>::kZero(),
            data_desc_,
            x->template data<T>(),
            nullptr,
            nullptr,
            data_desc_,
            y->template data<T>(),
            bn_param_desc_,
Z
Zhen Wang 已提交
205
            scale->template data<BatchNormParamType<T>>(),
206 207
            bias->template data<BatchNormParamType<T>>(),
            this_factor,
208 209 210 211 212
            dev_ctx.template Alloc<BatchNormParamType<T>>(
                mean_out, mean_out->numel() * sizeof(BatchNormParamType<T>)),
            dev_ctx.template Alloc<BatchNormParamType<T>>(
                variance_out,
                variance_out->numel() * sizeof(BatchNormParamType<T>)),
213
            epsilon,
214 215 216 217 218 219
            dev_ctx.template Alloc<BatchNormParamType<T>>(
                saved_mean,
                saved_mean->numel() * sizeof(BatchNormParamType<T>)),
            dev_ctx.template Alloc<BatchNormParamType<T>>(
                saved_variance,
                saved_variance->numel() * sizeof(BatchNormParamType<T>)),
220 221 222 223
            activation_desc_,
            workspace_ptr,
            workspace_size,
            reserve_space_ptr,
224
            reserve_space_size));
Z
Zhen Wang 已提交
225 226

    // clean when exit.
227
    PADDLE_ENFORCE_GPU_SUCCESS(
228
        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
229
    PADDLE_ENFORCE_GPU_SUCCESS(
230
        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
Z
Zhen Wang 已提交
231 232 233 234
  }
};

template <typename T>
L
Leo Chen 已提交
235
class FusedBatchNormActGradKernel<phi::GPUContext, T>
Z
Zhen Wang 已提交
236 237 238 239
    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE_EQ(
240 241
        platform::is_gpu_place(ctx.GetPlace()),
        true,
Z
Zhen Wang 已提交
242 243 244
        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
    std::string act_type = ctx.Attr<std::string>("act_type");
245
    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
Z
Zhen Wang 已提交
246 247 248 249 250 251 252 253 254
    const auto *x = ctx.Input<Tensor>("X");
    const auto *y = ctx.Input<Tensor>("Y");
    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
    const auto *scale = ctx.Input<Tensor>("Scale");
    const auto *bias = ctx.Input<Tensor>("Bias");
    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");

    const auto &x_dims = x->dims();

255 256
    PADDLE_ENFORCE_EQ(x_dims.size() >= 2 && x_dims.size() <= 5,
                      true,
Z
Zhen Wang 已提交
257 258 259 260 261 262 263 264 265 266 267
                      platform::errors::PreconditionNotMet(
                          "The Input dim size should be between 2 and 5"));
    int N, C, H, W, D;
    const DataLayout data_layout = DataLayout::kNHWC;
    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);

    // init output
    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));

268
    dev_ctx.Alloc<T>(d_x, d_x->numel() * sizeof(T));
Z
Zhen Wang 已提交
269
    PADDLE_ENFORCE_EQ(
270 271
        d_scale && d_bias,
        true,
Z
Zhen Wang 已提交
272 273
        platform::errors::PreconditionNotMet(
            "Both the scale grad and the bias grad must not be null."));
274 275 276 277
    dev_ctx.Alloc<BatchNormParamType<T>>(
        d_scale, d_scale->numel() * sizeof(BatchNormParamType<T>));
    dev_ctx.Alloc<BatchNormParamType<T>>(
        d_bias, d_bias->numel() * sizeof(BatchNormParamType<T>));
278 279
    PADDLE_ENFORCE_EQ(scale->dims().size(),
                      1UL,
Z
Zhen Wang 已提交
280 281 282
                      platform::errors::PreconditionNotMet(
                          "The scale only has one dimension."));
    PADDLE_ENFORCE_EQ(
283 284
        scale->dims()[0],
        C,
Z
Zhen Wang 已提交
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
        platform::errors::PreconditionNotMet(
            "The size of scale is equal to the channel of Input(X)."));

    if ((N * H * W * D) == 1) {
      if (act_type == "relu") {
        auto x_v = framework::EigenVector<T>::Flatten(*x);
        auto y_v = framework::EigenVector<T>::Flatten(*y);
        auto dx_v = framework::EigenVector<T>::Flatten(*d_x);
        auto dy_v = framework::EigenVector<T>::Flatten(*d_y);
        auto &dev = *dev_ctx.eigen_device();
        ReluGradFunctor<T>()(dev, x_v, y_v, dy_v, dx_v);
      } else {
        PADDLE_THROW(
            platform::errors::Unimplemented("Unsupported activation type"));
      }
L
Leo Chen 已提交
300
      phi::funcs::SetConstant<phi::GPUContext, BatchNormParamType<T>> functor;
Z
Zhen Wang 已提交
301 302 303 304 305 306 307 308 309 310 311 312
      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
      return;
    }

    std::vector<int> dims = {N, C, H, W, D};
    std::vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
    // ------------------- cudnn descriptors ---------------------
    cudnnTensorDescriptor_t data_desc_;
    cudnnTensorDescriptor_t bn_param_desc_;
    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;

313
    PADDLE_ENFORCE_GPU_SUCCESS(
314
        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
315
    PADDLE_ENFORCE_GPU_SUCCESS(
316
        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
Z
Zhen Wang 已提交
317 318 319 320 321 322 323
    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
      LOG(ERROR) << "Provided epsilon is smaller than "
                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
                 << "CUDNN_BN_MIN_EPSILON instead.";
    }
    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);

324
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
325 326 327 328 329
        data_desc_,
        CudnnDataType<T>::type,
        x_dims.size() > 3 ? x_dims.size() : 4,
        dims.data(),
        strides.data()));
330 331
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
        bn_param_desc_, data_desc_, mode_));
Z
Zhen Wang 已提交
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348

    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
    const auto *saved_mean_data =
        saved_mean->template data<BatchNormParamType<T>>();
    const auto *saved_var_data =
        saved_var->template data<BatchNormParamType<T>>();

    size_t workspace_size = 0;
    void *workspace_ptr = nullptr;
    Tensor workspace_tensor;
    auto reserve_space_size = reserve_space->memory_size();
    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
    platform::ScopedActivationDescriptor scope_act_desc;
    cudnnActivationDescriptor_t activation_desc_ =
        scope_act_desc.descriptor<T>(act_type);
    // --------------- cudnn batchnorm workspace ---------------
349
    PADDLE_ENFORCE_GPU_SUCCESS(
Z
Zhen Wang 已提交
350 351 352 353 354 355 356 357 358 359 360
        platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
            /*handle=*/dev_ctx.cudnn_handle(),
            /*mode=*/mode_,
            /*bnOps=*/bnOps_,
            /*xDesc=*/data_desc_,
            /*yDesc=*/data_desc_,
            /*dyDesc=*/data_desc_,
            /*dzDesc=*/nullptr,
            /*dxDesc=*/data_desc_,
            /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
            /*activationDesc=*/activation_desc_,
361
            /*sizeInBytes=*/&workspace_size));
Z
Zhen Wang 已提交
362

363 364 365 366 367
    workspace_tensor.Resize({static_cast<int64_t>(
        (workspace_size + experimental::SizeOf(x->dtype()) - 1) /
        experimental::SizeOf(x->dtype()))});
    workspace_ptr = dev_ctx.Alloc<T>(&workspace_tensor,
                                     workspace_tensor.numel() * sizeof(T));
Z
Zhen Wang 已提交
368

369
    PADDLE_ENFORCE_GPU_SUCCESS(
Z
Zhen Wang 已提交
370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
        platform::dynload::cudnnBatchNormalizationBackwardEx(
            /*handle=*/dev_ctx.cudnn_handle(),
            /*mode=*/mode_,
            /*bnOps=*/bnOps_,
            /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
            /*betaDataDiff=*/CudnnDataType<T>::kZero(),
            /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
            /*betaParamDiff=*/CudnnDataType<T>::kZero(),
            /*xDesc=*/data_desc_,
            /*xData=*/x->template data<T>(),
            /*yDesc=*/data_desc_,
            /*yData=*/y->template data<T>(),
            /*dyDesc=*/data_desc_,
            /*dyData=*/d_y->template data<T>(),
            /*dzDesc=*/nullptr,
            /*dzData=*/nullptr,
            /*dxDesc=*/data_desc_,
387 388
            /*dxData=*/
            dev_ctx.template Alloc<T>(d_x, d_x->numel() * sizeof(T)),
Z
Zhen Wang 已提交
389 390 391
            /*dBnScaleBiasDesc=*/bn_param_desc_,
            /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
            /*bnBiasData=*/bias->template data<BatchNormParamType<T>>(),
392
            /*dBnScaleData=*/
393 394
            dev_ctx.template Alloc<BatchNormParamType<T>>(
                d_scale, d_scale->numel() * sizeof(BatchNormParamType<T>)),
395
            /*dBnBiasData=*/
396 397
            dev_ctx.template Alloc<BatchNormParamType<T>>(
                d_bias, d_bias->numel() * sizeof(BatchNormParamType<T>)),
Z
Zhen Wang 已提交
398 399 400 401 402 403 404
            /*epsilon=*/epsilon,
            /*savedMean=*/saved_mean_data,
            /*savedInvVariance=*/saved_var_data,
            /*activationDesc=*/activation_desc_,
            /*workspace=*/workspace_ptr,
            /*workSpaceSizeInBytes=*/workspace_size,
            /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
405
            /*reserveSpaceSizeInBytes=*/reserve_space_size));
Z
Zhen Wang 已提交
406 407

    // clean when exit.
408
    PADDLE_ENFORCE_GPU_SUCCESS(
409
        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
410
    PADDLE_ENFORCE_GPU_SUCCESS(
411
        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
Z
Zhen Wang 已提交
412 413 414 415 416 417 418 419 420 421 422
  }
};

}  // namespace operators
}  // namespace paddle

#if CUDNN_VERSION >= 7401
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
    fused_batch_norm_act,
L
Leo Chen 已提交
423 424 425
    ops::FusedBatchNormActKernel<phi::GPUContext, float>,
    ops::FusedBatchNormActKernel<phi::GPUContext, double>,
    ops::FusedBatchNormActKernel<phi::GPUContext, plat::float16>);
Z
Zhen Wang 已提交
426 427
REGISTER_OP_CUDA_KERNEL(
    fused_batch_norm_act_grad,
L
Leo Chen 已提交
428 429 430
    ops::FusedBatchNormActGradKernel<phi::GPUContext, float>,
    ops::FusedBatchNormActGradKernel<phi::GPUContext, double>,
    ops::FusedBatchNormActGradKernel<phi::GPUContext, plat::float16>);
Z
Zhen Wang 已提交
431
#endif