elementwise_sub_op.cu 7.0 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
G
gongweibao 已提交
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
G
gongweibao 已提交
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
G
gongweibao 已提交
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
14

15
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
W
Wu Yi 已提交
16
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
17
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
18
#include "paddle/fluid/platform/complex.h"
19
#include "paddle/fluid/platform/float16.h"
G
gongweibao 已提交
20 21

namespace ops = paddle::operators;
22 23 24 25 26 27 28 29 30 31 32 33
namespace plat = paddle::platform;

namespace paddle {
namespace operators {

template <typename T>
static __global__ void SimpleElemwiseSubGradCUDAKernel(const T* dout,
                                                       int64_t size, T* dx,
                                                       T* dy) {
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  while (col < size) {
34 35 36
    if (dx != nullptr) {
      dx[col] = dout[col];
    }
37 38 39 40 41
    dy[col] = -dout[col];
    col += blockDim.x * gridDim.x;
  }
}

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
template <typename DeviceContext, typename T>
typename std::enable_if<
    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
default_elementwise_sub_grad(const framework::ExecutionContext& ctx,
                             const framework::Tensor* x,
                             const framework::Tensor* y,
                             const framework::Tensor* out,
                             const framework::Tensor* dout,
                             framework::Tensor* dx, framework::Tensor* dy) {
  int axis = ctx.Attr<int>("axis");
  auto* dout_data = dout->data<T>();
  // dx
  if (dx != nullptr) {
    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
    if (dx->dims() == dout->dims()) {
      if (dx_data != dout_data) {
        framework::TensorCopy(
            *dout, ctx.GetPlace(),
            ctx.template device_context<platform::DeviceContext>(), dx);
      }
    } else {
      // For inplace strategy, dx will be stored in addr of dout, which makes
      // the result of dy wrong.
      if (dx->IsSharedBufferWith(*dout)) {
        dx->clear();
        dx->mutable_data<T>(x->dims(), ctx.GetPlace());
      }
      std::vector<int> reduce_dims = GetReduceDim(x->dims(), out->dims(), axis);
      gpuStream_t stream = ctx.cuda_device_context().stream();
71 72
      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
          *dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
    }
  }
  // dy
  if (dy != nullptr) {
    auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
    if (dy->dims() == dout->dims()) {
      if (dy_data != dout_data) {
        dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
        auto size = dy->numel();
        dim3 grid_size = dim3(
            (size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
        SimpleElemwiseSubGradCUDAKernel<T><<<
            grid_size, block_size, 0,
            ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
            dout->data<T>(), size, nullptr,
            dy->mutable_data<T>(ctx.GetPlace()));
      }
    } else {
      std::vector<int> reduce_dims = GetReduceDim(y->dims(), out->dims(), axis);
      gpuStream_t stream = ctx.cuda_device_context().stream();
93 94
      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
          *dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
95 96 97 98
    }
  }
}

99 100 101 102 103 104 105 106
template <typename DeviceContext, typename T>
typename std::enable_if<
    std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
elementwise_sub_grad(const framework::ExecutionContext& ctx,
                     const framework::Tensor* x, const framework::Tensor* y,
                     const framework::Tensor* out,
                     const framework::Tensor* dout, framework::Tensor* dx,
                     framework::Tensor* dy) {
107
  dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
108
  auto size = x->numel();
109
  dim3 grid_size =
110
      dim3((size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
111
  SimpleElemwiseSubGradCUDAKernel<
112
      T><<<grid_size, block_size, 0,
113 114 115 116 117 118 119
           ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
      dout->data<T>(), size, dx->mutable_data<T>(ctx.GetPlace()),
      dy->mutable_data<T>(ctx.GetPlace()));
}

}  // namespace operators
}  // namespace paddle
G
gongweibao 已提交
120

Q
QI JUN 已提交
121
REGISTER_OP_CUDA_KERNEL(
G
gongweibao 已提交
122
    elementwise_sub,
Q
QI JUN 已提交
123
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
124 125
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
                              paddle::platform::float16>,
Q
QI JUN 已提交
126 127
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
128 129
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
130
                              paddle::platform::complex<float>>,
131
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
132
                              paddle::platform::complex<double>>);
Q
QI JUN 已提交
133
REGISTER_OP_CUDA_KERNEL(
G
gongweibao 已提交
134
    elementwise_sub_grad,
Q
QI JUN 已提交
135
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
136 137
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
                                  paddle::platform::float16>,
Q
QI JUN 已提交
138 139
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
140 141
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
142
                                  paddle::platform::complex<float>>,
Q
QI JUN 已提交
143
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
144
                                  paddle::platform::complex<double>>);
145 146 147 148 149 150 151 152 153
REGISTER_OP_CUDA_KERNEL(
    elementwise_sub_grad_grad,
    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                        float>,
    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                        double>,
    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                        int>,
    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
154 155
                                        int64_t>,
    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
156
                                        paddle::platform::complex<float>>,
157
    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
158
                                        paddle::platform::complex<double>>);