elementwise_div_op.cu 7.7 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
G
gongweibao 已提交
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
G
gongweibao 已提交
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
G
gongweibao 已提交
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
W
Wu Yi 已提交
14
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
15
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
16
#include "paddle/fluid/platform/complex.h"
W
Wu Yi 已提交
17
#include "paddle/fluid/platform/float16.h"
G
gongweibao 已提交
18 19

namespace ops = paddle::operators;
20 21 22 23 24
namespace plat = paddle::platform;

namespace paddle {
namespace operators {

25 26 27 28 29 30 31
template <typename T, typename Enable = void>
struct CudaDivFunctor {
  inline HOSTDEVICE T operator()(const T* args) const {
    return args[0] / args[1];
  }
};

32
template <typename T>
33 34 35 36 37 38 39
struct CudaDivFunctor<T,
                      typename std::enable_if_t<std::is_integral<T>::value>> {
  inline HOSTDEVICE T operator()(const T* args) const {
    PADDLE_ENFORCE(args[1] != 0,
                   "Invalid Argument Error: Integer division by zero "
                   "encountered in divide. Please check the input value.");
    return args[0] / args[1];
40 41 42
  }
};

43 44 45 46 47 48 49 50 51 52 53 54 55
template <typename T>
class ElementwiseDivKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    std::vector<const framework::Tensor*> ins;
    std::vector<framework::Tensor*> outs;
    const auto& cuda_ctx =
        ctx.template device_context<platform::CUDADeviceContext>();

    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
        cuda_ctx, ins, &outs, axis, CudaDivFunctor<T>());
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
  }
};

template <typename T>
static __global__ void SimpleElemwiseDivGradCUDAKernel(const T* x, const T* y,
                                                       const T* out,
                                                       const T* dout,
                                                       int64_t size, T* dx,
                                                       T* dy) {
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  while (col < size) {
    T o = dout[col];
    dx[col] = o / y[col];
    dy[col] = -o * out[col] / y[col];
    col += blockDim.x * gridDim.x;
  }
}

75
template <>
76 77 78 79 80 81 82 83
__global__ void
SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex<float>>(
    const paddle::platform::complex<float>* x,
    const paddle::platform::complex<float>* y,
    const paddle::platform::complex<float>* out,
    const paddle::platform::complex<float>* dout, int64_t size,
    paddle::platform::complex<float>* dx,
    paddle::platform::complex<float>* dy) {
84 85 86
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  while (col < size) {
87 88 89 90
    paddle::platform::complex<float> o = dout[col];
    paddle::platform::complex<float> y_conj(y[col].real, -y[col].imag);
    paddle::platform::complex<float> out_div_y_conj((out[col] / y[col]).real,
                                                    -(out[col] / y[col]).imag);
91 92 93 94 95 96 97
    dx[col] = o / y_conj;
    dy[col] = -o * out_div_y_conj;
    col += blockDim.x * gridDim.x;
  }
}

template <>
98 99 100 101 102 103 104 105
__global__ void
SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex<double>>(
    const paddle::platform::complex<double>* x,
    const paddle::platform::complex<double>* y,
    const paddle::platform::complex<double>* out,
    const paddle::platform::complex<double>* dout, int64_t size,
    paddle::platform::complex<double>* dx,
    paddle::platform::complex<double>* dy) {
106 107 108
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  while (col < size) {
109 110 111 112
    paddle::platform::complex<double> o = dout[col];
    paddle::platform::complex<double> y_conj(y[col].real, -y[col].imag);
    paddle::platform::complex<double> out_div_y_conj((out[col] / y[col]).real,
                                                     -(out[col] / y[col]).imag);
113 114 115 116 117 118
    dx[col] = o / y_conj;
    dy[col] = -o * out_div_y_conj;
    col += blockDim.x * gridDim.x;
  }
}

119 120 121 122 123 124 125 126 127 128
template <typename DeviceContext, typename T>
typename std::enable_if<
    std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
elementwise_div_grad(const framework::ExecutionContext& ctx,
                     const framework::Tensor* x, const framework::Tensor* y,
                     const framework::Tensor* out,
                     const framework::Tensor* dout, framework::Tensor* dx,
                     framework::Tensor* dy) {
  dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
  auto size = x->numel();
129
  dim3 grid_size =
130 131
      dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
  SimpleElemwiseDivGradCUDAKernel<
132
      T><<<grid_size, block_size, 0,
133 134 135 136 137 138 139
           ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
      x->data<T>(), y->data<T>(), out->data<T>(), dout->data<T>(), size,
      dx->mutable_data<T>(ctx.GetPlace()), dy->mutable_data<T>(ctx.GetPlace()));
}

}  // namespace operators
}  // namespace paddle
G
gongweibao 已提交
140

Q
QI JUN 已提交
141
REGISTER_OP_CUDA_KERNEL(
G
gongweibao 已提交
142
    elementwise_div,
Q
QI JUN 已提交
143
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
W
Wu Yi 已提交
144 145
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
                              paddle::platform::float16>,
Q
QI JUN 已提交
146 147
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
148 149
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
150
                              paddle::platform::complex<float>>,
151
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
152
                              paddle::platform::complex<double>>);
Q
QI JUN 已提交
153
REGISTER_OP_CUDA_KERNEL(
G
gongweibao 已提交
154
    elementwise_div_grad,
Q
QI JUN 已提交
155
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
W
Wu Yi 已提交
156 157
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
                                  paddle::platform::float16>,
Q
QI JUN 已提交
158 159
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
160 161
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
162
                                  paddle::platform::complex<float>>,
Q
QI JUN 已提交
163
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
164
                                  paddle::platform::complex<double>>);
165 166 167 168
REGISTER_OP_CUDA_KERNEL(
    elementwise_div_grad_grad,
    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                        float>,
169 170
    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                        paddle::platform::float16>,
171 172 173 174 175
    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                        double>,
    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                        int>,
    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
176 177
                                        int64_t>,
    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
178
                                        paddle::platform::complex<float>>,
179
    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
180
                                        paddle::platform::complex<double>>);