math_function_impl.h 9.8 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#pragma once
16
#include <memory>
17
#include <vector>
18

19
#include "paddle/phi/common/data_type.h"
20
#include "paddle/phi/kernels/funcs/math_function.h"
21

22
namespace phi {
23
namespace funcs {
24

25
using phi::To32BitIndex;
26

Q
QI JUN 已提交
27
template <typename DeviceContext, typename T>
28 29 30
void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
                                               phi::DenseTensor* tensor,
                                               T num) {
31
  auto t = phi::EigenVector<T>::Flatten(*tensor);
32 33 34
  t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
}

35
#ifdef PADDLE_WITH_XPU
36 37
template <typename T>
void SetConstant<XPUContext, T>::operator()(const XPUContext& context,
38
                                            phi::DenseTensor* tensor,
39 40 41
                                            T num) {
  phi::VisitDataType(tensor->dtype(),
                     TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
42
}
43 44 45
template <typename T>
void SetConstant<paddle::platform::XPUDeviceContext, T>::operator()(
    const paddle::platform::XPUDeviceContext& context,
46
    phi::DenseTensor* tensor,
47 48 49 50 51
    T num) {
  phi::VisitDataType(tensor->dtype(),
                     TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
}
#endif
52

Q
QI JUN 已提交
53 54
template <typename DeviceContext, typename T, int Rank>
void Transpose<DeviceContext, T, Rank>::operator()(
55
    const DeviceContext& context,
56 57
    const phi::DenseTensor& in,
    phi::DenseTensor* out,
58
    const std::vector<int>& axis) {
59 60 61 62
  Eigen::array<int, Rank> permute;
  for (int i = 0; i < Rank; i++) {
    permute[i] = axis[i];
  }
63 64
  auto eigen_in = phi::EigenTensor<T, Rank>::From(in);
  auto eigen_out = phi::EigenTensor<T, Rank>::From(*out);
Q
QI JUN 已提交
65
  auto* dev = context.eigen_device();
66 67
  // use 32bit index to speed up computation
  bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
68
  bool is_gpu_place = paddle::platform::is_gpu_place(context.GetPlace());
69 70 71 72 73 74
  if (use_32bit_index && is_gpu_place) {
    To32BitIndex(eigen_out).device(*dev) =
        To32BitIndex(eigen_in).shuffle(permute);
  } else {
    eigen_out.device(*dev) = eigen_in.shuffle(permute);
  }
75
}
76

Q
QI JUN 已提交
77
template <typename DeviceContext, typename T>
78 79 80
void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                              const phi::DenseTensor& input,
                                              phi::DenseTensor* out) {
81 82
  auto in_dims = input.dims();
  auto size = input.numel() / in_dims[0];
83 84
  PADDLE_ENFORCE_EQ(out->numel(),
                    size,
85
                    phi::errors::InvalidArgument(
86 87 88
                        "The size of output tensor "
                        "should be equal to the size of input tensor column"
                        " dimension. Expected output size=%d, but received %d",
89 90
                        size,
                        out->numel()));
91

92 93
  auto in = phi::EigenMatrix<T>::From(input);
  auto vec = phi::EigenVector<T>::Flatten(*out);
Y
Yu Yang 已提交
94 95

  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
96
}
97

Y
Yu Yang 已提交
98 99 100 101
// Specialize for CPU, since Eigen implement a general reduce. However,
// colwise-sum can be easily implemented. General reduce has a huge overhead in
// CPU
template <typename T>
L
Leo Chen 已提交
102
class ColwiseSum<phi::CPUContext, T> {
Y
Yu Yang 已提交
103
 public:
L
Leo Chen 已提交
104
  void operator()(const phi::CPUContext& context,
105 106
                  const phi::DenseTensor& input,
                  phi::DenseTensor* out) {
Y
Yu Yang 已提交
107 108 109
    auto& in_dims = input.dims();
    auto height = in_dims[0];
    auto size = in_dims[1];
110
    PADDLE_ENFORCE_EQ(
111 112
        out->numel(),
        size,
113
        phi::errors::InvalidArgument(
114 115 116
            "The size of output tensor "
            "should be equal to the size of input tensor column"
            " dimension. Expected output size=%d, but received %d",
117 118
            size,
            out->numel()));
Y
Yu Yang 已提交
119

120
    T* out_buf = context.template Alloc<T>(out);
Y
Yu Yang 已提交
121 122
    const T* in_buf = input.data<T>();

Q
qiaolongfei 已提交
123 124
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
Y
Yu Yang 已提交
125 126 127 128 129 130 131 132 133 134
        if (i == 0) {
          out_buf[j] = in_buf[i * size + j];
        } else {
          out_buf[j] += in_buf[i * size + j];
        }
      }
    }
  }
};

C
chengduoZH 已提交
135
template <typename DeviceContext, typename T>
136 137 138
void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
                                               const phi::DenseTensor& input,
                                               phi::DenseTensor* out) {
C
chengduoZH 已提交
139
  auto in_dims = input.dims();
140 141 142 143 144
  PADDLE_ENFORCE_EQ(in_dims.size(),
                    2U,
                    phi::errors::InvalidArgument("The rank of input tensor "
                                                 "should be 2, but received %d",
                                                 in_dims.size()));
145 146
  PADDLE_ENFORCE_EQ(out->numel(),
                    in_dims[0],
147
                    phi::errors::InvalidArgument(
148 149 150
                        "The size of output tensor "
                        "should be equal to the size of input tensor row"
                        " dimension. Expected output size=%d, but received %d",
151 152
                        in_dims[0],
                        out->numel()));
C
chengduoZH 已提交
153

154 155
  auto in = phi::EigenMatrix<T>::From(input);
  auto vec = phi::EigenVector<T>::Flatten(*out);
C
chengduoZH 已提交
156 157 158 159 160 161 162 163

  vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
}
// TODO(zcd): Following ColwiseSum format, need to confirm.
// Specialize for CPU, since Eigen implement a general reduce. However,
// rowwise-sum can be easily implemented. General reduce has a huge overhead in
// CPU
template <typename T>
L
Leo Chen 已提交
164
class RowwiseMean<phi::CPUContext, T> {
C
chengduoZH 已提交
165
 public:
L
Leo Chen 已提交
166
  void operator()(const phi::CPUContext& context,
167 168
                  const phi::DenseTensor& input,
                  phi::DenseTensor* out) {
C
chengduoZH 已提交
169
    auto& in_dims = input.dims();
170 171 172 173 174 175
    PADDLE_ENFORCE_EQ(
        in_dims.size(),
        2U,
        phi::errors::InvalidArgument("The rank of input tensor "
                                     "should be 2, but received %d",
                                     in_dims.size()));
C
chengduoZH 已提交
176 177
    auto height = in_dims[0];
    auto size = in_dims[1];
178
    PADDLE_ENFORCE_EQ(
179 180
        out->numel(),
        height,
181
        phi::errors::InvalidArgument(
182 183 184
            "The size of output tensor "
            "should be equal to the size of input tensor row"
            " dimension. Expected output size=%d, but received %d",
185 186
            height,
            out->numel()));
C
chengduoZH 已提交
187
    auto inv_size = 1.0 / size;
188
    T* out_buf = context.template Alloc<T>(out);
C
chengduoZH 已提交
189 190 191 192 193 194 195 196 197 198 199 200 201
    const T* in_buf = input.data<T>();

    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
      T sum = 0;
      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
        sum += in_buf[i * size + j];
      }
      out_buf[i] = sum * inv_size;
    }
  }
};

template <typename DeviceContext, typename T>
202 203 204
void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                              const phi::DenseTensor& input,
                                              phi::DenseTensor* out) {
C
chengduoZH 已提交
205
  auto in_dims = input.dims();
206 207 208 209 210
  PADDLE_ENFORCE_EQ(in_dims.size(),
                    2U,
                    phi::errors::InvalidArgument("The rank of input tensor "
                                                 "should be 2, but received %d",
                                                 in_dims.size()));
211 212
  PADDLE_ENFORCE_EQ(out->numel(),
                    in_dims[0],
213
                    phi::errors::InvalidArgument(
214 215 216
                        "The size of output tensor "
                        "should be equal to the size of input tensor row"
                        " dimension. Expected output size=%d, but received %d",
217 218
                        in_dims[0],
                        out->numel()));
C
chengduoZH 已提交
219

220 221
  auto in = phi::EigenMatrix<T>::From(input);
  auto vec = phi::EigenVector<T>::Flatten(*out);
C
chengduoZH 已提交
222 223 224 225 226 227 228 229

  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
}
// TODO(zcd): Following ColwiseSum format, need to confirm.
// Specialize for CPU, since Eigen implement a general reduce. However,
// rowwise-sum can be easily implemented. General reduce has a huge overhead in
// CPU
template <typename T>
L
Leo Chen 已提交
230
class RowwiseSum<phi::CPUContext, T> {
C
chengduoZH 已提交
231
 public:
L
Leo Chen 已提交
232
  void operator()(const phi::CPUContext& context,
233 234
                  const phi::DenseTensor& input,
                  phi::DenseTensor* out) {
C
chengduoZH 已提交
235
    auto& in_dims = input.dims();
236 237 238 239 240 241
    PADDLE_ENFORCE_EQ(
        in_dims.size(),
        2U,
        phi::errors::InvalidArgument("The rank of input tensor "
                                     "should be 2, but received %d",
                                     in_dims.size()));
C
chengduoZH 已提交
242 243
    auto height = in_dims[0];
    auto size = in_dims[1];
244
    PADDLE_ENFORCE_EQ(
245 246
        out->numel(),
        height,
247
        phi::errors::InvalidArgument(
248 249 250
            "The size of output tensor "
            "should be equal to the size of input tensor row"
            " dimension. Expected output size=%d, but received %d",
251 252
            height,
            out->numel()));
C
chengduoZH 已提交
253

254
    T* out_buf = context.template Alloc<T>(out);
C
chengduoZH 已提交
255 256 257 258 259 260 261 262 263 264 265 266
    const T* in_buf = input.data<T>();

    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
      T sum = 0;
      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
        sum += in_buf[i * size + j];
      }
      out_buf[i] = sum;
    }
  }
};

267
}  // namespace funcs
268
}  // namespace phi