math_function.cc 13.1 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Q
qijun 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Y
Yi Wang 已提交
15
#include "paddle/fluid/operators/math/math_function.h"
Y
Yu Yang 已提交
16
#include <vector>
Y
Yi Wang 已提交
17 18
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
19
#include "paddle/fluid/platform/float16.h"
Q
qijun 已提交
20 21 22 23 24

namespace paddle {
namespace operators {
namespace math {

25 26 27 28 29 30 31 32 33 34 35
using float16 = paddle::platform::float16;

template <>
void matmul<platform::CPUDeviceContext, float16>(
    const platform::CPUDeviceContext& context,
    const framework::Tensor& matrix_a, bool trans_a,
    const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
    framework::Tensor* matrix_out, float16 beta) {
  PADDLE_THROW("float16 matmul not supported on CPU");
}

Q
qijun 已提交
36
template <>
Q
QI JUN 已提交
37 38 39 40
void matmul<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context,
    const framework::Tensor& matrix_a, bool trans_a,
    const framework::Tensor& matrix_b, bool trans_b, float alpha,
41
    framework::Tensor* matrix_out, float beta) {
Q
qijun 已提交
42 43 44 45 46 47 48 49 50
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
                 "The input and output of matmul be matrix");

  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
                     platform::is_cpu_place(matrix_b.place()) &&
                     platform::is_cpu_place(matrix_out->place()),
Q
qijun 已提交
51 52
                 "Matrix must all be in CPUPlace");

Q
qijun 已提交
53 54 55
  int M = dim_out[0];
  int N = dim_out[1];
  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
Q
qijun 已提交
56

Q
qijun 已提交
57 58
  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
Q
qijun 已提交
59

Y
Yu Yang 已提交
60 61
  Blas<platform::CPUDeviceContext>(context).GEMM(
      transA, transB, M, N, K, alpha, matrix_a.data<float>(),
62
      matrix_b.data<float>(), beta, matrix_out->data<float>());
Q
qijun 已提交
63 64 65
}

template <>
Q
QI JUN 已提交
66 67 68 69
void matmul<platform::CPUDeviceContext, double>(
    const platform::CPUDeviceContext& context,
    const framework::Tensor& matrix_a, bool trans_a,
    const framework::Tensor& matrix_b, bool trans_b, double alpha,
70
    framework::Tensor* matrix_out, double beta) {
Q
qijun 已提交
71 72 73 74 75 76 77 78 79
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
                 "The input and output of matmul be matrix");

  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
                     platform::is_cpu_place(matrix_b.place()) &&
                     platform::is_cpu_place(matrix_out->place()),
Q
qijun 已提交
80 81
                 "Matrix must all be in CPUPlace");

Q
qijun 已提交
82 83 84 85 86 87
  int M = dim_out[0];
  int N = dim_out[1];
  int K = (trans_a == false) ? dim_a[1] : dim_a[0];

  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
Q
qijun 已提交
88

Y
Yu Yang 已提交
89 90
  Blas<platform::CPUDeviceContext>(context).GEMM(
      transA, transB, M, N, K, alpha, matrix_a.data<double>(),
91
      matrix_b.data<double>(), beta, matrix_out->data<double>());
Q
qijun 已提交
92 93
}

94 95 96 97 98
template <>
void batched_gemm<platform::CPUDeviceContext, float16>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const float16 alpha, const float16* A, const float16* B, const float16 beta,
Y
Yu Yang 已提交
99 100
    float16* C, const int batchCount, const int64_t strideA,
    const int64_t strideB) {
101 102 103
  PADDLE_THROW("float16 batched_gemm not supported on CPU");
}

T
tensor-tang 已提交
104
#ifdef PADDLE_WITH_MKLML
M
Markus Kliegl 已提交
105 106
// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
template <>
Q
QI JUN 已提交
107 108
void batched_gemm<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
M
Markus Kliegl 已提交
109 110
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const float alpha, const float* A, const float* B, const float beta,
Y
Yu Yang 已提交
111 112
    float* C, const int batchCount, const int64_t strideA,
    const int64_t strideB) {
M
Markus Kliegl 已提交
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
  auto a_array = std::vector<const float*>(batchCount);
  auto b_array = std::vector<const float*>(batchCount);
  auto c_array = std::vector<float*>(batchCount);
  for (int k = 0; k < batchCount; ++k) {
    a_array[k] = &A[k * strideA];
    b_array[k] = &B[k * strideB];
    c_array[k] = &C[k * M * N];
  }
  cblas_sgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
}

template <>
Q
QI JUN 已提交
130 131
void batched_gemm<platform::CPUDeviceContext, double>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
M
Markus Kliegl 已提交
132 133
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const double alpha, const double* A, const double* B, const double beta,
Y
Yu Yang 已提交
134 135
    double* C, const int batchCount, const int64_t strideA,
    const int64_t strideB) {
M
Markus Kliegl 已提交
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
  auto a_array = std::vector<const double*>(batchCount);
  auto b_array = std::vector<const double*>(batchCount);
  auto c_array = std::vector<double*>(batchCount);
  for (int k = 0; k < batchCount; ++k) {
    a_array[k] = &A[k * strideA];
    b_array[k] = &B[k * strideB];
    c_array[k] = &C[k * M * N];
  }
  cblas_dgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
}
#else
// The below is a naive but correct serial implementation that just loops
// over the batch dimension. This is a fallback for when the batched gemm
// functions of Intel MKL are not available. In the future, this computation
// should be parallelized.
template <>
Q
QI JUN 已提交
157 158
void batched_gemm<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
M
Markus Kliegl 已提交
159 160
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const float alpha, const float* A, const float* B, const float beta,
Y
Yu Yang 已提交
161 162
    float* C, const int batchCount, const int64_t strideA,
    const int64_t strideB) {
M
Markus Kliegl 已提交
163 164 165 166
  for (int k = 0; k < batchCount; ++k) {
    const float* Ak = &A[k * strideA];
    const float* Bk = &B[k * strideB];
    float* Ck = &C[k * M * N];
Y
Yu Yang 已提交
167 168
    Blas<platform::CPUDeviceContext>(context).GEMM(transA, transB, M, N, K,
                                                   alpha, Ak, Bk, beta, Ck);
M
Markus Kliegl 已提交
169 170 171 172
  }
}

template <>
Q
QI JUN 已提交
173 174
void batched_gemm<platform::CPUDeviceContext, double>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
M
Markus Kliegl 已提交
175 176
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const double alpha, const double* A, const double* B, const double beta,
Y
Yu Yang 已提交
177 178
    double* C, const int batchCount, const int64_t strideA,
    const int64_t strideB) {
M
Markus Kliegl 已提交
179 180 181 182
  for (int k = 0; k < batchCount; ++k) {
    const double* Ak = &A[k * strideA];
    const double* Bk = &B[k * strideB];
    double* Ck = &C[k * M * N];
Y
Yu Yang 已提交
183 184
    Blas<platform::CPUDeviceContext>(context).GEMM(transA, transB, M, N, K,
                                                   alpha, Ak, Bk, beta, Ck);
M
Markus Kliegl 已提交
185 186 187 188
  }
}
#endif

189
template <>
Q
QI JUN 已提交
190 191 192 193
void gemv<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
    const int N, const float alpha, const float* A, const float* B,
    const float beta, float* C) {
194 195 196 197 198
  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
}

template <>
Q
QI JUN 已提交
199 200 201 202
void gemv<platform::CPUDeviceContext, double>(
    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
    const int N, const double alpha, const double* A, const double* B,
    const double beta, double* C) {
203 204 205 206
  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
}

207
template <>
Q
QI JUN 已提交
208 209 210
void axpy<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context, const int n, const float alpha,
    const float* x, float* y) {
211 212 213 214
  cblas_saxpy(n, alpha, x, 1, y, 1);
}

template <>
Q
QI JUN 已提交
215 216 217
void axpy<platform::CPUDeviceContext, double>(
    const platform::CPUDeviceContext& context, const int n, const double alpha,
    const double* x, double* y) {
218 219 220
  cblas_daxpy(n, alpha, x, 1, y, 1);
}

K
Kexin Zhao 已提交
221
template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
Q
QI JUN 已提交
222 223 224 225 226
template struct SetConstant<platform::CPUDeviceContext, float>;
template struct SetConstant<platform::CPUDeviceContext, double>;
template struct SetConstant<platform::CPUDeviceContext, int>;
template struct SetConstant<platform::CPUDeviceContext, int64_t>;
template struct SetConstant<platform::CPUDeviceContext, bool>;
227

228 229 230 231 232 233 234
#define DEFINE_CPU_TRANS(RANK)                                             \
  template struct Transpose<platform::CPUDeviceContext, platform::float16, \
                            RANK>;                                         \
  template struct Transpose<platform::CPUDeviceContext, float, RANK>;      \
  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
D
dzhwinter 已提交
235
  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
236 237 238 239 240 241 242

DEFINE_CPU_TRANS(1);
DEFINE_CPU_TRANS(2);
DEFINE_CPU_TRANS(3);
DEFINE_CPU_TRANS(4);
DEFINE_CPU_TRANS(5);
DEFINE_CPU_TRANS(6);
Q
qijun 已提交
243

244 245
struct TensorSetConstantCPU {
  TensorSetConstantCPU(framework::Tensor* tensor, float value)
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
      : tensor_(tensor), value_(value) {}
  template <typename T>
  void operator()() const {
    auto cpu = platform::CPUPlace();
    auto* begin = tensor_->mutable_data<T>(cpu);
    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
  }
  framework::Tensor* tensor_;
  float value_;
};

template <>
void set_constant_with_place<platform::CPUPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,
    float value) {
  framework::VisitDataType(framework::ToDataType(tensor->type()),
262
                           TensorSetConstantCPU(tensor, value));
263 264
}

C
chengduoZH 已提交
265 266 267 268 269 270 271 272
template <>
void set_constant_with_place<platform::CUDAPinnedPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,
    float value) {
  framework::VisitDataType(framework::ToDataType(tensor->type()),
                           TensorSetConstantCPU(tensor, value));
}

273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
  TensorSetConstantWithPlace(const platform::DeviceContext& context,
                             framework::Tensor* tensor, float value)
      : context_(context), tensor_(tensor), value_(value) {}

  template <typename Place>
  void operator()(Place place) const {
    set_constant_with_place<Place>(context_, tensor_, value_);
  }

  const platform::DeviceContext& context_;
  framework::Tensor* tensor_;
  float value_;
};

void set_constant(const platform::DeviceContext& context,
                  framework::Tensor* tensor, float value) {
Y
Fix CI  
Yu Yang 已提交
290
  TensorSetConstantWithPlace func(context, tensor, value);
291
#ifdef PADDLE_WITH_CUDA
Y
Fix CI  
Yu Yang 已提交
292
  tensor->place().apply_visitor(func);
293 294 295 296 297
#else
  func(platform::CPUPlace());
#endif
}

Q
qingqing01 已提交
298 299 300 301 302 303 304 305 306 307
template <typename T>
struct RowwiseAdd<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::Tensor& input,
                  const framework::Tensor& vector, framework::Tensor* output) {
    auto in_dims = input.dims();
    auto size = input.numel() / in_dims[0];
    PADDLE_ENFORCE_EQ(vector.numel(), size);
    PADDLE_ENFORCE_EQ(output->dims(), in_dims);

Q
qingqing01 已提交
308 309 310 311 312 313
    auto in = framework::EigenMatrix<T>::From(input);
    auto vec = framework::EigenVector<T>::Flatten(vector);
    auto out = framework::EigenMatrix<T>::From(*output);

    for (int64_t i = 0; i < in_dims[0]; ++i) {
      out.chip(i, 0) = in.chip(i, 0) + vec;
Q
qingqing01 已提交
314 315 316 317
    }
  }
};

Q
QI JUN 已提交
318 319
template struct RowwiseAdd<platform::CPUDeviceContext, float>;
template struct RowwiseAdd<platform::CPUDeviceContext, double>;
Q
qingqing01 已提交
320

Q
QI JUN 已提交
321 322
template struct ColwiseSum<platform::CPUDeviceContext, float>;
template struct ColwiseSum<platform::CPUDeviceContext, double>;
Y
yangyaming 已提交
323 324
template struct ColwiseSum<platform::CPUDeviceContext, int>;
template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
325

C
chengduoZH 已提交
326 327 328 329 330 331
template struct RowwiseSum<platform::CPUDeviceContext, float>;
template struct RowwiseSum<platform::CPUDeviceContext, double>;

template struct RowwiseMean<platform::CPUDeviceContext, float>;
template struct RowwiseMean<platform::CPUDeviceContext, double>;

Q
qijun 已提交
332 333 334
}  // namespace math
}  // namespace operators
}  // namespace paddle