math_function.cc 15.8 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Q
qijun 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Y
Yi Wang 已提交
15
#include "paddle/fluid/operators/math/math_function.h"
Y
Yu Yang 已提交
16
#include <vector>
Y
Yi Wang 已提交
17 18
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
19
#include "paddle/fluid/platform/float16.h"
Q
qijun 已提交
20 21 22 23 24

namespace paddle {
namespace operators {
namespace math {

25 26 27 28 29 30 31 32 33 34 35
using float16 = paddle::platform::float16;

template <>
void gemm<platform::CPUDeviceContext, float16>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const float16 alpha, const float16* A, const float16* B, const float16 beta,
    float16* C) {
  PADDLE_THROW("float16 GEMM not supported on CPU");
}

Q
qijun 已提交
36
template <>
Q
QI JUN 已提交
37 38 39 40 41
void gemm<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const float alpha, const float* A, const float* B, const float beta,
    float* C) {
D
dongzhihong 已提交
42 43
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
Q
qijun 已提交
44
  int ldc = N;
Q
qijun 已提交
45 46
  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
              beta, C, ldc);
Q
qijun 已提交
47 48 49
}

template <>
Q
QI JUN 已提交
50 51 52 53 54
void gemm<platform::CPUDeviceContext, double>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const double alpha, const double* A, const double* B, const double beta,
    double* C) {
D
dongzhihong 已提交
55 56
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
Q
qijun 已提交
57
  int ldc = N;
Q
qijun 已提交
58 59
  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
              beta, C, ldc);
Q
qijun 已提交
60 61
}

62 63 64 65 66 67 68 69 70
template <>
void gemm<platform::CPUDeviceContext, float16>(
    const platform::CPUDeviceContext& context, const bool transA,
    const bool transB, const int M, const int N, const int K,
    const float16 alpha, const float16* A, const int lda, const float16* B,
    const int ldb, const float16 beta, float16* C, const int ldc) {
  PADDLE_THROW("float16 GEMM not supported on CPU");
}

G
guosheng 已提交
71
template <>
Q
QI JUN 已提交
72 73 74 75 76
void gemm<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context, const bool transA,
    const bool transB, const int M, const int N, const int K, const float alpha,
    const float* A, const int lda, const float* B, const int ldb,
    const float beta, float* C, const int ldc) {
G
guosheng 已提交
77 78 79 80 81 82
  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
              lda, B, ldb, beta, C, ldc);
}

template <>
Q
QI JUN 已提交
83 84 85 86 87
void gemm<platform::CPUDeviceContext, double>(
    const platform::CPUDeviceContext& context, const bool transA,
    const bool transB, const int M, const int N, const int K,
    const double alpha, const double* A, const int lda, const double* B,
    const int ldb, const double beta, double* C, const int ldc) {
G
guosheng 已提交
88 89 90 91 92
  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
              lda, B, ldb, beta, C, ldc);
}

93 94 95 96 97 98 99 100 101
template <>
void matmul<platform::CPUDeviceContext, float16>(
    const platform::CPUDeviceContext& context,
    const framework::Tensor& matrix_a, bool trans_a,
    const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
    framework::Tensor* matrix_out, float16 beta) {
  PADDLE_THROW("float16 matmul not supported on CPU");
}

Q
qijun 已提交
102
template <>
Q
QI JUN 已提交
103 104 105 106
void matmul<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context,
    const framework::Tensor& matrix_a, bool trans_a,
    const framework::Tensor& matrix_b, bool trans_b, float alpha,
107
    framework::Tensor* matrix_out, float beta) {
Q
qijun 已提交
108 109 110 111 112 113 114 115 116
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
                 "The input and output of matmul be matrix");

  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
                     platform::is_cpu_place(matrix_b.place()) &&
                     platform::is_cpu_place(matrix_out->place()),
Q
qijun 已提交
117 118
                 "Matrix must all be in CPUPlace");

Q
qijun 已提交
119 120 121
  int M = dim_out[0];
  int N = dim_out[1];
  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
Q
qijun 已提交
122

Q
qijun 已提交
123 124
  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
Q
qijun 已提交
125

Q
QI JUN 已提交
126
  gemm<platform::CPUDeviceContext, float>(
127 128
      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
      matrix_b.data<float>(), beta, matrix_out->data<float>());
Q
qijun 已提交
129 130 131
}

template <>
Q
QI JUN 已提交
132 133 134 135
void matmul<platform::CPUDeviceContext, double>(
    const platform::CPUDeviceContext& context,
    const framework::Tensor& matrix_a, bool trans_a,
    const framework::Tensor& matrix_b, bool trans_b, double alpha,
136
    framework::Tensor* matrix_out, double beta) {
Q
qijun 已提交
137 138 139 140 141 142 143 144 145
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
                 "The input and output of matmul be matrix");

  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
                     platform::is_cpu_place(matrix_b.place()) &&
                     platform::is_cpu_place(matrix_out->place()),
Q
qijun 已提交
146 147
                 "Matrix must all be in CPUPlace");

Q
qijun 已提交
148 149 150 151 152 153
  int M = dim_out[0];
  int N = dim_out[1];
  int K = (trans_a == false) ? dim_a[1] : dim_a[0];

  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
Q
qijun 已提交
154

Q
QI JUN 已提交
155
  gemm<platform::CPUDeviceContext, double>(
156 157
      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
      matrix_b.data<double>(), beta, matrix_out->data<double>());
Q
qijun 已提交
158 159
}

160 161 162 163 164
template <>
void batched_gemm<platform::CPUDeviceContext, float16>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const float16 alpha, const float16* A, const float16* B, const float16 beta,
Y
Yu Yang 已提交
165 166
    float16* C, const int batchCount, const int64_t strideA,
    const int64_t strideB) {
167 168 169
  PADDLE_THROW("float16 batched_gemm not supported on CPU");
}

T
tensor-tang 已提交
170
#ifdef PADDLE_WITH_MKLML
M
Markus Kliegl 已提交
171 172
// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
template <>
Q
QI JUN 已提交
173 174
void batched_gemm<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
M
Markus Kliegl 已提交
175 176
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const float alpha, const float* A, const float* B, const float beta,
Y
Yu Yang 已提交
177 178
    float* C, const int batchCount, const int64_t strideA,
    const int64_t strideB) {
M
Markus Kliegl 已提交
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
  auto a_array = std::vector<const float*>(batchCount);
  auto b_array = std::vector<const float*>(batchCount);
  auto c_array = std::vector<float*>(batchCount);
  for (int k = 0; k < batchCount; ++k) {
    a_array[k] = &A[k * strideA];
    b_array[k] = &B[k * strideB];
    c_array[k] = &C[k * M * N];
  }
  cblas_sgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
}

template <>
Q
QI JUN 已提交
196 197
void batched_gemm<platform::CPUDeviceContext, double>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
M
Markus Kliegl 已提交
198 199
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const double alpha, const double* A, const double* B, const double beta,
Y
Yu Yang 已提交
200 201
    double* C, const int batchCount, const int64_t strideA,
    const int64_t strideB) {
M
Markus Kliegl 已提交
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
  auto a_array = std::vector<const double*>(batchCount);
  auto b_array = std::vector<const double*>(batchCount);
  auto c_array = std::vector<double*>(batchCount);
  for (int k = 0; k < batchCount; ++k) {
    a_array[k] = &A[k * strideA];
    b_array[k] = &B[k * strideB];
    c_array[k] = &C[k * M * N];
  }
  cblas_dgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
}
#else
// The below is a naive but correct serial implementation that just loops
// over the batch dimension. This is a fallback for when the batched gemm
// functions of Intel MKL are not available. In the future, this computation
// should be parallelized.
template <>
Q
QI JUN 已提交
223 224
void batched_gemm<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
M
Markus Kliegl 已提交
225 226 227 228 229 230 231
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const float alpha, const float* A, const float* B, const float beta,
    float* C, const int batchCount, const int strideA, const int strideB) {
  for (int k = 0; k < batchCount; ++k) {
    const float* Ak = &A[k * strideA];
    const float* Bk = &B[k * strideB];
    float* Ck = &C[k * M * N];
Q
QI JUN 已提交
232 233
    gemm<platform::CPUDeviceContext, float>(context, transA, transB, M, N, K,
                                            alpha, Ak, Bk, beta, Ck);
M
Markus Kliegl 已提交
234 235 236 237
  }
}

template <>
Q
QI JUN 已提交
238 239
void batched_gemm<platform::CPUDeviceContext, double>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
M
Markus Kliegl 已提交
240 241 242 243 244 245 246
    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
    const double alpha, const double* A, const double* B, const double beta,
    double* C, const int batchCount, const int strideA, const int strideB) {
  for (int k = 0; k < batchCount; ++k) {
    const double* Ak = &A[k * strideA];
    const double* Bk = &B[k * strideB];
    double* Ck = &C[k * M * N];
Q
QI JUN 已提交
247 248
    gemm<platform::CPUDeviceContext, double>(context, transA, transB, M, N, K,
                                             alpha, Ak, Bk, beta, Ck);
M
Markus Kliegl 已提交
249 250 251 252
  }
}
#endif

253
template <>
Q
QI JUN 已提交
254 255 256 257
void gemv<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
    const int N, const float alpha, const float* A, const float* B,
    const float beta, float* C) {
258 259 260 261 262
  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
}

template <>
Q
QI JUN 已提交
263 264 265 266
void gemv<platform::CPUDeviceContext, double>(
    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
    const int N, const double alpha, const double* A, const double* B,
    const double beta, double* C) {
267 268 269 270
  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
}

271
template <>
Q
QI JUN 已提交
272 273 274
void axpy<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context, const int n, const float alpha,
    const float* x, float* y) {
275 276 277 278
  cblas_saxpy(n, alpha, x, 1, y, 1);
}

template <>
Q
QI JUN 已提交
279 280 281
void axpy<platform::CPUDeviceContext, double>(
    const platform::CPUDeviceContext& context, const int n, const double alpha,
    const double* x, double* y) {
282 283 284
  cblas_daxpy(n, alpha, x, 1, y, 1);
}

K
Kexin Zhao 已提交
285
template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
Q
QI JUN 已提交
286 287 288 289 290
template struct SetConstant<platform::CPUDeviceContext, float>;
template struct SetConstant<platform::CPUDeviceContext, double>;
template struct SetConstant<platform::CPUDeviceContext, int>;
template struct SetConstant<platform::CPUDeviceContext, int64_t>;
template struct SetConstant<platform::CPUDeviceContext, bool>;
291

292 293 294 295 296 297 298
#define DEFINE_CPU_TRANS(RANK)                                             \
  template struct Transpose<platform::CPUDeviceContext, platform::float16, \
                            RANK>;                                         \
  template struct Transpose<platform::CPUDeviceContext, float, RANK>;      \
  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
D
dzhwinter 已提交
299
  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
300 301 302 303 304 305 306

DEFINE_CPU_TRANS(1);
DEFINE_CPU_TRANS(2);
DEFINE_CPU_TRANS(3);
DEFINE_CPU_TRANS(4);
DEFINE_CPU_TRANS(5);
DEFINE_CPU_TRANS(6);
Q
qijun 已提交
307

308 309
struct TensorSetConstantCPU {
  TensorSetConstantCPU(framework::Tensor* tensor, float value)
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
      : tensor_(tensor), value_(value) {}
  template <typename T>
  void operator()() const {
    auto cpu = platform::CPUPlace();
    auto* begin = tensor_->mutable_data<T>(cpu);
    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
  }
  framework::Tensor* tensor_;
  float value_;
};

template <>
void set_constant_with_place<platform::CPUPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,
    float value) {
  framework::VisitDataType(framework::ToDataType(tensor->type()),
326
                           TensorSetConstantCPU(tensor, value));
327 328
}

C
chengduoZH 已提交
329 330 331 332 333 334 335 336
template <>
void set_constant_with_place<platform::CUDAPinnedPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,
    float value) {
  framework::VisitDataType(framework::ToDataType(tensor->type()),
                           TensorSetConstantCPU(tensor, value));
}

337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
  TensorSetConstantWithPlace(const platform::DeviceContext& context,
                             framework::Tensor* tensor, float value)
      : context_(context), tensor_(tensor), value_(value) {}

  template <typename Place>
  void operator()(Place place) const {
    set_constant_with_place<Place>(context_, tensor_, value_);
  }

  const platform::DeviceContext& context_;
  framework::Tensor* tensor_;
  float value_;
};

void set_constant(const platform::DeviceContext& context,
                  framework::Tensor* tensor, float value) {
Y
Fix CI  
Yu Yang 已提交
354
  TensorSetConstantWithPlace func(context, tensor, value);
355
#ifdef PADDLE_WITH_CUDA
Y
Fix CI  
Yu Yang 已提交
356
  tensor->place().apply_visitor(func);
357 358 359 360 361
#else
  func(platform::CPUPlace());
#endif
}

Q
qingqing01 已提交
362 363 364 365 366 367 368 369 370 371
template <typename T>
struct RowwiseAdd<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::Tensor& input,
                  const framework::Tensor& vector, framework::Tensor* output) {
    auto in_dims = input.dims();
    auto size = input.numel() / in_dims[0];
    PADDLE_ENFORCE_EQ(vector.numel(), size);
    PADDLE_ENFORCE_EQ(output->dims(), in_dims);

Q
qingqing01 已提交
372 373 374 375 376 377
    auto in = framework::EigenMatrix<T>::From(input);
    auto vec = framework::EigenVector<T>::Flatten(vector);
    auto out = framework::EigenMatrix<T>::From(*output);

    for (int64_t i = 0; i < in_dims[0]; ++i) {
      out.chip(i, 0) = in.chip(i, 0) + vec;
Q
qingqing01 已提交
378 379 380 381
    }
  }
};

Q
QI JUN 已提交
382 383
template struct RowwiseAdd<platform::CPUDeviceContext, float>;
template struct RowwiseAdd<platform::CPUDeviceContext, double>;
Q
qingqing01 已提交
384

Q
QI JUN 已提交
385 386
template struct ColwiseSum<platform::CPUDeviceContext, float>;
template struct ColwiseSum<platform::CPUDeviceContext, double>;
Y
yangyaming 已提交
387 388
template struct ColwiseSum<platform::CPUDeviceContext, int>;
template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
389

C
chengduoZH 已提交
390 391 392 393 394 395
template struct RowwiseSum<platform::CPUDeviceContext, float>;
template struct RowwiseSum<platform::CPUDeviceContext, double>;

template struct RowwiseMean<platform::CPUDeviceContext, float>;
template struct RowwiseMean<platform::CPUDeviceContext, double>;

Q
qijun 已提交
396 397 398
}  // namespace math
}  // namespace operators
}  // namespace paddle