blas_impl.h 32.5 KB
Newer Older
Y
Yu Yang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
S
ShenLiang 已提交
15
#include <algorithm>
T
tensor-tang 已提交
16
#include <cmath>
T
tensor-tang 已提交
17
#include <limits>
Y
Yu Yang 已提交
18
#include <vector>
Y
Yu Yang 已提交
19 20 21 22 23 24 25 26 27
#include "paddle/fluid/operators/math/math_function.h"

namespace paddle {
namespace operators {
namespace math {

template <typename T>
struct CBlas;

28 29 30 31
template <>
struct CBlas<int8_t> {
  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
32 33
    PADDLE_THROW(platform::errors::Unimplemented(
        "Blas VCOPY do not supported on CPU, please check your code"));
34 35 36
  }
};

37
#ifdef PADDLE_WITH_MKLML
Y
Yu Yang 已提交
38 39
template <>
struct CBlas<float> {
Y
Yu Yang 已提交
40 41
  template <typename... ARGS>
  static void GEMM(ARGS... args) {
42
    platform::dynload::cblas_sgemm(args...);
Y
Yu Yang 已提交
43
  }
Y
Yu Yang 已提交
44

T
tensor-tang 已提交
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
  template <typename... ARGS>
  static float *GEMM_ALLOC(ARGS... args) {
    return platform::dynload::cblas_sgemm_alloc(args...);
  }

  template <typename... ARGS>
  static void GEMM_PACK(ARGS... args) {
    platform::dynload::cblas_sgemm_pack(args...);
  }

  template <typename... ARGS>
  static void GEMM_COMPUTE(ARGS... args) {
    platform::dynload::cblas_sgemm_compute(args...);
  }

  template <typename... ARGS>
  static void GEMM_FREE(ARGS... args) {
    platform::dynload::cblas_sgemm_free(args...);
  }

T
tensor-tang 已提交
65 66 67 68 69 70
#ifdef PADDLE_WITH_LIBXSMM
  template <typename... ARGS>
  static void SMM_GEMM(ARGS... args) {
    libxsmm_sgemm(args...);
  }
#endif
T
tensor-tang 已提交
71

Y
Yu Yang 已提交
72 73
  template <typename... ARGS>
  static void AXPY(ARGS... args) {
74 75 76 77 78 79 80 81 82 83 84 85 86
    platform::dynload::cblas_saxpy(args...);
  }

  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
    platform::dynload::cblas_scopy(args...);
  }

  template <typename... ARGS>
  static void GEMV(ARGS... args) {
    platform::dynload::cblas_sgemv(args...);
  }

T
tensor-tang 已提交
87 88 89 90 91
  template <typename... ARGS>
  static float DOT(ARGS... args) {
    return platform::dynload::cblas_sdot(args...);
  }

T
tensor-tang 已提交
92 93 94 95 96
  template <typename... ARGS>
  static void SCAL(ARGS... args) {
    platform::dynload::cblas_sscal(args...);
  }

J
Jacek Czaja 已提交
97 98 99 100 101
  template <typename... ARGS>
  static float ASUM(ARGS... args) {
    return platform::dynload::cblas_sasum(args...);
  }

102 103 104
  template <typename... ARGS>
  static void GEMM_BATCH(ARGS... args) {
    platform::dynload::cblas_sgemm_batch(args...);
Y
Yu Yang 已提交
105 106
  }

107 108
  template <typename... ARGS>
  static void VADD(ARGS... args) {
109 110
    platform::dynload::vsAdd(args...);
  }
T
tensor-tang 已提交
111

112 113 114 115 116
  template <typename... ARGS>
  static void VSUB(ARGS... args) {
    platform::dynload::vsSub(args...);
  }

T
tensor-tang 已提交
117 118 119 120
  template <typename... ARGS>
  static void VMUL(ARGS... args) {
    platform::dynload::vsMul(args...);
  }
T
tensor-tang 已提交
121

122 123 124 125 126
  template <typename... ARGS>
  static void VDIV(ARGS... args) {
    platform::dynload::vsDiv(args...);
  }

T
tensor-tang 已提交
127 128 129 130
  template <typename... ARGS>
  static void VEXP(ARGS... args) {
    platform::dynload::vsExp(args...);
  }
T
tensor-tang 已提交
131 132

  template <typename... ARGS>
T
tensor-tang 已提交
133
  static void VSQUARE(ARGS... args) {
T
tensor-tang 已提交
134 135 136 137 138 139 140
    platform::dynload::vsSqr(args...);
  }

  template <typename... ARGS>
  static void VPOW(ARGS... args) {
    platform::dynload::vsPowx(args...);
  }
Y
Use mkl  
Yu Yang 已提交
141 142 143 144 145

  template <typename... ARGS>
  static void VINV(ARGS... args) {
    platform::dynload::vsInv(args...);
  }
Y
Yihua Xu 已提交
146 147 148 149 150

  template <typename... ARGS>
  static void VMERF(ARGS... args) {
    platform::dynload::vmsErf(args...);
  }
151
#if !defined(_WIN32)
152 153 154 155
  template <typename... ARGS>
  static void CSRMM(ARGS... args) {
    platform::dynload::mkl_scsrmm(args...);
  }
156
#endif
G
Guo Sheng 已提交
157 158 159 160 161

  template <typename... ARGS>
  static void TRSM(ARGS... args) {
    platform::dynload::cblas_strsm(args...);
  }
162 163 164 165 166 167 168 169 170
};

template <>
struct CBlas<double> {
  template <typename... ARGS>
  static void GEMM(ARGS... args) {
    platform::dynload::cblas_dgemm(args...);
  }

T
tensor-tang 已提交
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
  template <typename... ARGS>
  static double *GEMM_ALLOC(ARGS... args) {
    return platform::dynload::cblas_dgemm_alloc(args...);
  }

  template <typename... ARGS>
  static void GEMM_PACK(ARGS... args) {
    platform::dynload::cblas_dgemm_pack(args...);
  }

  template <typename... ARGS>
  static void GEMM_COMPUTE(ARGS... args) {
    platform::dynload::cblas_dgemm_compute(args...);
  }

  template <typename... ARGS>
  static void GEMM_FREE(ARGS... args) {
    platform::dynload::cblas_dgemm_free(args...);
  }

T
tensor-tang 已提交
191 192 193 194 195 196
#ifdef PADDLE_WITH_LIBXSMM
  template <typename... ARGS>
  static void SMM_GEMM(ARGS... args) {
    libxsmm_dgemm(args...);
  }
#endif
T
tensor-tang 已提交
197

198 199 200
  template <typename... ARGS>
  static void AXPY(ARGS... args) {
    platform::dynload::cblas_daxpy(args...);
201 202 203 204
  }

  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
205
    platform::dynload::cblas_dcopy(args...);
206 207
  }

Y
Yu Yang 已提交
208 209
  template <typename... ARGS>
  static void GEMV(ARGS... args) {
210
    platform::dynload::cblas_dgemv(args...);
Y
Yu Yang 已提交
211 212
  }

T
tensor-tang 已提交
213 214 215 216 217
  template <typename... ARGS>
  static double DOT(ARGS... args) {
    return platform::dynload::cblas_ddot(args...);
  }

T
tensor-tang 已提交
218 219 220 221 222
  template <typename... ARGS>
  static void SCAL(ARGS... args) {
    platform::dynload::cblas_dscal(args...);
  }

J
Jacek Czaja 已提交
223 224 225 226 227
  template <typename... ARGS>
  static double ASUM(ARGS... args) {
    return platform::dynload::cblas_dasum(args...);
  }

Y
Yu Yang 已提交
228 229
  template <typename... ARGS>
  static void GEMM_BATCH(ARGS... args) {
230 231 232 233 234 235 236
    platform::dynload::cblas_dgemm_batch(args...);
  }

  template <typename... ARGS>
  static void VADD(ARGS... args) {
    platform::dynload::vdAdd(args...);
  }
T
tensor-tang 已提交
237

238 239 240 241 242
  template <typename... ARGS>
  static void VSUB(ARGS... args) {
    platform::dynload::vdSub(args...);
  }

T
tensor-tang 已提交
243 244 245 246
  template <typename... ARGS>
  static void VMUL(ARGS... args) {
    platform::dynload::vdMul(args...);
  }
T
tensor-tang 已提交
247

248 249 250 251 252
  template <typename... ARGS>
  static void VDIV(ARGS... args) {
    platform::dynload::vdDiv(args...);
  }

T
tensor-tang 已提交
253 254 255 256
  template <typename... ARGS>
  static void VEXP(ARGS... args) {
    platform::dynload::vdExp(args...);
  }
T
tensor-tang 已提交
257 258

  template <typename... ARGS>
T
tensor-tang 已提交
259
  static void VSQUARE(ARGS... args) {
T
tensor-tang 已提交
260 261 262 263 264 265 266
    platform::dynload::vdSqr(args...);
  }

  template <typename... ARGS>
  static void VPOW(ARGS... args) {
    platform::dynload::vdPowx(args...);
  }
Y
Use mkl  
Yu Yang 已提交
267 268 269 270 271

  template <typename... ARGS>
  static void VINV(ARGS... args) {
    platform::dynload::vdInv(args...);
  }
Y
Yihua Xu 已提交
272 273 274 275 276

  template <typename... ARGS>
  static void VMERF(ARGS... args) {
    platform::dynload::vmdErf(args...);
  }
277
#if !defined(_WIN32)
278 279 280 281
  template <typename... ARGS>
  static void CSRMM(ARGS... args) {
    platform::dynload::mkl_dcsrmm(args...);
  }
282
#endif
G
Guo Sheng 已提交
283 284 285 286 287

  template <typename... ARGS>
  static void TRSM(ARGS... args) {
    platform::dynload::cblas_dtrsm(args...);
  }
288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
};

#else

template <>
struct CBlas<float> {
  template <typename... ARGS>
  static void GEMM(ARGS... args) {
    cblas_sgemm(args...);
  }

  template <typename... ARGS>
  static void AXPY(ARGS... args) {
    cblas_saxpy(args...);
  }

  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
    cblas_scopy(args...);
  }

  template <typename... ARGS>
  static void GEMV(ARGS... args) {
    cblas_sgemv(args...);
Y
Yu Yang 已提交
312
  }
G
Guo Sheng 已提交
313 314 315 316 317

  template <typename... ARGS>
  static void TRSM(ARGS... args) {
    cblas_strsm(args...);
  }
Y
Yu Yang 已提交
318 319 320 321
};

template <>
struct CBlas<double> {
Y
Yu Yang 已提交
322 323 324 325
  template <typename... ARGS>
  static void GEMM(ARGS... args) {
    cblas_dgemm(args...);
  }
Y
Yu Yang 已提交
326 327 328 329 330 331

  template <typename... ARGS>
  static void AXPY(ARGS... args) {
    cblas_daxpy(args...);
  }

332 333 334 335 336
  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
    cblas_dcopy(args...);
  }

Y
Yu Yang 已提交
337 338 339 340
  template <typename... ARGS>
  static void GEMV(ARGS... args) {
    cblas_dgemv(args...);
  }
G
Guo Sheng 已提交
341 342 343 344 345

  template <typename... ARGS>
  static void TRSM(ARGS... args) {
    cblas_dtrsm(args...);
  }
Y
Yu Yang 已提交
346
};
347
#endif
T
tensor-tang 已提交
348

Y
Yu Yang 已提交
349 350
template <>
struct CBlas<platform::float16> {
351 352 353 354 355
  static void GEMM(...) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "float16 GEMM not supported on CPU, please check your code"));
  }

T
tensor-tang 已提交
356
  static void SMM_GEMM(...) {
357 358
    PADDLE_THROW(platform::errors::Unimplemented(
        "float16 SMM_GEMM not supported on CPU, please check your code"));
T
tensor-tang 已提交
359
  }
360 361 362
  static void VMUL(...) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "float16 VMUL not supported on CPU, please check your code"));
T
tensor-tang 已提交
363
  }
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
  static void VEXP(...) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "float16 VEXP not supported on CPU, please check your code"));
  }
  static void VSQUARE(...) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "float16 VSQUARE not supported on CPU, please check your code"));
  }
  static void VPOW(...) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "float16 VPOW not supported on CPU, please check your code"));
  }
  static void DOT(...) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "float16 DOT not supported on CPU, please check your code"));
  };
  static void SCAL(...) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "float16 SCAL not supported on CPU, please check your code"));
  };
  static void ASUM(...) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "float16 ASUM not supported on CPU, please check your code"));
  };
Y
Yu Yang 已提交
388 389
#ifdef PADDLE_WITH_MKLML
  static void GEMM_BATCH(...) {
390 391
    PADDLE_THROW(platform::errors::Unimplemented(
        "float16 GEMM_BATCH not supported on CPU, please check your code"));
Y
Yu Yang 已提交
392 393
  }
#endif
Y
Yu Yang 已提交
394
};
T
tensor-tang 已提交
395

T
tensor-tang 已提交
396
#ifdef PADDLE_WITH_MKLML
T
tensor-tang 已提交
397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
template <>
template <typename T>
T *Blas<platform::CPUDeviceContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
                                                const int M, const int N,
                                                const int K) const {
  return CBlas<T>::GEMM_ALLOC(id, M, N, K);
}

template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
                                                 const CBLAS_TRANSPOSE trans,
                                                 int M, int N, int K,
                                                 const T alpha, const T *src,
                                                 const int ld, T *dst) const {
  CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
}

template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM_COMPUTE(
    int transA, int transB, int M, int N, int K, const T *A, const int lda,
    const T *B, const int ldb, T beta, T *C, const int ldc) const {
  CBlas<T>::GEMM_COMPUTE(CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb,
                         beta, C, ldc);
}

template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM_FREE(T *data) const {
  CBlas<T>::GEMM_FREE(data);
}
T
tensor-tang 已提交
429
#endif
T
tensor-tang 已提交
430

T
tensor-tang 已提交
431 432 433 434 435 436 437 438 439
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
                                            CBLAS_TRANSPOSE transB, int M,
                                            int N, int K, T alpha, const T *A,
                                            const T *B, T beta, T *C) const {
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
T
tensor-tang 已提交
440 441
  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
                 beta, C, ldc);
Y
Yu Yang 已提交
442 443 444 445
}

template <>
template <typename T>
Y
Yu Yang 已提交
446 447 448 449
void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
                                            int N, int K, T alpha, const T *A,
                                            int lda, const T *B, int ldb,
                                            T beta, T *C, int ldc) const {
T
tensor-tang 已提交
450 451 452 453 454 455 456 457 458 459 460 461 462 463
  CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
                 transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
                 lda, B, ldb, beta, C, ldc);
}

template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
                                            CBLAS_TRANSPOSE transB, int M,
                                            int N, int K, T alpha, const T *A,
                                            int lda, const T *B, int ldb,
                                            T beta, T *C, int ldc) const {
  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
                 beta, C, ldc);
Y
Yu Yang 已提交
464 465
}

Y
Yu Yang 已提交
466 467 468 469 470 471 472 473 474
template <typename DeviceContext>
template <typename T>
void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
                                 const framework::Tensor &mat_b, bool trans_b,
                                 T alpha, framework::Tensor *mat_out,
                                 T beta) const {
  auto dim_a = mat_a.dims();
  auto dim_b = mat_b.dims();
  auto dim_out = mat_out->dims();
475 476 477 478 479 480 481 482 483 484 485 486
  PADDLE_ENFORCE_EQ(
      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, true,
      platform::errors::InvalidArgument(
          "The input and output of matmul should be matrix, the dim size must "
          "be 2,"
          "but received dim size input_a:%d, input_b:%d, output:%d",
          dim_a.size(), dim_b.size(), dim_out.size()));
  PADDLE_ENFORCE_EQ(
      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(), true,
      platform::errors::InvalidArgument("The places of matrices in the matmul "
                                        "should be same, please check your "
                                        "code."));
Y
Yu Yang 已提交
487 488 489 490 491 492 493 494 495 496 497 498

  int M = dim_out[0];
  int N = dim_out[1];
  int K = !trans_a ? dim_a[1] : dim_a[0];

  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = !trans_b ? CblasNoTrans : CblasTrans;

  this->GEMM(transA, transB, M, N, K, alpha, mat_a.data<T>(), mat_b.data<T>(),
             beta, mat_out->data<T>());
}

Y
Yu Yang 已提交
499 500 501 502 503 504 505
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::AXPY(int n, T alpha, const T *x,
                                            T *y) const {
  CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
}

506 507 508 509 510 511 512 513 514 515 516 517 518
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VCOPY(int n, const T *x, T *y) const {
  CBlas<T>::VCOPY(n, x, 1, y, 1);
}

template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
                                            T *z) const {
#ifdef PADDLE_WITH_MKLML
  CBlas<T>::VADD(n, x, y, z);
#else
519 520 521 522 523 524
  if (x == z) {
    this->template AXPY<T>(n, 1., y, z);
  } else {
    this->template VCOPY<T>(n, y, z);
    this->template AXPY<T>(n, 1., x, z);
  }
525 526 527
#endif
}

528 529 530 531 532 533 534 535 536 537 538 539 540 541
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VSUB(int n, const T *x, const T *y,
                                            T *z) const {
#ifdef PADDLE_WITH_MKLML
  CBlas<T>::VSUB(n, x, y, z);
#else
  // try to find if openblas support vsub
  for (int i = 0; i < n; ++i) {
    z[i] = x[i] - y[i];
  }
#endif
}

T
tensor-tang 已提交
542 543 544 545 546 547 548 549 550 551
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
                                            T *z) const {
#ifdef PADDLE_WITH_MKLML
  CBlas<T>::VMUL(n, x, y, z);
#else
  // try to find if openblas support vmul
  for (int i = 0; i < n; ++i) {
    z[i] = x[i] * y[i];
552 553 554 555 556 557 558 559 560 561 562 563 564 565
  }
#endif
}

template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VDIV(int n, const T *x, const T *y,
                                            T *z) const {
#ifdef PADDLE_WITH_MKLML
  CBlas<T>::VDIV(n, x, y, z);
#else
  // try to find if openblas support vdiv
  for (int i = 0; i < n; ++i) {
    z[i] = x[i] / y[i];
T
tensor-tang 已提交
566 567 568 569
  }
#endif
}

T
tensor-tang 已提交
570 571 572 573 574 575 576 577 578 579 580 581 582
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
#ifdef PADDLE_WITH_MKLML
  CBlas<T>::VEXP(n, x, y);
#else
  // try to find if openblas support vexp
  for (int i = 0; i < n; ++i) {
    y[i] = std::exp(x[i]);
  }
#endif
}

T
tensor-tang 已提交
583 584
template <>
template <typename T>
T
tensor-tang 已提交
585
void Blas<platform::CPUDeviceContext>::VSQUARE(int n, const T *x, T *y) const {
T
tensor-tang 已提交
586
#ifdef PADDLE_WITH_MKLML
T
tensor-tang 已提交
587
  CBlas<T>::VSQUARE(n, x, y);
T
tensor-tang 已提交
588 589
#else
  for (int i = 0; i < n; ++i) {
T
tensor-tang 已提交
590
    y[i] = x[i] * x[i];
T
tensor-tang 已提交
591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607
  }
#endif
}

template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VPOW(int n, const T *x, T a,
                                            T *y) const {
#ifdef PADDLE_WITH_MKLML
  CBlas<T>::VPOW(n, x, a, y);
#else
  for (int i = 0; i < n; ++i) {
    y[i] = std::pow(x[i], a);
  }
#endif
}

T
tensor-tang 已提交
608 609 610 611
template <>
template <typename T>
T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
#ifdef PADDLE_WITH_MKLML
T
tensor-tang 已提交
612
  return CBlas<T>::DOT(n, x, 1, y, 1);
T
tensor-tang 已提交
613 614 615 616 617 618 619 620 621 622
#else
  // try to find if openblas support cblas_dot
  T sum = 0;
  for (int i = 0; i < n; ++i) {
    sum += x[i] * y[i];
  }
  return sum;
#endif
}

T
tensor-tang 已提交
623 624
template <>
template <typename T>
T
tensor-tang 已提交
625
void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a, T *x) const {
T
tensor-tang 已提交
626 627 628 629 630 631 632 633 634 635
#ifdef PADDLE_WITH_MKLML
  CBlas<T>::SCAL(n, a, x, 1);
#else
  // try to find if openblas support cblas_scal
  for (int i = 0; i < n; ++i) {
    x[i] = a * x[i];
  }
#endif
}

J
Jacek Czaja 已提交
636 637 638 639 640
template <>
template <typename T>
T Blas<platform::CPUDeviceContext>::ASUM(int n, T *x, int inc) const {
  auto sum = static_cast<T>(0.0);
#ifdef PADDLE_WITH_MKLML
641
  sum = CBlas<T>::ASUM(n, x, inc);
J
Jacek Czaja 已提交
642
#else
J
Jacek Czaja 已提交
643
  // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
J
Jacek Czaja 已提交
644 645 646 647 648 649 650
  for (int c = 0; c < n; ++c) {
    sum += x[c];
  }
#endif
  return sum;
}

Y
Yu Yang 已提交
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
                                            const T *A, const T *B, T beta,
                                            T *C) const {
  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
  CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
}

template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::BatchedGEMM(
    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
    T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
    int64_t strideA, int64_t strideB) const {
#ifdef PADDLE_WITH_MKLML
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
  auto a_array = std::vector<const T *>(batchCount);
  auto b_array = std::vector<const T *>(batchCount);
  auto c_array = std::vector<T *>(batchCount);
  for (int k = 0; k < batchCount; ++k) {
    a_array[k] = &A[k * strideA];
    b_array[k] = &B[k * strideB];
    c_array[k] = &C[k * M * N];
  }

  CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
                       a_array.data(), &lda, b_array.data(), &ldb, &beta,
                       c_array.data(), &ldc, 1 /* group_count */, &batchCount);
#else
  for (int k = 0; k < batchCount; ++k) {
Y
yuyang18 已提交
684 685 686
    auto *Ak = &A[k * strideA];
    auto *Bk = &B[k * strideB];
    auto *Ck = &C[k * M * N];
Y
Yu Yang 已提交
687 688 689 690 691
    this->template GEMM<T>(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck);
  }
#endif
}

S
ShenLiang 已提交
692 693 694 695 696 697
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::BatchedGEMM(
    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
    T alpha, const T **A, const T **B, T beta, T **C, int batchCount) const {
#ifdef PADDLE_WITH_MKLML
W
wanghuancoder 已提交
698 699 700
  const int lda = (std::max)((transA == CblasNoTrans) ? K : M, 1);
  const int ldb = (std::max)((transB == CblasNoTrans) ? N : K, 1);
  const int ldc = (std::max)(N, 1);
S
ShenLiang 已提交
701 702 703 704 705 706 707 708 709 710 711
  CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha, A,
                       &lda, B, &ldb, &beta, C, &ldc, 1 /* group_count */,
                       &batchCount);
#else
  for (int k = 0; k < batchCount; ++k) {
    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
                           C[k]);
  }
#endif
}

712 713 714 715
#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::BatchedGEMMWithHead(
716 717 718 719 720 721
    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int W1, int H1, int W2,
    int H2, T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
    int64_t strideA, int64_t strideB, int64_t head_number,
    bool split_b_vertical) const {
  int lda = (transA == CblasNoTrans) ? W1 : H1;
  int ldb = (transB == CblasNoTrans) ? W2 : H2;
722 723 724 725
  auto a_array = std::vector<const T *>(batchCount);
  auto b_array = std::vector<const T *>(batchCount);
  auto c_array = std::vector<T *>(batchCount);

726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747
  if (split_b_vertical) {
    int ldc = W2;
    int sub_width = W2 / head_number;

    for (int i = 0; i < head_number; i++) {
      int sub_matA_offset = (transA == CblasNoTrans)
                                ? i * (W1 / head_number)
                                : i * (W1 / head_number) * H1;
      int sub_matB_offset = (transB == CblasNoTrans)
                                ? i * (W2 / head_number)
                                : i * (W2 / head_number) * H2;
      int sub_matC_offset = i * W2 / head_number;
      for (int k = 0; k < batchCount; ++k) {
        a_array[k] = &A[k * strideA] + sub_matA_offset;
        b_array[k] = &B[k * strideB] + sub_matB_offset;
        c_array[k] = &C[k * H1 * W2] + sub_matC_offset;
      }

      CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &H1, &sub_width,
                           &H2, &alpha, a_array.data(), &lda, b_array.data(),
                           &ldb, &beta, c_array.data(), &ldc,
                           1 /* group_count */, &batchCount);
748 749
    }

750
  } else {
751 752 753 754 755 756 757
    PADDLE_ENFORCE_EQ(
        W1, H2,
        platform::errors::InvalidArgument(
            "The fisrt matrix width should be same as second matrix height,"
            "but received fisrt matrix width %d"
            ", second matrix height %d",
            W1, H2));
758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
    int ldc = W2 * head_number;
    int sub_width = W1 / head_number;

    for (int i = 0; i < head_number; i++) {
      int sub_matA_offset = (transA == CblasNoTrans)
                                ? i * (W1 / head_number)
                                : i * (W1 / head_number) * H1;
      int sub_matB_offset = (transB == CblasNoTrans)
                                ? i * (W1 / head_number) * W2
                                : i * (W1 / head_number);
      int sub_matC_offset = i * W2;
      for (int k = 0; k < batchCount; ++k) {
        a_array[k] = &A[k * strideA] + sub_matA_offset;
        b_array[k] = &B[k * strideB] + sub_matB_offset;
        c_array[k] = &C[k * H1 * head_number * W2] + sub_matC_offset;
      }

      CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &H1, &W2,
                           &sub_width, &alpha, a_array.data(), &lda,
                           b_array.data(), &ldb, &beta, c_array.data(), &ldc,
                           1 /* group_count */, &batchCount);
    }
780 781 782 783
  }
}
#endif

T
tensor-tang 已提交
784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819
template <typename DeviceContext>
template <typename T>
void Blas<DeviceContext>::MatMul(const int M, const int N, const int K,
                                 const T *A, const T *B, T *C) const {
  this->template GEMM<T>(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
                         static_cast<T>(1), A, K, B, N, static_cast<T>(0), C,
                         N);
}

template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::MatMul(const int M, const int N,
                                              const int K, const T *A,
                                              const T *B, T *C) const {
#ifdef PADDLE_WITH_LIBXSMM
  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
  // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;

  // Since the matrix is very small,
  // so the unit of calculation is already very fast,
  // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
  // use xsmm directly.
  // Note: SMM use ColMajor
  const char transa = 'N';
  const char transb = 'N';
  const T alpha = static_cast<T>(1);
  const T beta = static_cast<T>(0);
  CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta,
                     C, &N);
  return;
#endif

  CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
                 static_cast<T>(1), A, K, B, N, static_cast<T>(0), C, N);
}

Y
Yu Yang 已提交
820 821 822 823 824 825 826
template <typename DeviceContext>
template <typename T>
void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                                 const MatDescriptor &dim_a,
                                 const framework::Tensor &mat_b,
                                 const MatDescriptor &dim_b, T alpha,
                                 framework::Tensor *mat_out, T beta) const {
827 828 829 830 831 832 833 834
  PADDLE_ENFORCE_EQ(
      dim_a.width_, dim_b.height_,
      platform::errors::InvalidArgument(
          "The fisrt matrix width should be same as second matrix height,"
          "but received fisrt matrix width %d"
          ", second matrix height %d",
          dim_a.width_, dim_b.height_));

Y
Yu Yang 已提交
835 836 837 838 839 840 841
  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
    this->template GEMM<T>(transA, transB, dim_a.height_, dim_b.width_,
                           dim_a.width_, alpha, mat_a.data<T>(),
                           mat_b.data<T>(), beta, mat_out->data<T>());
  } else {
842 843 844 845 846 847 848 849
    PADDLE_ENFORCE_EQ(
        dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
            dim_b.batch_size_ == 0,
        true, platform::errors::InvalidArgument(
                  "dim_a.batch_size should be equal to dim_b.batch_size, or "
                  "one of dim_a.batch_size and dim_b.batch_size should be 0. "
                  "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
                  dim_a.batch_size_, dim_b.batch_size_));
Y
Yu Yang 已提交
850 851 852 853 854 855 856
    this->template BatchedGEMM<T>(
        transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
        mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
        dim_a.stride_, dim_b.stride_);
  }
}
857 858 859 860 861 862 863 864 865 866 867 868

#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
/*
 * Multiple two matrixes with multiple heads
 *
 * A new parameter, i.e head_number is added compared to normal MatMul.
 * The head_number describes the number of heads a matrix is vertically
 * split.
 *
 * When user calls this API, the multiplication of two big matrixes is split
 * into multiplication of several (head_number_) small matrixes. e.g. if Mat A
 * is [3, 24] and Mat B is [24, 4], when multiple A and B with head_number as
T
tianshuo78520a 已提交
869 870
 * 4, Mat A will be split as 4 matrix of [3, 6] and Mat B will be
 * (horizontally) split as 4 matrix of [6, 4]. The result of final matrix
871 872
 * will be 4 matrix of [3, 4], i.e. [3, 16].
 * Another example is A is [3, 8], B is [2, 16], head_number is 4. In this
T
tianshuo78520a 已提交
873
 * case, A will be split as [3, 2], B will be (vertically) split as
874
 * [2, 4]. The final result will be 4 matrix of 4 matrix of [3,4], i.e. [3, 16]
875 876 877
 */
template <typename DeviceContext>
template <typename T>
878 879 880 881 882 883 884
void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
                                         const MatDescriptor &dim_a,
                                         const framework::Tensor &mat_b,
                                         const MatDescriptor &dim_b, T alpha,
                                         int head_number,
                                         framework::Tensor *mat_out, T beta,
                                         bool mat_b_split_vertical) const {
885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903
  PADDLE_ENFORCE_EQ(
      dim_a.width_ % head_number, 0,
      platform::errors::InvalidArgument(
          "The first input width must be some times the head number"
          "but received first input width %d"
          ",  head_number %d",
          dim_a.width_, head_number));
  PADDLE_ENFORCE_GE(head_number, 1,
                    platform::errors::InvalidArgument(
                        "The head number should be greater equal 1,"
                        "but received head number %d",
                        head_number));
  PADDLE_ENFORCE_LE(
      head_number, dim_a.width_,
      platform::errors::InvalidArgument(
          "The head number should be less equal first input width,"
          "but received first input width %d"
          ",  head_number %d",
          dim_a.width_, head_number));
904 905 906
  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;

907
  if (mat_b_split_vertical) {
908 909 910 911 912 913 914 915 916 917 918 919 920
    PADDLE_ENFORCE_EQ(
        dim_b.height_, dim_a.width_ / head_number,
        platform::errors::InvalidArgument(
            "The second input height should be equal than first input width,"
            "but received second input height %d, first input width %d",
            dim_b.height_, dim_a.width_ / head_number));
    PADDLE_ENFORCE_EQ(
        dim_a.width_ % head_number, 0,
        platform::errors::InvalidArgument(
            "The second input width should be some times the head number"
            "but received second input width %d"
            ",  head_number %d",
            dim_b.width_, head_number));
921 922
  }

923
  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
924 925 926 927 928 929 930 931 932 933
    int lda = !dim_a.trans_ ? dim_a.width_ : dim_a.height_;
    int ldb = !dim_b.trans_ ? dim_b.width_ : dim_b.height_;
    int sub_matA_offset;
    int sub_matB_offset;
    int sub_matC_offset;
    int sub_mat_M = dim_a.height_;
    int sub_mat_N;
    int sub_mat_K;
    int ldc;

934
    for (int i = 0; i < head_number; i++) {
935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961
      sub_matA_offset = dim_a.trans_
                            ? i * (dim_a.width_ / head_number) * dim_a.height_
                            : i * (dim_a.width_ / head_number);
      if (mat_b_split_vertical) {
        sub_matB_offset = dim_b.trans_
                              ? i * (dim_b.width_ / head_number) * dim_b.height_
                              : i * (dim_b.width_ / head_number);
        sub_matC_offset = i * dim_b.width_ / head_number;

        sub_mat_N = dim_b.width_ / head_number;
        sub_mat_K = dim_b.height_;

        ldc = dim_b.width_;
      } else {
        sub_matB_offset =
            dim_b.trans_ ? i * (dim_b.height_ / head_number)
                         : i * (dim_b.height_ / head_number) * dim_b.width_;
        sub_matC_offset = i * dim_b.width_;

        sub_mat_N = dim_b.width_;
        sub_mat_K = dim_a.width_ / head_number;

        ldc = head_number * dim_b.width_;
      }

      this->template GEMM<T>(transA, transB, sub_mat_M, sub_mat_N, sub_mat_K,
                             alpha, mat_a.data<T>() + sub_matA_offset, lda,
962 963 964 965
                             mat_b.data<T>() + sub_matB_offset, ldb, beta,
                             mat_out->data<T>() + sub_matC_offset, ldc);
    }
  } else {
966 967 968 969 970 971 972 973 974 975
    PADDLE_ENFORCE_EQ(
        (dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
         dim_b.batch_size_ == 0),
        true,
        platform::errors::InvalidArgument(
            "The first input batch size should be equal than second input,"
            "either two input batch size is 0, but received first input batch "
            "size"
            " %d, second input batch size %d",
            dim_a.batch_size_, dim_b.batch_size_));
976 977

    this->template BatchedGEMMWithHead<T>(
978 979 980
        transA, transB, dim_a.width_, dim_a.height_, dim_b.width_,
        dim_b.height_, alpha, mat_a.data<T>(), mat_b.data<T>(), beta,
        mat_out->data<T>(),
981
        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
982
        dim_a.stride_, dim_b.stride_, head_number, mat_b_split_vertical);
983 984 985 986
  }
}
#endif

Y
Use mkl  
Yu Yang 已提交
987 988 989 990 991 992 993 994 995 996 997
template <typename DeviceContext>
template <typename T>
void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
#ifdef PADDLE_WITH_MKLML
  CBlas<T>::VINV(n, a, y);
#else
  for (int i = 0; i < n; ++i) {
    y[i] = 1.0 / a[i];
  }
#endif
}
Y
Yu Yang 已提交
998

Y
Yihua Xu 已提交
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VMERF(int n, const T *a, T *y,
                                             int64_t mode) const {
#ifdef PADDLE_WITH_MKLML
  CBlas<T>::VMERF(n, a, y, mode);
#else
  for (int i = 0; i < n; ++i) {
    y[i] = std::erf(a[i]);
  }
#endif
}

1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
#ifdef PADDLE_WITH_MKLML
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::CSRMM(
    const char *transa, const int *m, const int *n, const int *k,
    const T *alpha, const char *matdescra, const T *val, const int *indx,
    const int *pntrb, const int *pntre, const T *b, const int *ldb,
    const T *beta, T *c, const int *ldc) const {
  CBlas<T>::CSRMM(transa, m, n, k, alpha, matdescra, val, indx, pntrb, pntre, b,
                  ldb, beta, c, ldc);
}
#endif

G
Guo Sheng 已提交
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
                                            CBLAS_TRANSPOSE transA,
                                            CBLAS_DIAG diag, int M, int N,
                                            T alpha, const T *A, int lda, T *B,
                                            int ldb) const {
  CBlas<T>::TRSM(CblasRowMajor, side, uplo, transA, diag, M, N, alpha, A, lda,
                 B, ldb);
}

Y
Yu Yang 已提交
1036 1037 1038
}  // namespace math
}  // namespace operators
}  // namespace paddle