mkl.cc 5.5 KB
Newer Older
T
tensor-tang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */

T
tensor-tang 已提交
15
#include "paddle/fluid/operators/jit/more/mkl/mkl.h"
16
#include "paddle/fluid/operators/jit/refer/refer.h"
T
tensor-tang 已提交
17
#include "paddle/fluid/operators/jit/registry.h"
18
#include "paddle/fluid/platform/cpu_info.h"
T
tensor-tang 已提交
19 20 21 22
#include "paddle/fluid/platform/dynload/mklml.h"

namespace paddle {
namespace operators {
T
tensor-tang 已提交
23
namespace jit {
T
tensor-tang 已提交
24 25 26
namespace more {
namespace mkl {

27 28 29 30 31 32 33 34 35 36 37 38 39 40
template <>
void MatMul<float>(const float* a, const float* b, float* c, int m, int n,
                   int k) {
  platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m,
                                 n, k, 1.f, a, k, b, n, 0.f, c, n);
}

template <>
void MatMul<double>(const double* a, const double* b, double* c, int m, int n,
                    int k) {
  platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m,
                                 n, k, 1.0, a, k, b, n, 0.0, c, n);
}

T
tensor-tang 已提交
41 42 43 44 45 46 47 48 49 50
template <>
void VMul<float>(const float* x, const float* y, float* z, int n) {
  platform::dynload::vsMul(n, x, y, z);
}

template <>
void VMul<double>(const double* x, const double* y, double* z, int n) {
  platform::dynload::vdMul(n, x, y, z);
}

51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
template <>
void VAdd<float>(const float* x, const float* y, float* z, int n) {
  platform::dynload::vsAdd(n, x, y, z);
}

template <>
void VAdd<double>(const double* x, const double* y, double* z, int n) {
  platform::dynload::vdAdd(n, x, y, z);
}

template <>
void VScal<float>(const float* a, const float* x, float* y, int n) {
  if (x == y) {
    platform::dynload::cblas_sscal(n, *a, y, 1);
  } else {
    refer::VScal<float>(a, x, y, n);
  }
}

template <>
void VScal<double>(const double* a, const double* x, double* y, int n) {
  if (x == y) {
    platform::dynload::cblas_dscal(n, *a, y, 1);
  } else {
    refer::VScal<double>(a, x, y, n);
  }
}

79 80 81 82 83 84 85 86 87 88
template <>
void VExp<float>(const float* x, float* y, int n) {
  platform::dynload::vsExp(n, x, y);
}

template <>
void VExp<double>(const double* x, double* y, int n) {
  platform::dynload::vdExp(n, x, y);
}

T
tensor-tang 已提交
89 90 91 92 93 94 95 96 97 98
template <>
void VSquare<float>(const float* x, float* y, int n) {
  platform::dynload::vsSqr(n, x, y);
}

template <>
void VSquare<double>(const double* x, double* y, int n) {
  platform::dynload::vdSqr(n, x, y);
}

99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
template <>
void VCopy<float>(const float* x, float* y, int n) {
  platform::dynload::cblas_scopy(n, x, 1, y, 1);
}

template <>
void VCopy<double>(const double* x, double* y, int n) {
  platform::dynload::cblas_dcopy(n, x, 1, y, 1);
}

template <>
void VAXPY<float>(float a, const float* x, float* y, int n) {
  platform::dynload::cblas_saxpy(n, a, x, 1, y, 1);
}

template <>
void VAXPY<double>(double a, const double* x, double* y, int n) {
  platform::dynload::cblas_daxpy(n, a, x, 1, y, 1);
}

119
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
120 121 122 123 124
template <>
bool MatMulKernel<float>::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx);
}

125
template <>
T
tensor-tang 已提交
126
bool VMulKernel<float>::UseMe(const int& d) const {
127 128 129 130
  return platform::MayIUse(platform::avx512f) && d > 512;
}

template <>
T
tensor-tang 已提交
131
bool VAddKernel<float>::UseMe(const int& d) const {
132 133 134 135
  return platform::MayIUse(platform::avx512f) && d > 512;
}

template <>
T
tensor-tang 已提交
136
bool VScalKernel<float>::UseMe(const int& d) const {
137 138 139
  return platform::MayIUse(platform::avx512f) && d > 512;
}

140
template <>
T
tensor-tang 已提交
141
bool VExpKernel<float>::UseMe(const int& d) const {
142 143 144
  return d > 7;
}

T
tensor-tang 已提交
145 146 147 148 149
template <>
bool VSquareKernel<float>::UseMe(const int& d) const {
  return d > 7;
}

150
template <>
T
tensor-tang 已提交
151
bool VSigmoidKernel<float>::UseMe(const int& d) const {
152 153 154 155
  return d > 7;
}

template <>
T
tensor-tang 已提交
156
bool VTanhKernel<float>::UseMe(const int& d) const {
157 158 159
  return d > 7;
}

160 161 162 163 164 165 166 167 168 169
template <>
bool SeqPoolKernel<float>::UseMe(const seq_pool_attr_t& attr) const {
  return true;
}

template <>
bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
  return true;
}

T
tensor-tang 已提交
170 171 172 173
#define AWALYS_USE_ME_WITH_DOUBLE(func)                  \
  template <>                                            \
  bool func##Kernel<double>::UseMe(const int& d) const { \
    return true;                                         \
174 175
  }

176
AWALYS_USE_ME_WITH_DOUBLE(MatMul);
177 178 179
AWALYS_USE_ME_WITH_DOUBLE(VMul);
AWALYS_USE_ME_WITH_DOUBLE(VAdd);
AWALYS_USE_ME_WITH_DOUBLE(VScal);
180 181 182
AWALYS_USE_ME_WITH_DOUBLE(VExp);
AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
AWALYS_USE_ME_WITH_DOUBLE(VTanh);
T
tensor-tang 已提交
183
AWALYS_USE_ME_WITH_DOUBLE(VSquare);
184 185

#undef AWALYS_USE_ME_WITH_DOUBLE
T
tensor-tang 已提交
186 187
}  // namespace mkl
}  // namespace more
T
tensor-tang 已提交
188
}  // namespace jit
T
tensor-tang 已提交
189 190 191
}  // namespace operators
}  // namespace paddle

T
tensor-tang 已提交
192
namespace mkl = paddle::operators::jit::more::mkl;
T
tensor-tang 已提交
193

194 195 196 197
#define REGISTER_MKL_KERNEL(key, func)                        \
  REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel<float>, \
                          mkl::func##Kernel<double>)

198
REGISTER_MKL_KERNEL(kMatMul, MatMul);
T
tensor-tang 已提交
199 200 201 202
REGISTER_MKL_KERNEL(kVMul, VMul);
REGISTER_MKL_KERNEL(kVAdd, VAdd);
REGISTER_MKL_KERNEL(kVScal, VScal);
REGISTER_MKL_KERNEL(kVExp, VExp);
T
tensor-tang 已提交
203
REGISTER_MKL_KERNEL(kVSquare, VSquare);
T
tensor-tang 已提交
204 205
REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
REGISTER_MKL_KERNEL(kVTanh, VTanh);
206
REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
207 208

#undef REGISTER_MKL_KERNEL