mkl.cc 8.0 KB
Newer Older
T
tensor-tang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */

T
tensor-tang 已提交
15
#include "paddle/fluid/operators/jit/more/mkl/mkl.h"
16
#include "paddle/fluid/operators/jit/refer/refer.h"
T
tensor-tang 已提交
17
#include "paddle/fluid/operators/jit/registry.h"
18
#include "paddle/fluid/platform/cpu_info.h"
T
tensor-tang 已提交
19 20 21 22
#include "paddle/fluid/platform/dynload/mklml.h"

namespace paddle {
namespace operators {
T
tensor-tang 已提交
23
namespace jit {
T
tensor-tang 已提交
24 25 26
namespace more {
namespace mkl {

27
template <>
28 29 30 31 32
void MatMul<float>(const float* a, const float* b, float* c,
                   const matmul_attr_t* attr) {
  platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                                 attr->m, attr->n, attr->k, 1.f, a, attr->k, b,
                                 attr->n, 0.f, c, attr->n);
33 34 35
}

template <>
36 37 38 39 40
void MatMul<double>(const double* a, const double* b, double* c,
                    const matmul_attr_t* attr) {
  platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                                 attr->m, attr->n, attr->k, 1.0, a, attr->k, b,
                                 attr->n, 0.0, c, attr->n);
41 42
}

T
tensor-tang 已提交
43 44 45 46 47 48 49 50 51 52
template <>
void VMul<float>(const float* x, const float* y, float* z, int n) {
  platform::dynload::vsMul(n, x, y, z);
}

template <>
void VMul<double>(const double* x, const double* y, double* z, int n) {
  platform::dynload::vdMul(n, x, y, z);
}

53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
template <>
void VAdd<float>(const float* x, const float* y, float* z, int n) {
  platform::dynload::vsAdd(n, x, y, z);
}

template <>
void VAdd<double>(const double* x, const double* y, double* z, int n) {
  platform::dynload::vdAdd(n, x, y, z);
}

template <>
void VScal<float>(const float* a, const float* x, float* y, int n) {
  if (x == y) {
    platform::dynload::cblas_sscal(n, *a, y, 1);
  } else {
    refer::VScal<float>(a, x, y, n);
  }
}

template <>
void VScal<double>(const double* a, const double* x, double* y, int n) {
  if (x == y) {
    platform::dynload::cblas_dscal(n, *a, y, 1);
  } else {
    refer::VScal<double>(a, x, y, n);
  }
}

81
template <>
D
dengkaipeng 已提交
82 83
void StrideScal<float>(const float* a, const float* x, float* y, int n,
                       int stride) {
84
  if (x == y) {
D
dengkaipeng 已提交
85
    platform::dynload::cblas_sscal(n / stride, *a, y, stride);
86 87 88 89 90 91
  } else {
    refer::StrideScal<float>(a, x, y, n, stride);
  }
}

template <>
D
dengkaipeng 已提交
92 93
void StrideScal<double>(const double* a, const double* x, double* y, int n,
                        int stride) {
94
  if (x == y) {
D
dengkaipeng 已提交
95
    platform::dynload::cblas_dscal(n / stride, *a, y, stride);
96 97 98 99 100
  } else {
    refer::StrideScal<double>(a, x, y, n, stride);
  }
}

101 102 103 104 105 106 107 108 109 110
template <>
void VExp<float>(const float* x, float* y, int n) {
  platform::dynload::vsExp(n, x, y);
}

template <>
void VExp<double>(const double* x, double* y, int n) {
  platform::dynload::vdExp(n, x, y);
}

T
tensor-tang 已提交
111 112 113 114 115 116 117 118 119 120
template <>
void VSquare<float>(const float* x, float* y, int n) {
  platform::dynload::vsSqr(n, x, y);
}

template <>
void VSquare<double>(const double* x, double* y, int n) {
  platform::dynload::vdSqr(n, x, y);
}

121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
template <>
void VCopy<float>(const float* x, float* y, int n) {
  platform::dynload::cblas_scopy(n, x, 1, y, 1);
}

template <>
void VCopy<double>(const double* x, double* y, int n) {
  platform::dynload::cblas_dcopy(n, x, 1, y, 1);
}

template <>
void VAXPY<float>(float a, const float* x, float* y, int n) {
  platform::dynload::cblas_saxpy(n, a, x, 1, y, 1);
}

template <>
void VAXPY<double>(double a, const double* x, double* y, int n) {
  platform::dynload::cblas_daxpy(n, a, x, 1, y, 1);
}

T
tensor-tang 已提交
141 142 143 144 145 146 147 148 149 150
template <>
void ASum<float>(const float* x, float* res, int n) {
  res[0] = platform::dynload::cblas_sasum(n, x, 1);
}

template <>
void ASum<double>(const double* x, double* res, int n) {
  res[0] = platform::dynload::cblas_dasum(n, x, 1);
}

151
template <>
D
dengkaipeng 已提交
152
void StrideASum<float>(const float* x, float* res, int n, int stride) {
D
dengkaipeng 已提交
153
  res[0] = platform::dynload::cblas_sasum(n / stride, x, stride);
154 155 156
}

template <>
D
dengkaipeng 已提交
157
void StrideASum<double>(const double* x, double* res, int n, int stride) {
D
dengkaipeng 已提交
158
  res[0] = platform::dynload::cblas_dasum(n / stride, x, stride);
159 160
}

161 162
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template <>
163
bool VMulKernel<float>::CanBeUsed(const int& d) const {
164 165 166 167
  return platform::MayIUse(platform::avx512f) && d > 512;
}

template <>
168
bool VAddKernel<float>::CanBeUsed(const int& d) const {
T
tensor-tang 已提交
169
  return platform::MayIUse(platform::avx) && d > 512;
170 171 172
}

template <>
173
bool VScalKernel<float>::CanBeUsed(const int& d) const {
174 175 176
  return platform::MayIUse(platform::avx512f) && d > 512;
}

177 178
template <>
bool StrideScalKernel<float>::CanBeUsed(const int& d) const {
D
dengkaipeng 已提交
179
  return true;
180 181
}

182
template <>
183
bool VExpKernel<float>::CanBeUsed(const int& d) const {
184 185 186
  return d > 7;
}

T
tensor-tang 已提交
187
template <>
188
bool VSquareKernel<float>::CanBeUsed(const int& d) const {
T
tensor-tang 已提交
189 190 191
  return d > 7;
}

192
template <>
193
bool VCopyKernel<float>::CanBeUsed(const int& d) const {
194 195 196
  return d > 15;
}

197
template <>
198
bool VBroadcastKernel<float>::CanBeUsed(const int64_t& d) const {
199 200 201 202
  return d > 127;
}

template <>
203
bool VBroadcastKernel<double>::CanBeUsed(const int64_t& attr) const {
204 205 206
  return true;
}

207
template <>
208
bool VSigmoidKernel<float>::CanBeUsed(const int& d) const {
209 210 211 212
  return d > 7;
}

template <>
213
bool VTanhKernel<float>::CanBeUsed(const int& d) const {
214 215 216
  return d > 7;
}

217
template <>
218
bool SeqPoolKernel<float>::CanBeUsed(const seq_pool_attr_t& attr) const {
219 220 221 222
  return true;
}

template <>
223
bool SeqPoolKernel<double>::CanBeUsed(const seq_pool_attr_t& attr) const {
224 225 226
  return true;
}

227
template <>
228
bool EmbSeqPoolKernel<float>::CanBeUsed(const emb_seq_pool_attr_t& attr) const {
229 230 231 232
  return true;
}

template <>
233 234
bool EmbSeqPoolKernel<double>::CanBeUsed(
    const emb_seq_pool_attr_t& attr) const {
235 236 237
  return true;
}

238
template <>
239
bool SgdKernel<float>::CanBeUsed(const sgd_attr_t& attr) const {
240 241 242 243
  return true;
}

template <>
244
bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const {
245 246 247
  return true;
}

248
template <>
249
bool MatMulKernel<float>::CanBeUsed(const matmul_attr_t& attr) const {
250 251 252 253
  return platform::MayIUse(platform::avx);
}

template <>
254
bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const {
255 256 257
  return true;
}

T
tensor-tang 已提交
258
template <>
259
bool SoftmaxKernel<float>::CanBeUsed(const int& d) const {
T
tensor-tang 已提交
260 261
  // tuned on avx2
  return platform::MayIUse(platform::avx) && d < 60;
T
tensor-tang 已提交
262 263
}

264 265 266 267
#define AWALYS_USE_ME_WITH_DOUBLE(func)                      \
  template <>                                                \
  bool func##Kernel<double>::CanBeUsed(const int& d) const { \
    return true;                                             \
268 269 270 271 272
  }

AWALYS_USE_ME_WITH_DOUBLE(VMul);
AWALYS_USE_ME_WITH_DOUBLE(VAdd);
AWALYS_USE_ME_WITH_DOUBLE(VScal);
273
AWALYS_USE_ME_WITH_DOUBLE(StrideScal);
274 275 276
AWALYS_USE_ME_WITH_DOUBLE(VExp);
AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
AWALYS_USE_ME_WITH_DOUBLE(VTanh);
T
tensor-tang 已提交
277
AWALYS_USE_ME_WITH_DOUBLE(VSquare);
278
AWALYS_USE_ME_WITH_DOUBLE(VCopy);
T
tensor-tang 已提交
279
AWALYS_USE_ME_WITH_DOUBLE(Softmax);
280 281

#undef AWALYS_USE_ME_WITH_DOUBLE
T
tensor-tang 已提交
282 283
}  // namespace mkl
}  // namespace more
T
tensor-tang 已提交
284
}  // namespace jit
T
tensor-tang 已提交
285 286 287
}  // namespace operators
}  // namespace paddle

T
tensor-tang 已提交
288
namespace mkl = paddle::operators::jit::more::mkl;
T
tensor-tang 已提交
289

290 291
#define REGISTER_MKL_KERNEL(func)                                 \
  REGISTER_JITKERNEL_MORE(k##func, mkl, mkl::func##Kernel<float>, \
292 293
                          mkl::func##Kernel<double>)

294 295 296 297
REGISTER_MKL_KERNEL(MatMul);
REGISTER_MKL_KERNEL(VMul);
REGISTER_MKL_KERNEL(VAdd);
REGISTER_MKL_KERNEL(VScal);
298
REGISTER_MKL_KERNEL(StrideScal);
299 300 301 302 303 304 305 306 307 308
REGISTER_MKL_KERNEL(VExp);
REGISTER_MKL_KERNEL(VSquare);
REGISTER_MKL_KERNEL(VCopy);
REGISTER_MKL_KERNEL(VBroadcast);
REGISTER_MKL_KERNEL(VSigmoid);
REGISTER_MKL_KERNEL(VTanh);
REGISTER_MKL_KERNEL(SeqPool);
REGISTER_MKL_KERNEL(EmbSeqPool);
REGISTER_MKL_KERNEL(Softmax);
REGISTER_MKL_KERNEL(Sgd);
309 310

#undef REGISTER_MKL_KERNEL