jit_kernel_blas.cc 9.8 KB
Newer Older
T
tensor-tang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/math/jit_kernel.h"
#include <string>
T
tensor-tang 已提交
17
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
18
#include "paddle/fluid/operators/math/jit_kernel_refer.h"
T
tensor-tang 已提交
19 20
#include "paddle/fluid/platform/enforce.h"

T
tensor-tang 已提交
21 22 23 24
#ifdef PADDLE_WITH_XBYAK
#include "paddle/fluid/operators/math/jit_code.h"
#endif

T
tensor-tang 已提交
25 26 27 28 29 30 31 32 33 34
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif

namespace paddle {
namespace operators {
namespace math {
namespace jitkernel {
namespace jit = platform::jit;

T
tensor-tang 已提交
35 36 37 38 39 40 41 42
#ifdef PADDLE_WITH_MKLML
template <typename T>
void VMulMKL(const T* x, const T* y, T* z, int n);

template <>
void VMulMKL<float>(const float* x, const float* y, float* z, int n) {
  platform::dynload::vsMul(n, x, y, z);
}
T
tensor-tang 已提交
43

T
tensor-tang 已提交
44 45 46 47
template <>
void VMulMKL<double>(const double* x, const double* y, double* z, int n) {
  platform::dynload::vdMul(n, x, y, z);
}
T
tensor-tang 已提交
48 49 50 51 52 53 54 55 56 57 58 59 60

template <typename T>
void VAddMKL(const T* x, const T* y, T* z, int n);

template <>
void VAddMKL<float>(const float* x, const float* y, float* z, int n) {
  platform::dynload::vsAdd(n, x, y, z);
}

template <>
void VAddMKL<double>(const double* x, const double* y, double* z, int n) {
  platform::dynload::vdAdd(n, x, y, z);
}
T
tensor-tang 已提交
61 62 63 64 65 66 67 68 69

template <typename T>
void VScalMKL(const T* a, const T* x, T* y, int n);

template <>
void VScalMKL<float>(const float* a, const float* x, float* y, int n) {
  if (x == y) {
    platform::dynload::cblas_sscal(n, *a, y, 1);
  } else {
70
    refer::VScal<float>(a, x, y, n);
T
tensor-tang 已提交
71 72 73 74 75 76 77 78
  }
}

template <>
void VScalMKL<double>(const double* a, const double* x, double* y, int n) {
  if (x == y) {
    platform::dynload::cblas_dscal(n, *a, y, 1);
  } else {
79
    refer::VScal<double>(a, x, y, n);
T
tensor-tang 已提交
80 81 82
  }
}

T
tensor-tang 已提交
83 84
#endif

T
tensor-tang 已提交
85
/* VMUL JitKernel */
T
tensor-tang 已提交
86
template <typename T>
T
tensor-tang 已提交
87 88
class VMulKernelImpl : public VMulKernel<T> {
 public:
T
tensor-tang 已提交
89
  JITKERNEL_DECLARE_STATIC_FUNC;
T
tensor-tang 已提交
90
  explicit VMulKernelImpl(int d) : VMulKernel<T>() {
T
tensor-tang 已提交
91
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
92
    if (useJIT(d)) {
T
tensor-tang 已提交
93
      // roughly estimate the size of code
94
      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
T
tensor-tang 已提交
95
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 0, false,
T
tensor-tang 已提交
96
                                         sz > 4096 ? sz : 4096));
T
tensor-tang 已提交
97 98 99 100
      this->Compute =
          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
      return;
    }
T
tensor-tang 已提交
101
#endif
T
tensor-tang 已提交
102
#ifdef PADDLE_WITH_MKLML
T
tensor-tang 已提交
103 104 105 106
    if (useMKL(d)) {
      this->Compute = VMulMKL<T>;
      return;
    }
T
tensor-tang 已提交
107
#endif
108
    this->Compute = refer::VMul<T>;
T
tensor-tang 已提交
109 110
  }

T
tensor-tang 已提交
111
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
112

T
tensor-tang 已提交
113
 private:
T
tensor-tang 已提交
114
  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
T
tensor-tang 已提交
115
#endif
T
tensor-tang 已提交
116
};
T
tensor-tang 已提交
117

T
tensor-tang 已提交
118
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
119 120
template <>
bool VMulKernelImpl<float>::useJIT(int d) {
T
tensor-tang 已提交
121
  return gen::VXXJitCode::init(d);
T
tensor-tang 已提交
122
}
T
tensor-tang 已提交
123
#endif
T
tensor-tang 已提交
124

T
tensor-tang 已提交
125
#ifdef PADDLE_WITH_MKLML
T
tensor-tang 已提交
126 127 128 129 130 131 132 133 134
template <>
bool VMulKernelImpl<float>::useMKL(int d) {
  return jit::MayIUse(jit::avx512f) && d > 512;
}

template <>
bool VMulKernelImpl<double>::useMKL(int d) {
  return true;
}
T
tensor-tang 已提交
135
#endif
T
tensor-tang 已提交
136

T
tensor-tang 已提交
137 138
/* VAdd JitKernel */
template <typename T>
T
tensor-tang 已提交
139 140
class VAddKernelImpl : public VAddKernel<T> {
 public:
T
tensor-tang 已提交
141
  JITKERNEL_DECLARE_STATIC_FUNC;
T
tensor-tang 已提交
142
  explicit VAddKernelImpl(int d) : VAddKernel<T>() {
T
tensor-tang 已提交
143
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
144
    if (useJIT(d)) {
145
      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
T
tensor-tang 已提交
146
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, false,
T
tensor-tang 已提交
147
                                         sz > 4096 ? sz : 4096));
T
tensor-tang 已提交
148 149 150
      this->Compute =
          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
      return;
T
tensor-tang 已提交
151
    }
T
tensor-tang 已提交
152
#endif
T
tensor-tang 已提交
153 154 155 156
#ifdef PADDLE_WITH_MKLML
    if (useMKL(d)) {
      this->Compute = VAddMKL<T>;
      return;
T
tensor-tang 已提交
157
    }
T
tensor-tang 已提交
158
#endif
159
    this->Compute = refer::VAdd<T>;
T
tensor-tang 已提交
160
  }
T
fix mac  
tensor-tang 已提交
161
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
162 163

 private:
T
tensor-tang 已提交
164
  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
T
fix mac  
tensor-tang 已提交
165
#endif
T
tensor-tang 已提交
166
};
T
tensor-tang 已提交
167

T
tensor-tang 已提交
168
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
169 170
template <>
bool VAddKernelImpl<float>::useJIT(int d) {
T
tensor-tang 已提交
171
  return gen::VXXJitCode::init(d);
T
tensor-tang 已提交
172
}
T
tensor-tang 已提交
173
#endif
T
tensor-tang 已提交
174

T
tensor-tang 已提交
175
#ifdef PADDLE_WITH_MKLML
T
tensor-tang 已提交
176 177 178 179
template <>
bool VAddKernelImpl<float>::useMKL(int d) {
  return d > 512;
}
T
tensor-tang 已提交
180

T
tensor-tang 已提交
181 182 183 184
template <>
bool VAddKernelImpl<double>::useMKL(int d) {
  return true;
}
T
tensor-tang 已提交
185 186
#endif

187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
#ifdef PADDLE_WITH_MKLDNN
/* EltwiseMul for nChw16c & NC inputs JitKernel */
template <typename T>
class EltwiseMulnChw16cNCKernelImpl
    : public math::jitkernel::EltwiseMulnChw16cNCKernel<T> {
 public:
  JITKERNEL_DECLARE_STATIC_FUNC;
  explicit EltwiseMulnChw16cNCKernelImpl(int d)
      : EltwiseMulnChw16cNCKernel<T>() {
    using mul_func_t = void (*)(const float*, const float*, float*, int, int);
#ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
      // roughly estimate the size of code
      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
      sz = sz > 4096 ? sz : 4096;
      jitcode_.reset(new gen::EltwiseMulnChw16cNC(sz));
      this->Compute = (mul_func_t)jitcode_->getCode();
      return;
    }
#endif
    PADDLE_THROW(
        "This kernel shouldn't be used in Non-Xbyak, Non-MKL-DNN "
        "environemnt");
  }

#ifdef PADDLE_WITH_XBYAK

 private:
  std::unique_ptr<gen::EltwiseMulnChw16cNC> jitcode_{nullptr};
};

template <>
bool EltwiseMulnChw16cNCKernelImpl<float>::useJIT(int d) {
  return true;
}
#endif
#endif

T
tensor-tang 已提交
225 226 227 228
/* VAddRelu JitKernel */
template <typename T>
class VAddReluKernelImpl : public VAddReluKernel<T> {
 public:
T
tensor-tang 已提交
229
  JITKERNEL_DECLARE_STATIC_FUNC;
T
tensor-tang 已提交
230
  explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() {
T
tensor-tang 已提交
231
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
232
    if (useJIT(d)) {
233
      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
T
tensor-tang 已提交
234
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, true,
T
tensor-tang 已提交
235
                                         sz > 4096 ? sz : 4096));
T
tensor-tang 已提交
236 237 238 239
      this->Compute =
          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
      return;
    }
T
tensor-tang 已提交
240
#endif
241
    this->Compute = refer::VAddRelu<T>;
T
tensor-tang 已提交
242
  }
T
fix mac  
tensor-tang 已提交
243
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
244 245

 private:
T
tensor-tang 已提交
246
  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
T
tensor-tang 已提交
247
#endif
T
tensor-tang 已提交
248 249
};

T
tensor-tang 已提交
250
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
251 252
template <>
bool VAddReluKernelImpl<float>::useJIT(int d) {
T
tensor-tang 已提交
253
  return gen::VXXJitCode::init(d);
T
tensor-tang 已提交
254
}
T
tensor-tang 已提交
255 256
#endif

T
tensor-tang 已提交
257 258
/* VScal JitKernel */
template <typename T>
T
tensor-tang 已提交
259 260
class VScalKernelImpl : public VScalKernel<T> {
 public:
T
tensor-tang 已提交
261
  JITKERNEL_DECLARE_STATIC_FUNC;
T
tensor-tang 已提交
262 263 264
  explicit VScalKernelImpl(int d) : VScalKernel<T>() {
#ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
265
      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
T
tensor-tang 已提交
266 267
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 1, false,
                                         sz > 4096 ? sz : 4096));
T
tensor-tang 已提交
268 269 270
      this->Compute =
          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
      return;
T
tensor-tang 已提交
271
    }
T
tensor-tang 已提交
272
#endif
T
tensor-tang 已提交
273
#ifdef PADDLE_WITH_MKLML
T
tensor-tang 已提交
274 275 276 277
    if (useMKL(d)) {
      this->Compute = VScalMKL<T>;
      return;
    }
T
tensor-tang 已提交
278
#endif
279
    this->Compute = refer::VScal<T>;
T
tensor-tang 已提交
280
  }
T
tensor-tang 已提交
281
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
282

T
tensor-tang 已提交
283
 private:
T
tensor-tang 已提交
284
  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
T
tensor-tang 已提交
285
#endif
T
tensor-tang 已提交
286 287 288 289 290
};

#ifdef PADDLE_WITH_XBYAK
template <>
bool VScalKernelImpl<float>::useJIT(int d) {
T
tensor-tang 已提交
291
  return gen::VXXJitCode::init(d, 1);
T
tensor-tang 已提交
292
}
T
tensor-tang 已提交
293 294
#endif

T
tensor-tang 已提交
295 296 297 298 299 300 301 302 303
#ifdef PADDLE_WITH_MKLML
template <>
bool VScalKernelImpl<float>::useMKL(int d) {
  return d > 512;
}
template <>
bool VScalKernelImpl<double>::useMKL(int d) {
  return true;
}
T
tensor-tang 已提交
304
#endif
T
tensor-tang 已提交
305

T
tensor-tang 已提交
306
/* VAddBias JitKernel */
T
tensor-tang 已提交
307
template <typename T>
T
tensor-tang 已提交
308 309
class VAddBiasKernelImpl : public VAddBiasKernel<T> {
 public:
T
tensor-tang 已提交
310
  JITKERNEL_DECLARE_STATIC_FUNC;
T
tensor-tang 已提交
311 312 313
  explicit VAddBiasKernelImpl(int d) : VAddBiasKernel<T>() {
#ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
314
      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
T
tensor-tang 已提交
315 316 317 318 319
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 1, false,
                                         sz > 4096 ? sz : 4096));
      this->Compute =
          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
      return;
T
tensor-tang 已提交
320
    }
T
tensor-tang 已提交
321
#endif
T
tensor-tang 已提交
322

323
    this->Compute = refer::VAddBias<T>;
T
tensor-tang 已提交
324
  }
T
tensor-tang 已提交
325
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
326

T
tensor-tang 已提交
327 328
 private:
  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
T
tensor-tang 已提交
329
#endif
T
tensor-tang 已提交
330 331 332 333 334 335 336
};

#ifdef PADDLE_WITH_XBYAK
template <>
bool VAddBiasKernelImpl<float>::useJIT(int d) {
  return gen::VXXJitCode::init(d, 1);
}
T
tensor-tang 已提交
337 338
#endif

T
tensor-tang 已提交
339
/* VRelu JitKernel */
T
tensor-tang 已提交
340
template <typename T>
T
tensor-tang 已提交
341 342
class VReluKernelImpl : public VReluKernel<T> {
 public:
T
tensor-tang 已提交
343
  JITKERNEL_DECLARE_STATIC_FUNC;
T
tensor-tang 已提交
344 345 346
  explicit VReluKernelImpl(int d) : VReluKernel<T>() {
#ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
T
tensor-tang 已提交
347
      size_t sz = 96 /* init size */ +
348
                  d / YMM_FLOAT_BLOCK * 4 /* instructions */ *
T
tensor-tang 已提交
349
                      8 /* average bytes for each instruction */;
350 351
      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::relu,
                                          sz > 4096 ? sz : 4096));
T
tensor-tang 已提交
352 353
      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
      return;
T
tensor-tang 已提交
354
    }
T
tensor-tang 已提交
355
#endif
T
tensor-tang 已提交
356

357
    this->Compute = refer::VRelu<T>;
T
tensor-tang 已提交
358
  }
T
tensor-tang 已提交
359
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
360

T
tensor-tang 已提交
361
 private:
362
  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
T
tensor-tang 已提交
363
#endif
T
tensor-tang 已提交
364 365 366 367 368
};

#ifdef PADDLE_WITH_XBYAK
template <>
bool VReluKernelImpl<float>::useJIT(int d) {
369
  return gen::VActJitCode::init(d, gen::operand_type::relu);
T
tensor-tang 已提交
370
}
T
tensor-tang 已提交
371 372 373
#endif

/* An empty JitKernel */
T
tensor-tang 已提交
374
template <typename T>
T
tensor-tang 已提交
375 376
class VIdentityKernelImpl : public VIdentityKernel<T> {
 public:
T
tensor-tang 已提交
377 378
  JITKERNEL_DECLARE_STATIC_FUNC;
  explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() {
379
    this->Compute = refer::VIdentity<T>;
T
tensor-tang 已提交
380
  }
T
tensor-tang 已提交
381 382
};

T
tensor-tang 已提交
383 384 385 386 387 388 389
REGISTER_JITKERNEL(vmul, VMulKernel);
REGISTER_JITKERNEL(vadd, VAddKernel);
REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
REGISTER_JITKERNEL(vscal, VScalKernel);
REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
REGISTER_JITKERNEL(vrelu, VReluKernel);
REGISTER_JITKERNEL(videntity, VIdentityKernel);
390 391 392
#ifdef PADDLE_WITH_MKLDNN
REGISTER_JITKERNEL(eltwise_mul_nchw16c, EltwiseMulnChw16cNCKernel);
#endif
T
tensor-tang 已提交
393 394 395 396 397

}  // namespace jitkernel
}  // namespace math
}  // namespace operators
}  // namespace paddle