jit_kernel_blas.cc 8.7 KB
Newer Older
T
tensor-tang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/math/jit_kernel.h"
#include <string>
T
tensor-tang 已提交
17
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
18
#include "paddle/fluid/operators/math/jit_kernel_refer.h"
T
tensor-tang 已提交
19 20
#include "paddle/fluid/platform/enforce.h"

T
tensor-tang 已提交
21 22 23 24
#ifdef PADDLE_WITH_XBYAK
#include "paddle/fluid/operators/math/jit_code.h"
#endif

T
tensor-tang 已提交
25 26 27 28 29 30 31 32 33 34
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif

namespace paddle {
namespace operators {
namespace math {
namespace jitkernel {
namespace jit = platform::jit;

T
tensor-tang 已提交
35 36 37 38 39 40 41 42
#ifdef PADDLE_WITH_MKLML
template <typename T>
void VMulMKL(const T* x, const T* y, T* z, int n);

template <>
void VMulMKL<float>(const float* x, const float* y, float* z, int n) {
  platform::dynload::vsMul(n, x, y, z);
}
T
tensor-tang 已提交
43

T
tensor-tang 已提交
44 45 46 47
template <>
void VMulMKL<double>(const double* x, const double* y, double* z, int n) {
  platform::dynload::vdMul(n, x, y, z);
}
T
tensor-tang 已提交
48 49 50 51 52 53 54 55 56 57 58 59 60

template <typename T>
void VAddMKL(const T* x, const T* y, T* z, int n);

template <>
void VAddMKL<float>(const float* x, const float* y, float* z, int n) {
  platform::dynload::vsAdd(n, x, y, z);
}

template <>
void VAddMKL<double>(const double* x, const double* y, double* z, int n) {
  platform::dynload::vdAdd(n, x, y, z);
}
T
tensor-tang 已提交
61 62 63 64 65 66 67 68 69

template <typename T>
void VScalMKL(const T* a, const T* x, T* y, int n);

template <>
void VScalMKL<float>(const float* a, const float* x, float* y, int n) {
  if (x == y) {
    platform::dynload::cblas_sscal(n, *a, y, 1);
  } else {
70
    refer::VScal<float>(a, x, y, n);
T
tensor-tang 已提交
71 72 73 74 75 76 77 78
  }
}

template <>
void VScalMKL<double>(const double* a, const double* x, double* y, int n) {
  if (x == y) {
    platform::dynload::cblas_dscal(n, *a, y, 1);
  } else {
79
    refer::VScal<double>(a, x, y, n);
T
tensor-tang 已提交
80 81 82
  }
}

T
tensor-tang 已提交
83 84
#endif

T
tensor-tang 已提交
85
/* VMUL JitKernel */
T
tensor-tang 已提交
86
template <typename T>
T
tensor-tang 已提交
87 88
class VMulKernelImpl : public VMulKernel<T> {
 public:
T
tensor-tang 已提交
89
  JITKERNEL_DECLARE_STATIC_FUNC;
T
tensor-tang 已提交
90
  explicit VMulKernelImpl(int d) : VMulKernel<T>() {
T
tensor-tang 已提交
91
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
92
    if (useJIT(d)) {
T
tensor-tang 已提交
93
      // roughly estimate the size of code
94
      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
T
tensor-tang 已提交
95
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 0, false,
T
tensor-tang 已提交
96
                                         sz > 4096 ? sz : 4096));
T
tensor-tang 已提交
97 98 99 100
      this->Compute =
          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
      return;
    }
T
tensor-tang 已提交
101
#endif
T
tensor-tang 已提交
102
#ifdef PADDLE_WITH_MKLML
T
tensor-tang 已提交
103 104 105 106
    if (useMKL(d)) {
      this->Compute = VMulMKL<T>;
      return;
    }
T
tensor-tang 已提交
107
#endif
108
    this->Compute = refer::VMul<T>;
T
tensor-tang 已提交
109 110
  }

T
tensor-tang 已提交
111
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
112

T
tensor-tang 已提交
113
 private:
T
tensor-tang 已提交
114
  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
T
tensor-tang 已提交
115
#endif
T
tensor-tang 已提交
116
};
T
tensor-tang 已提交
117

T
tensor-tang 已提交
118
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
119 120
template <>
bool VMulKernelImpl<float>::useJIT(int d) {
T
tensor-tang 已提交
121
  return gen::VXXJitCode::init(d);
T
tensor-tang 已提交
122
}
T
tensor-tang 已提交
123
#endif
T
tensor-tang 已提交
124

T
tensor-tang 已提交
125
#ifdef PADDLE_WITH_MKLML
T
tensor-tang 已提交
126 127 128 129 130 131 132 133 134
template <>
bool VMulKernelImpl<float>::useMKL(int d) {
  return jit::MayIUse(jit::avx512f) && d > 512;
}

template <>
bool VMulKernelImpl<double>::useMKL(int d) {
  return true;
}
T
tensor-tang 已提交
135
#endif
T
tensor-tang 已提交
136

T
tensor-tang 已提交
137 138
/* VAdd JitKernel */
template <typename T>
T
tensor-tang 已提交
139 140
class VAddKernelImpl : public VAddKernel<T> {
 public:
T
tensor-tang 已提交
141
  JITKERNEL_DECLARE_STATIC_FUNC;
T
tensor-tang 已提交
142
  explicit VAddKernelImpl(int d) : VAddKernel<T>() {
T
tensor-tang 已提交
143
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
144
    if (useJIT(d)) {
145
      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
T
tensor-tang 已提交
146
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, false,
T
tensor-tang 已提交
147
                                         sz > 4096 ? sz : 4096));
T
tensor-tang 已提交
148 149 150
      this->Compute =
          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
      return;
T
tensor-tang 已提交
151
    }
T
tensor-tang 已提交
152
#endif
T
tensor-tang 已提交
153 154 155 156
#ifdef PADDLE_WITH_MKLML
    if (useMKL(d)) {
      this->Compute = VAddMKL<T>;
      return;
T
tensor-tang 已提交
157
    }
T
tensor-tang 已提交
158
#endif
159
    this->Compute = refer::VAdd<T>;
T
tensor-tang 已提交
160
  }
T
fix mac  
tensor-tang 已提交
161
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
162 163

 private:
T
tensor-tang 已提交
164
  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
T
fix mac  
tensor-tang 已提交
165
#endif
T
tensor-tang 已提交
166
};
T
tensor-tang 已提交
167

T
tensor-tang 已提交
168
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
169 170
template <>
bool VAddKernelImpl<float>::useJIT(int d) {
T
tensor-tang 已提交
171
  return gen::VXXJitCode::init(d);
T
tensor-tang 已提交
172
}
T
tensor-tang 已提交
173
#endif
T
tensor-tang 已提交
174

T
tensor-tang 已提交
175
#ifdef PADDLE_WITH_MKLML
T
tensor-tang 已提交
176 177 178 179
template <>
bool VAddKernelImpl<float>::useMKL(int d) {
  return d > 512;
}
T
tensor-tang 已提交
180

T
tensor-tang 已提交
181 182 183 184
template <>
bool VAddKernelImpl<double>::useMKL(int d) {
  return true;
}
T
tensor-tang 已提交
185 186
#endif

T
tensor-tang 已提交
187 188 189 190
/* VAddRelu JitKernel */
template <typename T>
class VAddReluKernelImpl : public VAddReluKernel<T> {
 public:
T
tensor-tang 已提交
191
  JITKERNEL_DECLARE_STATIC_FUNC;
T
tensor-tang 已提交
192
  explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() {
T
tensor-tang 已提交
193
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
194
    if (useJIT(d)) {
195
      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
T
tensor-tang 已提交
196
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, true,
T
tensor-tang 已提交
197
                                         sz > 4096 ? sz : 4096));
T
tensor-tang 已提交
198 199 200 201
      this->Compute =
          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
      return;
    }
T
tensor-tang 已提交
202
#endif
203
    this->Compute = refer::VAddRelu<T>;
T
tensor-tang 已提交
204
  }
T
fix mac  
tensor-tang 已提交
205
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
206 207

 private:
T
tensor-tang 已提交
208
  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
T
tensor-tang 已提交
209
#endif
T
tensor-tang 已提交
210 211
};

T
tensor-tang 已提交
212
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
213 214
template <>
bool VAddReluKernelImpl<float>::useJIT(int d) {
T
tensor-tang 已提交
215
  return gen::VXXJitCode::init(d);
T
tensor-tang 已提交
216
}
T
tensor-tang 已提交
217 218
#endif

T
tensor-tang 已提交
219 220
/* VScal JitKernel */
template <typename T>
T
tensor-tang 已提交
221 222
class VScalKernelImpl : public VScalKernel<T> {
 public:
T
tensor-tang 已提交
223
  JITKERNEL_DECLARE_STATIC_FUNC;
T
tensor-tang 已提交
224 225 226
  explicit VScalKernelImpl(int d) : VScalKernel<T>() {
#ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
227
      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
T
tensor-tang 已提交
228 229
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 1, false,
                                         sz > 4096 ? sz : 4096));
T
tensor-tang 已提交
230 231 232
      this->Compute =
          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
      return;
T
tensor-tang 已提交
233
    }
T
tensor-tang 已提交
234
#endif
T
tensor-tang 已提交
235
#ifdef PADDLE_WITH_MKLML
T
tensor-tang 已提交
236 237 238 239
    if (useMKL(d)) {
      this->Compute = VScalMKL<T>;
      return;
    }
T
tensor-tang 已提交
240
#endif
241
    this->Compute = refer::VScal<T>;
T
tensor-tang 已提交
242
  }
T
tensor-tang 已提交
243
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
244

T
tensor-tang 已提交
245
 private:
T
tensor-tang 已提交
246
  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
T
tensor-tang 已提交
247
#endif
T
tensor-tang 已提交
248 249 250 251 252
};

#ifdef PADDLE_WITH_XBYAK
template <>
bool VScalKernelImpl<float>::useJIT(int d) {
T
tensor-tang 已提交
253
  return gen::VXXJitCode::init(d, 1);
T
tensor-tang 已提交
254
}
T
tensor-tang 已提交
255 256
#endif

T
tensor-tang 已提交
257 258 259 260 261 262 263 264 265
#ifdef PADDLE_WITH_MKLML
template <>
bool VScalKernelImpl<float>::useMKL(int d) {
  return d > 512;
}
template <>
bool VScalKernelImpl<double>::useMKL(int d) {
  return true;
}
T
tensor-tang 已提交
266
#endif
T
tensor-tang 已提交
267

T
tensor-tang 已提交
268
/* VAddBias JitKernel */
T
tensor-tang 已提交
269
template <typename T>
T
tensor-tang 已提交
270 271
class VAddBiasKernelImpl : public VAddBiasKernel<T> {
 public:
T
tensor-tang 已提交
272
  JITKERNEL_DECLARE_STATIC_FUNC;
T
tensor-tang 已提交
273 274 275
  explicit VAddBiasKernelImpl(int d) : VAddBiasKernel<T>() {
#ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
276
      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
T
tensor-tang 已提交
277 278 279 280 281
      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 1, false,
                                         sz > 4096 ? sz : 4096));
      this->Compute =
          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
      return;
T
tensor-tang 已提交
282
    }
T
tensor-tang 已提交
283
#endif
T
tensor-tang 已提交
284

285
    this->Compute = refer::VAddBias<T>;
T
tensor-tang 已提交
286
  }
T
tensor-tang 已提交
287
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
288

T
tensor-tang 已提交
289 290
 private:
  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
T
tensor-tang 已提交
291
#endif
T
tensor-tang 已提交
292 293 294 295 296 297 298
};

#ifdef PADDLE_WITH_XBYAK
template <>
bool VAddBiasKernelImpl<float>::useJIT(int d) {
  return gen::VXXJitCode::init(d, 1);
}
T
tensor-tang 已提交
299 300
#endif

T
tensor-tang 已提交
301
/* VRelu JitKernel */
T
tensor-tang 已提交
302
template <typename T>
T
tensor-tang 已提交
303 304
class VReluKernelImpl : public VReluKernel<T> {
 public:
T
tensor-tang 已提交
305
  JITKERNEL_DECLARE_STATIC_FUNC;
T
tensor-tang 已提交
306 307 308
  explicit VReluKernelImpl(int d) : VReluKernel<T>() {
#ifdef PADDLE_WITH_XBYAK
    if (useJIT(d)) {
T
tensor-tang 已提交
309
      size_t sz = 96 /* init size */ +
310
                  d / YMM_FLOAT_BLOCK * 4 /* instructions */ *
T
tensor-tang 已提交
311
                      8 /* average bytes for each instruction */;
312 313
      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::relu,
                                          sz > 4096 ? sz : 4096));
T
tensor-tang 已提交
314 315
      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
      return;
T
tensor-tang 已提交
316
    }
T
tensor-tang 已提交
317
#endif
T
tensor-tang 已提交
318

319
    this->Compute = refer::VRelu<T>;
T
tensor-tang 已提交
320
  }
T
tensor-tang 已提交
321
#ifdef PADDLE_WITH_XBYAK
T
tensor-tang 已提交
322

T
tensor-tang 已提交
323
 private:
324
  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
T
tensor-tang 已提交
325
#endif
T
tensor-tang 已提交
326 327 328 329 330
};

#ifdef PADDLE_WITH_XBYAK
template <>
bool VReluKernelImpl<float>::useJIT(int d) {
331
  return gen::VActJitCode::init(d, gen::operand_type::relu);
T
tensor-tang 已提交
332
}
T
tensor-tang 已提交
333 334 335
#endif

/* An empty JitKernel */
T
tensor-tang 已提交
336
template <typename T>
T
tensor-tang 已提交
337 338
class VIdentityKernelImpl : public VIdentityKernel<T> {
 public:
T
tensor-tang 已提交
339 340
  JITKERNEL_DECLARE_STATIC_FUNC;
  explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() {
341
    this->Compute = refer::VIdentity<T>;
T
tensor-tang 已提交
342
  }
T
tensor-tang 已提交
343 344
};

T
tensor-tang 已提交
345 346 347 348 349 350 351
REGISTER_JITKERNEL(vmul, VMulKernel);
REGISTER_JITKERNEL(vadd, VAddKernel);
REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
REGISTER_JITKERNEL(vscal, VScalKernel);
REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
REGISTER_JITKERNEL(vrelu, VReluKernel);
REGISTER_JITKERNEL(videntity, VIdentityKernel);
T
tensor-tang 已提交
352 353 354 355 356

}  // namespace jitkernel
}  // namespace math
}  // namespace operators
}  // namespace paddle