cpu_vec.h 13.3 KB
Newer Older
T
tensor-tang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
T
tensor-tang 已提交
16
#include <cmath>
T
tensor-tang 已提交
17
#include <functional>
18
#include <string>
T
tensor-tang 已提交
19
#include "paddle/fluid/platform/cpu_info.h"
T
tensor-tang 已提交
20
#include "paddle/fluid/platform/enforce.h"
21

T
tensor-tang 已提交
22 23 24
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
T
tensor-tang 已提交
25 26 27 28 29 30 31 32

namespace paddle {
namespace operators {
namespace math {

#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0

33
#define YMM_FLOAT_BLOCK 8
T
tensor-tang 已提交
34
#define AVX_DOUBLE_BLOCK 4
35
#define YMM_FLOAT_BLOCK 8
T
tensor-tang 已提交
36
#define AVX2_DOUBLE_BLOCK 4
37
#define ZMM_FLOAT_BLOCK 16
T
tensor-tang 已提交
38 39
#define AVX512_DOUBLE_BLOCK 8

T
tensor-tang 已提交
40
template <typename T>
T
tensor-tang 已提交
41 42 43 44
inline void vec_exp(const int n, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {
    y[i] = std::exp(x[i]);
  }
T
tensor-tang 已提交
45 46
}

47 48 49 50 51 52 53
template <typename T>
inline void vec_scal(const int n, const T a, T* x) {
  for (int i = 0; i < n; ++i) {
    x[i] = a * x[i];
  }
}

T
tensor-tang 已提交
54 55 56 57
#ifdef PADDLE_WITH_MKLML
template <>
inline void vec_exp<float>(const int n, const float* x, float* y) {
  platform::dynload::vsExp(n, x, y);
T
tensor-tang 已提交
58 59
}

T
tensor-tang 已提交
60 61 62 63
template <>
inline void vec_exp<double>(const int n, const double* x, double* y) {
  platform::dynload::vdExp(n, x, y);
}
64 65 66 67 68 69 70 71 72 73 74 75 76

template <>
inline void vec_scal<float>(const int n, const float a, float* x) {
  platform::dynload::cblas_sscal(n, a, x, 1);
}

template <>
inline void vec_scal<double>(const int n, const double a, double* x) {
  platform::dynload::cblas_dscal(n, a, x, 1);
}
#endif

// MKL scal only support inplace, choose this if src and dst are not equal
T
tensor-tang 已提交
77
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
78 79 80 81 82 83 84
inline void vec_scal(const int n, const T a, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {
    y[i] = a * x[i];
  }
}

template <>
T
tensor-tang 已提交
85 86
inline void vec_scal<float, platform::avx>(const int n, const float a,
                                           const float* x, float* y) {
87
#ifdef __AVX__
88
  constexpr int block = YMM_FLOAT_BLOCK;
T
tensor-tang 已提交
89
  if (n < block) {
T
tensor-tang 已提交
90
    vec_scal<float, platform::isa_any>(n, a, x, y);
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
    return;
  }
  const int rest = n % block;
  const int end = n - rest;
  int i = 0;
  __m256 scalar = _mm256_set1_ps(a);
  __m256 tmp;
#define MOVE_ONE_STEP               \
  tmp = _mm256_loadu_ps(x + i);     \
  tmp = _mm256_mul_ps(tmp, scalar); \
  _mm256_storeu_ps(y + i, tmp)
  for (i = 0; i < end; i += block) {
    MOVE_ONE_STEP;
  }
#undef MOVE_ONE_STEP
  if (rest == 0) {
    return;
  }
  // can not continue move step if src and dst are inplace
  for (i = n - rest; i < n; ++i) {
    y[i] = a * x[i];
  }
#else
T
tensor-tang 已提交
114
  vec_scal<float, platform::isa_any>(n, a, x, y);
T
tensor-tang 已提交
115
#endif
116 117 118
}

template <>
T
tensor-tang 已提交
119 120 121
inline void vec_scal<float, platform::avx2>(const int n, const float a,
                                            const float* x, float* y) {
  vec_scal<float, platform::avx>(n, a, x, y);
122 123 124
}

template <>
T
tensor-tang 已提交
125 126
inline void vec_scal<float, platform::avx512f>(const int n, const float a,
                                               const float* x, float* y) {
127
  // TODO(TJ): enable me
T
tensor-tang 已提交
128
  vec_scal<float, platform::avx2>(n, a, x, y);
129
}
T
tensor-tang 已提交
130

T
tensor-tang 已提交
131
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
T
tensor-tang 已提交
132 133 134 135 136 137 138
inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {
    y[i] = a - x[i];
  }
}

template <>
T
tensor-tang 已提交
139 140
inline void vec_bias_sub<float, platform::avx>(const int n, const float a,
                                               const float* x, float* y) {
T
tensor-tang 已提交
141
#ifdef __AVX__
142
  constexpr int block = YMM_FLOAT_BLOCK;
T
tensor-tang 已提交
143
  if (n < block) {
T
tensor-tang 已提交
144
    vec_bias_sub<float, platform::isa_any>(n, a, x, y);
T
tensor-tang 已提交
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
    return;
  }
  const int rest = n % block;
  const int end = n - rest;
  int i = 0;
  __m256 bias = _mm256_set1_ps(a);
  __m256 tmp;
#define MOVE_ONE_STEP             \
  tmp = _mm256_loadu_ps(x + i);   \
  tmp = _mm256_sub_ps(bias, tmp); \
  _mm256_storeu_ps(y + i, tmp)
  for (i = 0; i < end; i += block) {
    MOVE_ONE_STEP;
  }
#undef MOVE_ONE_STEP
  if (rest == 0) {
    return;
  }
  // can not continue move step if src and dst are inplace
  for (i = n - rest; i < n; ++i) {
    y[i] = a - x[i];
  }
#else
T
tensor-tang 已提交
168
  vec_bias_sub<float, platform::isa_any>(n, a, x, y);
T
tensor-tang 已提交
169 170 171 172
#endif
}

template <>
T
tensor-tang 已提交
173 174 175
inline void vec_bias_sub<float, platform::avx2>(const int n, const float a,
                                                const float* x, float* y) {
  vec_bias_sub<float, platform::avx>(n, a, x, y);
T
tensor-tang 已提交
176 177 178
}

template <>
T
tensor-tang 已提交
179 180
inline void vec_bias_sub<float, platform::avx512f>(const int n, const float a,
                                                   const float* x, float* y) {
T
tensor-tang 已提交
181
  // TODO(TJ): enable me
T
tensor-tang 已提交
182
  vec_bias_sub<float, platform::avx2>(n, a, x, y);
T
tensor-tang 已提交
183 184
}

T
tensor-tang 已提交
185
// out = x*y + (1-x)*z
T
tensor-tang 已提交
186
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
T
tensor-tang 已提交
187 188 189 190 191 192 193
inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
  for (int i = 0; i < n; ++i) {
    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
  }
}

template <>
T
tensor-tang 已提交
194 195 196
inline void vec_cross<float, platform::avx>(const int n, const float* x,
                                            const float* y, const float* z,
                                            float* out) {
T
tensor-tang 已提交
197
#ifdef __AVX__
198
  constexpr int block = YMM_FLOAT_BLOCK;
T
tensor-tang 已提交
199
  if (n < block) {
T
tensor-tang 已提交
200
    vec_cross<float, platform::isa_any>(n, x, y, z, out);
T
tensor-tang 已提交
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
    return;
  }
  const int rest = n % block;
  const int end = n - rest;
  int i = 0;
  __m256 bias = _mm256_set1_ps(1.f);
  __m256 tmpx, tmpy, tmpz;
  for (i = 0; i < end; i += block) {
    tmpx = _mm256_loadu_ps(x + i);
    tmpy = _mm256_loadu_ps(y + i);
    tmpz = _mm256_loadu_ps(z + i);
    tmpy = _mm256_mul_ps(tmpx, tmpy);
    tmpx = _mm256_sub_ps(bias, tmpx);
    tmpz = _mm256_mul_ps(tmpx, tmpz);
    tmpz = _mm256_add_ps(tmpy, tmpz);
    _mm256_storeu_ps(out + i, tmpz);
  }
  if (rest == 0) {
    return;
  }
  // can not continue move step if src and dst are inplace
  for (i = n - rest; i < n; ++i) {
    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
  }
#else
T
tensor-tang 已提交
226
  vec_cross<float, platform::isa_any>(n, x, y, z, out);
T
tensor-tang 已提交
227 228 229 230
#endif
}

template <>
T
tensor-tang 已提交
231 232 233 234
inline void vec_cross<float, platform::avx2>(const int n, const float* x,
                                             const float* y, const float* z,
                                             float* out) {
  vec_cross<float, platform::avx>(n, x, y, z, out);
T
tensor-tang 已提交
235 236 237
}

template <>
T
tensor-tang 已提交
238 239 240
inline void vec_cross<float, platform::avx512f>(const int n, const float* x,
                                                const float* y, const float* z,
                                                float* out) {
T
tensor-tang 已提交
241
  // TODO(TJ): enable me
T
tensor-tang 已提交
242
  vec_cross<float, platform::avx>(n, x, y, z, out);
T
tensor-tang 已提交
243 244
}

T
tensor-tang 已提交
245
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
T
tensor-tang 已提交
246 247 248 249 250 251 252
inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {
    y[i] = x[i] + a;
  }
}

template <>
T
tensor-tang 已提交
253 254
inline void vec_add_bias<float, platform::avx>(const int n, const float a,
                                               const float* x, float* y) {
T
tensor-tang 已提交
255
#ifdef __AVX__
256
  constexpr int block = YMM_FLOAT_BLOCK;
T
tensor-tang 已提交
257
  if (n < block) {
T
tensor-tang 已提交
258
    vec_add_bias<float, platform::isa_any>(n, a, x, y);
T
tensor-tang 已提交
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
    return;
  }
  const int rest = n % block;
  const int end = n - rest;
  int i = 0;
  __m256 bias = _mm256_set1_ps(a);
  __m256 tmp;
#define MOVE_ONE_STEP             \
  tmp = _mm256_loadu_ps(x + i);   \
  tmp = _mm256_add_ps(tmp, bias); \
  _mm256_storeu_ps(y + i, tmp)
  for (i = 0; i < end; i += block) {
    MOVE_ONE_STEP;
  }
#undef MOVE_ONE_STEP
  if (rest == 0) {
    return;
  }
  // can not continue move step if src and dst are inplace
  for (i = n - rest; i < n; ++i) {
    y[i] = x[i] + a;
  }
#else
T
tensor-tang 已提交
282
  vec_add_bias<float, platform::isa_any>(n, a, x, y);
T
tensor-tang 已提交
283 284 285 286
#endif
}

template <>
T
tensor-tang 已提交
287 288 289
inline void vec_add_bias<float, platform::avx2>(const int n, const float a,
                                                const float* x, float* y) {
  vec_add_bias<float, platform::avx>(n, a, x, y);
T
tensor-tang 已提交
290 291 292
}

template <>
T
tensor-tang 已提交
293 294
inline void vec_add_bias<float, platform::avx512f>(const int n, const float a,
                                                   const float* x, float* y) {
T
tensor-tang 已提交
295
  // TODO(TJ): enable me
T
tensor-tang 已提交
296
  vec_add_bias<float, platform::avx2>(n, a, x, y);
T
tensor-tang 已提交
297 298
}

T
tensor-tang 已提交
299
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
300 301 302 303 304
inline void vec_identity(const int n, const T* x, T* y) {
  // do nothing
  return;
}

T
tensor-tang 已提交
305
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
T
tensor-tang 已提交
306 307 308 309
inline void vec_sigmoid(const int n, const T* x, T* y) {
  const T min = SIGMOID_THRESHOLD_MIN;
  const T max = SIGMOID_THRESHOLD_MAX;
  for (int i = 0; i < n; ++i) {
T
tensor-tang 已提交
310 311 312 313 314 315
    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
    y[i] = static_cast<T>(0) - y[i];
  }
  vec_exp<T>(n, y, y);
  for (int i = 0; i < n; ++i) {
    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
T
tensor-tang 已提交
316 317 318
  }
}

319
template <>
T
tensor-tang 已提交
320 321
inline void vec_sigmoid<float, platform::avx>(const int n, const float* x,
                                              float* y) {
322
#ifdef __AVX__
323
  constexpr int block = YMM_FLOAT_BLOCK;
324
  if (n < block) {
T
tensor-tang 已提交
325
    vec_sigmoid<float, platform::isa_any>(n, x, y);
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
    return;
  }
  const int rest = n % block;
  const int end = n - rest;
  int i = 0;
  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
  __m256 zeros = _mm256_setzero_ps();
  __m256 tmp;
#define MOVE_ONE_STEP              \
  tmp = _mm256_loadu_ps(x + i);    \
  tmp = _mm256_max_ps(tmp, min);   \
  tmp = _mm256_min_ps(tmp, max);   \
  tmp = _mm256_sub_ps(zeros, tmp); \
  _mm256_storeu_ps(y + i, tmp)
  for (i = 0; i < end; i += block) {
    MOVE_ONE_STEP;
  }
344
#undef MOVE_ONE_STEP
345
  if (rest != 0) {
346 347 348 349 350 351
    // can not continue move step since the src and dst address could be equal
    const float xmin = SIGMOID_THRESHOLD_MIN;
    const float xmax = SIGMOID_THRESHOLD_MAX;
    for (i = n - rest; i < n; ++i) {
      y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
    }
352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
  }

  vec_exp<float>(n, y, y);

  __m256 ones = _mm256_set1_ps(1.0f);
#define MOVE_ONE_STEP             \
  tmp = _mm256_loadu_ps(y + i);   \
  tmp = _mm256_add_ps(ones, tmp); \
  tmp = _mm256_div_ps(ones, tmp); \
  _mm256_storeu_ps(y + i, tmp)
  for (i = 0; i < end; i += block) {
    MOVE_ONE_STEP;
  }
#undef MOVE_ONE_STEP
  if (rest == 0) {
    return;
  }
  // can not continue move step
  for (i = n - rest; i < n; ++i) {
    y[i] = 1.f / (1.f + y[i]);
  }
#else
T
tensor-tang 已提交
374
  vec_sigmoid<float, platform::isa_any>(n, x, y);
375 376 377 378
#endif
}

template <>
T
tensor-tang 已提交
379 380 381
inline void vec_sigmoid<float, platform::avx2>(const int n, const float* x,
                                               float* y) {
  vec_sigmoid<float, platform::avx>(n, x, y);
382 383 384
}

template <>
T
tensor-tang 已提交
385 386
inline void vec_sigmoid<float, platform::avx512f>(const int n, const float* x,
                                                  float* y) {
387
  // TODO(TJ): enable me
T
tensor-tang 已提交
388
  vec_sigmoid<float, platform::avx2>(n, x, y);
389 390
}

T
tensor-tang 已提交
391
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
T
tensor-tang 已提交
392
inline void vec_tanh(const int n, const T* x, T* y) {
393 394 395
  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
  vec_sigmoid<T, isa>(n, y, y);
  vec_scal<T>(n, static_cast<T>(2), y);
T
tensor-tang 已提交
396
  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
T
tensor-tang 已提交
397 398
}

T
tensor-tang 已提交
399
// TODO(TJ): make relu clip
T
tensor-tang 已提交
400
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
T
tensor-tang 已提交
401 402 403 404 405 406
inline void vec_relu(const int n, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {
    y[i] = x[i] > 0 ? x[i] : 0;
  }
}

T
tensor-tang 已提交
407
template <>
T
tensor-tang 已提交
408 409
inline void vec_relu<float, platform::avx>(const int n, const float* x,
                                           float* y) {
T
tensor-tang 已提交
410
#ifdef __AVX__
411
  constexpr int block = YMM_FLOAT_BLOCK;
T
tensor-tang 已提交
412
  if (n < block * 4) {
T
tensor-tang 已提交
413
    vec_relu<float, platform::isa_any>(n, x, y);
T
tensor-tang 已提交
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
    return;
  }

  const int rest = n % block;
  const int end = n - rest;
  int i = 0;
  __m256 zeros = _mm256_setzero_ps();
  __m256 tmp;
#define MOVE_ONE_STEP              \
  tmp = _mm256_loadu_ps(x + i);    \
  tmp = _mm256_max_ps(tmp, zeros); \
  _mm256_storeu_ps(y + i, tmp)
  for (i = 0; i < end; i += block) {
    MOVE_ONE_STEP;
  }
  if (rest == 0) {
    return;
  }
  i = n - block;
  MOVE_ONE_STEP;
#undef MOVE_ONE_STEP

#else
T
tensor-tang 已提交
437
  vec_relu<float, platform::isa_any>(n, x, y);
T
tensor-tang 已提交
438 439 440
#endif
}

T
tensor-tang 已提交
441
template <>
T
tensor-tang 已提交
442 443 444
inline void vec_relu<float, platform::avx2>(const int n, const float* x,
                                            float* y) {
  vec_relu<float, platform::avx>(n, x, y);
T
tensor-tang 已提交
445 446 447
}

template <>
T
tensor-tang 已提交
448 449
inline void vec_relu<float, platform::avx512f>(const int n, const float* x,
                                               float* y) {
450
  // TODO(TJ): enable me
T
tensor-tang 已提交
451
  vec_relu<float, platform::avx2>(n, x, y);
T
tensor-tang 已提交
452 453
}

T
tensor-tang 已提交
454 455
// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary

T
tensor-tang 已提交
456
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
457 458 459 460 461 462 463 464 465 466 467 468 469
class VecActivations {
 public:
  std::function<void(const int, const T*, T*)> operator()(
      const std::string& type) {
    if (type == "sigmoid") {
      return vec_sigmoid<T, isa>;
    } else if (type == "relu") {
      return vec_relu<T, isa>;
    } else if (type == "tanh") {
      return vec_tanh<T, isa>;
    } else if (type == "identity" || type == "") {
      return vec_identity<T, isa>;
    }
T
tensor-tang 已提交
470
    PADDLE_THROW("Not support type: %s", type);
471 472 473
  }
};

T
tensor-tang 已提交
474 475 476
}  // namespace math
}  // namespace operators
}  // namespace paddle