未验证 提交 923ad5dc 编写于 作者: P PuQing 提交者: GitHub

add cpu_info.h (#48403)

上级 fe617f95
...@@ -50,6 +50,8 @@ inline void cpuid(int reg[4], int x) { ...@@ -50,6 +50,8 @@ inline void cpuid(int reg[4], int x) {
#endif #endif
#endif #endif
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -82,18 +84,7 @@ size_t NPUPinnedMinChunkSize(); ...@@ -82,18 +84,7 @@ size_t NPUPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator. //! Get the maximum chunk size for buddy allocator.
size_t NPUPinnedMaxChunkSize(); size_t NPUPinnedMaxChunkSize();
typedef enum { using namespace phi::backends::cpu; // NOLINT
isa_any,
sse42,
avx,
avx2,
avx512f,
avx512_core,
avx512_core_vnni,
avx512_mic,
avx512_mic_4ops,
avx512_bf16,
} cpu_isa_t; // Instruction set architecture
// May I use some instruction // May I use some instruction
bool MayIUse(const cpu_isa_t cpu_isa); bool MayIUse(const cpu_isa_t cpu_isa);
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stddef.h>
#ifdef _WIN32
#if defined(__AVX2__)
#include <immintrin.h> // avx2
#elif defined(__AVX__)
#include <intrin.h> // avx
#endif // AVX
#else // WIN32
#ifdef __AVX__
#include <immintrin.h>
#endif
#endif // WIN32
#if defined(_WIN32)
#define ALIGN32_BEG __declspec(align(32))
#define ALIGN32_END
#else
#define ALIGN32_BEG
#define ALIGN32_END __attribute__((aligned(32)))
#endif // _WIN32
namespace phi {
namespace backends {
namespace cpu {
typedef enum {
isa_any,
sse42,
avx,
avx2,
avx512f,
avx512_core,
avx512_core_vnni,
avx512_mic,
avx512_mic_4ops,
avx512_bf16,
} cpu_isa_t; // Instruction set architecture
} // namespace cpu
} // namespace backends
} // namespace phi
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include <functional> #include <functional>
#include <string> #include <string>
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
...@@ -81,8 +81,7 @@ inline void vec_scal<double>(const int n, const double a, double* x) { ...@@ -81,8 +81,7 @@ inline void vec_scal<double>(const int n, const double a, double* x) {
#endif #endif
// MKL scal only support inplace, choose this if src and dst are not equal // MKL scal only support inplace, choose this if src and dst are not equal
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
inline void vec_scal(const int n, const T a, const T* x, T* y) { inline void vec_scal(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = a * x[i]; y[i] = a * x[i];
...@@ -90,14 +89,14 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) { ...@@ -90,14 +89,14 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) {
} }
template <> template <>
inline void vec_scal<float, paddle::platform::avx>(const int n, inline void vec_scal<float, backends::cpu::avx>(const int n,
const float a, const float a,
const float* x, const float* x,
float* y) { float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_scal<float, paddle::platform::isa_any>(n, a, x, y); vec_scal<float, backends::cpu::isa_any>(n, a, x, y);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -121,29 +120,28 @@ inline void vec_scal<float, paddle::platform::avx>(const int n, ...@@ -121,29 +120,28 @@ inline void vec_scal<float, paddle::platform::avx>(const int n,
y[i] = a * x[i]; y[i] = a * x[i];
} }
#else #else
vec_scal<float, paddle::platform::isa_any>(n, a, x, y); vec_scal<float, backends::cpu::isa_any>(n, a, x, y);
#endif #endif
} }
template <> template <>
inline void vec_scal<float, paddle::platform::avx2>(const int n, inline void vec_scal<float, backends::cpu::avx2>(const int n,
const float a, const float a,
const float* x, const float* x,
float* y) { float* y) {
vec_scal<float, paddle::platform::avx>(n, a, x, y); vec_scal<float, backends::cpu::avx>(n, a, x, y);
} }
template <> template <>
inline void vec_scal<float, paddle::platform::avx512f>(const int n, inline void vec_scal<float, backends::cpu::avx512f>(const int n,
const float a, const float a,
const float* x, const float* x,
float* y) { float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_scal<float, paddle::platform::avx2>(n, a, x, y); vec_scal<float, backends::cpu::avx2>(n, a, x, y);
} }
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
inline void vec_sum(const size_t n, const T* x, T* s) { inline void vec_sum(const size_t n, const T* x, T* s) {
s[0] = x[0]; s[0] = x[0];
for (size_t i = 1; i < n; ++i) { for (size_t i = 1; i < n; ++i) {
...@@ -152,13 +150,13 @@ inline void vec_sum(const size_t n, const T* x, T* s) { ...@@ -152,13 +150,13 @@ inline void vec_sum(const size_t n, const T* x, T* s) {
} }
template <> template <>
inline void vec_sum<float, paddle::platform::avx>(const size_t n, inline void vec_sum<float, backends::cpu::avx>(const size_t n,
const float* x, const float* x,
float* s) { float* s) {
#ifdef __AVX__ #ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK; constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_sum<float, paddle::platform::isa_any>(n, x, s); vec_sum<float, backends::cpu::isa_any>(n, x, s);
return; return;
} }
...@@ -182,12 +180,11 @@ inline void vec_sum<float, paddle::platform::avx>(const size_t n, ...@@ -182,12 +180,11 @@ inline void vec_sum<float, paddle::platform::avx>(const size_t n,
s[0] += x[i]; s[0] += x[i];
} }
#else #else
vec_sum<float, paddle::platform::isa_any>(n, x, s); vec_sum<float, backends::cpu::isa_any>(n, x, s);
#endif #endif
} }
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
inline void vec_mul(const size_t n, const T* x, const T* y, T* z) { inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
z[i] = x[i] * y[i]; z[i] = x[i] * y[i];
...@@ -195,14 +192,14 @@ inline void vec_mul(const size_t n, const T* x, const T* y, T* z) { ...@@ -195,14 +192,14 @@ inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
} }
template <> template <>
inline void vec_mul<float, paddle::platform::avx>(const size_t n, inline void vec_mul<float, backends::cpu::avx>(const size_t n,
const float* x, const float* x,
const float* y, const float* y,
float* z) { float* z) {
#ifdef __AVX__ #ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK; constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_mul<float, paddle::platform::isa_any>(n, x, y, z); vec_mul<float, backends::cpu::isa_any>(n, x, y, z);
return; return;
} }
...@@ -217,12 +214,11 @@ inline void vec_mul<float, paddle::platform::avx>(const size_t n, ...@@ -217,12 +214,11 @@ inline void vec_mul<float, paddle::platform::avx>(const size_t n,
z[i] = x[i] * y[i]; z[i] = x[i] * y[i];
} }
#else #else
vec_mul<float, paddle::platform::isa_any>(n, x, y, z); vec_mul<float, backends::cpu::isa_any>(n, x, y, z);
#endif #endif
} }
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) { inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
z[0] = x[0] * y[0]; z[0] = x[0] * y[0];
for (size_t i = 1; i < n; ++i) { for (size_t i = 1; i < n; ++i) {
...@@ -231,14 +227,14 @@ inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) { ...@@ -231,14 +227,14 @@ inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
} }
template <> template <>
inline void vec_mul_reduce<float, paddle::platform::avx>(const size_t n, inline void vec_mul_reduce<float, backends::cpu::avx>(const size_t n,
const float* x, const float* x,
const float* y, const float* y,
float* z) { float* z) {
#ifdef __AVX__ #ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK; constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_mul_reduce<float, paddle::platform::isa_any>(n, x, y, z); vec_mul_reduce<float, backends::cpu::isa_any>(n, x, y, z);
return; return;
} }
...@@ -262,12 +258,11 @@ inline void vec_mul_reduce<float, paddle::platform::avx>(const size_t n, ...@@ -262,12 +258,11 @@ inline void vec_mul_reduce<float, paddle::platform::avx>(const size_t n,
z[0] += x[i] * y[i]; z[0] += x[i] * y[i];
} }
#else #else
vec_mul_reduce<float, paddle::platform::isa_any>(n, x, y, z); vec_mul_reduce<float, backends::cpu::isa_any>(n, x, y, z);
#endif #endif
} }
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = a - x[i]; y[i] = a - x[i];
...@@ -275,14 +270,14 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { ...@@ -275,14 +270,14 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
} }
template <> template <>
inline void vec_bias_sub<float, paddle::platform::avx>(const int n, inline void vec_bias_sub<float, backends::cpu::avx>(const int n,
const float a, const float a,
const float* x, const float* x,
float* y) { float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_bias_sub<float, paddle::platform::isa_any>(n, a, x, y); vec_bias_sub<float, backends::cpu::isa_any>(n, a, x, y);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -306,30 +301,29 @@ inline void vec_bias_sub<float, paddle::platform::avx>(const int n, ...@@ -306,30 +301,29 @@ inline void vec_bias_sub<float, paddle::platform::avx>(const int n,
y[i] = a - x[i]; y[i] = a - x[i];
} }
#else #else
vec_bias_sub<float, paddle::platform::isa_any>(n, a, x, y); vec_bias_sub<float, backends::cpu::isa_any>(n, a, x, y);
#endif #endif
} }
template <> template <>
inline void vec_bias_sub<float, paddle::platform::avx2>(const int n, inline void vec_bias_sub<float, backends::cpu::avx2>(const int n,
const float a, const float a,
const float* x, const float* x,
float* y) { float* y) {
vec_bias_sub<float, paddle::platform::avx>(n, a, x, y); vec_bias_sub<float, backends::cpu::avx>(n, a, x, y);
} }
template <> template <>
inline void vec_bias_sub<float, paddle::platform::avx512f>(const int n, inline void vec_bias_sub<float, backends::cpu::avx512f>(const int n,
const float a, const float a,
const float* x, const float* x,
float* y) { float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_bias_sub<float, paddle::platform::avx2>(n, a, x, y); vec_bias_sub<float, backends::cpu::avx2>(n, a, x, y);
} }
// out = x*y + (1-x)*z // out = x*y + (1-x)*z
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i]; out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
...@@ -337,12 +331,12 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { ...@@ -337,12 +331,12 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
} }
template <> template <>
inline void vec_cross<float, paddle::platform::avx>( inline void vec_cross<float, backends::cpu::avx>(
const int n, const float* x, const float* y, const float* z, float* out) { const int n, const float* x, const float* y, const float* z, float* out) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_cross<float, paddle::platform::isa_any>(n, x, y, z, out); vec_cross<float, backends::cpu::isa_any>(n, x, y, z, out);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -368,25 +362,24 @@ inline void vec_cross<float, paddle::platform::avx>( ...@@ -368,25 +362,24 @@ inline void vec_cross<float, paddle::platform::avx>(
out[i] = x[i] * y[i] + (1.f - x[i]) * z[i]; out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
} }
#else #else
vec_cross<float, paddle::platform::isa_any>(n, x, y, z, out); vec_cross<float, backends::cpu::isa_any>(n, x, y, z, out);
#endif #endif
} }
template <> template <>
inline void vec_cross<float, paddle::platform::avx2>( inline void vec_cross<float, backends::cpu::avx2>(
const int n, const float* x, const float* y, const float* z, float* out) { const int n, const float* x, const float* y, const float* z, float* out) {
vec_cross<float, paddle::platform::avx>(n, x, y, z, out); vec_cross<float, backends::cpu::avx>(n, x, y, z, out);
} }
template <> template <>
inline void vec_cross<float, paddle::platform::avx512f>( inline void vec_cross<float, backends::cpu::avx512f>(
const int n, const float* x, const float* y, const float* z, float* out) { const int n, const float* x, const float* y, const float* z, float* out) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_cross<float, paddle::platform::avx>(n, x, y, z, out); vec_cross<float, backends::cpu::avx>(n, x, y, z, out);
} }
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
inline void vec_clip(const size_t n, const T a, const T* x, T* y) { inline void vec_clip(const size_t n, const T a, const T* x, T* y) {
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
y[i] = x[i] < a ? a : x[i]; y[i] = x[i] < a ? a : x[i];
...@@ -394,14 +387,14 @@ inline void vec_clip(const size_t n, const T a, const T* x, T* y) { ...@@ -394,14 +387,14 @@ inline void vec_clip(const size_t n, const T a, const T* x, T* y) {
} }
template <> template <>
inline void vec_clip<float, paddle::platform::avx>(const size_t n, inline void vec_clip<float, backends::cpu::avx>(const size_t n,
const float a, const float a,
const float* x, const float* x,
float* y) { float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK; constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_clip<float, paddle::platform::isa_any>(n, a, x, y); vec_clip<float, backends::cpu::isa_any>(n, a, x, y);
return; return;
} }
...@@ -417,12 +410,11 @@ inline void vec_clip<float, paddle::platform::avx>(const size_t n, ...@@ -417,12 +410,11 @@ inline void vec_clip<float, paddle::platform::avx>(const size_t n,
y[i] = x[i] < a ? a : x[i]; y[i] = x[i] < a ? a : x[i];
} }
#else #else
vec_clip<float, paddle::platform::isa_any>(n, a, x, y); vec_clip<float, backends::cpu::isa_any>(n, a, x, y);
#endif #endif
} }
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
inline void vec_add_bias(const int n, const T a, const T* x, T* y) { inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = x[i] + a; y[i] = x[i] + a;
...@@ -430,14 +422,14 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) { ...@@ -430,14 +422,14 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
} }
template <> template <>
inline void vec_add_bias<float, paddle::platform::avx>(const int n, inline void vec_add_bias<float, backends::cpu::avx>(const int n,
const float a, const float a,
const float* x, const float* x,
float* y) { float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_add_bias<float, paddle::platform::isa_any>(n, a, x, y); vec_add_bias<float, backends::cpu::isa_any>(n, a, x, y);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -461,36 +453,34 @@ inline void vec_add_bias<float, paddle::platform::avx>(const int n, ...@@ -461,36 +453,34 @@ inline void vec_add_bias<float, paddle::platform::avx>(const int n,
y[i] = x[i] + a; y[i] = x[i] + a;
} }
#else #else
vec_add_bias<float, paddle::platform::isa_any>(n, a, x, y); vec_add_bias<float, backends::cpu::isa_any>(n, a, x, y);
#endif #endif
} }
template <> template <>
inline void vec_add_bias<float, paddle::platform::avx2>(const int n, inline void vec_add_bias<float, backends::cpu::avx2>(const int n,
const float a, const float a,
const float* x, const float* x,
float* y) { float* y) {
vec_add_bias<float, paddle::platform::avx>(n, a, x, y); vec_add_bias<float, backends::cpu::avx>(n, a, x, y);
} }
template <> template <>
inline void vec_add_bias<float, paddle::platform::avx512f>(const int n, inline void vec_add_bias<float, backends::cpu::avx512f>(const int n,
const float a, const float a,
const float* x, const float* x,
float* y) { float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_add_bias<float, paddle::platform::avx2>(n, a, x, y); vec_add_bias<float, backends::cpu::avx2>(n, a, x, y);
} }
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
inline void vec_identity(const int n, const T* x, T* y) { inline void vec_identity(const int n, const T* x, T* y) {
// do nothing // do nothing
return; return;
} }
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
inline void vec_sigmoid(const int n, const T* x, T* y) { inline void vec_sigmoid(const int n, const T* x, T* y) {
const T min = SIGMOID_THRESHOLD_MIN; const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX; const T max = SIGMOID_THRESHOLD_MAX;
...@@ -505,13 +495,13 @@ inline void vec_sigmoid(const int n, const T* x, T* y) { ...@@ -505,13 +495,13 @@ inline void vec_sigmoid(const int n, const T* x, T* y) {
} }
template <> template <>
inline void vec_sigmoid<float, paddle::platform::avx>(const int n, inline void vec_sigmoid<float, backends::cpu::avx>(const int n,
const float* x, const float* x,
float* y) { float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_sigmoid<float, paddle::platform::isa_any>(n, x, y); vec_sigmoid<float, backends::cpu::isa_any>(n, x, y);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -560,27 +550,26 @@ inline void vec_sigmoid<float, paddle::platform::avx>(const int n, ...@@ -560,27 +550,26 @@ inline void vec_sigmoid<float, paddle::platform::avx>(const int n,
y[i] = 1.f / (1.f + y[i]); y[i] = 1.f / (1.f + y[i]);
} }
#else #else
vec_sigmoid<float, paddle::platform::isa_any>(n, x, y); vec_sigmoid<float, backends::cpu::isa_any>(n, x, y);
#endif #endif
} }
template <> template <>
inline void vec_sigmoid<float, paddle::platform::avx2>(const int n, inline void vec_sigmoid<float, backends::cpu::avx2>(const int n,
const float* x, const float* x,
float* y) { float* y) {
vec_sigmoid<float, paddle::platform::avx>(n, x, y); vec_sigmoid<float, backends::cpu::avx>(n, x, y);
} }
template <> template <>
inline void vec_sigmoid<float, paddle::platform::avx512f>(const int n, inline void vec_sigmoid<float, backends::cpu::avx512f>(const int n,
const float* x, const float* x,
float* y) { float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_sigmoid<float, paddle::platform::avx2>(n, x, y); vec_sigmoid<float, backends::cpu::avx2>(n, x, y);
} }
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
inline void vec_tanh(const int n, const T* x, T* y) { inline void vec_tanh(const int n, const T* x, T* y) {
vec_scal<T, isa>(n, static_cast<T>(2), x, y); vec_scal<T, isa>(n, static_cast<T>(2), x, y);
vec_sigmoid<T, isa>(n, y, y); vec_sigmoid<T, isa>(n, y, y);
...@@ -589,8 +578,7 @@ inline void vec_tanh(const int n, const T* x, T* y) { ...@@ -589,8 +578,7 @@ inline void vec_tanh(const int n, const T* x, T* y) {
} }
// TODO(TJ): make relu clip // TODO(TJ): make relu clip
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
inline void vec_relu(const int n, const T* x, T* y) { inline void vec_relu(const int n, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0; y[i] = x[i] > 0 ? x[i] : 0;
...@@ -598,13 +586,13 @@ inline void vec_relu(const int n, const T* x, T* y) { ...@@ -598,13 +586,13 @@ inline void vec_relu(const int n, const T* x, T* y) {
} }
template <> template <>
inline void vec_relu<float, paddle::platform::avx>(const int n, inline void vec_relu<float, backends::cpu::avx>(const int n,
const float* x, const float* x,
float* y) { float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block * 4) { if (n < block * 4) {
vec_relu<float, paddle::platform::isa_any>(n, x, y); vec_relu<float, backends::cpu::isa_any>(n, x, y);
return; return;
} }
...@@ -628,29 +616,28 @@ inline void vec_relu<float, paddle::platform::avx>(const int n, ...@@ -628,29 +616,28 @@ inline void vec_relu<float, paddle::platform::avx>(const int n,
#undef MOVE_ONE_STEP #undef MOVE_ONE_STEP
#else #else
vec_relu<float, paddle::platform::isa_any>(n, x, y); vec_relu<float, backends::cpu::isa_any>(n, x, y);
#endif #endif
} }
template <> template <>
inline void vec_relu<float, paddle::platform::avx2>(const int n, inline void vec_relu<float, backends::cpu::avx2>(const int n,
const float* x, const float* x,
float* y) { float* y) {
vec_relu<float, paddle::platform::avx>(n, x, y); vec_relu<float, backends::cpu::avx>(n, x, y);
} }
template <> template <>
inline void vec_relu<float, paddle::platform::avx512f>(const int n, inline void vec_relu<float, backends::cpu::avx512f>(const int n,
const float* x, const float* x,
float* y) { float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_relu<float, paddle::platform::avx2>(n, x, y); vec_relu<float, backends::cpu::avx2>(n, x, y);
} }
// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
template <typename T, template <typename T, backends::cpu::cpu_isa_t isa = backends::cpu::isa_any>
paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
class VecActivations { class VecActivations {
public: public:
std::function<void(const int, const T*, T*)> operator()( std::function<void(const int, const T*, T*)> operator()(
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
namespace phi { namespace phi {
......
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
(this is the zlib license) (this is the zlib license)
*/ */
#pragma once #pragma once
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
/* __m128 is ugly to write */ /* __m128 is ugly to write */
typedef __m256 v8sf; // vector of 8 float (avx) typedef __m256 v8sf; // vector of 8 float (avx)
......
...@@ -14,15 +14,13 @@ limitations under the License. */ ...@@ -14,15 +14,13 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/softmax_grad_kernel.h" #include "paddle/phi/kernels/sparse/softmax_grad_kernel.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/visit_type.h" #include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h" #include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h" #include "paddle/phi/kernels/sparse/empty_kernel.h"
namespace plt = paddle::platform;
namespace phi { namespace phi {
namespace sparse { namespace sparse {
...@@ -72,11 +70,11 @@ void SoftmaxCsrGradKernel(const Context& dev_ctx, ...@@ -72,11 +70,11 @@ void SoftmaxCsrGradKernel(const Context& dev_ctx,
out_crows_data[crow_idx]); out_crows_data[crow_idx]);
T sum = 0; T sum = 0;
phi::funcs::vec_mul_reduce<T, plt::avx>( phi::funcs::vec_mul_reduce<T, backends::cpu::avx>(
row_nnz, dout_data, out_data, &sum); row_nnz, dout_data, out_data, &sum);
phi::funcs::vec_add_bias<T, plt::avx>( phi::funcs::vec_add_bias<T, backends::cpu::avx>(
row_nnz, static_cast<T>(-1) * sum, dout_data, dx_data); row_nnz, static_cast<T>(-1) * sum, dout_data, dx_data);
phi::funcs::vec_mul<T, plt::avx>( phi::funcs::vec_mul<T, backends::cpu::avx>(
row_nnz, dx_data, out_data, dx_data); row_nnz, dx_data, out_data, dx_data);
out_data = out_data + row_nnz; out_data = out_data + row_nnz;
......
...@@ -14,15 +14,13 @@ limitations under the License. */ ...@@ -14,15 +14,13 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/softmax_kernel.h" #include "paddle/phi/kernels/sparse/softmax_kernel.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/visit_type.h" #include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h" #include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h" #include "paddle/phi/kernels/sparse/empty_kernel.h"
namespace plt = paddle::platform;
namespace phi { namespace phi {
namespace sparse { namespace sparse {
...@@ -70,14 +68,14 @@ void SoftmaxCsrKernel(const Context& dev_ctx, ...@@ -70,14 +68,14 @@ void SoftmaxCsrKernel(const Context& dev_ctx,
x_crows_data[crow_idx]); x_crows_data[crow_idx]);
row_max_val = *std::max_element(x_data, x_data + row_nnz); row_max_val = *std::max_element(x_data, x_data + row_nnz);
phi::funcs::vec_add_bias<T, plt::avx>( phi::funcs::vec_add_bias<T, backends::cpu::avx>(
row_nnz, static_cast<T>(-1) * row_max_val, x_data, out_data); row_nnz, static_cast<T>(-1) * row_max_val, x_data, out_data);
phi::funcs::vec_exp<T>(row_nnz, out_data, out_data); phi::funcs::vec_exp<T>(row_nnz, out_data, out_data);
T sum = 0; T sum = 0;
phi::funcs::vec_sum<T, plt::avx>(row_nnz, out_data, &sum); phi::funcs::vec_sum<T, backends::cpu::avx>(row_nnz, out_data, &sum);
phi::funcs::vec_scal<T, plt::avx>( phi::funcs::vec_scal<T, backends::cpu::avx>(
row_nnz, static_cast<T>(1) / sum, out_data, out_data); row_nnz, static_cast<T>(1) / sum, out_data, out_data);
x_data = x_data + row_nnz; x_data = x_data + row_nnz;
......
...@@ -106,42 +106,43 @@ void TestAndBench(const int n, ...@@ -106,42 +106,43 @@ void TestAndBench(const int n,
} }
TEST(CpuVecTest, sigmoid) { TEST(CpuVecTest, sigmoid) {
namespace platform = paddle::platform;
using namespace phi::funcs; // NOLINT using namespace phi::funcs; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>); TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
TestAndBench<float>( TestAndBench<float>(
sz, vec_sigmoid<float, platform::avx>, ref_sigmoid<float>); sz, vec_sigmoid<float, backends::cpu::avx>, ref_sigmoid<float>);
TestAndBench<float>( TestAndBench<float>(
sz, vec_sigmoid<float, platform::avx2>, ref_sigmoid<float>); sz, vec_sigmoid<float, backends::cpu::avx2>, ref_sigmoid<float>);
TestAndBench<float>( TestAndBench<float>(
sz, vec_sigmoid<float, platform::avx512f>, ref_sigmoid<float>); sz, vec_sigmoid<float, backends::cpu::avx512f>, ref_sigmoid<float>);
} }
TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>); TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
} }
TEST(CpuVecTest, tanh) { TEST(CpuVecTest, tanh) {
namespace platform = paddle::platform;
using namespace phi::funcs; // NOLINT using namespace phi::funcs; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>); TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
TestAndBench<float>( TestAndBench<float>(
sz, vec_tanh<float, platform::avx512f>, ref_tanh<float>); sz, vec_tanh<float, backends::cpu::avx>, ref_tanh<float>);
TestAndBench<float>(
sz, vec_tanh<float, backends::cpu::avx2>, ref_tanh<float>);
TestAndBench<float>(
sz, vec_tanh<float, backends::cpu::avx512f>, ref_tanh<float>);
} }
TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>); TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
} }
TEST(CpuVecTest, relu) { TEST(CpuVecTest, relu) {
namespace platform = paddle::platform;
using namespace phi::funcs; // NOLINT using namespace phi::funcs; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>); TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
TestAndBench<float>( TestAndBench<float>(
sz, vec_relu<float, platform::avx512f>, ref_relu<float>); sz, vec_relu<float, backends::cpu::avx>, ref_relu<float>);
TestAndBench<float>(
sz, vec_relu<float, backends::cpu::avx2>, ref_relu<float>);
TestAndBench<float>(
sz, vec_relu<float, backends::cpu::avx512f>, ref_relu<float>);
} }
TestAndBench<double>(30, vec_relu<double>, ref_relu<double>); TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
} }
...@@ -161,14 +162,16 @@ void compare_sum(size_t n, ...@@ -161,14 +162,16 @@ void compare_sum(size_t n,
} }
TEST(CpuVecTest, vec_sum) { TEST(CpuVecTest, vec_sum) {
namespace platform = paddle::platform;
using namespace phi::funcs; // NOLINT using namespace phi::funcs; // NOLINT
for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
compare_sum<float>(sz, vec_sum<float>, vec_sum<float, platform::isa_any>);
compare_sum<float>( compare_sum<float>(
sz, vec_sum<float, platform::avx>, vec_sum<float, platform::isa_any>); sz, vec_sum<float>, vec_sum<float, backends::cpu::isa_any>);
compare_sum<float>(sz,
vec_sum<float, backends::cpu::avx>,
vec_sum<float, backends::cpu::isa_any>);
} }
compare_sum<double>(30U, vec_sum<double>, vec_sum<double, platform::isa_any>); compare_sum<double>(
30U, vec_sum<double>, vec_sum<double, backends::cpu::isa_any>);
} }
template <typename T> template <typename T>
...@@ -192,18 +195,17 @@ void compare_clip( ...@@ -192,18 +195,17 @@ void compare_clip(
} }
TEST(CpuVecTest, vec_clip) { TEST(CpuVecTest, vec_clip) {
namespace platform = paddle::platform;
using namespace phi::funcs; // NOLINT using namespace phi::funcs; // NOLINT
for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
compare_clip<float>( compare_clip<float>(
sz, -4.f, vec_clip<float>, vec_clip<float, platform::isa_any>); sz, -4.f, vec_clip<float>, vec_clip<float, backends::cpu::isa_any>);
compare_clip<float>(sz, compare_clip<float>(sz,
-1.1f, -1.1f,
vec_clip<float, platform::avx>, vec_clip<float, backends::cpu::avx>,
vec_clip<float, platform::isa_any>); vec_clip<float, backends::cpu::isa_any>);
} }
compare_clip<double>( compare_clip<double>(
30U, 1.0, vec_clip<double>, vec_clip<double, platform::isa_any>); 30U, 1.0, vec_clip<double>, vec_clip<double, backends::cpu::isa_any>);
} }
template <typename T> template <typename T>
...@@ -230,14 +232,16 @@ void compare_mul( ...@@ -230,14 +232,16 @@ void compare_mul(
} }
TEST(CpuVecTest, vec_mul) { TEST(CpuVecTest, vec_mul) {
namespace platform = paddle::platform;
using namespace phi::funcs; // NOLINT using namespace phi::funcs; // NOLINT
for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
compare_mul<float>(sz, vec_mul<float>, vec_mul<float, platform::isa_any>);
compare_mul<float>( compare_mul<float>(
sz, vec_mul<float, platform::avx>, vec_mul<float, platform::isa_any>); sz, vec_mul<float>, vec_mul<float, backends::cpu::isa_any>);
compare_mul<float>(sz,
vec_mul<float, backends::cpu::avx>,
vec_mul<float, backends::cpu::isa_any>);
} }
compare_mul<double>(30U, vec_mul<double>, vec_mul<double, platform::isa_any>); compare_mul<double>(
30U, vec_mul<double>, vec_mul<double, backends::cpu::isa_any>);
} }
template <typename T> template <typename T>
...@@ -260,17 +264,18 @@ void compare_mul_reduce( ...@@ -260,17 +264,18 @@ void compare_mul_reduce(
} }
TEST(CpuVecTest, vec_mul_reduce) { TEST(CpuVecTest, vec_mul_reduce) {
namespace platform = paddle::platform;
using namespace phi::funcs; // NOLINT using namespace phi::funcs; // NOLINT
for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
compare_mul_reduce<float>(
sz, vec_mul_reduce<float>, vec_mul_reduce<float, platform::isa_any>);
compare_mul_reduce<float>(sz, compare_mul_reduce<float>(sz,
vec_mul_reduce<float, platform::avx>, vec_mul_reduce<float>,
vec_mul_reduce<float, platform::isa_any>); vec_mul_reduce<float, backends::cpu::isa_any>);
compare_mul_reduce<float>(sz,
vec_mul_reduce<float, backends::cpu::avx>,
vec_mul_reduce<float, backends::cpu::isa_any>);
} }
compare_mul_reduce<double>( compare_mul_reduce<double>(30U,
30U, vec_mul_reduce<double>, vec_mul_reduce<double, platform::isa_any>); vec_mul_reduce<double>,
vec_mul_reduce<double, backends::cpu::isa_any>);
} }
template <typename T> template <typename T>
...@@ -296,40 +301,43 @@ void TestInplace(const int n, ...@@ -296,40 +301,43 @@ void TestInplace(const int n,
} }
TEST(CpuVecTest, inplace_sigmoid) { TEST(CpuVecTest, inplace_sigmoid) {
namespace platform = paddle::platform;
using namespace phi::funcs; // NOLINT using namespace phi::funcs; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>); TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
TestInplace<float>( TestInplace<float>(
sz, vec_sigmoid<float, platform::avx>, ref_sigmoid<float>); sz, vec_sigmoid<float, backends::cpu::avx>, ref_sigmoid<float>);
TestInplace<float>( TestInplace<float>(
sz, vec_sigmoid<float, platform::avx2>, ref_sigmoid<float>); sz, vec_sigmoid<float, backends::cpu::avx2>, ref_sigmoid<float>);
TestInplace<float>( TestInplace<float>(
sz, vec_sigmoid<float, platform::avx512f>, ref_sigmoid<float>); sz, vec_sigmoid<float, backends::cpu::avx512f>, ref_sigmoid<float>);
} }
TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>); TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
} }
TEST(CpuVecTest, inplace_tanh) { TEST(CpuVecTest, inplace_tanh) {
namespace platform = paddle::platform;
using namespace phi::funcs; // NOLINT using namespace phi::funcs; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>); TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>); TestInplace<float>(
TestInplace<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>); sz, vec_tanh<float, backends::cpu::avx>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, platform::avx512f>, ref_tanh<float>); TestInplace<float>(
sz, vec_tanh<float, backends::cpu::avx2>, ref_tanh<float>);
TestInplace<float>(
sz, vec_tanh<float, backends::cpu::avx512f>, ref_tanh<float>);
} }
TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>); TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
} }
TEST(CpuVecTest, inplace_relu) { TEST(CpuVecTest, inplace_relu) {
namespace platform = paddle::platform;
using namespace phi::funcs; // NOLINT using namespace phi::funcs; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestInplace<float>(sz, vec_relu<float>, ref_relu<float>); TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>); TestInplace<float>(
TestInplace<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>); sz, vec_relu<float, backends::cpu::avx>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, platform::avx512f>, ref_relu<float>); TestInplace<float>(
sz, vec_relu<float, backends::cpu::avx2>, ref_relu<float>);
TestInplace<float>(
sz, vec_relu<float, backends::cpu::avx512f>, ref_relu<float>);
} }
TestInplace<double>(30, vec_relu<double>, ref_relu<double>); TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册