未验证 提交 53a5906c 编写于 作者: W Wilber 提交者: GitHub

fix fluid-lite-subgraph x86 compile error test=develop (#2682)

-fix fluid-lite-subgraph x86 compile error
    - Replace FLAGS with environment variables
上级 52f325e3
......@@ -32,26 +32,37 @@
#include <gflags/gflags.h>
#include <algorithm>
DEFINE_double(fraction_of_cpu_memory_to_use,
1,
"Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
DEFINE_uint64(initial_cpu_memory_in_mb,
500ul,
"Initial CPU memory for PaddlePaddle, in MD unit.");
DEFINE_double(
fraction_of_cuda_pinned_memory_to_use,
0.5,
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
#include "lite/utils/env.h"
// DEFINE_double(fraction_of_cpu_memory_to_use,
// 1,
// "Default use 100% of CPU memory for PaddlePaddle,"
// "reserve the rest for page tables, etc");
double fraction_of_cpu_memory_to_use =
paddle::lite::GetDoubleFromEnv("fraction_of_cpu_memory_to_use", 1);
// DEFINE_uint64(initial_cpu_memory_in_mb,
// 500ul,
// "Initial CPU memory for PaddlePaddle, in MD unit.");
uint64_t initial_cpu_memory_in_mb =
paddle::lite::GetUInt64FromEnv("initial_cpu_memory_in_mb", 500ul);
// DEFINE_double(
// fraction_of_cuda_pinned_memory_to_use,
// 0.5,
// "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
// "reserve the rest for page tables, etc");
double fraction_of_cuda_pinned_memory_to_use = paddle::lite::GetDoubleFromEnv(
"fraction_of_cuda_pinned_memory_to_use", 0.5);
// If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
// DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
bool use_pinned_memory =
paddle::lite::GetBoolFromEnv("use_pinned_memory", true);
namespace paddle {
namespace lite {
......@@ -81,7 +92,7 @@ size_t CpuTotalPhysicalMemory() {
size_t CpuMaxAllocSize() {
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
return fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
}
size_t CpuMinChunkSize() {
......@@ -92,15 +103,14 @@ size_t CpuMinChunkSize() {
size_t CpuMaxChunkSize() {
// Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
// or the initial_cpu_memory_in_mb.
return std::min(
static_cast<size_t>(CpuMaxAllocSize() / 32),
static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
return std::min(static_cast<size_t>(CpuMaxAllocSize() / 32),
static_cast<size_t>(initial_cpu_memory_in_mb * 1 << 20));
}
size_t CUDAPinnedMaxAllocSize() {
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
return fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
}
size_t CUDAPinnedMinChunkSize() {
......
......@@ -22,36 +22,46 @@ limitations under the License. */
#include "lite/backends/x86/cupti_lib_path.h"
#include "lite/backends/x86/port.h"
#include "lite/backends/x86/warpctc_lib_path.h"
#include "lite/utils/env.h"
#include "lite/utils/paddle_enforce.h"
DEFINE_string(cudnn_dir,
"",
"Specify path for loading libcudnn.so. For instance, "
"/usr/local/cudnn/lib. If empty [default], dlopen "
"will search cudnn from LD_LIBRARY_PATH");
// DEFINE_string(cudnn_dir,
// "",
// "Specify path for loading libcudnn.so. For instance, "
// "/usr/local/cudnn/lib. If empty [default], dlopen "
// "will search cudnn from LD_LIBRARY_PATH");
std::string cudnn_dir = paddle::lite::GetStringFromEnv("cudnn_dir"); // NOLINT
DEFINE_string(cuda_dir,
"",
"Specify path for loading cuda library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH");
// DEFINE_string(cuda_dir,
// "",
// "Specify path for loading cuda library, such as libcublas, "
// "libcurand. For instance, /usr/local/cuda/lib64. If default, "
// "dlopen will search cuda from LD_LIBRARY_PATH");
std::string cuda_dir = paddle::lite::GetStringFromEnv("cuda_dir"); // NOLINT
DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
// DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
std::string f_warpctc_dir = // NOLINT
paddle::lite::GetStringFromEnv("warpctc_dir"); // NOLINT
DEFINE_string(nccl_dir,
"",
"Specify path for loading nccl library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH");
// DEFINE_string(nccl_dir,
// "",
// "Specify path for loading nccl library, such as libcublas, "
// "libcurand. For instance, /usr/local/cuda/lib64. If default, "
// "dlopen will search cuda from LD_LIBRARY_PATH");
std::string nccl_dir = paddle::lite::GetStringFromEnv("nccl_dir"); // NOLINT
DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
// DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
std::string cupti_dir = paddle::lite::GetStringFromEnv("cupti_dir"); // NOLINT
DEFINE_string(
tensorrt_dir,
"",
"Specify path for loading tensorrt library, such as libnvinfer.so.");
// DEFINE_string(
// tensorrt_dir,
// "",
// "Specify path for loading tensorrt library, such as libnvinfer.so.");
std::string tensorrt_dir = // NOLINT
paddle::lite::GetStringFromEnv("tensorrt_dir"); // NOLINT
DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
// DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
std::string mklml_dir = paddle::lite::GetStringFromEnv("mklml_dir"); // NOLINT
namespace paddle {
namespace lite {
......@@ -180,28 +190,28 @@ auto error_msg =
void* GetCublasDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
return GetDsoHandleFromSearchPath(cuda_dir, "libcublas.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
return GetDsoHandleFromSearchPath(cuda_dir, win_cublas_lib);
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
return GetDsoHandleFromSearchPath(cuda_dir, "libcublas.so");
#endif
}
void* GetCUDNNDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
return GetDsoHandleFromSearchPath(cudnn_dir, "libcudnn.dylib", false);
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
return GetDsoHandleFromSearchPath(cudnn_dir, win_cudnn_lib);
#else
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
return GetDsoHandleFromSearchPath(cudnn_dir, "libcudnn.so", false);
#endif
}
void* GetCUPTIDsoHandle() {
std::string cupti_path = cupti_lib_path;
if (!FLAGS_cupti_dir.empty()) {
cupti_path = FLAGS_cupti_dir;
if (!cupti_dir.empty()) {
cupti_path = cupti_dir;
}
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
......@@ -212,18 +222,18 @@ void* GetCUPTIDsoHandle() {
void* GetCurandDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
return GetDsoHandleFromSearchPath(cuda_dir, "libcurand.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
return GetDsoHandleFromSearchPath(cuda_dir, win_curand_lib);
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
return GetDsoHandleFromSearchPath(cuda_dir, "libcurand.so");
#endif
}
void* GetWarpCTCDsoHandle() {
std::string warpctc_dir = warpctc_lib_path;
if (!FLAGS_warpctc_dir.empty()) {
warpctc_dir = FLAGS_warpctc_dir;
if (!f_warpctc_dir.empty()) {
warpctc_dir = f_warpctc_dir;
}
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
......@@ -236,27 +246,27 @@ void* GetWarpCTCDsoHandle() {
void* GetNCCLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
return GetDsoHandleFromSearchPath(nccl_dir, "libnccl.dylib");
#else
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
return GetDsoHandleFromSearchPath(nccl_dir, "libnccl.so");
#endif
}
void* GetTensorRtDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.dylib");
#else
return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.so");
#endif
}
void* GetMKLMLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.so");
#endif
}
......
......@@ -21,13 +21,15 @@
// posix_memalign
#include "lite/backends/x86/cpu_info.h"
#include "lite/backends/x86/jit/macro.h"
#include "lite/utils/env.h"
#include "lite/utils/paddle_enforce.h"
#ifndef _WIN32
#define posix_memalign_free free
#endif
DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
// DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode");
namespace paddle {
namespace lite {
......
......@@ -20,7 +20,8 @@
#include <vector>
#include "lite/backends/x86/jit/kernel_base.h"
DECLARE_bool(dump_jitcode);
// DECLARE_bool(dump_jitcode);
extern bool dump_jitcode;
namespace paddle {
namespace lite {
......@@ -36,7 +37,7 @@ class GenBase : public Kernel {
template <typename Func>
Func getCode() const {
const unsigned char* code = this->getCodeInternal();
if (FLAGS_dump_jitcode) {
if (dump_jitcode) {
this->dumpCode(code);
}
// Note: failed to cast with reinterpret_cast<const Func> on Mac clang,
......
......@@ -41,9 +41,11 @@
(this is the zlib license)
*/
#pragma once
#include "lite/backends/x86/cpu_info.h"
namespace paddle {
namespace lite {
/* __m128 is ugly to write */
typedef __m256 v8sf; // vector of 8 float (avx)
typedef __m256i v8si; // vector of 8 int (avx)
......@@ -134,7 +136,7 @@ typedef union imm_xmm_union {
return (ret); \
}
//#warning "Using SSE2 to perform AVX2 bitshift ops"
// #warning "Using SSE2 to perform AVX2 bitshift ops"
AVX2_BITOP_USING_SSE2(slli_epi32)
AVX2_BITOP_USING_SSE2(srli_epi32)
......@@ -152,7 +154,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32)
return (ret); \
}
//#warning "Using SSE2 to perform AVX2 integer ops"
// #warning "Using SSE2 to perform AVX2 integer ops"
AVX2_INTOP_USING_SSE2(and_si128)
AVX2_INTOP_USING_SSE2(andnot_si128)
AVX2_INTOP_USING_SSE2(cmpeq_epi32)
......@@ -175,23 +177,23 @@ AVX2_INTOP_USING_SSE2(add_epi32)
*/
v8sf log256_ps(v8sf x) {
v8si imm0;
v8sf one = *(v8sf *)_ps256_1;
v8sf one = *(v8sf *)_ps256_1; // NOLINT
// v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
x = _mm256_max_ps(
x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
x = _mm256_max_ps(x, *(v8sf *)_ps256_min_norm_pos); // NOLINT
/* cut off denormalized stuff */ // NOLINT
// can be done with AVX2
imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
/* keep only the fractional part */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); // NOLINT
x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); // NOLINT
// this is again another AVX2 instruction
imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); // NOLINT
v8sf e = _mm256_cvtepi32_ps(imm0);
e = _mm256_add_ps(e, one);
......@@ -203,7 +205,8 @@ v8sf log256_ps(v8sf x) {
} else { x = x - 1.0; }
*/
// v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
v8sf mask =
_mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); // NOLINT
v8sf tmp = _mm256_and_ps(x, mask);
x = _mm256_sub_ps(x, one);
e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
......@@ -211,34 +214,34 @@ v8sf log256_ps(v8sf x) {
v8sf z = _mm256_mul_ps(x, x);
v8sf y = *(v8sf *)_ps256_cephes_log_p0;
v8sf y = *(v8sf *)_ps256_cephes_log_p0; // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_mul_ps(y, z);
tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); // NOLINT
y = _mm256_add_ps(y, tmp);
tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT
y = _mm256_sub_ps(y, tmp);
tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); // NOLINT
x = _mm256_add_ps(x, y);
x = _mm256_add_ps(x, tmp);
x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
......@@ -262,14 +265,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
v8sf exp256_ps(v8sf x) {
v8sf tmp = _mm256_setzero_ps(), fx;
v8si imm0;
v8sf one = *(v8sf *)_ps256_1;
v8sf one = *(v8sf *)_ps256_1; // NOLINT
x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); // NOLINT
x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); // NOLINT
/* express exp(x) as exp(g + n*log(2)) */
fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); // NOLINT
fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); // NOLINT
/* how to perform a floorf with SSE: just below */
// imm0 = _mm256_cvttps_epi32(fx);
......@@ -283,24 +286,24 @@ v8sf exp256_ps(v8sf x) {
mask = _mm256_and_ps(mask, one);
fx = _mm256_sub_ps(tmp, mask);
tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); // NOLINT
v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); // NOLINT
x = _mm256_sub_ps(x, tmp);
x = _mm256_sub_ps(x, z);
z = _mm256_mul_ps(x, x);
v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
v8sf y = *(v8sf *)_ps256_cephes_exp_p0; // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); // NOLINT
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one);
......@@ -308,7 +311,7 @@ v8sf exp256_ps(v8sf x) {
/* build 2^n */
imm0 = _mm256_cvttps_epi32(fx);
// another two AVX2 instructions
imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); // NOLINT
imm0 = avx2_mm256_slli_epi32(imm0, 23);
v8sf pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
......@@ -349,12 +352,12 @@ v8sf sin256_ps(v8sf x) { // any x
sign_bit = x;
/* take the absolute value */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); // NOLINT
/* extract the sign bit (upper one) */
sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); // NOLINT
/* scale by 4/Pi */
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); // NOLINT
/*
Here we start a series of integer operations, which are in the
......@@ -367,12 +370,12 @@ v8sf sin256_ps(v8sf x) { // any x
imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
// another two AVX2 instruction
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); // NOLINT
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); // NOLINT
y = _mm256_cvtepi32_ps(imm2);
/* get the swap sign flag */
imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); // NOLINT
imm0 = avx2_mm256_slli_epi32(imm0, 29);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
......@@ -380,31 +383,31 @@ v8sf sin256_ps(v8sf x) { // any x
Both branches will be computed.
*/
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); // NOLINT
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); // NOLINT
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); // NOLINT
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); // NOLINT
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); // NOLINT
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); // NOLINT
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
y = _mm256_cvtepi32_ps(imm2);
imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); // NOLINT
imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); // NOLINT
imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29);
COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); // NOLINT
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); // NOLINT
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
......@@ -418,9 +421,9 @@ v8sf sin256_ps(v8sf x) { // any x
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; // NOLINT
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; // NOLINT
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; // NOLINT
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
......@@ -429,26 +432,26 @@ v8sf sin256_ps(v8sf x) { // any x
x = _mm256_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v8sf *)_ps256_coscof_p0;
y = *(v8sf *)_ps256_coscof_p0; // NOLINT
v8sf z = _mm256_mul_ps(x, x);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1); // NOLINT
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf y2 = *(v8sf *)_ps256_sincof_p0;
v8sf y2 = *(v8sf *)_ps256_sincof_p0; // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
......@@ -475,53 +478,53 @@ v8sf cos256_ps(v8sf x) { // any x
#endif
/* take the absolute value */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); // NOLINT
/* scale by 4/Pi */
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); // NOLINT
#ifdef __AVX2__
/* store the integer part of y in mm0 */
imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); // NOLINT
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); // NOLINT
y = _mm256_cvtepi32_ps(imm2);
imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2); // NOLINT
/* get the swap sign flag */
imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4); // NOLINT
imm0 = avx2_mm256_slli_epi32(imm0, 29);
/* get the polynom selection mask */
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); // NOLINT
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); // NOLINT
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); // NOLINT
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); // NOLINT
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); // NOLINT
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); // NOLINT
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
y = _mm256_cvtepi32_ps(imm2);
imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2); // NOLINT
imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2); // NOLINT
imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4); // NOLINT
imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4); // NOLINT
imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29);
COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); // NOLINT
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); // NOLINT
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
......@@ -534,9 +537,9 @@ v8sf cos256_ps(v8sf x) { // any x
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; // NOLINT
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; // NOLINT
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; // NOLINT
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
......@@ -545,26 +548,26 @@ v8sf cos256_ps(v8sf x) { // any x
x = _mm256_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v8sf *)_ps256_coscof_p0;
y = *(v8sf *)_ps256_coscof_p0; // NOLINT
v8sf z = _mm256_mul_ps(x, x);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1); // NOLINT
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf y2 = *(v8sf *)_ps256_sincof_p0;
v8sf y2 = *(v8sf *)_ps256_sincof_p0; // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
......@@ -595,42 +598,43 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
sign_bit_sin = x;
/* take the absolute value */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); // NOLINT
/* extract the sign bit (upper one) */
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
sign_bit_sin =
_mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask); // NOLINT
/* scale by 4/Pi */
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); // NOLINT
#ifdef __AVX2__
/* store the integer part of y in imm2 */
imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); // NOLINT
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); // NOLINT
y = _mm256_cvtepi32_ps(imm2);
imm4 = imm2;
/* get the swap sign flag for the sine */
imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); // NOLINT
imm0 = avx2_mm256_slli_epi32(imm0, 29);
// v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
/* get the polynom selection mask for the sine*/
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); // NOLINT
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); // NOLINT
// v8sf poly_mask = _mm256_castsi256_ps(imm2);
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); // NOLINT
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); // NOLINT
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); // NOLINT
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); // NOLINT
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
y = _mm256_cvtepi32_ps(imm2);
......@@ -638,16 +642,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
imm4_1 = imm2_1;
imm4_2 = imm2_2;
imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); // NOLINT
imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); // NOLINT
imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29);
COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); // NOLINT
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); // NOLINT
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
......@@ -659,9 +663,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; // NOLINT
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; // NOLINT
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; // NOLINT
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
......@@ -670,15 +674,15 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
x = _mm256_add_ps(x, xmm3);
#ifdef __AVX2__
imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2); // NOLINT
imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4); // NOLINT
imm4 = avx2_mm256_slli_epi32(imm4, 29);
#else
imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2); // NOLINT
imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2); // NOLINT
imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4); // NOLINT
imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4); // NOLINT
imm4_1 = _mm_slli_epi32(imm4_1, 29);
imm4_2 = _mm_slli_epi32(imm4_2, 29);
......@@ -692,25 +696,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
/* Evaluate the first polynom (0 <= x <= Pi/4) */
v8sf z = _mm256_mul_ps(x, x);
y = *(v8sf *)_ps256_coscof_p0;
y = *(v8sf *)_ps256_coscof_p0; // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); // NOLINT
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1); // NOLINT
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf y2 = *(v8sf *)_ps256_sincof_p0;
v8sf y2 = *(v8sf *)_ps256_sincof_p0; // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); // NOLINT
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
......@@ -729,3 +733,6 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
*s = _mm256_xor_ps(xmm1, sign_bit_sin);
*c = _mm256_xor_ps(xmm2, sign_bit_cos);
}
} // namespace lite
} // namespace paddle
......@@ -13,10 +13,13 @@
// limitations under the License.
#include "lite/kernels/x86/gru_compute.h"
#include "lite/utils/env.h"
DEFINE_int32(paddle_num_threads,
1,
"Number of threads for each paddle instance.");
// DEFINE_int32(paddle_num_threads,
// 1,
// "Number of threads for each paddle instance.");
int32_t paddle_num_threads =
paddle::lite::GetIntFromEnv("paddle_num_threads", 1);
REGISTER_LITE_KERNEL(gru,
kX86,
......
......@@ -26,7 +26,8 @@
#include "lite/core/types.h"
#include "lite/fluid/eigen.h"
DECLARE_int32(paddle_num_threads);
// DECLARE_int32(paddle_num_threads);
extern int32_t paddle_num_threads;
namespace paddle {
namespace lite {
......@@ -109,7 +110,7 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
#ifdef PADDLE_WITH_MKLML
// use MKL packed to speedup GEMM
if (FLAGS_paddle_num_threads >= 4) {
if (paddle_num_threads >= 4) {
auto blas = lite::x86::math::GetBlas<TARGET(kX86), T>(context);
T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix,
1 /*height of C*/,
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <string>
namespace paddle {
namespace lite {
static std::string GetStringFromEnv(const std::string& str,
const std::string& def = "") {
char* variable = std::getenv(str.c_str());
if (!variable) {
return def;
}
return std::string(variable);
}
static bool GetBoolFromEnv(const std::string& str, bool def = false) {
char* variable = std::getenv(str.c_str());
if (!variable) {
return def;
}
if (strcmp(variable, "false") == 0 || strcmp(variable, "0") == 0) {
return false;
} else {
return true;
}
}
static int GetIntFromEnv(const std::string& str, int def = 0) {
char* variable = std::getenv(str.c_str());
if (!variable) {
return def;
}
return atoi(variable);
}
static double GetDoubleFromEnv(const std::string& str, double def = 0.0) {
char* variable = std::getenv(str.c_str());
if (!variable) {
return def;
}
return atof(variable);
}
static uint64_t GetUInt64FromEnv(const std::string& str, uint64_t def = 0ul) {
char* variable = std::getenv(str.c_str());
if (!variable) {
return def;
}
return static_cast<uint64_t>(atol(variable));
}
} // namespace lite
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册