From 53a5906c16123d42cef6aee0a54bf585a9ee1178 Mon Sep 17 00:00:00 2001 From: Wilber Date: Thu, 26 Dec 2019 19:22:59 +0800 Subject: [PATCH] fix fluid-lite-subgraph x86 compile error test=develop (#2682) -fix fluid-lite-subgraph x86 compile error - Replace FLAGS with environment variables --- lite/backends/x86/cpu_info.cc | 48 ++-- lite/backends/x86/dynamic_loader.cc | 94 +++---- lite/backends/x86/jit/gen_base.cc | 4 +- lite/backends/x86/jit/gen_base.h | 5 +- lite/backends/x86/math/detail/avx_mathfun.h | 257 ++++++++++---------- lite/kernels/x86/gru_compute.cc | 9 +- lite/kernels/x86/gru_compute.h | 5 +- lite/utils/env.h | 71 ++++++ 8 files changed, 299 insertions(+), 194 deletions(-) create mode 100644 lite/utils/env.h diff --git a/lite/backends/x86/cpu_info.cc b/lite/backends/x86/cpu_info.cc index c2759d6191..aa097f947a 100644 --- a/lite/backends/x86/cpu_info.cc +++ b/lite/backends/x86/cpu_info.cc @@ -32,26 +32,37 @@ #include #include -DEFINE_double(fraction_of_cpu_memory_to_use, - 1, - "Default use 100% of CPU memory for PaddlePaddle," - "reserve the rest for page tables, etc"); -DEFINE_uint64(initial_cpu_memory_in_mb, - 500ul, - "Initial CPU memory for PaddlePaddle, in MD unit."); - -DEFINE_double( - fraction_of_cuda_pinned_memory_to_use, - 0.5, - "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," - "reserve the rest for page tables, etc"); +#include "lite/utils/env.h" + +// DEFINE_double(fraction_of_cpu_memory_to_use, +// 1, +// "Default use 100% of CPU memory for PaddlePaddle," +// "reserve the rest for page tables, etc"); +double fraction_of_cpu_memory_to_use = + paddle::lite::GetDoubleFromEnv("fraction_of_cpu_memory_to_use", 1); + +// DEFINE_uint64(initial_cpu_memory_in_mb, +// 500ul, +// "Initial CPU memory for PaddlePaddle, in MD unit."); +uint64_t initial_cpu_memory_in_mb = + paddle::lite::GetUInt64FromEnv("initial_cpu_memory_in_mb", 500ul); + +// DEFINE_double( +// fraction_of_cuda_pinned_memory_to_use, +// 0.5, +// "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," +// "reserve the rest for page tables, etc"); +double fraction_of_cuda_pinned_memory_to_use = paddle::lite::GetDoubleFromEnv( + "fraction_of_cuda_pinned_memory_to_use", 0.5); // If use_pinned_memory is true, CPUAllocator calls mlock, which // returns pinned and locked memory as staging areas for data exchange // between host and device. Allocates too much would reduce the amount // of memory available to the system for paging. So, by default, we // should set false to use_pinned_memory. -DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory."); +// DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory."); +bool use_pinned_memory = + paddle::lite::GetBoolFromEnv("use_pinned_memory", true); namespace paddle { namespace lite { @@ -81,7 +92,7 @@ size_t CpuTotalPhysicalMemory() { size_t CpuMaxAllocSize() { // For distributed systems, it requires configuring and limiting // the fraction of memory to use. - return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); + return fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); } size_t CpuMinChunkSize() { @@ -92,15 +103,14 @@ size_t CpuMinChunkSize() { size_t CpuMaxChunkSize() { // Allow to allocate the maximum chunk size is roughly 3% of CPU memory, // or the initial_cpu_memory_in_mb. - return std::min( - static_cast(CpuMaxAllocSize() / 32), - static_cast(FLAGS_initial_cpu_memory_in_mb * 1 << 20)); + return std::min(static_cast(CpuMaxAllocSize() / 32), + static_cast(initial_cpu_memory_in_mb * 1 << 20)); } size_t CUDAPinnedMaxAllocSize() { // For distributed systems, it requires configuring and limiting // the fraction of memory to use. - return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory(); + return fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory(); } size_t CUDAPinnedMinChunkSize() { diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc index 75bb528f38..a05a57e93b 100644 --- a/lite/backends/x86/dynamic_loader.cc +++ b/lite/backends/x86/dynamic_loader.cc @@ -22,36 +22,46 @@ limitations under the License. */ #include "lite/backends/x86/cupti_lib_path.h" #include "lite/backends/x86/port.h" #include "lite/backends/x86/warpctc_lib_path.h" +#include "lite/utils/env.h" #include "lite/utils/paddle_enforce.h" -DEFINE_string(cudnn_dir, - "", - "Specify path for loading libcudnn.so. For instance, " - "/usr/local/cudnn/lib. If empty [default], dlopen " - "will search cudnn from LD_LIBRARY_PATH"); +// DEFINE_string(cudnn_dir, +// "", +// "Specify path for loading libcudnn.so. For instance, " +// "/usr/local/cudnn/lib. If empty [default], dlopen " +// "will search cudnn from LD_LIBRARY_PATH"); +std::string cudnn_dir = paddle::lite::GetStringFromEnv("cudnn_dir"); // NOLINT -DEFINE_string(cuda_dir, - "", - "Specify path for loading cuda library, such as libcublas, " - "libcurand. For instance, /usr/local/cuda/lib64. If default, " - "dlopen will search cuda from LD_LIBRARY_PATH"); +// DEFINE_string(cuda_dir, +// "", +// "Specify path for loading cuda library, such as libcublas, " +// "libcurand. For instance, /usr/local/cuda/lib64. If default, " +// "dlopen will search cuda from LD_LIBRARY_PATH"); +std::string cuda_dir = paddle::lite::GetStringFromEnv("cuda_dir"); // NOLINT -DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); +// DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); +std::string f_warpctc_dir = // NOLINT + paddle::lite::GetStringFromEnv("warpctc_dir"); // NOLINT -DEFINE_string(nccl_dir, - "", - "Specify path for loading nccl library, such as libcublas, " - "libcurand. For instance, /usr/local/cuda/lib64. If default, " - "dlopen will search cuda from LD_LIBRARY_PATH"); +// DEFINE_string(nccl_dir, +// "", +// "Specify path for loading nccl library, such as libcublas, " +// "libcurand. For instance, /usr/local/cuda/lib64. If default, " +// "dlopen will search cuda from LD_LIBRARY_PATH"); +std::string nccl_dir = paddle::lite::GetStringFromEnv("nccl_dir"); // NOLINT -DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); +// DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); +std::string cupti_dir = paddle::lite::GetStringFromEnv("cupti_dir"); // NOLINT -DEFINE_string( - tensorrt_dir, - "", - "Specify path for loading tensorrt library, such as libnvinfer.so."); +// DEFINE_string( +// tensorrt_dir, +// "", +// "Specify path for loading tensorrt library, such as libnvinfer.so."); +std::string tensorrt_dir = // NOLINT + paddle::lite::GetStringFromEnv("tensorrt_dir"); // NOLINT -DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); +// DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); +std::string mklml_dir = paddle::lite::GetStringFromEnv("mklml_dir"); // NOLINT namespace paddle { namespace lite { @@ -180,28 +190,28 @@ auto error_msg = void* GetCublasDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); + return GetDsoHandleFromSearchPath(cuda_dir, "libcublas.dylib"); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib); + return GetDsoHandleFromSearchPath(cuda_dir, win_cublas_lib); #else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); + return GetDsoHandleFromSearchPath(cuda_dir, "libcublas.so"); #endif } void* GetCUDNNDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false); + return GetDsoHandleFromSearchPath(cudnn_dir, "libcudnn.dylib", false); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib); + return GetDsoHandleFromSearchPath(cudnn_dir, win_cudnn_lib); #else - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false); + return GetDsoHandleFromSearchPath(cudnn_dir, "libcudnn.so", false); #endif } void* GetCUPTIDsoHandle() { std::string cupti_path = cupti_lib_path; - if (!FLAGS_cupti_dir.empty()) { - cupti_path = FLAGS_cupti_dir; + if (!cupti_dir.empty()) { + cupti_path = cupti_dir; } #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false); @@ -212,18 +222,18 @@ void* GetCUPTIDsoHandle() { void* GetCurandDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); + return GetDsoHandleFromSearchPath(cuda_dir, "libcurand.dylib"); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib); + return GetDsoHandleFromSearchPath(cuda_dir, win_curand_lib); #else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); + return GetDsoHandleFromSearchPath(cuda_dir, "libcurand.so"); #endif } void* GetWarpCTCDsoHandle() { std::string warpctc_dir = warpctc_lib_path; - if (!FLAGS_warpctc_dir.empty()) { - warpctc_dir = FLAGS_warpctc_dir; + if (!f_warpctc_dir.empty()) { + warpctc_dir = f_warpctc_dir; } #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib"); @@ -236,27 +246,27 @@ void* GetWarpCTCDsoHandle() { void* GetNCCLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib"); + return GetDsoHandleFromSearchPath(nccl_dir, "libnccl.dylib"); #else - return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so"); + return GetDsoHandleFromSearchPath(nccl_dir, "libnccl.so"); #endif } void* GetTensorRtDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib"); + return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.dylib"); #else - return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so"); + return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.so"); #endif } void* GetMKLMLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); + return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib"); #elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll"); + return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll"); #else - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); + return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.so"); #endif } diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc index 38250d533d..7d051aa6f5 100644 --- a/lite/backends/x86/jit/gen_base.cc +++ b/lite/backends/x86/jit/gen_base.cc @@ -21,13 +21,15 @@ // posix_memalign #include "lite/backends/x86/cpu_info.h" #include "lite/backends/x86/jit/macro.h" +#include "lite/utils/env.h" #include "lite/utils/paddle_enforce.h" #ifndef _WIN32 #define posix_memalign_free free #endif -DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); +// DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); +bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode"); namespace paddle { namespace lite { diff --git a/lite/backends/x86/jit/gen_base.h b/lite/backends/x86/jit/gen_base.h index b5f942615a..4af93c2447 100644 --- a/lite/backends/x86/jit/gen_base.h +++ b/lite/backends/x86/jit/gen_base.h @@ -20,7 +20,8 @@ #include #include "lite/backends/x86/jit/kernel_base.h" -DECLARE_bool(dump_jitcode); +// DECLARE_bool(dump_jitcode); +extern bool dump_jitcode; namespace paddle { namespace lite { @@ -36,7 +37,7 @@ class GenBase : public Kernel { template Func getCode() const { const unsigned char* code = this->getCodeInternal(); - if (FLAGS_dump_jitcode) { + if (dump_jitcode) { this->dumpCode(code); } // Note: failed to cast with reinterpret_cast on Mac clang, diff --git a/lite/backends/x86/math/detail/avx_mathfun.h b/lite/backends/x86/math/detail/avx_mathfun.h index c95c881512..2ad0866d63 100644 --- a/lite/backends/x86/math/detail/avx_mathfun.h +++ b/lite/backends/x86/math/detail/avx_mathfun.h @@ -41,9 +41,11 @@ (this is the zlib license) */ - +#pragma once #include "lite/backends/x86/cpu_info.h" +namespace paddle { +namespace lite { /* __m128 is ugly to write */ typedef __m256 v8sf; // vector of 8 float (avx) typedef __m256i v8si; // vector of 8 int (avx) @@ -134,7 +136,7 @@ typedef union imm_xmm_union { return (ret); \ } -//#warning "Using SSE2 to perform AVX2 bitshift ops" +// #warning "Using SSE2 to perform AVX2 bitshift ops" AVX2_BITOP_USING_SSE2(slli_epi32) AVX2_BITOP_USING_SSE2(srli_epi32) @@ -152,7 +154,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32) return (ret); \ } -//#warning "Using SSE2 to perform AVX2 integer ops" +// #warning "Using SSE2 to perform AVX2 integer ops" AVX2_INTOP_USING_SSE2(and_si128) AVX2_INTOP_USING_SSE2(andnot_si128) AVX2_INTOP_USING_SSE2(cmpeq_epi32) @@ -175,23 +177,23 @@ AVX2_INTOP_USING_SSE2(add_epi32) */ v8sf log256_ps(v8sf x) { v8si imm0; - v8sf one = *(v8sf *)_ps256_1; + v8sf one = *(v8sf *)_ps256_1; // NOLINT // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); - x = _mm256_max_ps( - x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */ + x = _mm256_max_ps(x, *(v8sf *)_ps256_min_norm_pos); // NOLINT + /* cut off denormalized stuff */ // NOLINT // can be done with AVX2 imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23); /* keep only the fractional part */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); - x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); // NOLINT + x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); // NOLINT // this is again another AVX2 instruction - imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); + imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); // NOLINT v8sf e = _mm256_cvtepi32_ps(imm0); e = _mm256_add_ps(e, one); @@ -203,7 +205,8 @@ v8sf log256_ps(v8sf x) { } else { x = x - 1.0; } */ // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); - v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); + v8sf mask = + _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); // NOLINT v8sf tmp = _mm256_and_ps(x, mask); x = _mm256_sub_ps(x, one); e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); @@ -211,34 +214,34 @@ v8sf log256_ps(v8sf x) { v8sf z = _mm256_mul_ps(x, x); - v8sf y = *(v8sf *)_ps256_cephes_log_p0; + v8sf y = *(v8sf *)_ps256_cephes_log_p0; // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); // NOLINT y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, z); - tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); + tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); // NOLINT y = _mm256_add_ps(y, tmp); - tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT y = _mm256_sub_ps(y, tmp); - tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); + tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); // NOLINT x = _mm256_add_ps(x, y); x = _mm256_add_ps(x, tmp); x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN @@ -262,14 +265,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1); v8sf exp256_ps(v8sf x) { v8sf tmp = _mm256_setzero_ps(), fx; v8si imm0; - v8sf one = *(v8sf *)_ps256_1; + v8sf one = *(v8sf *)_ps256_1; // NOLINT - x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); - x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); + x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); // NOLINT + x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); // NOLINT /* express exp(x) as exp(g + n*log(2)) */ - fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); - fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); + fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); // NOLINT + fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); // NOLINT /* how to perform a floorf with SSE: just below */ // imm0 = _mm256_cvttps_epi32(fx); @@ -283,24 +286,24 @@ v8sf exp256_ps(v8sf x) { mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); - tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); - v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); + tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); // NOLINT + v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); // NOLINT x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x, x); - v8sf y = *(v8sf *)_ps256_cephes_exp_p0; + v8sf y = *(v8sf *)_ps256_cephes_exp_p0; // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); // NOLINT y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); // NOLINT y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); @@ -308,7 +311,7 @@ v8sf exp256_ps(v8sf x) { /* build 2^n */ imm0 = _mm256_cvttps_epi32(fx); // another two AVX2 instructions - imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); + imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); // NOLINT imm0 = avx2_mm256_slli_epi32(imm0, 23); v8sf pow2n = _mm256_castsi256_ps(imm0); y = _mm256_mul_ps(y, pow2n); @@ -349,12 +352,12 @@ v8sf sin256_ps(v8sf x) { // any x sign_bit = x; /* take the absolute value */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); // NOLINT /* extract the sign bit (upper one) */ - sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); + sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); // NOLINT /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); + y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); // NOLINT /* Here we start a series of integer operations, which are in the @@ -367,12 +370,12 @@ v8sf sin256_ps(v8sf x) { // any x imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ // another two AVX2 instruction - imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); + imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); // NOLINT + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); // NOLINT y = _mm256_cvtepi32_ps(imm2); /* get the swap sign flag */ - imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); + imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); // NOLINT imm0 = avx2_mm256_slli_epi32(imm0, 29); /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 @@ -380,31 +383,31 @@ v8sf sin256_ps(v8sf x) { // any x Both branches will be computed. */ - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); - imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); // NOLINT + imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); // NOLINT #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2); - imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); - imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); + imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); // NOLINT + imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); // NOLINT - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); + imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); // NOLINT + imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); // NOLINT COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); y = _mm256_cvtepi32_ps(imm2); - imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); - imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); + imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); // NOLINT + imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); // NOLINT imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); + imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); // NOLINT + imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); // NOLINT imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); @@ -418,9 +421,9 @@ v8sf sin256_ps(v8sf x) { // any x /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; - xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; - xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; + xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; // NOLINT + xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; // NOLINT + xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; // NOLINT xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -429,26 +432,26 @@ v8sf sin256_ps(v8sf x) { // any x x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = *(v8sf *)_ps256_coscof_p0; + y = *(v8sf *)_ps256_coscof_p0; // NOLINT v8sf z = _mm256_mul_ps(x, x); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); // NOLINT y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); + y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); // NOLINT y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(v8sf *)_ps256_1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_1); // NOLINT /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - v8sf y2 = *(v8sf *)_ps256_sincof_p0; + v8sf y2 = *(v8sf *)_ps256_sincof_p0; // NOLINT y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); + y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); // NOLINT y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); + y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); // NOLINT y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); @@ -475,53 +478,53 @@ v8sf cos256_ps(v8sf x) { // any x #endif /* take the absolute value */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); // NOLINT /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); + y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); // NOLINT #ifdef __AVX2__ /* store the integer part of y in mm0 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ - imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); + imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); // NOLINT + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); // NOLINT y = _mm256_cvtepi32_ps(imm2); - imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2); + imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2); // NOLINT /* get the swap sign flag */ - imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4); + imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4); // NOLINT imm0 = avx2_mm256_slli_epi32(imm0, 29); /* get the polynom selection mask */ - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); - imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); // NOLINT + imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); // NOLINT #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2); - imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); - imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); + imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); // NOLINT + imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); // NOLINT - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); + imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); // NOLINT + imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); // NOLINT COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); y = _mm256_cvtepi32_ps(imm2); - imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2); - imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2); + imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2); // NOLINT + imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2); // NOLINT - imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4); - imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4); + imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4); // NOLINT + imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4); // NOLINT imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); + imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); // NOLINT + imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); // NOLINT imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); @@ -534,9 +537,9 @@ v8sf cos256_ps(v8sf x) { // any x /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; - xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; - xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; + xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; // NOLINT + xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; // NOLINT + xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; // NOLINT xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -545,26 +548,26 @@ v8sf cos256_ps(v8sf x) { // any x x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = *(v8sf *)_ps256_coscof_p0; + y = *(v8sf *)_ps256_coscof_p0; // NOLINT v8sf z = _mm256_mul_ps(x, x); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); // NOLINT y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); + y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); // NOLINT y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(v8sf *)_ps256_1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_1); // NOLINT /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - v8sf y2 = *(v8sf *)_ps256_sincof_p0; + v8sf y2 = *(v8sf *)_ps256_sincof_p0; // NOLINT y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); + y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); // NOLINT y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); + y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); // NOLINT y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); @@ -595,42 +598,43 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { sign_bit_sin = x; /* take the absolute value */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); // NOLINT /* extract the sign bit (upper one) */ - sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask); + sign_bit_sin = + _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask); // NOLINT /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); + y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); // NOLINT #ifdef __AVX2__ /* store the integer part of y in imm2 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ - imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); + imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); // NOLINT + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); // NOLINT y = _mm256_cvtepi32_ps(imm2); imm4 = imm2; /* get the swap sign flag for the sine */ - imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); + imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); // NOLINT imm0 = avx2_mm256_slli_epi32(imm0, 29); // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); /* get the polynom selection mask for the sine*/ - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); - imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); // NOLINT + imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); // NOLINT // v8sf poly_mask = _mm256_castsi256_ps(imm2); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2); - imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); - imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); + imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); // NOLINT + imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); // NOLINT - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); + imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); // NOLINT + imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); // NOLINT COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); y = _mm256_cvtepi32_ps(imm2); @@ -638,16 +642,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { imm4_1 = imm2_1; imm4_2 = imm2_2; - imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); - imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); + imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); // NOLINT + imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); // NOLINT imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); + imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); // NOLINT + imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); // NOLINT imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); @@ -659,9 +663,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; - xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; - xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; + xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; // NOLINT + xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; // NOLINT + xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; // NOLINT xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -670,15 +674,15 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { x = _mm256_add_ps(x, xmm3); #ifdef __AVX2__ - imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2); - imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4); + imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2); // NOLINT + imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4); // NOLINT imm4 = avx2_mm256_slli_epi32(imm4, 29); #else - imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2); - imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2); + imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2); // NOLINT + imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2); // NOLINT - imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4); - imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4); + imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4); // NOLINT + imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4); // NOLINT imm4_1 = _mm_slli_epi32(imm4_1, 29); imm4_2 = _mm_slli_epi32(imm4_2, 29); @@ -692,25 +696,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { /* Evaluate the first polynom (0 <= x <= Pi/4) */ v8sf z = _mm256_mul_ps(x, x); - y = *(v8sf *)_ps256_coscof_p0; + y = *(v8sf *)_ps256_coscof_p0; // NOLINT y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); // NOLINT y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); + y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); // NOLINT y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); // NOLINT y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(v8sf *)_ps256_1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_1); // NOLINT /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - v8sf y2 = *(v8sf *)_ps256_sincof_p0; + v8sf y2 = *(v8sf *)_ps256_sincof_p0; // NOLINT y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); + y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); // NOLINT y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); + y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); // NOLINT y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); @@ -729,3 +733,6 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { *s = _mm256_xor_ps(xmm1, sign_bit_sin); *c = _mm256_xor_ps(xmm2, sign_bit_cos); } + +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/gru_compute.cc b/lite/kernels/x86/gru_compute.cc index d8e70833aa..23842957fa 100644 --- a/lite/kernels/x86/gru_compute.cc +++ b/lite/kernels/x86/gru_compute.cc @@ -13,10 +13,13 @@ // limitations under the License. #include "lite/kernels/x86/gru_compute.h" +#include "lite/utils/env.h" -DEFINE_int32(paddle_num_threads, - 1, - "Number of threads for each paddle instance."); +// DEFINE_int32(paddle_num_threads, +// 1, +// "Number of threads for each paddle instance."); +int32_t paddle_num_threads = + paddle::lite::GetIntFromEnv("paddle_num_threads", 1); REGISTER_LITE_KERNEL(gru, kX86, diff --git a/lite/kernels/x86/gru_compute.h b/lite/kernels/x86/gru_compute.h index e3c6f70fdb..948485105a 100644 --- a/lite/kernels/x86/gru_compute.h +++ b/lite/kernels/x86/gru_compute.h @@ -26,7 +26,8 @@ #include "lite/core/types.h" #include "lite/fluid/eigen.h" -DECLARE_int32(paddle_num_threads); +// DECLARE_int32(paddle_num_threads); +extern int32_t paddle_num_threads; namespace paddle { namespace lite { @@ -109,7 +110,7 @@ class GRUCompute : public KernelLite { #ifdef PADDLE_WITH_MKLML // use MKL packed to speedup GEMM - if (FLAGS_paddle_num_threads >= 4) { + if (paddle_num_threads >= 4) { auto blas = lite::x86::math::GetBlas(context); T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, diff --git a/lite/utils/env.h b/lite/utils/env.h new file mode 100644 index 0000000000..86af8c9e7e --- /dev/null +++ b/lite/utils/env.h @@ -0,0 +1,71 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include +#include + +namespace paddle { +namespace lite { + +static std::string GetStringFromEnv(const std::string& str, + const std::string& def = "") { + char* variable = std::getenv(str.c_str()); + if (!variable) { + return def; + } + return std::string(variable); +} + +static bool GetBoolFromEnv(const std::string& str, bool def = false) { + char* variable = std::getenv(str.c_str()); + if (!variable) { + return def; + } + if (strcmp(variable, "false") == 0 || strcmp(variable, "0") == 0) { + return false; + } else { + return true; + } +} + +static int GetIntFromEnv(const std::string& str, int def = 0) { + char* variable = std::getenv(str.c_str()); + if (!variable) { + return def; + } + return atoi(variable); +} + +static double GetDoubleFromEnv(const std::string& str, double def = 0.0) { + char* variable = std::getenv(str.c_str()); + if (!variable) { + return def; + } + return atof(variable); +} + +static uint64_t GetUInt64FromEnv(const std::string& str, uint64_t def = 0ul) { + char* variable = std::getenv(str.c_str()); + if (!variable) { + return def; + } + return static_cast(atol(variable)); +} + +} // namespace lite +} // namespace paddle -- GitLab