From 911c314139aa47c0a18f52cfa8d2fdf6b987d072 Mon Sep 17 00:00:00 2001 From: Alex Stark Date: Wed, 21 Aug 2019 13:48:41 -0700 Subject: [PATCH] Ruy: Introduce CPU ID detection on x86. This amounts to disabling Ruy paths for this cpuid instruction results lack selected features. PiperOrigin-RevId: 264683681 --- tensorflow/lite/experimental/ruy/BUILD | 24 ++++- tensorflow/lite/experimental/ruy/context.cc | 23 ++++- .../ruy/{detect_dotprod.cc => detect_arm.cc} | 6 +- .../ruy/{detect_dotprod.h => detect_arm.h} | 6 +- .../lite/experimental/ruy/detect_x86.cc | 90 +++++++++++++++++++ tensorflow/lite/experimental/ruy/detect_x86.h | 42 +++++++++ tensorflow/lite/experimental/ruy/platform.h | 16 ++-- tensorflow/lite/kernels/internal/BUILD | 2 +- .../internal/optimized/neon_tensor_utils.cc | 2 +- 9 files changed, 189 insertions(+), 22 deletions(-) rename tensorflow/lite/experimental/ruy/{detect_dotprod.cc => detect_arm.cc} (98%) rename tensorflow/lite/experimental/ruy/{detect_dotprod.h => detect_arm.h} (83%) create mode 100644 tensorflow/lite/experimental/ruy/detect_x86.cc create mode 100644 tensorflow/lite/experimental/ruy/detect_x86.h diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD index 4ffcf229883..48dabde355a 100644 --- a/tensorflow/lite/experimental/ruy/BUILD +++ b/tensorflow/lite/experimental/ruy/BUILD @@ -204,17 +204,32 @@ cc_library( ) cc_library( - name = "detect_dotprod", + name = "detect_arm", srcs = [ - "detect_dotprod.cc", + "detect_arm.cc", ], hdrs = [ - "detect_dotprod.h", + "detect_arm.h", ], copts = RUY_COPTS, visibility = ruy_visibility(), ) +cc_library( + name = "detect_x86", + srcs = [ + "detect_x86.cc", + ], + hdrs = [ + "detect_x86.h", + ], + copts = RUY_COPTS, + visibility = ruy_visibility(), + deps = [ + ":platform", + ], +) + cc_library( name = "path", hdrs = ["path.h"], @@ -256,7 +271,8 @@ cc_library( deps = [ ":allocator", ":check_macros", - ":detect_dotprod", + ":detect_arm", + ":detect_x86", ":path", ":thread_pool", ":trace", diff --git a/tensorflow/lite/experimental/ruy/context.cc b/tensorflow/lite/experimental/ruy/context.cc index cc2c8ee0040..32f222c77f5 100644 --- a/tensorflow/lite/experimental/ruy/context.cc +++ b/tensorflow/lite/experimental/ruy/context.cc @@ -16,7 +16,8 @@ limitations under the License. #include "tensorflow/lite/experimental/ruy/context.h" #include "tensorflow/lite/experimental/ruy/check_macros.h" -#include "tensorflow/lite/experimental/ruy/detect_dotprod.h" +#include "tensorflow/lite/experimental/ruy/detect_arm.h" +#include "tensorflow/lite/experimental/ruy/detect_x86.h" namespace ruy { @@ -41,13 +42,31 @@ Path Context::GetRuntimeEnabledPaths() { // Now selectively disable paths that aren't supported on this machine. if ((runtime_enabled_paths_ & Path::kNeonDotprod) != Path::kNone) { if (!DetectDotprod()) { - runtime_enabled_paths_ = runtime_enabled_paths_ ^ Path::kNeonDotprod; + runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kNeonDotprod; // Sanity check. RUY_DCHECK((runtime_enabled_paths_ & Path::kNeonDotprod) == Path::kNone); } } #endif +#if RUY_PLATFORM(X86) + if ((runtime_enabled_paths_ & Path::kAvx2) != Path::kNone) { + if (!DetectCpuAvx2()) { + runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kAvx2; + // Sanity check. + RUY_DCHECK((runtime_enabled_paths_ & Path::kAvx2) == Path::kNone); + } + } + + if ((runtime_enabled_paths_ & Path::kAvx512) != Path::kNone) { + if (!DetectCpuAvx512()) { + runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kAvx512; + // Sanity check. + RUY_DCHECK((runtime_enabled_paths_ & Path::kAvx512) == Path::kNone); + } + } +#endif + // Sanity check. We can't possibly have disabled all paths, as some paths // are universally available (kReference, kStandardCpp). RUY_DCHECK(runtime_enabled_paths_ != Path::kNone); diff --git a/tensorflow/lite/experimental/ruy/detect_dotprod.cc b/tensorflow/lite/experimental/ruy/detect_arm.cc similarity index 98% rename from tensorflow/lite/experimental/ruy/detect_dotprod.cc rename to tensorflow/lite/experimental/ruy/detect_arm.cc index 35c812e42b5..f40a963ef33 100644 --- a/tensorflow/lite/experimental/ruy/detect_dotprod.cc +++ b/tensorflow/lite/experimental/ruy/detect_arm.cc @@ -70,6 +70,8 @@ bool try_asm_snippet(bool (*asm_snippet)()) { ``` */ +#include "tensorflow/lite/experimental/ruy/detect_arm.h" + #if defined __aarch64__ && defined __linux__ #define RUY_IMPLEMENT_DETECT_DOTPROD #endif @@ -215,8 +217,8 @@ bool DetectDotprod() { return DetectDotprodBySigIllMethod(); } -#else +#else // RUY_IMPLEMENT_DETECT_DOTPROD bool DetectDotprod() { return false; } -#endif +#endif // RUY_IMPLEMENT_DETECT_DOTPROD } // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/detect_dotprod.h b/tensorflow/lite/experimental/ruy/detect_arm.h similarity index 83% rename from tensorflow/lite/experimental/ruy/detect_dotprod.h rename to tensorflow/lite/experimental/ruy/detect_arm.h index 39c73013ba0..e843a684396 100644 --- a/tensorflow/lite/experimental/ruy/detect_dotprod.h +++ b/tensorflow/lite/experimental/ruy/detect_arm.h @@ -15,8 +15,8 @@ limitations under the License. // Temporary dotprod-detection code until we can rely on getauxval. -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_DOTPROD_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_DOTPROD_H_ +#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_ARM_H_ +#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_ARM_H_ namespace ruy { @@ -26,4 +26,4 @@ bool DetectDotprod(); } // namespace ruy -#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_DOTPROD_H_ +#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_ARM_H_ diff --git a/tensorflow/lite/experimental/ruy/detect_x86.cc b/tensorflow/lite/experimental/ruy/detect_x86.cc new file mode 100644 index 00000000000..f96f172ee80 --- /dev/null +++ b/tensorflow/lite/experimental/ruy/detect_x86.cc @@ -0,0 +1,90 @@ +/* Copyright 2019 Google LLC. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/experimental/ruy/detect_x86.h" + +#include + +#if RUY_PLATFORM(X86) && RUY_PLATFORM(X86_ENHANCEMENTS) +#include // IWYU pragma: keep + +#endif + +namespace ruy { +#if RUY_PLATFORM(X86) && RUY_PLATFORM(X86_ENHANCEMENTS) + +namespace { + +// See Intel docs, such as http://goo.gl/c6IkGX. +inline void RunCpuid(std::uint32_t eax, std::uint32_t ecx, + std::uint32_t abcd[4]) { + std::uint32_t ebx, edx; +#if defined(__i386__) && defined(__PIC__) + /* in case of PIC under 32-bit EBX cannot be clobbered */ + asm volatile("movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" + : "=D"(ebx), +#else + asm volatile("cpuid" + : "+b"(ebx), +#endif + "+a"(eax), "+c"(ecx), "=d"(edx)); + abcd[0] = eax; + abcd[1] = ebx; + abcd[2] = ecx; + abcd[3] = edx; +} + +} // namespace + +bool DetectCpuSse42() { + constexpr std::uint32_t kAvx512EcxSse42 = 1u << 20; + constexpr std::uint32_t kAvx512EcxAbm = 1u << 5; + + std::uint32_t abcd[4]; + + RunCpuid(1, 0, abcd); + const bool has_sse4_2_base = (abcd[2] & kAvx512EcxSse42) == kAvx512EcxSse42; + RunCpuid(0x80000001, 0, abcd); + const bool has_abm = (abcd[2] & kAvx512EcxAbm) == kAvx512EcxAbm; + + return has_sse4_2_base && has_abm; +} + +bool DetectCpuAvx2() { + constexpr std::uint32_t kAvx2Ebx = 1u << 5; + + std::uint32_t abcd[4]; + RunCpuid(7, 0, abcd); + + return (abcd[1] & kAvx2Ebx) == kAvx2Ebx; +} + +bool DetectCpuAvx512() { + constexpr std::uint32_t kAvx512EbxF = 1u << 16; + constexpr std::uint32_t kAvx512EbxDq = 1u << 17; + constexpr std::uint32_t kAvx512EbxCd = 1u << 28; + constexpr std::uint32_t kAvx512EbxBw = 1u << 30; + constexpr std::uint32_t kAvx512EbxVl = 1u << 31; + + constexpr std::uint32_t kAvx512EbxMask = + kAvx512EbxF | kAvx512EbxDq | kAvx512EbxCd | kAvx512EbxBw | kAvx512EbxVl; + std::uint32_t abcd[4]; + RunCpuid(7, 0, abcd); + + return (abcd[1] & kAvx512EbxMask) == kAvx512EbxMask; +} + +#endif +} // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/detect_x86.h b/tensorflow/lite/experimental/ruy/detect_x86.h new file mode 100644 index 00000000000..e469bcf8e84 --- /dev/null +++ b/tensorflow/lite/experimental/ruy/detect_x86.h @@ -0,0 +1,42 @@ +/* Copyright 2019 Google LLC. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_X86_H_ +#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_X86_H_ + +#include "tensorflow/lite/experimental/ruy/platform.h" + +namespace ruy { + +#if RUY_PLATFORM(X86) +#if RUY_PLATFORM(X86_ENHANCEMENTS) + +// This also checks ABM support, which implies LZCNT and POPCNT. +bool DetectCpuSse42(); +bool DetectCpuAvx2(); +bool DetectCpuAvx512(); + +#else // RUY_PLATFORM(X86_ENHANCEMENTS) + +inline bool DetectCpuSse42() { return false; } +inline bool DetectCpuAvx2() { return false; } +inline bool DetectCpuAvx512() { return false; } + +#endif // !RUY_PLATFORM(X86_ENHANCEMENTS) +#endif // RUY_PLATFORM(X86) + +} // namespace ruy + +#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_DETECT_X86_H_ diff --git a/tensorflow/lite/experimental/ruy/platform.h b/tensorflow/lite/experimental/ruy/platform.h index 6bd8102e2cf..7dd67bd186d 100644 --- a/tensorflow/lite/experimental/ruy/platform.h +++ b/tensorflow/lite/experimental/ruy/platform.h @@ -80,9 +80,9 @@ limitations under the License. // restriction. #if defined(RUY_FORCE_ENABLE_X86_ENHANCEMENTS) || \ (defined(__clang__) && defined(__linux__)) -#define RUY_USE_X86_ENHANCEMENTS 1 +#define RUY_DONOTUSEDIRECTLY_X86_ENHANCEMENTS 1 #else -#define RUY_USE_X86_ENHANCEMENTS 0 +#define RUY_DONOTUSEDIRECTLY_X86_ENHANCEMENTS 0 #endif // These CPU capabilities will all be true when Skylake, etc, are enabled during @@ -91,15 +91,15 @@ limitations under the License. // TODO(b/138433137) Select x86 enhancements at runtime rather than via compile // options. // -#if RUY_USE_X86_ENHANCEMENTS && RUY_PLATFORM(X86) && defined(__AVX512F__) && \ - defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__) && \ - defined(__AVX512VL__) +#if RUY_PLATFORM(X86_ENHANCEMENTS) && RUY_PLATFORM(X86) && \ + defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && \ + defined(__AVX512BW__) && defined(__AVX512VL__) #define RUY_DONOTUSEDIRECTLY_AVX512 1 #else #define RUY_DONOTUSEDIRECTLY_AVX512 0 #endif -#if defined(RUY_ENABLE_AVX2_ENHANCEMENTS) && RUY_USE_X86_ENHANCEMENTS && \ +#if defined(RUY_ENABLE_AVX2_ENHANCEMENTS) && RUY_PLATFORM(X86_ENHANCEMENTS) && \ RUY_PLATFORM(X86) && defined(__AVX2__) #define RUY_DONOTUSEDIRECTLY_AVX2 1 #else @@ -107,7 +107,7 @@ limitations under the License. #endif // Note does not check for LZCNT or POPCNT. -#if RUY_USE_X86_ENHANCEMENTS && RUY_PLATFORM(X86) && defined(__SSE4_2__) +#if RUY_PLATFORM(X86_ENHANCEMENTS) && RUY_PLATFORM(X86) && defined(__SSE4_2__) #define RUY_DONOTUSEDIRECTLY_SSE4_2 1 #else #define RUY_DONOTUSEDIRECTLY_SSE4_2 0 @@ -120,6 +120,4 @@ limitations under the License. #define RUY_DONOTUSEDIRECTLY_APPLE 0 #endif -#undef RUY_USE_X86_ENHANCEMENTS - #endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PLATFORM_H_ diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD index eb442598c70..639da16cec5 100644 --- a/tensorflow/lite/kernels/internal/BUILD +++ b/tensorflow/lite/kernels/internal/BUILD @@ -535,7 +535,7 @@ cc_library( ":round", ":types", "//tensorflow/lite/c:c_api_internal", - "//tensorflow/lite/experimental/ruy:detect_dotprod", + "//tensorflow/lite/experimental/ruy:detect_arm", "//tensorflow/lite/kernels:activation_functor", "//tensorflow/lite/kernels:cpu_backend_context", "//tensorflow/lite/kernels:op_macros", diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc index 889bb8339e5..86b758fca0c 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc @@ -22,7 +22,7 @@ limitations under the License. #include #include "tensorflow/lite/c/builtin_op_data.h" -#include "tensorflow/lite/experimental/ruy/detect_dotprod.h" +#include "tensorflow/lite/experimental/ruy/detect_arm.h" #include "tensorflow/lite/kernels/activation_functor.h" #include "tensorflow/lite/kernels/internal/compatibility.h" #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h" -- GitLab