提交 8c1064d9 编写于 作者: B Benoit Jacob 提交者: TensorFlower Gardener

Fix performance regression (b/137615815) introduced by new platform

#defines - they were tested directly by #ifdef, and were being defined
by path.h.  As tune.cc did not #include path.h, it did not enable its
platform-specific tuning code, resulting in a performance regression
in cases relying on tuning for maximal performance --- in-order ARM.

To prevent that from happening again, this moves the platform defines
to a new platform.h and forces users to use a RUY_PLATFORM(X) function
macro, so that if they fail to #include platform.h, they get a compilation
error.

PiperOrigin-RevId: 258372624
上级 f2df2c28
......@@ -12,6 +12,11 @@ package(
licenses = ["notice"], # Apache 2.0
)
cc_library(
name = "platform",
hdrs = ["platform.h"],
)
cc_library(
name = "check_macros",
hdrs = ["check_macros.h"],
......@@ -60,6 +65,7 @@ cc_library(
],
deps = [
":opt_set",
":platform",
":time",
],
)
......@@ -164,7 +170,10 @@ cc_library(
name = "path",
hdrs = ["path.h"],
visibility = ruy_visibility(),
deps = [":size_util"],
deps = [
":platform",
":size_util",
],
)
cc_library(
......@@ -238,6 +247,7 @@ cc_library(
":matrix",
":opt_set",
":path",
":platform",
],
)
......@@ -255,6 +265,7 @@ cc_library(
":internal_matrix",
":opt_set",
":path",
":platform",
":size_util",
":spec",
":tune",
......@@ -276,6 +287,7 @@ cc_library(
":internal_matrix",
":opt_set",
":path",
":platform",
":spec",
":tune",
"@gemmlowp//:profiler",
......@@ -371,6 +383,7 @@ cc_library(
":ruy",
":time",
"@com_google_googletest//:gtest",
":platform",
] + ruy_test_ext_deps(),
)
......
......@@ -26,8 +26,9 @@ limitations under the License.
#include "tensorflow/lite/experimental/ruy/matrix.h"
#include "tensorflow/lite/experimental/ruy/opt_set.h"
#include "tensorflow/lite/experimental/ruy/path.h"
#include "tensorflow/lite/experimental/ruy/platform.h"
#if ((defined RUY_NEON_64) || (defined RUY_NEON_32))
#if (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32))
#include <arm_neon.h>
#endif
......
......@@ -25,6 +25,7 @@ limitations under the License.
#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
#include "tensorflow/lite/experimental/ruy/opt_set.h"
#include "tensorflow/lite/experimental/ruy/path.h"
#include "tensorflow/lite/experimental/ruy/platform.h"
#include "tensorflow/lite/experimental/ruy/size_util.h"
#include "tensorflow/lite/experimental/ruy/spec.h"
#include "tensorflow/lite/experimental/ruy/tune.h"
......@@ -217,7 +218,7 @@ RUY_INHERIT_KERNEL(Path::kStandardCpp, Path::kNeon)
RUY_INHERIT_KERNEL(Path::kNeon, Path::kNeonDotprod)
// KernelParams are shared across 32-bit and 64-bit NEON code.
#if ((defined RUY_NEON_64) || (defined RUY_NEON_32)) && \
#if (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) && \
(RUY_OPT_ENABLED(RUY_OPT_ASM))
#define RUY_ASM_FLAG_HAS_BIAS 0x1
......@@ -369,7 +370,7 @@ void Kernel8bitNeonInOrder(const KernelParams8bit<4, 4>& params);
void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params);
void Kernel8bitNeonDotprodInOrder(const KernelParams8bit<8, 8>& params);
#ifdef RUY_NEON_64
#if RUY_PLATFORM(NEON_64)
template <typename DstScalar>
struct Kernel<Path::kNeon, std::int8_t, std::int8_t, DstScalar,
BasicSpec<std::int32_t, DstScalar>> {
......@@ -489,7 +490,7 @@ void KernelFloatNeonInOrder(const KernelParamsFloat<8, 8>& params);
void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params);
void KernelFloatNeonDotprodInOrder(const KernelParamsFloat<8, 8>& params);
#ifdef RUY_NEON_64
#if RUY_PLATFORM(NEON_64)
// A Float kernel for ARM64 Neon.
template <>
struct Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>> {
......@@ -512,7 +513,7 @@ struct Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>> {
};
#endif
#ifdef RUY_NEON_32
#if RUY_PLATFORM(NEON_32)
// A Float kernel for ARM32 Neon.
template <>
struct Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>> {
......@@ -559,7 +560,7 @@ struct Kernel<Path::kNeonDotprod, float, float, float,
}
};
#endif // ((defined RUY_NEON_64) || (defined RUY_NEON_32)) &&
#endif // (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) &&
// (RUY_OPT_ENABLED(RUY_OPT_ASM)
} // namespace ruy
......
......@@ -15,10 +15,11 @@ limitations under the License.
#include "profiling/instrumentation.h"
#include "tensorflow/lite/experimental/ruy/kernel.h"
#include "tensorflow/lite/experimental/ruy/platform.h"
namespace ruy {
#if (defined RUY_NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
#if RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
#define RUY_ASM_LABEL_STORE_UINT8 91
#define RUY_ASM_LABEL_STORE_INT8 92
......@@ -539,5 +540,5 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
#undef RUY_OFFSET_RHS_BASE_PTR
#undef RUY_OFFSET_DST_BASE_PTR
#endif // (defined RUY_NEON_32) && (RUY_OPT_ENABLED(RUY_OPT_ASM)
#endif // RUY_PLATFORM(NEON_32) && (RUY_OPT_ENABLED(RUY_OPT_ASM)
} // namespace ruy
......@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/experimental/ruy/kernel.h"
#include "profiling/instrumentation.h"
#include "tensorflow/lite/experimental/ruy/kernel.h"
#include "tensorflow/lite/experimental/ruy/platform.h"
namespace ruy {
#if (defined RUY_NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
#define RUY_ASM_LABEL_STORE_UINT8 91
#define RUY_ASM_LABEL_STORE_INT8 92
......@@ -6302,6 +6302,6 @@ void KernelFloatNeonDotprodInOrder(const KernelParamsFloat<8, 8>& params) {
#undef RUY_OFFSET_RHS_BASE_PTR
#undef RUY_OFFSET_DST_BASE_PTR
#endif // (defined RUY_NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
#endif // RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
} // namespace ruy
......@@ -15,9 +15,11 @@ limitations under the License.
#include "tensorflow/lite/experimental/ruy/pack.h"
#include "tensorflow/lite/experimental/ruy/platform.h"
namespace ruy {
#if (defined RUY_NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
void Pack8bitNeonOutOfOrder(const void* src_ptr0, const void* src_ptr1,
const void* src_ptr2, const void* src_ptr3,
......@@ -1329,6 +1331,6 @@ void PackFloatNeonInOrder(const float* src_ptr0, const float* src_ptr1,
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
#endif // (defined RUY_NEON64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
#endif // RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
} // namespace ruy
......@@ -89,6 +89,7 @@ limitations under the License.
#include "tensorflow/lite/experimental/ruy/common.h"
#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
#include "tensorflow/lite/experimental/ruy/opt_set.h"
#include "tensorflow/lite/experimental/ruy/platform.h"
#include "tensorflow/lite/experimental/ruy/tune.h"
namespace ruy {
......@@ -158,7 +159,7 @@ struct PackImpl<Path::kStandardCpp, FixedKernelLayout, Scalar, PackedScalar,
RUY_INHERIT_PACK(Path::kStandardCpp, Path::kNeon)
RUY_INHERIT_PACK(Path::kNeon, Path::kNeonDotprod)
#if (defined RUY_NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
void Pack8bitNeonOutOfOrder(const void* src_ptr0, const void* src_ptr1,
const void* src_ptr2, const void* src_ptr3,
......@@ -384,7 +385,7 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
}
};
#endif // (defined RUY_NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
#endif // RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
// Main entry point for packing.
template <Path ThePath, typename FixedKernelLayout, typename Scalar,
......
......@@ -18,29 +18,9 @@ limitations under the License.
#include <cstdint>
#include "tensorflow/lite/experimental/ruy/platform.h"
#include "tensorflow/lite/experimental/ruy/size_util.h"
// Detect ARM, 32-bit or 64-bit
#ifdef __aarch64__
#define RUY_ARM_64
#elif defined(__arm__)
#define RUY_ARM_32
#endif
// Detect NEON.
#if (defined __ARM_NEON) || (defined __ARM_NEON__)
#define RUY_NEON
#endif
// Define 32bit ARM NEON and 64 bit ARM NEON
#if defined(RUY_NEON) && defined(RUY_ARM_32)
#define RUY_NEON_32
#endif
#if defined(RUY_NEON) && defined(RUY_ARM_64)
#define RUY_NEON_64
#endif
namespace ruy {
// A Path is a choice of implementation path, e.g. between reference code
......@@ -119,21 +99,21 @@ inline Path GetMostSignificantPath(Path path_mask) {
// ruy::kAllPaths represents all Path's that make sense to on a given
// base architecture.
#ifdef __linux__
#ifdef RUY_NEON_64
#if RUY_PLATFORM(NEON_64)
constexpr Path kAllPaths =
Path::kReference | Path::kStandardCpp | Path::kNeon | Path::kNeonDotprod;
#elif defined RUY_NEON_32
#elif RUY_PLATFORM(NEON_32)
constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp | Path::kNeon;
#else
constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp;
#endif // RUY_NEON_64
#endif
#else // __linux__
// We don't know how to do runtime dotprod detection outside of linux for now.
#if defined(RUY_NEON_64) || defined(RUY_NEON_32)
#if RUY_PLATFORM(NEON)
constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp | Path::kNeon;
#else
constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp;
#endif // defined(RUY_NEON_64) || defined(RUY_NEON_32)
#endif
#endif // __linux__
} // namespace ruy
......
/* Copyright 2019 Google LLC. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PLATFORM_H_
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PLATFORM_H_
#define RUY_PLATFORM(X) ((RUY_DONOTUSEDIRECTLY_##X) != 0)
// Detect ARM 32-bit
#ifdef __arm__
#define RUY_DONOTUSEDIRECTLY_ARM_32 1
#else
#define RUY_DONOTUSEDIRECTLY_ARM_32 0
#endif
// Detect ARM 64-bit
#ifdef __aarch64__
#define RUY_DONOTUSEDIRECTLY_ARM_64 1
#else
#define RUY_DONOTUSEDIRECTLY_ARM_64 0
#endif
// Detect NEON
#if (defined __ARM_NEON) || (defined __ARM_NEON__)
#define RUY_DONOTUSEDIRECTLY_NEON 1
#else
#define RUY_DONOTUSEDIRECTLY_NEON 0
#endif
// Define ARM 32-bit NEON
#define RUY_DONOTUSEDIRECTLY_NEON_32 \
(RUY_DONOTUSEDIRECTLY_NEON && RUY_DONOTUSEDIRECTLY_ARM_32)
// Define ARM 64-bit NEON
// Note: NEON is implied by ARM64, so this define is redundant.
// It still allows some conveyance of intent.
#define RUY_DONOTUSEDIRECTLY_NEON_64 \
(RUY_DONOTUSEDIRECTLY_NEON && RUY_DONOTUSEDIRECTLY_ARM_64)
#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PLATFORM_H_
......@@ -29,6 +29,7 @@ limitations under the License.
#include <vector>
#include <gtest/gtest.h>
#include "tensorflow/lite/experimental/ruy/platform.h"
#include "tensorflow/lite/experimental/ruy/pmu.h"
#include "tensorflow/lite/experimental/ruy/ruy.h"
#include "tensorflow/lite/experimental/ruy/ruy_advanced.h"
......@@ -1651,7 +1652,7 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::MakeResultPaths() {
}
// We link against a generic BLAS target that only maps to OpenBLAS on specific
// architectures.
#if defined RUY_ARM_64 || defined RUY_ARM_32
#if RUY_PLATFORM(ARM_32) || RUY_PLATFORM(ARM_64)
// OpenBLAS multi-threading is disabled, so avoid mixing single-threaded
// and multi-threaded benchmark results.
if (max_num_threads == 1) {
......
......@@ -19,11 +19,12 @@ limitations under the License.
#include <cstdint>
#include "tensorflow/lite/experimental/ruy/opt_set.h"
#include "tensorflow/lite/experimental/ruy/platform.h"
#include "tensorflow/lite/experimental/ruy/time.h"
namespace ruy {
#ifdef RUY_NEON_64
#if RUY_PLATFORM(NEON_64)
namespace {
......@@ -130,7 +131,7 @@ Tuning TuningResolver::ResolveNow() {
return is_probably_inorder ? Tuning::kInOrder : Tuning::kOutOfOrder;
}
#else // not defined RUY_NEON_64
#else // not RUY_PLATFORM(NEON_64)
float TuningResolver::EvalRatio() { return 0; }
float TuningResolver::ThresholdRatio() { return 0; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册