From 294923318968ce1432de56b150dd42ec3838edc2 Mon Sep 17 00:00:00 2001 From: kevinw Date: Fri, 17 Mar 2017 03:39:23 -0700 Subject: [PATCH] 8049717: expose L1_data_cache_line_size for diagnostic/sanity checks Summary: Add support for VM_Version::L1_data_cache_line_size(). Reviewed-by: dsimms, kvn, dholmes --- src/cpu/sparc/vm/vm_version_sparc.cpp | 44 +++++++++++++++ src/cpu/x86/vm/vm_version_x86.cpp | 4 ++ src/cpu/x86/vm/vm_version_x86.hpp | 8 ++- src/share/vm/prims/jni.cpp | 1 + src/share/vm/runtime/objectMonitor.cpp | 64 ++++++++++++++++++++++ src/share/vm/runtime/objectMonitor.hpp | 4 +- src/share/vm/runtime/synchronizer.cpp | 75 +++++++++++++++++++++----- src/share/vm/runtime/synchronizer.hpp | 3 ++ src/share/vm/runtime/vm_version.cpp | 1 + src/share/vm/runtime/vm_version.hpp | 7 ++- 10 files changed, 194 insertions(+), 17 deletions(-) diff --git a/src/cpu/sparc/vm/vm_version_sparc.cpp b/src/cpu/sparc/vm/vm_version_sparc.cpp index b14a6e993..c2ccb581f 100644 --- a/src/cpu/sparc/vm/vm_version_sparc.cpp +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp @@ -259,6 +259,49 @@ void VM_Version::initialize() { // buf is started with ", " or is empty _features_str = strdup(strlen(buf) > 2 ? buf + 2 : buf); + // There are three 64-bit SPARC families that do not overlap, e.g., + // both is_ultra3() and is_sparc64() cannot be true at the same time. + // Within these families, there can be more than one chip, e.g., + // is_T4() and is_T7() machines are also is_niagara(). + if (is_ultra3()) { + assert(_L1_data_cache_line_size == 0, "overlap with Ultra3 family"); + // Ref: UltraSPARC III Cu Processor + _L1_data_cache_line_size = 64; + } + if (is_niagara()) { + assert(_L1_data_cache_line_size == 0, "overlap with niagara family"); + // All Niagara's are sun4v's, but not all sun4v's are Niagaras, e.g., + // Fujitsu SPARC64 is sun4v, but we don't want it in this block. + // + // Ref: UltraSPARC T1 Supplement to the UltraSPARC Architecture 2005 + // Appendix F.1.3.1 Cacheable Accesses + // -> 16-byte L1 cache line size + // + // Ref: UltraSPARC T2: A Highly-Threaded, Power-Efficient, SPARC SOC + // Section III: SPARC Processor Core + // -> 16-byte L1 cache line size + // + // Ref: Oracle's SPARC T4-1, SPARC T4-2, SPARC T4-4, and SPARC T4-1B Server Architecture + // Section SPARC T4 Processor Cache Architecture + // -> 32-byte L1 cache line size (no longer see that info on this ref) + // + // XXX - still need a T7 reference here + // + if (is_T7()) { // T7 or newer + _L1_data_cache_line_size = 64; + } else if (is_T4()) { // T4 or newer (until T7) + _L1_data_cache_line_size = 32; + } else { // T1 or newer (until T4) + _L1_data_cache_line_size = 16; + } + } + if (is_sparc64()) { + guarantee(_L1_data_cache_line_size == 0, "overlap with SPARC64 family"); + // Ref: Fujitsu SPARC64 VII Processor + // Section 4 Cache System + _L1_data_cache_line_size = 64; + } + // UseVIS is set to the smallest of what hardware supports and what // the command line requires. I.e., you cannot set UseVIS to 3 on // older UltraSparc which do not support it. @@ -364,6 +407,7 @@ void VM_Version::initialize() { #ifndef PRODUCT if (PrintMiscellaneous && Verbose) { + tty->print_cr("L1 data cache line size: %u", L1_data_cache_line_size()); tty->print_cr("L2 data cache line size: %u", L2_data_cache_line_size()); tty->print("Allocation"); if (AllocatePrefetchStyle <= 0) { diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp index 134fa2418..fd0a68d10 100644 --- a/src/cpu/x86/vm/vm_version_x86.cpp +++ b/src/cpu/x86/vm/vm_version_x86.cpp @@ -406,6 +406,8 @@ void VM_Version::get_processor_features() { _stepping = 0; _cpuFeatures = 0; _logical_processors_per_package = 1; + // i486 internal cache is both I&D and has a 16-byte line size + _L1_data_cache_line_size = 16; if (!Use486InstrsOnly) { // Get raw processor info @@ -424,6 +426,7 @@ void VM_Version::get_processor_features() { // Logical processors are only available on P4s and above, // and only if hyperthreading is available. _logical_processors_per_package = logical_processor_count(); + _L1_data_cache_line_size = L1_line_size(); } } @@ -1034,6 +1037,7 @@ void VM_Version::get_processor_features() { if (PrintMiscellaneous && Verbose) { tty->print_cr("Logical CPUs per core: %u", logical_processors_per_package()); + tty->print_cr("L1 data cache line size: %u", L1_data_cache_line_size()); tty->print("UseSSE=%d", (int) UseSSE); if (UseAVX > 0) { tty->print(" UseAVX=%d", (int) UseAVX); diff --git a/src/cpu/x86/vm/vm_version_x86.hpp b/src/cpu/x86/vm/vm_version_x86.hpp index 49c9dba7e..f01893691 100644 --- a/src/cpu/x86/vm/vm_version_x86.hpp +++ b/src/cpu/x86/vm/vm_version_x86.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -595,7 +595,7 @@ public: return (result == 0 ? 1 : result); } - static intx prefetch_data_size() { + static intx L1_line_size() { intx result = 0; if (is_intel()) { result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1); @@ -607,6 +607,10 @@ public: return result; } + static intx prefetch_data_size() { + return L1_line_size(); + } + // // Feature identification // diff --git a/src/share/vm/prims/jni.cpp b/src/share/vm/prims/jni.cpp index 8a41ca439..95fd4d226 100644 --- a/src/share/vm/prims/jni.cpp +++ b/src/share/vm/prims/jni.cpp @@ -5129,6 +5129,7 @@ void execute_internal_vm_tests() { run_unit_test(TestKlass_test()); run_unit_test(Test_linked_list()); run_unit_test(TestChunkedList_test()); + run_unit_test(ObjectMonitor::sanity_checks()); #if INCLUDE_VM_STRUCTS run_unit_test(VMStructs::test()); #endif diff --git a/src/share/vm/runtime/objectMonitor.cpp b/src/share/vm/runtime/objectMonitor.cpp index 31f899b12..94f4492d3 100644 --- a/src/share/vm/runtime/objectMonitor.cpp +++ b/src/share/vm/runtime/objectMonitor.cpp @@ -2529,6 +2529,10 @@ void ObjectMonitor::DeferredInitialize () { SETKNOB(FastHSSEC) ; #undef SETKNOB + if (Knob_Verbose) { + sanity_checks(); + } + if (os::is_MP()) { BackOffMask = (1 << Knob_SpinBackOff) - 1 ; if (Knob_ReportSettings) ::printf ("BackOffMask=%X\n", BackOffMask) ; @@ -2549,6 +2553,66 @@ void ObjectMonitor::DeferredInitialize () { InitDone = 1 ; } +void ObjectMonitor::sanity_checks() { + int error_cnt = 0; + int warning_cnt = 0; + bool verbose = Knob_Verbose != 0 NOT_PRODUCT(|| VerboseInternalVMTests); + + if (verbose) { + tty->print_cr("INFO: sizeof(ObjectMonitor)=" SIZE_FORMAT, + sizeof(ObjectMonitor)); + } + + uint cache_line_size = VM_Version::L1_data_cache_line_size(); + if (verbose) { + tty->print_cr("INFO: L1_data_cache_line_size=%u", cache_line_size); + } + + ObjectMonitor dummy; + u_char *addr_begin = (u_char*)&dummy; + u_char *addr_header = (u_char*)&dummy._header; + u_char *addr_owner = (u_char*)&dummy._owner; + + uint offset_header = (uint)(addr_header - addr_begin); + if (verbose) tty->print_cr("INFO: offset(_header)=%u", offset_header); + + uint offset_owner = (uint)(addr_owner - addr_begin); + if (verbose) tty->print_cr("INFO: offset(_owner)=%u", offset_owner); + + if ((uint)(addr_header - addr_begin) != 0) { + tty->print_cr("ERROR: offset(_header) must be zero (0)."); + error_cnt++; + } + + if (cache_line_size != 0) { + // We were able to determine the L1 data cache line size so + // do some cache line specific sanity checks + + if ((offset_owner - offset_header) < cache_line_size) { + tty->print_cr("WARNING: the _header and _owner fields are closer " + "than a cache line which permits false sharing."); + warning_cnt++; + } + + if ((sizeof(ObjectMonitor) % cache_line_size) != 0) { + tty->print_cr("WARNING: ObjectMonitor size is not a multiple of " + "a cache line which permits false sharing."); + warning_cnt++; + } + } + + ObjectSynchronizer::sanity_checks(verbose, cache_line_size, &error_cnt, + &warning_cnt); + + if (verbose || error_cnt != 0 || warning_cnt != 0) { + tty->print_cr("INFO: error_cnt=%d", error_cnt); + tty->print_cr("INFO: warning_cnt=%d", warning_cnt); + } + + guarantee(error_cnt == 0, + "Fatal error(s) found in ObjectMonitor::sanity_checks()"); +} + #ifndef PRODUCT void ObjectMonitor::verify() { } diff --git a/src/share/vm/runtime/objectMonitor.hpp b/src/share/vm/runtime/objectMonitor.hpp index 10b3609c0..09180cf70 100644 --- a/src/share/vm/runtime/objectMonitor.hpp +++ b/src/share/vm/runtime/objectMonitor.hpp @@ -189,6 +189,8 @@ public: bool check(TRAPS); // true if the thread owns the monitor. void check_slow(TRAPS); void clear(); + static void sanity_checks(); // public for -XX:+ExecuteInternalVMTests + // in PRODUCT for -XX:SyncKnobs=Verbose=1 #ifndef PRODUCT void verify(); void print(); @@ -234,8 +236,6 @@ public: // WARNING: this must be the very first word of ObjectMonitor // This means this class can't use any virtual member functions. - // TODO-FIXME: assert that offsetof(_header) is 0 or get rid of the - // implicit 0 offset in emitted code. volatile markOop _header; // displaced object header word - mark void* volatile _object; // backward object pointer - strong root diff --git a/src/share/vm/runtime/synchronizer.cpp b/src/share/vm/runtime/synchronizer.cpp index b35f86fc2..6bd92333c 100644 --- a/src/share/vm/runtime/synchronizer.cpp +++ b/src/share/vm/runtime/synchronizer.cpp @@ -437,19 +437,22 @@ void ObjectSynchronizer::notifyall(Handle obj, TRAPS) { // Hash Code handling // // Performance concern: -// OrderAccess::storestore() calls release() which STs 0 into the global volatile -// OrderAccess::Dummy variable. This store is unnecessary for correctness. -// Many threads STing into a common location causes considerable cache migration -// or "sloshing" on large SMP system. As such, I avoid using OrderAccess::storestore() -// until it's repaired. In some cases OrderAccess::fence() -- which incurs local -// latency on the executing processor -- is a better choice as it scales on SMP -// systems. See http://blogs.sun.com/dave/entry/biased_locking_in_hotspot for a -// discussion of coherency costs. Note that all our current reference platforms -// provide strong ST-ST order, so the issue is moot on IA32, x64, and SPARC. +// OrderAccess::storestore() calls release() which at one time stored 0 +// into the global volatile OrderAccess::dummy variable. This store was +// unnecessary for correctness. Many threads storing into a common location +// causes considerable cache migration or "sloshing" on large SMP systems. +// As such, I avoided using OrderAccess::storestore(). In some cases +// OrderAccess::fence() -- which incurs local latency on the executing +// processor -- is a better choice as it scales on SMP systems. +// +// See http://blogs.oracle.com/dave/entry/biased_locking_in_hotspot for +// a discussion of coherency costs. Note that all our current reference +// platforms provide strong ST-ST order, so the issue is moot on IA32, +// x64, and SPARC. // // As a general policy we use "volatile" to control compiler-based reordering -// and explicit fences (barriers) to control for architectural reordering performed -// by the CPU(s) or platform. +// and explicit fences (barriers) to control for architectural reordering +// performed by the CPU(s) or platform. struct SharedGlobals { // These are highly shared mostly-read variables. @@ -1636,7 +1639,55 @@ void ObjectSynchronizer::release_monitors_owned_by_thread(TRAPS) { } //------------------------------------------------------------------------------ -// Non-product code +// Debugging code + +void ObjectSynchronizer::sanity_checks(const bool verbose, + const uint cache_line_size, + int *error_cnt_ptr, + int *warning_cnt_ptr) { + u_char *addr_begin = (u_char*)&GVars; + u_char *addr_stwRandom = (u_char*)&GVars.stwRandom; + u_char *addr_hcSequence = (u_char*)&GVars.hcSequence; + + if (verbose) { + tty->print_cr("INFO: sizeof(SharedGlobals)=" SIZE_FORMAT, + sizeof(SharedGlobals)); + } + + uint offset_stwRandom = (uint)(addr_stwRandom - addr_begin); + if (verbose) tty->print_cr("INFO: offset(stwRandom)=%u", offset_stwRandom); + + uint offset_hcSequence = (uint)(addr_hcSequence - addr_begin); + if (verbose) { + tty->print_cr("INFO: offset(_hcSequence)=%u", offset_hcSequence); + } + + if (cache_line_size != 0) { + // We were able to determine the L1 data cache line size so + // do some cache line specific sanity checks + + if (offset_stwRandom < cache_line_size) { + tty->print_cr("WARNING: the SharedGlobals.stwRandom field is closer " + "to the struct beginning than a cache line which permits " + "false sharing."); + (*warning_cnt_ptr)++; + } + + if ((offset_hcSequence - offset_stwRandom) < cache_line_size) { + tty->print_cr("WARNING: the SharedGlobals.stwRandom and " + "SharedGlobals.hcSequence fields are closer than a cache " + "line which permits false sharing."); + (*warning_cnt_ptr)++; + } + + if ((sizeof(SharedGlobals) - offset_hcSequence) < cache_line_size) { + tty->print_cr("WARNING: the SharedGlobals.hcSequence field is closer " + "to the struct end than a cache line which permits false " + "sharing."); + (*warning_cnt_ptr)++; + } + } +} #ifndef PRODUCT diff --git a/src/share/vm/runtime/synchronizer.hpp b/src/share/vm/runtime/synchronizer.hpp index af8df338a..3b2597ddd 100644 --- a/src/share/vm/runtime/synchronizer.hpp +++ b/src/share/vm/runtime/synchronizer.hpp @@ -121,6 +121,9 @@ class ObjectSynchronizer : AllStatic { static void oops_do(OopClosure* f); // debugging + static void sanity_checks(const bool verbose, + const unsigned int cache_line_size, + int *error_cnt_ptr, int *warning_cnt_ptr); static void verify() PRODUCT_RETURN; static int verify_objmon_isinpool(ObjectMonitor *addr) PRODUCT_RETURN0; diff --git a/src/share/vm/runtime/vm_version.cpp b/src/share/vm/runtime/vm_version.cpp index d95e3a966..50063e6f8 100644 --- a/src/share/vm/runtime/vm_version.cpp +++ b/src/share/vm/runtime/vm_version.cpp @@ -50,6 +50,7 @@ bool Abstract_VM_Version::_supports_atomic_getset8 = false; bool Abstract_VM_Version::_supports_atomic_getadd4 = false; bool Abstract_VM_Version::_supports_atomic_getadd8 = false; unsigned int Abstract_VM_Version::_logical_processors_per_package = 1U; +unsigned int Abstract_VM_Version::_L1_data_cache_line_size = 0; int Abstract_VM_Version::_reserve_for_allocation_prefetch = 0; #ifndef HOTSPOT_RELEASE_VERSION diff --git a/src/share/vm/runtime/vm_version.hpp b/src/share/vm/runtime/vm_version.hpp index feabadf9b..5c40405f2 100644 --- a/src/share/vm/runtime/vm_version.hpp +++ b/src/share/vm/runtime/vm_version.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -42,6 +42,7 @@ class Abstract_VM_Version: AllStatic { static bool _supports_atomic_getadd4; static bool _supports_atomic_getadd8; static unsigned int _logical_processors_per_package; + static unsigned int _L1_data_cache_line_size; static int _vm_major_version; static int _vm_minor_version; static int _vm_build_number; @@ -114,6 +115,10 @@ class Abstract_VM_Version: AllStatic { return _logical_processors_per_package; } + static unsigned int L1_data_cache_line_size() { + return _L1_data_cache_line_size; + } + // Need a space at the end of TLAB for prefetch instructions // which may fault when accessing memory outside of heap. static int reserve_for_allocation_prefetch() { -- GitLab