8030177: G1: Enable TLAB resizing

Reviewed-by: tschatzl, stefank, jmasa

8030177: G1: Enable TLAB resizing
Reviewed-by: tschatzl, stefank, jmasa
b888cc0d · brutisso · 583b4f86 · b888cc0d · b888cc0d · b888cc0d
20 changed file
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@@ -2996,7 +2996,17 @@ bool G1CollectedHeap::supports_tlab_allocation() const {
 }

 size_t G1CollectedHeap::tlab_capacity(Thread* ignored) const {
-  return HeapRegion::GrainBytes;
+  return (_g1_policy->young_list_target_length() - young_list()->survivor_length()) * HeapRegion::GrainBytes;
+}
+
+size_t G1CollectedHeap::tlab_used(Thread* ignored) const {
+  return young_list()->eden_used_bytes();
+}
+
+// For G1 TLABs should not contain humongous objects, so the maximum TLAB size
+// must be smaller than the humongous object limit.
+size_t G1CollectedHeap::max_tlab_size() const {
+  return align_size_down(_humongous_object_threshold_in_words - 1, MinObjAlignment);
 }

 size_t G1CollectedHeap::unsafe_max_tlab_alloc(Thread* ignored) const {
@@ -3008,11 +3018,11 @@ size_t G1CollectedHeap::unsafe_max_tlab_alloc(Thread* ignored) const {
  // humongous objects.

  HeapRegion* hr = _mutator_alloc_region.get();
-  size_t max_tlab_size = _humongous_object_threshold_in_words * wordSize;
+  size_t max_tlab = max_tlab_size() * wordSize;
  if (hr == NULL) {
-    return max_tlab_size;
+    return max_tlab;
  } else {
-    return MIN2(MAX2(hr->free(), (size_t) MinTLABSize), max_tlab_size);
+    return MIN2(MAX2(hr->free(), (size_t) MinTLABSize), max_tlab);
  }
 }

@@ -3649,6 +3659,7 @@ void G1CollectedHeap::gc_prologue(bool full /* Ignored */) {
  // always_do_update_barrier = false;
  assert(InlineCacheBuffer::is_empty(), "should have cleaned up ICBuffer");
  // Fill TLAB's and such
+  accumulate_statistics_all_tlabs();
  ensure_parsability(true);

  if (G1SummarizeRSetStats && (G1SummarizeRSetStatsPeriod > 0) &&
@@ -3673,6 +3684,8 @@ void G1CollectedHeap::gc_epilogue(bool full /* Ignored */) {
                        "derived pointer present"));
  // always_do_update_barrier = true;

+  resize_all_tlabs();
+
  // We have just completed a GC. Update the soft reference
  // policy with the new heap occupancy
  Universe::update_heap_info_at_gc();

--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
@@ -1470,9 +1470,11 @@ public:
  // Section on thread-local allocation buffers (TLABs)
  // See CollectedHeap for semantics.

-  virtual bool supports_tlab_allocation() const;
-  virtual size_t tlab_capacity(Thread* thr) const;
-  virtual size_t unsafe_max_tlab_alloc(Thread* thr) const;
+  bool supports_tlab_allocation() const;
+  size_t tlab_capacity(Thread* ignored) const;
+  size_t tlab_used(Thread* ignored) const;
+  size_t max_tlab_size() const;
+  size_t unsafe_max_tlab_alloc(Thread* ignored) const;

  // Can a compiler initialize a new object without store barriers?
  // This permission only extends from the creation of a new object
@@ -1557,7 +1559,7 @@ public:
  void set_region_short_lived_locked(HeapRegion* hr);
  // add appropriate methods for any other surv rate groups

-  YoungList* young_list() { return _young_list; }
+  YoungList* young_list() const { return _young_list; }

  // debugging
  bool check_young_list_well_formed() {

--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
@@ -820,6 +820,8 @@ public:
    // do that for any other surv rate groups
  }

+  size_t young_list_target_length() const { return _young_list_target_length; }
+
  bool is_young_list_full() {
    uint young_list_length = _g1->young_list()->length();
    uint young_list_target_length = _young_list_target_length;

--- a/hotspot/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp
+++ b/hotspot/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp
@@ -488,6 +488,10 @@ size_t ParallelScavengeHeap::tlab_capacity(Thread* thr) const {
  return young_gen()->eden_space()->tlab_capacity(thr);
 }

+size_t ParallelScavengeHeap::tlab_used(Thread* thr) const {
+  return young_gen()->eden_space()->tlab_used(thr);
+}
+
 size_t ParallelScavengeHeap::unsafe_max_tlab_alloc(Thread* thr) const {
  return young_gen()->eden_space()->unsafe_max_tlab_alloc(thr);
 }

--- a/hotspot/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp
+++ b/hotspot/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp
@@ -187,6 +187,7 @@ class ParallelScavengeHeap : public CollectedHeap {
  bool supports_tlab_allocation() const { return true; }

  size_t tlab_capacity(Thread* thr) const;
+  size_t tlab_used(Thread* thr) const;
  size_t unsafe_max_tlab_alloc(Thread* thr) const;

  // Can a compiler initialize a new object without store barriers?

--- a/hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp
+++ b/hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp
@@ -173,6 +173,26 @@ size_t MutableNUMASpace::tlab_capacity(Thread *thr) const {
  return lgrp_spaces()->at(i)->space()->capacity_in_bytes();
 }

+size_t MutableNUMASpace::tlab_used(Thread *thr) const {
+  // Please see the comments for tlab_capacity().
+  guarantee(thr != NULL, "No thread");
+  int lgrp_id = thr->lgrp_id();
+  if (lgrp_id == -1) {
+    if (lgrp_spaces()->length() > 0) {
+      return (used_in_bytes()) / lgrp_spaces()->length();
+    } else {
+      assert(false, "There should be at least one locality group");
+      return 0;
+    }
+  }
+  int i = lgrp_spaces()->find(&lgrp_id, LGRPSpace::equals);
+  if (i == -1) {
+    return 0;
+  }
+  return lgrp_spaces()->at(i)->space()->used_in_bytes();
+}
+
+
 size_t MutableNUMASpace::unsafe_max_tlab_alloc(Thread *thr) const {
  // Please see the comments for tlab_capacity().
  guarantee(thr != NULL, "No thread");

--- a/hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.hpp
+++ b/hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.hpp
@@ -217,6 +217,7 @@ class MutableNUMASpace : public MutableSpace {
  using MutableSpace::capacity_in_words;
  virtual size_t capacity_in_words(Thread* thr) const;
  virtual size_t tlab_capacity(Thread* thr) const;
+  virtual size_t tlab_used(Thread* thr) const;
  virtual size_t unsafe_max_tlab_alloc(Thread* thr) const;

  // Allocation (return NULL if full)

--- a/hotspot/src/share/vm/gc_implementation/shared/mutableSpace.hpp
+++ b/hotspot/src/share/vm/gc_implementation/shared/mutableSpace.hpp
@@ -124,6 +124,7 @@ class MutableSpace: public ImmutableSpace {
  virtual size_t used_in_words() const                    { return pointer_delta(top(), bottom()); }
  virtual size_t free_in_words() const                    { return pointer_delta(end(),    top()); }
  virtual size_t tlab_capacity(Thread* thr) const         { return capacity_in_bytes();            }
+  virtual size_t tlab_used(Thread* thr) const             { return used_in_bytes();                }
  virtual size_t unsafe_max_tlab_alloc(Thread* thr) const { return free_in_bytes();                }

  // Allocation (return NULL if full)

--- a/hotspot/src/share/vm/gc_implementation/shared/parGCAllocBuffer.cpp
+++ b/hotspot/src/share/vm/gc_implementation/shared/parGCAllocBuffer.cpp
@@ -89,6 +89,10 @@ void ParGCAllocBuffer::flush_stats(PLABStats* stats) {
 // scavenge; it clears the sensor accumulators.
 void PLABStats::adjust_desired_plab_sz(uint no_of_gc_workers) {
  assert(ResizePLAB, "Not set");
+
+  assert(is_object_aligned(max_size()) && min_size() <= max_size(),
+         "PLAB clipping computation may be incorrect");
+
  if (_allocated == 0) {
    assert(_unused == 0,
           err_msg("Inconsistency in PLAB stats: "

--- a/hotspot/src/share/vm/gc_implementation/shared/parGCAllocBuffer.hpp
+++ b/hotspot/src/share/vm/gc_implementation/shared/parGCAllocBuffer.hpp
@@ -181,16 +181,7 @@ class PLABStats VALUE_OBJ_CLASS_SPEC {
    _used(0),
    _desired_plab_sz(desired_plab_sz_),
    _filter(wt)
-  {
-    size_t min_sz = min_size();
-    size_t max_sz = max_size();
-    size_t aligned_min_sz = align_object_size(min_sz);
-    size_t aligned_max_sz = align_object_size(max_sz);
-    assert(min_sz <= aligned_min_sz && max_sz >= aligned_max_sz &&
-           min_sz <= max_sz,
-           "PLAB clipping computation in adjust_desired_plab_sz()"
-           " may be incorrect");
-  }
+  { }

  static const size_t min_size() {
    return ParGCAllocBuffer::min_size();

--- a/hotspot/src/share/vm/gc_interface/collectedHeap.cpp
+++ b/hotspot/src/share/vm/gc_interface/collectedHeap.cpp
@@ -320,6 +320,21 @@ void CollectedHeap::flush_deferred_store_barrier(JavaThread* thread) {
  assert(thread->deferred_card_mark().is_empty(), "invariant");
 }

+size_t CollectedHeap::max_tlab_size() const {
+  // TLABs can't be bigger than we can fill with a int[Integer.MAX_VALUE].
+  // This restriction could be removed by enabling filling with multiple arrays.
+  // If we compute that the reasonable way as
+  //    header_size + ((sizeof(jint) * max_jint) / HeapWordSize)
+  // we'll overflow on the multiply, so we do the divide first.
+  // We actually lose a little by dividing first,
+  // but that just makes the TLAB  somewhat smaller than the biggest array,
+  // which is fine, since we'll be able to fill that.
+  size_t max_int_size = typeArrayOopDesc::header_size(T_INT) +
+              sizeof(jint) *
+              ((juint) max_jint / (size_t) HeapWordSize);
+  return align_size_down(max_int_size, MinObjAlignment);
+}
+
 // Helper for ReduceInitialCardMarks. For performance,
 // compiled code may elide card-marks for initializing stores
 // to a newly allocated object along the fast-path. We

--- a/hotspot/src/share/vm/gc_interface/collectedHeap.hpp
+++ b/hotspot/src/share/vm/gc_interface/collectedHeap.hpp
@@ -394,14 +394,16 @@ class CollectedHeap : public CHeapObj<mtInternal> {
  // the following methods:
  // Returns "true" iff the heap supports thread-local allocation buffers.
  // The default is "no".
-  virtual bool supports_tlab_allocation() const {
-    return false;
-  }
+  virtual bool supports_tlab_allocation() const = 0;
+
  // The amount of space available for thread-local allocation buffers.
-  virtual size_t tlab_capacity(Thread *thr) const {
-    guarantee(false, "thread-local allocation buffers not supported");
-    return 0;
-  }
+  virtual size_t tlab_capacity(Thread *thr) const = 0;
+
+  // The amount of used space for thread-local allocation buffers for the given thread.
+  virtual size_t tlab_used(Thread *thr) const = 0;
+
+  virtual size_t max_tlab_size() const;
+
  // An estimate of the maximum allocation that could be performed
  // for thread-local allocation buffers without triggering any
  // collection or expansion activity.

--- a/hotspot/src/share/vm/memory/defNewGeneration.cpp
+++ b/hotspot/src/share/vm/memory/defNewGeneration.cpp
@@ -1084,6 +1084,10 @@ size_t DefNewGeneration::tlab_capacity() const {
  return eden()->capacity();
 }

+size_t DefNewGeneration::tlab_used() const {
+  return eden()->used();
+}
+
 size_t DefNewGeneration::unsafe_max_tlab_alloc() const {
  return unsafe_max_alloc_nogc();
 }
--- a/hotspot/src/share/vm/memory/defNewGeneration.hpp
+++ b/hotspot/src/share/vm/memory/defNewGeneration.hpp
@@ -239,6 +239,7 @@ protected:
  // Thread-local allocation buffers
  bool supports_tlab_allocation() const { return true; }
  size_t tlab_capacity() const;
+  size_t tlab_used() const;
  size_t unsafe_max_tlab_alloc() const;

  // Grow the generation by the specified number of bytes.

--- a/hotspot/src/share/vm/memory/genCollectedHeap.cpp
+++ b/hotspot/src/share/vm/memory/genCollectedHeap.cpp
@@ -932,6 +932,16 @@ size_t GenCollectedHeap::tlab_capacity(Thread* thr) const {
  return result;
 }

+size_t GenCollectedHeap::tlab_used(Thread* thr) const {
+  size_t result = 0;
+  for (int i = 0; i < _n_gens; i += 1) {
+    if (_gens[i]->supports_tlab_allocation()) {
+      result += _gens[i]->tlab_used();
+    }
+  }
+  return result;
+}
+
 size_t GenCollectedHeap::unsafe_max_tlab_alloc(Thread* thr) const {
  size_t result = 0;
  for (int i = 0; i < _n_gens; i += 1) {

--- a/hotspot/src/share/vm/memory/genCollectedHeap.hpp
+++ b/hotspot/src/share/vm/memory/genCollectedHeap.hpp
@@ -248,6 +248,7 @@ public:
  // Section on TLAB's.
  virtual bool supports_tlab_allocation() const;
  virtual size_t tlab_capacity(Thread* thr) const;
+  virtual size_t tlab_used(Thread* thr) const;
  virtual size_t unsafe_max_tlab_alloc(Thread* thr) const;
  virtual HeapWord* allocate_new_tlab(size_t size);


--- a/hotspot/src/share/vm/memory/generation.hpp
+++ b/hotspot/src/share/vm/memory/generation.hpp
@@ -299,6 +299,10 @@ class Generation: public CHeapObj<mtGC> {
    guarantee(false, "Generation doesn't support thread local allocation buffers");
    return 0;
  }
+  virtual size_t tlab_used() const {
+    guarantee(false, "Generation doesn't support thread local allocation buffers");
+    return 0;
+  }
  virtual size_t unsafe_max_tlab_alloc() const {
    guarantee(false, "Generation doesn't support thread local allocation buffers");
    return 0;

--- a/hotspot/src/share/vm/memory/threadLocalAllocBuffer.cpp
+++ b/hotspot/src/share/vm/memory/threadLocalAllocBuffer.cpp
@@ -34,6 +34,7 @@
 // Thread-Local Edens support

 // static member initialization
+size_t           ThreadLocalAllocBuffer::_max_size       = 0;
 unsigned         ThreadLocalAllocBuffer::_target_refills = 0;
 GlobalTLABStats* ThreadLocalAllocBuffer::_global_stats   = NULL;

@@ -45,7 +46,7 @@ void ThreadLocalAllocBuffer::clear_before_allocation() {
 void ThreadLocalAllocBuffer::accumulate_statistics_before_gc() {
  global_stats()->initialize();

-  for(JavaThread *thread = Threads::first(); thread; thread = thread->next()) {
+  for (JavaThread *thread = Threads::first(); thread != NULL; thread = thread->next()) {
    thread->tlab().accumulate_statistics();
    thread->tlab().initialize_statistics();
  }
@@ -60,28 +61,32 @@ void ThreadLocalAllocBuffer::accumulate_statistics_before_gc() {
 }

 void ThreadLocalAllocBuffer::accumulate_statistics() {
-  size_t capacity = Universe::heap()->tlab_capacity(myThread()) / HeapWordSize;
-  size_t unused   = Universe::heap()->unsafe_max_tlab_alloc(myThread()) / HeapWordSize;
-  size_t used     = capacity - unused;
-
-  // Update allocation history if a reasonable amount of eden was allocated.
-  bool update_allocation_history = used > 0.5 * capacity;
+  Thread* thread = myThread();
+  size_t capacity = Universe::heap()->tlab_capacity(thread);
+  size_t used     = Universe::heap()->tlab_used(thread);

  _gc_waste += (unsigned)remaining();
+  size_t total_allocated = thread->allocated_bytes();
+  size_t allocated_since_last_gc = total_allocated - _allocated_before_last_gc;
+  _allocated_before_last_gc = total_allocated;

  if (PrintTLAB && (_number_of_refills > 0 || Verbose)) {
    print_stats("gc");
  }

  if (_number_of_refills > 0) {
+    // Update allocation history if a reasonable amount of eden was allocated.
+    bool update_allocation_history = used > 0.5 * capacity;

    if (update_allocation_history) {
      // Average the fraction of eden allocated in a tlab by this
      // thread for use in the next resize operation.
      // _gc_waste is not subtracted because it's included in
      // "used".
-      size_t allocation = _number_of_refills * desired_size();
-      double alloc_frac = allocation / (double) used;
+      // The result can be larger than 1.0 due to direct to old allocations.
+      // These allocations should ideally not be counted but since it is not possible
+      // to filter them out here we just cap the fraction to be at most 1.0.
+      double alloc_frac = MIN2(1.0, (double) allocated_since_last_gc / used);
      _allocation_fraction.sample(alloc_frac);
    }
    global_stats()->update_allocating_threads();
@@ -126,33 +131,32 @@ void ThreadLocalAllocBuffer::make_parsable(bool retire) {
 }

 void ThreadLocalAllocBuffer::resize_all_tlabs() {
-  for(JavaThread *thread = Threads::first(); thread; thread = thread->next()) {
-    thread->tlab().resize();
+  if (ResizeTLAB) {
+    for (JavaThread *thread = Threads::first(); thread != NULL; thread = thread->next()) {
+      thread->tlab().resize();
+    }
  }
 }

 void ThreadLocalAllocBuffer::resize() {
+  // Compute the next tlab size using expected allocation amount
+  assert(ResizeTLAB, "Should not call this otherwise");
+  size_t alloc = (size_t)(_allocation_fraction.average() *
+                          (Universe::heap()->tlab_capacity(myThread()) / HeapWordSize));
+  size_t new_size = alloc / _target_refills;

-  if (ResizeTLAB) {
-    // Compute the next tlab size using expected allocation amount
-    size_t alloc = (size_t)(_allocation_fraction.average() *
-                            (Universe::heap()->tlab_capacity(myThread()) / HeapWordSize));
-    size_t new_size = alloc / _target_refills;
-
-    new_size = MIN2(MAX2(new_size, min_size()), max_size());
+  new_size = MIN2(MAX2(new_size, min_size()), max_size());

-    size_t aligned_new_size = align_object_size(new_size);
+  size_t aligned_new_size = align_object_size(new_size);

-    if (PrintTLAB && Verbose) {
-      gclog_or_tty->print("TLAB new size: thread: " INTPTR_FORMAT " [id: %2d]"
-                          " refills %d  alloc: %8.6f desired_size: " SIZE_FORMAT " -> " SIZE_FORMAT "\n",
-                          myThread(), myThread()->osthread()->thread_id(),
-                          _target_refills, _allocation_fraction.average(), desired_size(), aligned_new_size);
-    }
-    set_desired_size(aligned_new_size);
-
-    set_refill_waste_limit(initial_refill_waste_limit());
+  if (PrintTLAB && Verbose) {
+    gclog_or_tty->print("TLAB new size: thread: " INTPTR_FORMAT " [id: %2d]"
+                        " refills %d  alloc: %8.6f desired_size: " SIZE_FORMAT " -> " SIZE_FORMAT "\n",
+                        myThread(), myThread()->osthread()->thread_id(),
+                        _target_refills, _allocation_fraction.average(), desired_size(), aligned_new_size);
  }
+  set_desired_size(aligned_new_size);
+  set_refill_waste_limit(initial_refill_waste_limit());
 }

 void ThreadLocalAllocBuffer::initialize_statistics() {
@@ -248,31 +252,13 @@ size_t ThreadLocalAllocBuffer::initial_desired_size() {
  return init_sz;
 }

-const size_t ThreadLocalAllocBuffer::max_size() {
-
-  // TLABs can't be bigger than we can fill with a int[Integer.MAX_VALUE].
-  // This restriction could be removed by enabling filling with multiple arrays.
-  // If we compute that the reasonable way as
-  //    header_size + ((sizeof(jint) * max_jint) / HeapWordSize)
-  // we'll overflow on the multiply, so we do the divide first.
-  // We actually lose a little by dividing first,
-  // but that just makes the TLAB  somewhat smaller than the biggest array,
-  // which is fine, since we'll be able to fill that.
-
-  size_t unaligned_max_size = typeArrayOopDesc::header_size(T_INT) +
-                              sizeof(jint) *
-                              ((juint) max_jint / (size_t) HeapWordSize);
-  return align_size_down(unaligned_max_size, MinObjAlignment);
-}
-
 void ThreadLocalAllocBuffer::print_stats(const char* tag) {
  Thread* thrd = myThread();
  size_t waste = _gc_waste + _slow_refill_waste + _fast_refill_waste;
  size_t alloc = _number_of_refills * _desired_size;
  double waste_percent = alloc == 0 ? 0.0 :
                      100.0 * waste / alloc;
-  size_t tlab_used  = Universe::heap()->tlab_capacity(thrd) -
-                      Universe::heap()->unsafe_max_tlab_alloc(thrd);
+  size_t tlab_used  = Universe::heap()->tlab_used(thrd);
  gclog_or_tty->print("TLAB: %s thread: " INTPTR_FORMAT " [id: %2d]"
                      " desired_size: " SIZE_FORMAT "KB"
                      " slow allocs: %d  refill waste: " SIZE_FORMAT "B"

--- a/hotspot/src/share/vm/memory/threadLocalAllocBuffer.hpp
+++ b/hotspot/src/share/vm/memory/threadLocalAllocBuffer.hpp
@@ -45,7 +45,9 @@ private:
  HeapWord* _end;                                // allocation end (excluding alignment_reserve)
  size_t    _desired_size;                       // desired size   (including alignment_reserve)
  size_t    _refill_waste_limit;                 // hold onto tlab if free() is larger than this
+  size_t    _allocated_before_last_gc;           // total bytes allocated up until the last gc

+  static size_t   _max_size;                     // maximum size of any TLAB
  static unsigned _target_refills;               // expected number of refills between GCs

  unsigned  _number_of_refills;
@@ -99,12 +101,13 @@ private:
  static GlobalTLABStats* global_stats() { return _global_stats; }

 public:
-  ThreadLocalAllocBuffer() : _allocation_fraction(TLABAllocationWeight) {
+  ThreadLocalAllocBuffer() : _allocation_fraction(TLABAllocationWeight), _allocated_before_last_gc(0) {
    // do nothing.  tlabs must be inited by initialize() calls
  }

  static const size_t min_size()                 { return align_object_size(MinTLABSize / HeapWordSize); }
-  static const size_t max_size();
+  static const size_t max_size()                 { assert(_max_size != 0, "max_size not set up"); return _max_size; }
+  static void set_max_size(size_t max_size)      { _max_size = max_size; }

  HeapWord* start() const                        { return _start; }
  HeapWord* end() const                          { return _end; }

--- a/hotspot/src/share/vm/memory/universe.cpp
+++ b/hotspot/src/share/vm/memory/universe.cpp
@@ -816,6 +816,8 @@ jint Universe::initialize_heap() {
    Universe::_collectedHeap = new GenCollectedHeap(gc_policy);
  }

+  ThreadLocalAllocBuffer::set_max_size(Universe::heap()->max_tlab_size());
+
  jint status = Universe::heap()->initialize();
  if (status != JNI_OK) {
    return status;