6888898: CMS: ReduceInitialCardMarks unsafe in the presence of cms precleaning

6889757: G1: enable card mark elision for initializing writes from compiled code (ReduceInitialCardMarks) Summary: Defer the (compiler-elided) card-mark upon a slow-path allocation until after the store and before the next subsequent safepoint; G1 now answers yes to can_elide_tlab_write_barriers(). Reviewed-by: jcoomes, kvn, never

6888898: CMS: ReduceInitialCardMarks unsafe in the presence of cms precleaning
6889757: G1: enable card mark elision for initializing writes from compiled code (ReduceInitialCardMarks) Summary: Defer the (compiler-elided) card-mark upon a slow-path allocation until after the store and before the next subsequent safepoint; G1 now answers yes to can_elide_tlab_write_barriers(). Reviewed-by: jcoomes, kvn, never
349bfec0 · ysr · 77eca09d · 349bfec0 · 349bfec0 · 349bfec0
13 changed file
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
@@ -992,11 +992,39 @@ public:

  // Can a compiler initialize a new object without store barriers?
  // This permission only extends from the creation of a new object
-  // via a TLAB up to the first subsequent safepoint.
+  // via a TLAB up to the first subsequent safepoint. If such permission
+  // is granted for this heap type, the compiler promises to call
+  // defer_store_barrier() below on any slow path allocation of
+  // a new object for which such initializing store barriers will
+  // have been elided. G1, like CMS, allows this, but should be
+  // ready to provide a compensating write barrier as necessary
+  // if that storage came out of a non-young region. The efficiency
+  // of this implementation depends crucially on being able to
+  // answer very efficiently in constant time whether a piece of
+  // storage in the heap comes from a young region or not.
+  // See ReduceInitialCardMarks.
  virtual bool can_elide_tlab_store_barriers() const {
-    // Since G1's TLAB's may, on occasion, come from non-young regions
-    // as well. (Is there a flag controlling that? XXX)
-    return false;
+    return true;
+  }
+
+  bool is_in_young(oop obj) {
+    HeapRegion* hr = heap_region_containing(obj);
+    return hr != NULL && hr->is_young();
+  }
+
+  // We don't need barriers for initializing stores to objects
+  // in the young gen: for the SATB pre-barrier, there is no
+  // pre-value that needs to be remembered; for the remembered-set
+  // update logging post-barrier, we don't maintain remembered set
+  // information for young gen objects. Note that non-generational
+  // G1 does not have any "young" objects, should not elide
+  // the rs logging barrier and so should always answer false below.
+  // However, non-generational G1 (-XX:-G1Gen) appears to have
+  // bit-rotted so was not tested below.
+  virtual bool can_elide_initializing_store_barrier(oop new_obj) {
+    assert(G1Gen || !is_in_young(new_obj),
+           "Non-generational G1 should never return true below");
+    return is_in_young(new_obj);
  }

  // Can a compiler elide a store barrier when it writes

--- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp
@@ -314,41 +314,6 @@ bool ParallelScavengeHeap::is_in_reserved(const void* p) const {
  return false;
 }

-// Static method
-bool ParallelScavengeHeap::is_in_young(oop* p) {
-  ParallelScavengeHeap* heap = (ParallelScavengeHeap*)Universe::heap();
-  assert(heap->kind() == CollectedHeap::ParallelScavengeHeap,
-                                            "Must be ParallelScavengeHeap");
-
-  PSYoungGen* young_gen = heap->young_gen();
-
-  if (young_gen->is_in_reserved(p)) {
-    return true;
-  }
-
-  return false;
-}
-
-// Static method
-bool ParallelScavengeHeap::is_in_old_or_perm(oop* p) {
-  ParallelScavengeHeap* heap = (ParallelScavengeHeap*)Universe::heap();
-  assert(heap->kind() == CollectedHeap::ParallelScavengeHeap,
-                                            "Must be ParallelScavengeHeap");
-
-  PSOldGen* old_gen = heap->old_gen();
-  PSPermGen* perm_gen = heap->perm_gen();
-
-  if (old_gen->is_in_reserved(p)) {
-    return true;
-  }
-
-  if (perm_gen->is_in_reserved(p)) {
-    return true;
-  }
-
-  return false;
-}
-
 // There are two levels of allocation policy here.
 //
 // When an allocation request fails, the requesting thread must invoke a VM
@@ -764,6 +729,13 @@ void ParallelScavengeHeap::resize_all_tlabs() {
  CollectedHeap::resize_all_tlabs();
 }

+bool ParallelScavengeHeap::can_elide_initializing_store_barrier(oop new_obj) {
+  // We don't need barriers for stores to objects in the
+  // young gen and, a fortiori, for initializing stores to
+  // objects therein.
+  return is_in_young(new_obj);
+}
+
 // This method is used by System.gc() and JVMTI.
 void ParallelScavengeHeap::collect(GCCause::Cause cause) {
  assert(!Heap_lock->owned_by_self(),

--- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp
@@ -129,8 +129,8 @@ class ParallelScavengeHeap : public CollectedHeap {
    return perm_gen()->is_in(p);
  }

-  static bool is_in_young(oop *p);        // reserved part
-  static bool is_in_old_or_perm(oop *p);  // reserved part
+  inline bool is_in_young(oop p);        // reserved part
+  inline bool is_in_old_or_perm(oop p);  // reserved part

  // Memory allocation.   "gc_time_limit_was_exceeded" will
  // be set to true if the adaptive size policy determine that
@@ -191,6 +191,10 @@ class ParallelScavengeHeap : public CollectedHeap {
    return true;
  }

+  // Return true if we don't we need a store barrier for
+  // initializing stores to an object at this address.
+  virtual bool can_elide_initializing_store_barrier(oop new_obj);
+
  // Can a compiler elide a store barrier when it writes
  // a permanent oop into the heap?  Applies when the compiler
  // is storing x to the heap, where x->is_perm() is true.

--- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.inline.hpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.inline.hpp
@@ -41,3 +41,11 @@ inline void ParallelScavengeHeap::invoke_full_gc(bool maximum_compaction)
    PSMarkSweep::invoke(maximum_compaction);
  }
 }
+
+inline bool ParallelScavengeHeap::is_in_young(oop p) {
+  return young_gen()->is_in_reserved(p);
+}
+
+inline bool ParallelScavengeHeap::is_in_old_or_perm(oop p) {
+  return old_gen()->is_in_reserved(p) || perm_gen()->is_in_reserved(p);
+}
--- a/src/share/vm/gc_interface/collectedHeap.cpp
+++ b/src/share/vm/gc_interface/collectedHeap.cpp
@@ -137,6 +137,89 @@ HeapWord* CollectedHeap::allocate_from_tlab_slow(Thread* thread, size_t size) {
  return obj;
 }

+void CollectedHeap::flush_deferred_store_barrier(JavaThread* thread) {
+  MemRegion deferred = thread->deferred_card_mark();
+  if (!deferred.is_empty()) {
+    {
+      // Verify that the storage points to a parsable object in heap
+      DEBUG_ONLY(oop old_obj = oop(deferred.start());)
+      assert(is_in(old_obj), "Not in allocated heap");
+      assert(!can_elide_initializing_store_barrier(old_obj),
+             "Else should have been filtered in defer_store_barrier()");
+      assert(!is_in_permanent(old_obj), "Sanity: not expected");
+      assert(old_obj->is_oop(true), "Not an oop");
+      assert(old_obj->is_parsable(), "Will not be concurrently parsable");
+      assert(deferred.word_size() == (size_t)(old_obj->size()),
+             "Mismatch: multiple objects?");
+    }
+    BarrierSet* bs = barrier_set();
+    assert(bs->has_write_region_opt(), "No write_region() on BarrierSet");
+    bs->write_region(deferred);
+    // "Clear" the deferred_card_mark field
+    thread->set_deferred_card_mark(MemRegion());
+  }
+  assert(thread->deferred_card_mark().is_empty(), "invariant");
+}
+
+// Helper for ReduceInitialCardMarks. For performance,
+// compiled code may elide card-marks for initializing stores
+// to a newly allocated object along the fast-path. We
+// compensate for such elided card-marks as follows:
+// (a) Generational, non-concurrent collectors, such as
+//     GenCollectedHeap(ParNew,DefNew,Tenured) and
+//     ParallelScavengeHeap(ParallelGC, ParallelOldGC)
+//     need the card-mark if and only if the region is
+//     in the old gen, and do not care if the card-mark
+//     succeeds or precedes the initializing stores themselves,
+//     so long as the card-mark is completed before the next
+//     scavenge. For all these cases, we can do a card mark
+//     at the point at which we do a slow path allocation
+//     in the old gen. For uniformity, however, we end
+//     up using the same scheme (see below) for all three
+//     cases (deferring the card-mark appropriately).
+// (b) GenCollectedHeap(ConcurrentMarkSweepGeneration) requires
+//     in addition that the card-mark for an old gen allocated
+//     object strictly follow any associated initializing stores.
+//     In these cases, the memRegion remembered below is
+//     used to card-mark the entire region either just before the next
+//     slow-path allocation by this thread or just before the next scavenge or
+//     CMS-associated safepoint, whichever of these events happens first.
+//     (The implicit assumption is that the object has been fully
+//     initialized by this point, a fact that we assert when doing the
+//     card-mark.)
+// (c) G1CollectedHeap(G1) uses two kinds of write barriers. When a
+//     G1 concurrent marking is in progress an SATB (pre-write-)barrier is
+//     is used to remember the pre-value of any store. Initializing
+//     stores will not need this barrier, so we need not worry about
+//     compensating for the missing pre-barrier here. Turning now
+//     to the post-barrier, we note that G1 needs a RS update barrier
+//     which simply enqueues a (sequence of) dirty cards which may
+//     optionally be refined by the concurrent update threads. Note
+//     that this barrier need only be applied to a non-young write,
+//     but, like in CMS, because of the presence of concurrent refinement
+//     (much like CMS' precleaning), must strictly follow the oop-store.
+//     Thus, using the same protocol for maintaining the intended
+//     invariants turns out, serendepitously, to be the same for all
+//     three collectors/heap types above.
+//
+// For each future collector, this should be reexamined with
+// that specific collector in mind.
+oop CollectedHeap::defer_store_barrier(JavaThread* thread, oop new_obj) {
+  // If a previous card-mark was deferred, flush it now.
+  flush_deferred_store_barrier(thread);
+  if (can_elide_initializing_store_barrier(new_obj)) {
+    // The deferred_card_mark region should be empty
+    // following the flush above.
+    assert(thread->deferred_card_mark().is_empty(), "Error");
+  } else {
+    // Remember info for the newly deferred store barrier
+    MemRegion deferred = MemRegion((HeapWord*)new_obj, new_obj->size());
+    assert(!deferred.is_empty(), "Error");
+    thread->set_deferred_card_mark(deferred);
+  }
+  return new_obj;
+}
+
 size_t CollectedHeap::filler_array_hdr_size() {
  return size_t(arrayOopDesc::header_size(T_INT));
 }
@@ -225,16 +308,6 @@ void CollectedHeap::fill_with_objects(HeapWord* start, size_t words)
  fill_with_object_impl(start, words);
 }

-oop CollectedHeap::new_store_barrier(oop new_obj) {
-  // %%% This needs refactoring.  (It was imported from the server compiler.)
-  guarantee(can_elide_tlab_store_barriers(), "store barrier elision not supported");
-  BarrierSet* bs = this->barrier_set();
-  assert(bs->has_write_region_opt(), "Barrier set does not have write_region");
-  int new_size = new_obj->size();
-  bs->write_region(MemRegion((HeapWord*)new_obj, new_size));
-  return new_obj;
-}
-
 HeapWord* CollectedHeap::allocate_new_tlab(size_t size) {
  guarantee(false, "thread-local allocation buffers not supported");
  return NULL;

--- a/src/share/vm/gc_interface/collectedHeap.hpp
+++ b/src/share/vm/gc_interface/collectedHeap.hpp
@@ -415,9 +415,14 @@ class CollectedHeap : public CHeapObj {
    guarantee(false, "thread-local allocation buffers not supported");
    return 0;
  }
+
  // Can a compiler initialize a new object without store barriers?
  // This permission only extends from the creation of a new object
-  // via a TLAB up to the first subsequent safepoint.
+  // via a TLAB up to the first subsequent safepoint. If such permission
+  // is granted for this heap type, the compiler promises to call
+  // defer_store_barrier() below on any slow path allocation of
+  // a new object for which such initializing store barriers will
+  // have been elided.
  virtual bool can_elide_tlab_store_barriers() const = 0;

  // If a compiler is eliding store barriers for TLAB-allocated objects,
@@ -425,8 +430,19 @@ class CollectedHeap : public CHeapObj {
  // an object allocated anywhere.  The compiler's runtime support
  // promises to call this function on such a slow-path-allocated
  // object before performing initializations that have elided
-  // store barriers.  Returns new_obj, or maybe a safer copy thereof.
-  virtual oop new_store_barrier(oop new_obj);
+  // store barriers. Returns new_obj, or maybe a safer copy thereof.
+  virtual oop defer_store_barrier(JavaThread* thread, oop new_obj);
+
+  // Answers whether an initializing store to a new object currently
+  // allocated at the given address doesn't need a (deferred) store
+  // barrier. Returns "true" if it doesn't need an initializing
+  // store barrier; answers "false" if it does.
+  virtual bool can_elide_initializing_store_barrier(oop new_obj) = 0;
+
+  // If the CollectedHeap was asked to defer a store barrier above,
+  // this informs it to flush such a deferred store barrier to the
+  // remembered set.
+  virtual void flush_deferred_store_barrier(JavaThread* thread);

  // Can a compiler elide a store barrier when it writes
  // a permanent oop into the heap?  Applies when the compiler

--- a/src/share/vm/memory/genCollectedHeap.hpp
+++ b/src/share/vm/memory/genCollectedHeap.hpp
@@ -260,6 +260,17 @@ public:
    return true;
  }

+  // We don't need barriers for stores to objects in the
+  // young gen and, a fortiori, for initializing stores to
+  // objects therein. This applies to {DefNew,ParNew}+{Tenured,CMS}
+  // only and may need to be re-examined in case other
+  // kinds of collectors are implemented in the future.
+  virtual bool can_elide_initializing_store_barrier(oop new_obj) {
+    assert(UseParNewGC || UseSerialGC || UseConcMarkSweepGC,
+           "Check can_elide_initializing_store_barrier() for this collector");
+    return is_in_youngest((void*)new_obj);
+  }
+
  // Can a compiler elide a store barrier when it writes
  // a permanent oop into the heap?  Applies when the compiler
  // is storing x to the heap, where x->is_perm() is true.

--- a/src/share/vm/opto/graphKit.cpp
+++ b/src/share/vm/opto/graphKit.cpp
@@ -3186,6 +3186,15 @@ void GraphKit::write_barrier_post(Node* oop_store,
      return;
  }

+  if (use_ReduceInitialCardMarks()
+      && obj == just_allocated_object(control())) {
+    // We can skip marks on a freshly-allocated object in Eden.
+    // Keep this code in sync with maybe_defer_card_mark() in runtime.cpp.
+    // That routine informs GC to take appropriate compensating steps
+    // so as to make this card-mark elision safe.
+    return;
+  }
+
  if (!use_precise) {
    // All card marks for a (non-array) instance are in one place:
    adr = obj;

--- a/src/share/vm/opto/library_call.cpp
+++ b/src/share/vm/opto/library_call.cpp
@@ -4160,13 +4160,13 @@ bool LibraryCallKit::inline_native_clone(bool is_virtual) {
          result_mem ->set_req(_objArray_path, reset_memory());
        }
      }
-      // We can dispense with card marks if we know the allocation
-      // comes out of eden (TLAB)...  In fact, ReduceInitialCardMarks
-      // causes the non-eden paths to simulate a fresh allocation,
-      // insofar that no further card marks are required to initialize
-      // the object.
-
      // Otherwise, there are no card marks to worry about.
+      // (We can dispense with card marks if we know the allocation
+      //  comes out of eden (TLAB)...  In fact, ReduceInitialCardMarks
+      //  causes the non-eden paths to take compensating steps to
+      //  simulate a fresh allocation, so that no further
+      //  card marks are required in compiled code to initialize
+      //  the object.)

      if (!stopped()) {
        copy_to_clone(obj, alloc_obj, obj_size, true, false);

--- a/src/share/vm/opto/runtime.cpp
+++ b/src/share/vm/opto/runtime.cpp
@@ -143,18 +143,20 @@ const char* OptoRuntime::stub_name(address entry) {
 // We failed the fast-path allocation.  Now we need to do a scavenge or GC
 // and try allocation again.

-void OptoRuntime::do_eager_card_mark(JavaThread* thread) {
+void OptoRuntime::maybe_defer_card_mark(JavaThread* thread) {
  // After any safepoint, just before going back to compiled code,
-  // we perform a card mark.  This lets the compiled code omit
-  // card marks for initialization of new objects.
-  // Keep this code consistent with GraphKit::store_barrier.
+  // we inform the GC that we will be doing initializing writes to
+  // this object in the future without emitting card-marks, so
+  // GC may take any compensating steps.
+  // NOTE: Keep this code consistent with GraphKit::store_barrier.

  oop new_obj = thread->vm_result();
  if (new_obj == NULL)  return;

  assert(Universe::heap()->can_elide_tlab_store_barriers(),
         "compiler must check this first");
-  new_obj = Universe::heap()->new_store_barrier(new_obj);
+  // GC may decide to give back a safer copy of new_obj.
+  new_obj = Universe::heap()->defer_store_barrier(thread, new_obj);
  thread->set_vm_result(new_obj);
 }

@@ -197,8 +199,8 @@ JRT_BLOCK_ENTRY(void, OptoRuntime::new_instance_C(klassOopDesc* klass, JavaThrea
  JRT_BLOCK_END;

  if (GraphKit::use_ReduceInitialCardMarks()) {
-    // do them now so we don't have to do them on the fast path
-    do_eager_card_mark(thread);
+    // inform GC that we won't do card marks for initializing writes.
+    maybe_defer_card_mark(thread);
  }
 JRT_END

@@ -236,8 +238,8 @@ JRT_BLOCK_ENTRY(void, OptoRuntime::new_array_C(klassOopDesc* array_type, int len
  JRT_BLOCK_END;

  if (GraphKit::use_ReduceInitialCardMarks()) {
-    // do them now so we don't have to do them on the fast path
-    do_eager_card_mark(thread);
+    // inform GC that we won't do card marks for initializing writes.
+    maybe_defer_card_mark(thread);
  }
 JRT_END


--- a/src/share/vm/opto/runtime.hpp
+++ b/src/share/vm/opto/runtime.hpp
@@ -133,8 +133,8 @@ class OptoRuntime : public AllStatic {
  // Allocate storage for a objArray or typeArray
  static void new_array_C(klassOopDesc* array_klass, int len, JavaThread *thread);

-  // Post-allocation step for implementing ReduceInitialCardMarks:
-  static void do_eager_card_mark(JavaThread* thread);
+  // Post-slow-path-allocation step for implementing ReduceInitialCardMarks:
+  static void maybe_defer_card_mark(JavaThread* thread);

  // Allocate storage for a multi-dimensional arrays
  // Note: needs to be fixed for arbitrary number of dimensions

--- a/src/share/vm/runtime/thread.cpp
+++ b/src/share/vm/runtime/thread.cpp
@@ -1213,6 +1213,7 @@ JavaThread::JavaThread(bool is_attaching) :
 {
  initialize();
  _is_attaching = is_attaching;
+  assert(_deferred_card_mark.is_empty(), "Default MemRegion ctor");
 }

 bool JavaThread::reguard_stack(address cur_sp) {
@@ -2318,6 +2319,10 @@ void JavaThread::gc_prologue() {


 void JavaThread::oops_do(OopClosure* f, CodeBlobClosure* cf) {
+  // Flush deferred store-barriers, if any, associated with
+  // initializing stores done by this JavaThread in the current epoch.
+  Universe::heap()->flush_deferred_store_barrier(this);
+
  // The ThreadProfiler oops_do is done from FlatProfiler::oops_do
  // since there may be more than one thread using each ThreadProfiler.


--- a/src/share/vm/runtime/thread.hpp
+++ b/src/share/vm/runtime/thread.hpp
@@ -684,8 +684,13 @@ class JavaThread: public Thread {
  methodOop     _callee_target;

  // Oop results of VM runtime calls
-  oop           _vm_result;                      // Used to pass back an oop result into Java code, GC-preserved
-  oop           _vm_result_2;                    // Used to pass back an oop result into Java code, GC-preserved
+  oop           _vm_result;    // Used to pass back an oop result into Java code, GC-preserved
+  oop           _vm_result_2;  // Used to pass back an oop result into Java code, GC-preserved
+
+  // See ReduceInitialCardMarks: this holds the precise space interval of
+  // the most recent slow path allocation for which compiled code has
+  // elided card-marks for performance along the fast-path.
+  MemRegion     _deferred_card_mark;

  MonitorChunk* _monitor_chunks;                 // Contains the off stack monitors
                                                 // allocated during deoptimization
@@ -1082,6 +1087,9 @@ class JavaThread: public Thread {
  oop  vm_result_2() const                       { return _vm_result_2; }
  void set_vm_result_2  (oop x)                  { _vm_result_2   = x; }

+  MemRegion deferred_card_mark() const           { return _deferred_card_mark; }
+  void set_deferred_card_mark(MemRegion mr)      { _deferred_card_mark = mr;   }
+
  // Exception handling for compiled methods
  oop      exception_oop() const                 { return _exception_oop; }
  int      exception_stack_size() const          { return _exception_stack_size; }