6896647: card marks can be deferred too long

Summary: Deferred card marks are now flushed during the gc prologue. Parallel[Scavege,OldGC] and SerialGC no longer defer card marks generated by COMPILER2 as a result of ReduceInitialCardMarks. For these cases, introduced a diagnostic option to defer the card marks, only for the purposes of testing and diagnostics. CMS and G1 continue to defer card marks. Potential performance concern related to single-threaded flushing of deferred card marks in the gc prologue will be addressed in the future. Reviewed-by: never, johnc

6896647: card marks can be deferred too long
Summary: Deferred card marks are now flushed during the gc prologue. Parallel[Scavege,OldGC] and SerialGC no longer defer card marks generated by COMPILER2 as a result of ReduceInitialCardMarks. For these cases, introduced a diagnostic option to defer the card marks, only for the purposes of testing and diagnostics. CMS and G1 continue to defer card marks. Potential performance concern related to single-threaded flushing of deferred card marks in the gc prologue will be addressed in the future. Reviewed-by: never, johnc
651edb8d · ysr · e0f00cdb · 651edb8d · 651edb8d · 651edb8d
14 changed file
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@@ -1441,6 +1441,7 @@ G1CollectedHeap::G1CollectedHeap(G1CollectorPolicy* policy_) :
 }

 jint G1CollectedHeap::initialize() {
+  CollectedHeap::pre_initialize();
  os::enable_vtime();

  // Necessary to satisfy locking discipline assertions.

--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
@@ -1007,6 +1007,10 @@ public:
    return true;
  }

+  virtual bool card_mark_must_follow_store() const {
+    return true;
+  }
+
  bool is_in_young(oop obj) {
    HeapRegion* hr = heap_region_containing(obj);
    return hr != NULL && hr->is_young();

--- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp
@@ -51,6 +51,8 @@ static void trace_gen_sizes(const char* const str,
 }

 jint ParallelScavengeHeap::initialize() {
+  CollectedHeap::pre_initialize();
+
  // Cannot be initialized until after the flags are parsed
  GenerationSizer flag_parser;

@@ -717,10 +719,6 @@ HeapWord* ParallelScavengeHeap::allocate_new_tlab(size_t size) {
  return young_gen()->allocate(size, true);
 }

-void ParallelScavengeHeap::fill_all_tlabs(bool retire) {
-  CollectedHeap::fill_all_tlabs(retire);
-}
-
 void ParallelScavengeHeap::accumulate_statistics_all_tlabs() {
  CollectedHeap::accumulate_statistics_all_tlabs();
 }

--- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp
@@ -54,7 +54,6 @@ class ParallelScavengeHeap : public CollectedHeap {
 protected:
  static inline size_t total_invocations();
  HeapWord* allocate_new_tlab(size_t size);
-  void fill_all_tlabs(bool retire);

 public:
  ParallelScavengeHeap() : CollectedHeap() {
@@ -191,6 +190,10 @@ class ParallelScavengeHeap : public CollectedHeap {
    return true;
  }

+  virtual bool card_mark_must_follow_store() const {
+    return false;
+  }
+
  // Return true if we don't we need a store barrier for
  // initializing stores to an object at this address.
  virtual bool can_elide_initializing_store_barrier(oop new_obj);

--- a/src/share/vm/gc_interface/collectedHeap.cpp
+++ b/src/share/vm/gc_interface/collectedHeap.cpp
@@ -59,8 +59,18 @@ CollectedHeap::CollectedHeap()
                PerfDataManager::create_string_variable(SUN_GC, "lastCause",
                             80, GCCause::to_string(_gc_lastcause), CHECK);
  }
+  _defer_initial_card_mark = false; // strengthened by subclass in pre_initialize() below.
 }

+void CollectedHeap::pre_initialize() {
+  // Used for ReduceInitialCardMarks (when COMPILER2 is used);
+  // otherwise remains unused.
+#ifdef COMPLER2
+  _defer_initial_card_mark = ReduceInitialCardMarks && (DeferInitialCardMark || card_mark_must_follow_store());
+#else
+  assert(_defer_initial_card_mark == false, "Who would set it?");
+#endif
+}

 #ifndef PRODUCT
 void CollectedHeap::check_for_bad_heap_word_value(HeapWord* addr, size_t size) {
@@ -140,12 +150,13 @@ HeapWord* CollectedHeap::allocate_from_tlab_slow(Thread* thread, size_t size) {
 void CollectedHeap::flush_deferred_store_barrier(JavaThread* thread) {
  MemRegion deferred = thread->deferred_card_mark();
  if (!deferred.is_empty()) {
+    assert(_defer_initial_card_mark, "Otherwise should be empty");
    {
      // Verify that the storage points to a parsable object in heap
      DEBUG_ONLY(oop old_obj = oop(deferred.start());)
      assert(is_in(old_obj), "Not in allocated heap");
      assert(!can_elide_initializing_store_barrier(old_obj),
-             "Else should have been filtered in defer_store_barrier()");
+             "Else should have been filtered in new_store_pre_barrier()");
      assert(!is_in_permanent(old_obj), "Sanity: not expected");
      assert(old_obj->is_oop(true), "Not an oop");
      assert(old_obj->is_parsable(), "Will not be concurrently parsable");
@@ -174,9 +185,7 @@ void CollectedHeap::flush_deferred_store_barrier(JavaThread* thread) {
 //     so long as the card-mark is completed before the next
 //     scavenge. For all these cases, we can do a card mark
 //     at the point at which we do a slow path allocation
-//     in the old gen. For uniformity, however, we end
-//     up using the same scheme (see below) for all three
-//     cases (deferring the card-mark appropriately).
+//     in the old gen, i.e. in this call.
 // (b) GenCollectedHeap(ConcurrentMarkSweepGeneration) requires
 //     in addition that the card-mark for an old gen allocated
 //     object strictly follow any associated initializing stores.
@@ -199,12 +208,13 @@ void CollectedHeap::flush_deferred_store_barrier(JavaThread* thread) {
 //     but, like in CMS, because of the presence of concurrent refinement
 //     (much like CMS' precleaning), must strictly follow the oop-store.
 //     Thus, using the same protocol for maintaining the intended
-//     invariants turns out, serendepitously, to be the same for all
-//     three collectors/heap types above.
+//     invariants turns out, serendepitously, to be the same for both
+//     G1 and CMS.
 //
-// For each future collector, this should be reexamined with
-// that specific collector in mind.
-oop CollectedHeap::defer_store_barrier(JavaThread* thread, oop new_obj) {
+// For any future collector, this code should be reexamined with
+// that specific collector in mind, and the documentation above suitably
+// extended and updated.
+oop CollectedHeap::new_store_pre_barrier(JavaThread* thread, oop new_obj) {
  // If a previous card-mark was deferred, flush it now.
  flush_deferred_store_barrier(thread);
  if (can_elide_initializing_store_barrier(new_obj)) {
@@ -212,10 +222,17 @@ oop CollectedHeap::defer_store_barrier(JavaThread* thread, oop new_obj) {
    // following the flush above.
    assert(thread->deferred_card_mark().is_empty(), "Error");
  } else {
-    // Remember info for the newly deferred store barrier
-    MemRegion deferred = MemRegion((HeapWord*)new_obj, new_obj->size());
-    assert(!deferred.is_empty(), "Error");
-    thread->set_deferred_card_mark(deferred);
+    MemRegion mr((HeapWord*)new_obj, new_obj->size());
+    assert(!mr.is_empty(), "Error");
+    if (_defer_initial_card_mark) {
+      // Defer the card mark
+      thread->set_deferred_card_mark(mr);
+    } else {
+      // Do the card mark
+      BarrierSet* bs = barrier_set();
+      assert(bs->has_write_region_opt(), "No write_region() on BarrierSet");
+      bs->write_region(mr);
+    }
  }
  return new_obj;
 }
@@ -313,22 +330,6 @@ HeapWord* CollectedHeap::allocate_new_tlab(size_t size) {
  return NULL;
 }

-void CollectedHeap::fill_all_tlabs(bool retire) {
-  assert(UseTLAB, "should not reach here");
-  // See note in ensure_parsability() below.
-  assert(SafepointSynchronize::is_at_safepoint() ||
-         !is_init_completed(),
-         "should only fill tlabs at safepoint");
-  // The main thread starts allocating via a TLAB even before it
-  // has added itself to the threads list at vm boot-up.
-  assert(Threads::first() != NULL,
-         "Attempt to fill tlabs before main thread has been added"
-         " to threads list is doomed to failure!");
-  for(JavaThread *thread = Threads::first(); thread; thread = thread->next()) {
-     thread->tlab().make_parsable(retire);
-  }
-}
-
 void CollectedHeap::ensure_parsability(bool retire_tlabs) {
  // The second disjunct in the assertion below makes a concession
  // for the start-up verification done while the VM is being
@@ -343,8 +344,24 @@ void CollectedHeap::ensure_parsability(bool retire_tlabs) {
         "Should only be called at a safepoint or at start-up"
         " otherwise concurrent mutator activity may make heap "
         " unparsable again");
-  if (UseTLAB) {
-    fill_all_tlabs(retire_tlabs);
+  const bool use_tlab = UseTLAB;
+  const bool deferred = _defer_initial_card_mark;
+  // The main thread starts allocating via a TLAB even before it
+  // has added itself to the threads list at vm boot-up.
+  assert(!use_tlab || Threads::first() != NULL,
+         "Attempt to fill tlabs before main thread has been added"
+         " to threads list is doomed to failure!");
+  for (JavaThread *thread = Threads::first(); thread; thread = thread->next()) {
+     if (use_tlab) thread->tlab().make_parsable(retire_tlabs);
+#ifdef COMPILER2
+     // The deferred store barriers must all have been flushed to the
+     // card-table (or other remembered set structure) before GC starts
+     // processing the card-table (or other remembered set).
+     if (deferred) flush_deferred_store_barrier(thread);
+#else
+     assert(!deferred, "Should be false");
+     assert(thread->deferred_card_mark().is_empty(), "Should be empty");
+#endif
  }
 }


--- a/src/share/vm/gc_interface/collectedHeap.hpp
+++ b/src/share/vm/gc_interface/collectedHeap.hpp
@@ -51,6 +51,9 @@ class CollectedHeap : public CHeapObj {
  // Used for filler objects (static, but initialized in ctor).
  static size_t _filler_array_max_size;

+  // Used in support of ReduceInitialCardMarks; only consulted if COMPILER2 is being used
+  bool _defer_initial_card_mark;
+
 protected:
  MemRegion _reserved;
  BarrierSet* _barrier_set;
@@ -70,13 +73,16 @@ class CollectedHeap : public CHeapObj {
  // Constructor
  CollectedHeap();

+  // Do common initializations that must follow instance construction,
+  // for example, those needing virtual calls.
+  // This code could perhaps be moved into initialize() but would
+  // be slightly more awkward because we want the latter to be a
+  // pure virtual.
+  void pre_initialize();
+
  // Create a new tlab
  virtual HeapWord* allocate_new_tlab(size_t size);

-  // Fix up tlabs to make the heap well-formed again,
-  // optionally retiring the tlabs.
-  virtual void fill_all_tlabs(bool retire);
-
  // Accumulate statistics on all tlabs.
  virtual void accumulate_statistics_all_tlabs();

@@ -431,14 +437,25 @@ class CollectedHeap : public CHeapObj {
  // promises to call this function on such a slow-path-allocated
  // object before performing initializations that have elided
  // store barriers. Returns new_obj, or maybe a safer copy thereof.
-  virtual oop defer_store_barrier(JavaThread* thread, oop new_obj);
+  virtual oop new_store_pre_barrier(JavaThread* thread, oop new_obj);

  // Answers whether an initializing store to a new object currently
-  // allocated at the given address doesn't need a (deferred) store
+  // allocated at the given address doesn't need a store
  // barrier. Returns "true" if it doesn't need an initializing
  // store barrier; answers "false" if it does.
  virtual bool can_elide_initializing_store_barrier(oop new_obj) = 0;

+  // If a compiler is eliding store barriers for TLAB-allocated objects,
+  // we will be informed of a slow-path allocation by a call
+  // to new_store_pre_barrier() above. Such a call precedes the
+  // initialization of the object itself, and no post-store-barriers will
+  // be issued. Some heap types require that the barrier strictly follows
+  // the initializing stores. (This is currently implemented by deferring the
+  // barrier until the next slow-path allocation or gc-related safepoint.)
+  // This interface answers whether a particular heap type needs the card
+  // mark to be thus strictly sequenced after the stores.
+  virtual bool card_mark_must_follow_store() const = 0;
+
  // If the CollectedHeap was asked to defer a store barrier above,
  // this informs it to flush such a deferred store barrier to the
  // remembered set.

--- a/src/share/vm/memory/genCollectedHeap.cpp
+++ b/src/share/vm/memory/genCollectedHeap.cpp
@@ -51,6 +51,8 @@ GenCollectedHeap::GenCollectedHeap(GenCollectorPolicy *policy) :
 }

 jint GenCollectedHeap::initialize() {
+  CollectedHeap::pre_initialize();
+
  int i;
  _n_gens = gen_policy()->number_of_generations();

@@ -129,6 +131,7 @@ jint GenCollectedHeap::initialize() {

  _rem_set = collector_policy()->create_rem_set(_reserved, n_covered_regions);
  set_barrier_set(rem_set()->bs());
+
  _gch = this;

  for (i = 0; i < _n_gens; i++) {

--- a/src/share/vm/memory/genCollectedHeap.hpp
+++ b/src/share/vm/memory/genCollectedHeap.hpp
@@ -260,6 +260,10 @@ public:
    return true;
  }

+  virtual bool card_mark_must_follow_store() const {
+    return UseConcMarkSweepGC;
+  }
+
  // We don't need barriers for stores to objects in the
  // young gen and, a fortiori, for initializing stores to
  // objects therein. This applies to {DefNew,ParNew}+{Tenured,CMS}

--- a/src/share/vm/opto/graphKit.cpp
+++ b/src/share/vm/opto/graphKit.cpp
@@ -3259,9 +3259,10 @@ void GraphKit::write_barrier_post(Node* oop_store,
  if (use_ReduceInitialCardMarks()
      && obj == just_allocated_object(control())) {
    // We can skip marks on a freshly-allocated object in Eden.
-    // Keep this code in sync with maybe_defer_card_mark() in runtime.cpp.
-    // That routine informs GC to take appropriate compensating steps
-    // so as to make this card-mark elision safe.
+    // Keep this code in sync with new_store_pre_barrier() in runtime.cpp.
+    // That routine informs GC to take appropriate compensating steps,
+    // upon a slow-path allocation, so as to make this card-mark
+    // elision safe.
    return;
  }


--- a/src/share/vm/opto/runtime.cpp
+++ b/src/share/vm/opto/runtime.cpp
@@ -143,7 +143,7 @@ const char* OptoRuntime::stub_name(address entry) {
 // We failed the fast-path allocation.  Now we need to do a scavenge or GC
 // and try allocation again.

-void OptoRuntime::maybe_defer_card_mark(JavaThread* thread) {
+void OptoRuntime::new_store_pre_barrier(JavaThread* thread) {
  // After any safepoint, just before going back to compiled code,
  // we inform the GC that we will be doing initializing writes to
  // this object in the future without emitting card-marks, so
@@ -156,7 +156,7 @@ void OptoRuntime::maybe_defer_card_mark(JavaThread* thread) {
  assert(Universe::heap()->can_elide_tlab_store_barriers(),
         "compiler must check this first");
  // GC may decide to give back a safer copy of new_obj.
-  new_obj = Universe::heap()->defer_store_barrier(thread, new_obj);
+  new_obj = Universe::heap()->new_store_pre_barrier(thread, new_obj);
  thread->set_vm_result(new_obj);
 }

@@ -200,7 +200,7 @@ JRT_BLOCK_ENTRY(void, OptoRuntime::new_instance_C(klassOopDesc* klass, JavaThrea

  if (GraphKit::use_ReduceInitialCardMarks()) {
    // inform GC that we won't do card marks for initializing writes.
-    maybe_defer_card_mark(thread);
+    new_store_pre_barrier(thread);
  }
 JRT_END

@@ -239,7 +239,7 @@ JRT_BLOCK_ENTRY(void, OptoRuntime::new_array_C(klassOopDesc* array_type, int len

  if (GraphKit::use_ReduceInitialCardMarks()) {
    // inform GC that we won't do card marks for initializing writes.
-    maybe_defer_card_mark(thread);
+    new_store_pre_barrier(thread);
  }
 JRT_END


--- a/src/share/vm/opto/runtime.hpp
+++ b/src/share/vm/opto/runtime.hpp
@@ -133,8 +133,9 @@ class OptoRuntime : public AllStatic {
  // Allocate storage for a objArray or typeArray
  static void new_array_C(klassOopDesc* array_klass, int len, JavaThread *thread);

-  // Post-slow-path-allocation step for implementing ReduceInitialCardMarks:
-  static void maybe_defer_card_mark(JavaThread* thread);
+  // Post-slow-path-allocation, pre-initializing-stores step for
+  // implementing ReduceInitialCardMarks
+  static void new_store_pre_barrier(JavaThread* thread);

  // Allocate storage for a multi-dimensional arrays
  // Note: needs to be fixed for arbitrary number of dimensions

--- a/src/share/vm/runtime/globals.hpp
+++ b/src/share/vm/runtime/globals.hpp
@@ -2015,6 +2015,10 @@ class CommandLineFlags {
  diagnostic(bool, GCParallelVerificationEnabled, true,                     \
          "Enable parallel memory system verification")                     \
                                                                            \
+  diagnostic(bool, DeferInitialCardMark, false,                             \
+          "When +ReduceInitialCardMarks, explicitly defer any that "        \
+           "may arise from new_pre_store_barrier")                          \
+                                                                            \
  diagnostic(bool, VerifyRememberedSets, false,                             \
          "Verify GC remembered sets")                                      \
                                                                            \

--- a/src/share/vm/runtime/thread.cpp
+++ b/src/share/vm/runtime/thread.cpp
@@ -2357,9 +2357,8 @@ public:
 };

 void JavaThread::oops_do(OopClosure* f, CodeBlobClosure* cf) {
-  // Flush deferred store-barriers, if any, associated with
-  // initializing stores done by this JavaThread in the current epoch.
-  Universe::heap()->flush_deferred_store_barrier(this);
+  // Verify that the deferred card marks have been flushed.
+  assert(deferred_card_mark().is_empty(), "Should be empty during GC");

  // The ThreadProfiler oops_do is done from FlatProfiler::oops_do
  // since there may be more than one thread using each ThreadProfiler.

--- a/src/share/vm/runtime/vmStructs.cpp
+++ b/src/share/vm/runtime/vmStructs.cpp
@@ -309,6 +309,7 @@ static inline uint64_t cast_uint64_t(size_t x)
  nonstatic_field(CollectedHeap,               _reserved,                                     MemRegion)                             \
  nonstatic_field(SharedHeap,                  _perm_gen,                                     PermGen*)                              \
  nonstatic_field(CollectedHeap,               _barrier_set,                                  BarrierSet*)                           \
+  nonstatic_field(CollectedHeap,               _defer_initial_card_mark,                      bool)                                  \
  nonstatic_field(CollectedHeap,               _is_gc_active,                                 bool)                                  \
  nonstatic_field(CompactibleSpace,            _compaction_top,                               HeapWord*)                             \
  nonstatic_field(CompactibleSpace,            _first_dead,                                   HeapWord*)                             \