7037276: Unnecessary double traversal of dirty card windows

Summary: Short-circuited an unnecessary double traversal of dirty card windows when iterating younger refs. Also renamed some cardtable methods for more clarity. Reviewed-by: jmasa, stefank, poonam

7037276: Unnecessary double traversal of dirty card windows
Summary: Short-circuited an unnecessary double traversal of dirty card windows when iterating younger refs. Also renamed some cardtable methods for more clarity. Reviewed-by: jmasa, stefank, poonam
febac7c6 · ysr · dfbf6345 · febac7c6 · febac7c6 · febac7c6
5 changed file
--- a/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp
+++ b/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp
@@ -33,44 +33,43 @@
 #include "runtime/mutexLocker.hpp"
 #include "runtime/virtualspace.hpp"

-void CardTableModRefBS::par_non_clean_card_iterate_work(Space* sp, MemRegion mr,
-                                                        DirtyCardToOopClosure* dcto_cl,
-                                                        MemRegionClosure* cl,
-                                                        int n_threads) {
-  if (n_threads > 0) {
-    assert((n_threads == 1 && ParallelGCThreads == 0) ||
-           n_threads <= (int)ParallelGCThreads,
-           "# worker threads != # requested!");
-    // Make sure the LNC array is valid for the space.
-    jbyte**   lowest_non_clean;
-    uintptr_t lowest_non_clean_base_chunk_index;
-    size_t    lowest_non_clean_chunk_size;
-    get_LNC_array_for_space(sp, lowest_non_clean,
-                            lowest_non_clean_base_chunk_index,
-                            lowest_non_clean_chunk_size);
+void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr,
+                                                             DirtyCardToOopClosure* dcto_cl,
+                                                             ClearNoncleanCardWrapper* cl,
+                                                             int n_threads) {
+  assert(n_threads > 0, "Error: expected n_threads > 0");
+  assert((n_threads == 1 && ParallelGCThreads == 0) ||
+         n_threads <= (int)ParallelGCThreads,
+         "# worker threads != # requested!");
+  // Make sure the LNC array is valid for the space.
+  jbyte**   lowest_non_clean;
+  uintptr_t lowest_non_clean_base_chunk_index;
+  size_t    lowest_non_clean_chunk_size;
+  get_LNC_array_for_space(sp, lowest_non_clean,
+                          lowest_non_clean_base_chunk_index,
+                          lowest_non_clean_chunk_size);

-    int n_strides = n_threads * StridesPerThread;
-    SequentialSubTasksDone* pst = sp->par_seq_tasks();
-    pst->set_n_threads(n_threads);
-    pst->set_n_tasks(n_strides);
+  int n_strides = n_threads * StridesPerThread;
+  SequentialSubTasksDone* pst = sp->par_seq_tasks();
+  pst->set_n_threads(n_threads);
+  pst->set_n_tasks(n_strides);

-    int stride = 0;
-    while (!pst->is_task_claimed(/* reference */ stride)) {
-      process_stride(sp, mr, stride, n_strides, dcto_cl, cl,
-                     lowest_non_clean,
-                     lowest_non_clean_base_chunk_index,
-                     lowest_non_clean_chunk_size);
-    }
-    if (pst->all_tasks_completed()) {
-      // Clear lowest_non_clean array for next time.
-      intptr_t first_chunk_index = addr_to_chunk_index(mr.start());
-      uintptr_t last_chunk_index  = addr_to_chunk_index(mr.last());
-      for (uintptr_t ch = first_chunk_index; ch <= last_chunk_index; ch++) {
-        intptr_t ind = ch - lowest_non_clean_base_chunk_index;
-        assert(0 <= ind && ind < (intptr_t)lowest_non_clean_chunk_size,
-               "Bounds error");
-        lowest_non_clean[ind] = NULL;
-      }
+  int stride = 0;
+  while (!pst->is_task_claimed(/* reference */ stride)) {
+    process_stride(sp, mr, stride, n_strides, dcto_cl, cl,
+                   lowest_non_clean,
+                   lowest_non_clean_base_chunk_index,
+                   lowest_non_clean_chunk_size);
+  }
+  if (pst->all_tasks_completed()) {
+    // Clear lowest_non_clean array for next time.
+    intptr_t first_chunk_index = addr_to_chunk_index(mr.start());
+    uintptr_t last_chunk_index  = addr_to_chunk_index(mr.last());
+    for (uintptr_t ch = first_chunk_index; ch <= last_chunk_index; ch++) {
+      intptr_t ind = ch - lowest_non_clean_base_chunk_index;
+      assert(0 <= ind && ind < (intptr_t)lowest_non_clean_chunk_size,
+             "Bounds error");
+      lowest_non_clean[ind] = NULL;
    }
  }
 }
@@ -81,7 +80,7 @@ process_stride(Space* sp,
               MemRegion used,
               jint stride, int n_strides,
               DirtyCardToOopClosure* dcto_cl,
-               MemRegionClosure* cl,
+               ClearNoncleanCardWrapper* cl,
               jbyte** lowest_non_clean,
               uintptr_t lowest_non_clean_base_chunk_index,
               size_t    lowest_non_clean_chunk_size) {
@@ -127,7 +126,11 @@ process_stride(Space* sp,
                             lowest_non_clean_base_chunk_index,
                             lowest_non_clean_chunk_size);

-    non_clean_card_iterate_work(chunk_mr, cl);
+    // We do not call the non_clean_card_iterate_serial() version because
+    // we want to clear the cards, and the ClearNoncleanCardWrapper closure
+    // itself does the work of finding contiguous dirty ranges of cards to
+    // process (and clear).
+    cl->do_MemRegion(chunk_mr);

    // Find the next chunk of the stride.
    chunk_card_start += CardsPerStrideChunk * n_strides;

--- a/src/share/vm/memory/cardTableModRefBS.cpp
+++ b/src/share/vm/memory/cardTableModRefBS.cpp
@@ -456,31 +456,35 @@ bool CardTableModRefBS::mark_card_deferred(size_t card_index) {
 }


-void CardTableModRefBS::non_clean_card_iterate(Space* sp,
-                                               MemRegion mr,
-                                               DirtyCardToOopClosure* dcto_cl,
-                                               MemRegionClosure* cl) {
+void CardTableModRefBS::non_clean_card_iterate_possibly_parallel(Space* sp,
+                                                                 MemRegion mr,
+                                                                 DirtyCardToOopClosure* dcto_cl,
+                                                                 ClearNoncleanCardWrapper* cl) {
  if (!mr.is_empty()) {
    int n_threads = SharedHeap::heap()->n_par_threads();
    if (n_threads > 0) {
 #ifndef SERIALGC
-      par_non_clean_card_iterate_work(sp, mr, dcto_cl, cl, n_threads);
+      non_clean_card_iterate_parallel_work(sp, mr, dcto_cl, cl, n_threads);
 #else  // SERIALGC
      fatal("Parallel gc not supported here.");
 #endif // SERIALGC
    } else {
-      non_clean_card_iterate_work(mr, cl);
+      // We do not call the non_clean_card_iterate_serial() version below because
+      // we want to clear the cards (which non_clean_card_iterate_serial() does not
+      // do for us), and the ClearNoncleanCardWrapper closure itself does the work
+      // of finding contiguous dirty ranges of cards to process (and clear).
+      cl->do_MemRegion(mr);
    }
  }
 }

-// NOTE: For this to work correctly, it is important that
-// we look for non-clean cards below (so as to catch those
-// marked precleaned), rather than look explicitly for dirty
-// cards (and miss those marked precleaned). In that sense,
-// the name precleaned is currently somewhat of a misnomer.
-void CardTableModRefBS::non_clean_card_iterate_work(MemRegion mr,
-                                                    MemRegionClosure* cl) {
+// The iterator itself is not MT-aware, but
+// MT-aware callers and closures can use this to
+// accomplish dirty card iteration in parallel. The
+// iterator itself does not clear the dirty cards, or
+// change their values in any manner.
+void CardTableModRefBS::non_clean_card_iterate_serial(MemRegion mr,
+                                                      MemRegionClosure* cl) {
  for (int i = 0; i < _cur_covered_regions; i++) {
    MemRegion mri = mr.intersection(_covered[i]);
    if (mri.word_size() > 0) {
@@ -661,7 +665,7 @@ public:

 void CardTableModRefBS::verify_clean_region(MemRegion mr) {
  GuaranteeNotModClosure blk(this);
-  non_clean_card_iterate_work(mr, &blk);
+  non_clean_card_iterate_serial(mr, &blk);
 }

 // To verify a MemRegion is entirely dirty this closure is passed to

--- a/src/share/vm/memory/cardTableModRefBS.hpp
+++ b/src/share/vm/memory/cardTableModRefBS.hpp
@@ -44,6 +44,7 @@
 class Generation;
 class OopsInGenClosure;
 class DirtyCardToOopClosure;
+class ClearNoncleanCardWrapper;

 class CardTableModRefBS: public ModRefBarrierSet {
  // Some classes get to look at some private stuff.
@@ -165,22 +166,28 @@ class CardTableModRefBS: public ModRefBarrierSet {

  // Iterate over the portion of the card-table which covers the given
  // region mr in the given space and apply cl to any dirty sub-regions
-  // of mr. cl and dcto_cl must either be the same closure or cl must
-  // wrap dcto_cl. Both are required - neither may be NULL. Also, dcto_cl
-  // may be modified. Note that this function will operate in a parallel
-  // mode if worker threads are available.
-  void non_clean_card_iterate(Space* sp, MemRegion mr,
-                              DirtyCardToOopClosure* dcto_cl,
-                              MemRegionClosure* cl);
-
-  // Utility function used to implement the other versions below.
-  void non_clean_card_iterate_work(MemRegion mr, MemRegionClosure* cl);
-
-  void par_non_clean_card_iterate_work(Space* sp, MemRegion mr,
-                                       DirtyCardToOopClosure* dcto_cl,
-                                       MemRegionClosure* cl,
-                                       int n_threads);
+  // of mr. Dirty cards are _not_ cleared by the iterator method itself,
+  // but closures may arrange to do so on their own should they so wish.
+  void non_clean_card_iterate_serial(MemRegion mr, MemRegionClosure* cl);
+
+  // A variant of the above that will operate in a parallel mode if
+  // worker threads are available, and clear the dirty cards as it
+  // processes them.
+  // ClearNoncleanCardWrapper cl must wrap the DirtyCardToOopClosure dcto_cl,
+  // which may itself be modified by the method.
+  void non_clean_card_iterate_possibly_parallel(Space* sp, MemRegion mr,
+                                                DirtyCardToOopClosure* dcto_cl,
+                                                ClearNoncleanCardWrapper* cl);
+
+ private:
+  // Work method used to implement non_clean_card_iterate_possibly_parallel()
+  // above in the parallel case.
+  void non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr,
+                                            DirtyCardToOopClosure* dcto_cl,
+                                            ClearNoncleanCardWrapper* cl,
+                                            int n_threads);

+ protected:
  // Dirty the bytes corresponding to "mr" (not all of which must be
  // covered.)
  void dirty_MemRegion(MemRegion mr);
@@ -237,7 +244,7 @@ class CardTableModRefBS: public ModRefBarrierSet {
                      MemRegion used,
                      jint stride, int n_strides,
                      DirtyCardToOopClosure* dcto_cl,
-                      MemRegionClosure* cl,
+                      ClearNoncleanCardWrapper* cl,
                      jbyte** lowest_non_clean,
                      uintptr_t lowest_non_clean_base_chunk_index,
                      size_t lowest_non_clean_chunk_size);
@@ -409,14 +416,14 @@ public:
  // marking, where a dirty card may cause scanning, and summarization
  // marking, of objects that extend onto subsequent cards.)
  void mod_card_iterate(MemRegionClosure* cl) {
-    non_clean_card_iterate_work(_whole_heap, cl);
+    non_clean_card_iterate_serial(_whole_heap, cl);
  }

  // Like the "mod_cards_iterate" above, except only invokes the closure
  // for cards within the MemRegion "mr" (which is required to be
  // card-aligned and sized.)
  void mod_card_iterate(MemRegion mr, MemRegionClosure* cl) {
-    non_clean_card_iterate_work(mr, cl);
+    non_clean_card_iterate_serial(mr, cl);
  }

  static uintx ct_max_alignment_constraint();
@@ -493,4 +500,5 @@ public:
  void set_CTRS(CardTableRS* rs) { _rs = rs; }
 };

+
 #endif // SHARE_VM_MEMORY_CARDTABLEMODREFBS_HPP
--- a/src/share/vm/memory/cardTableRS.cpp
+++ b/src/share/vm/memory/cardTableRS.cpp
@@ -105,107 +105,111 @@ void CardTableRS::younger_refs_iterate(Generation* g,
  g->younger_refs_iterate(blk);
 }

-class ClearNoncleanCardWrapper: public MemRegionClosure {
-  MemRegionClosure* _dirty_card_closure;
-  CardTableRS* _ct;
-  bool _is_par;
-private:
-  // Clears the given card, return true if the corresponding card should be
-  // processed.
-  bool clear_card(jbyte* entry) {
-    if (_is_par) {
-      while (true) {
-        // In the parallel case, we may have to do this several times.
-        jbyte entry_val = *entry;
-        assert(entry_val != CardTableRS::clean_card_val(),
-               "We shouldn't be looking at clean cards, and this should "
-               "be the only place they get cleaned.");
-        if (CardTableRS::card_is_dirty_wrt_gen_iter(entry_val)
-            || _ct->is_prev_youngergen_card_val(entry_val)) {
-          jbyte res =
-            Atomic::cmpxchg(CardTableRS::clean_card_val(), entry, entry_val);
-          if (res == entry_val) {
-            break;
-          } else {
-            assert(res == CardTableRS::cur_youngergen_and_prev_nonclean_card,
-                   "The CAS above should only fail if another thread did "
-                   "a GC write barrier.");
-          }
-        } else if (entry_val ==
-                   CardTableRS::cur_youngergen_and_prev_nonclean_card) {
-          // Parallelism shouldn't matter in this case.  Only the thread
-          // assigned to scan the card should change this value.
-          *entry = _ct->cur_youngergen_card_val();
-          break;
-        } else {
-          assert(entry_val == _ct->cur_youngergen_card_val(),
-                 "Should be the only possibility.");
-          // In this case, the card was clean before, and become
-          // cur_youngergen only because of processing of a promoted object.
-          // We don't have to look at the card.
-          return false;
-        }
+inline bool ClearNoncleanCardWrapper::clear_card(jbyte* entry) {
+  if (_is_par) {
+    return clear_card_parallel(entry);
+  } else {
+    return clear_card_serial(entry);
+  }
+}
+
+inline bool ClearNoncleanCardWrapper::clear_card_parallel(jbyte* entry) {
+  while (true) {
+    // In the parallel case, we may have to do this several times.
+    jbyte entry_val = *entry;
+    assert(entry_val != CardTableRS::clean_card_val(),
+           "We shouldn't be looking at clean cards, and this should "
+           "be the only place they get cleaned.");
+    if (CardTableRS::card_is_dirty_wrt_gen_iter(entry_val)
+        || _ct->is_prev_youngergen_card_val(entry_val)) {
+      jbyte res =
+        Atomic::cmpxchg(CardTableRS::clean_card_val(), entry, entry_val);
+      if (res == entry_val) {
+        break;
+      } else {
+        assert(res == CardTableRS::cur_youngergen_and_prev_nonclean_card,
+               "The CAS above should only fail if another thread did "
+               "a GC write barrier.");
      }
-      return true;
+    } else if (entry_val ==
+               CardTableRS::cur_youngergen_and_prev_nonclean_card) {
+      // Parallelism shouldn't matter in this case.  Only the thread
+      // assigned to scan the card should change this value.
+      *entry = _ct->cur_youngergen_card_val();
+      break;
    } else {
-      jbyte entry_val = *entry;
-      assert(entry_val != CardTableRS::clean_card_val(),
-             "We shouldn't be looking at clean cards, and this should "
-             "be the only place they get cleaned.");
-      assert(entry_val != CardTableRS::cur_youngergen_and_prev_nonclean_card,
-             "This should be possible in the sequential case.");
-      *entry = CardTableRS::clean_card_val();
-      return true;
+      assert(entry_val == _ct->cur_youngergen_card_val(),
+             "Should be the only possibility.");
+      // In this case, the card was clean before, and become
+      // cur_youngergen only because of processing of a promoted object.
+      // We don't have to look at the card.
+      return false;
    }
  }
+  return true;
+}

-public:
-  ClearNoncleanCardWrapper(MemRegionClosure* dirty_card_closure,
-                           CardTableRS* ct) :
+
+inline bool ClearNoncleanCardWrapper::clear_card_serial(jbyte* entry) {
+  jbyte entry_val = *entry;
+  assert(entry_val != CardTableRS::clean_card_val(),
+         "We shouldn't be looking at clean cards, and this should "
+         "be the only place they get cleaned.");
+  assert(entry_val != CardTableRS::cur_youngergen_and_prev_nonclean_card,
+         "This should be possible in the sequential case.");
+  *entry = CardTableRS::clean_card_val();
+  return true;
+}
+
+ClearNoncleanCardWrapper::ClearNoncleanCardWrapper(
+  MemRegionClosure* dirty_card_closure, CardTableRS* ct) :
    _dirty_card_closure(dirty_card_closure), _ct(ct) {
    _is_par = (SharedHeap::heap()->n_par_threads() > 0);
-  }
-  void do_MemRegion(MemRegion mr) {
-    // We start at the high end of "mr", walking backwards
-    // while accumulating a contiguous dirty range of cards in
-    // [start_of_non_clean, end_of_non_clean) which we then
-    // process en masse.
-    HeapWord* end_of_non_clean = mr.end();
-    HeapWord* start_of_non_clean = end_of_non_clean;
-    jbyte*       entry = _ct->byte_for(mr.last());
-    const jbyte* first_entry = _ct->byte_for(mr.start());
-    while (entry >= first_entry) {
-      HeapWord* cur = _ct->addr_for(entry);
-      if (!clear_card(entry)) {
-        // We hit a clean card; process any non-empty
-        // dirty range accumulated so far.
-        if (start_of_non_clean < end_of_non_clean) {
-          MemRegion mr2(start_of_non_clean, end_of_non_clean);
-          _dirty_card_closure->do_MemRegion(mr2);
-        }
-        // Reset the dirty window while continuing to
-        // look for the next dirty window to process.
-        end_of_non_clean = cur;
-        start_of_non_clean = end_of_non_clean;
+}
+
+void ClearNoncleanCardWrapper::do_MemRegion(MemRegion mr) {
+  assert(mr.word_size() > 0, "Error");
+  assert(_ct->is_aligned(mr.start()), "mr.start() should be card aligned");
+  // mr.end() may not necessarily be card aligned.
+  jbyte* cur_entry = _ct->byte_for(mr.last());
+  const jbyte* limit = _ct->byte_for(mr.start());
+  HeapWord* end_of_non_clean = mr.end();
+  HeapWord* start_of_non_clean = end_of_non_clean;
+  while (cur_entry >= limit) {
+    HeapWord* cur_hw = _ct->addr_for(cur_entry);
+    if ((*cur_entry != CardTableRS::clean_card_val()) && clear_card(cur_entry)) {
+      // Continue the dirty range by opening the
+      // dirty window one card to the left.
+      start_of_non_clean = cur_hw;
+    } else {
+      // We hit a "clean" card; process any non-empty
+      // "dirty" range accumulated so far.
+      if (start_of_non_clean < end_of_non_clean) {
+        const MemRegion mrd(start_of_non_clean, end_of_non_clean);
+        _dirty_card_closure->do_MemRegion(mrd);
      }
-      // Open the left end of the window one card to the left.
-      start_of_non_clean = cur;
-      // Note that "entry" leads "start_of_non_clean" in
-      // its leftward excursion after this point
-      // in the loop and, when we hit the left end of "mr",
-      // will point off of the left end of the card-table
-      // for "mr".
-      entry--;
-    }
-    // If the first card of "mr" was dirty, we will have
-    // been left with a dirty window, co-initial with "mr",
-    // which we now process.
-    if (start_of_non_clean < end_of_non_clean) {
-      MemRegion mr2(start_of_non_clean, end_of_non_clean);
-      _dirty_card_closure->do_MemRegion(mr2);
+      // Reset the dirty window, while continuing to look
+      // for the next dirty card that will start a
+      // new dirty window.
+      end_of_non_clean = cur_hw;
+      start_of_non_clean = cur_hw;
    }
+    // Note that "cur_entry" leads "start_of_non_clean" in
+    // its leftward excursion after this point
+    // in the loop and, when we hit the left end of "mr",
+    // will point off of the left end of the card-table
+    // for "mr".
+    cur_entry--;
  }
-};
+  // If the first card of "mr" was dirty, we will have
+  // been left with a dirty window, co-initial with "mr",
+  // which we now process.
+  if (start_of_non_clean < end_of_non_clean) {
+    const MemRegion mrd(start_of_non_clean, end_of_non_clean);
+    _dirty_card_closure->do_MemRegion(mrd);
+  }
+}
+
 // clean (by dirty->clean before) ==> cur_younger_gen
 // dirty                          ==> cur_youngergen_and_prev_nonclean_card
 // precleaned                     ==> cur_youngergen_and_prev_nonclean_card
@@ -246,8 +250,8 @@ void CardTableRS::younger_refs_in_space_iterate(Space* sp,
                                                   cl->gen_boundary());
  ClearNoncleanCardWrapper clear_cl(dcto_cl, this);

-  _ct_bs->non_clean_card_iterate(sp, sp->used_region_at_save_marks(),
-                                dcto_cl, &clear_cl);
+  _ct_bs->non_clean_card_iterate_possibly_parallel(sp, sp->used_region_at_save_marks(),
+                                                   dcto_cl, &clear_cl);
 }

 void CardTableRS::clear_into_younger(Generation* gen, bool clear_perm) {

--- a/src/share/vm/memory/cardTableRS.hpp
+++ b/src/share/vm/memory/cardTableRS.hpp
 /*
- * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -166,4 +166,21 @@ public:

 };

+class ClearNoncleanCardWrapper: public MemRegionClosure {
+  MemRegionClosure* _dirty_card_closure;
+  CardTableRS* _ct;
+  bool _is_par;
+private:
+  // Clears the given card, return true if the corresponding card should be
+  // processed.
+  inline bool clear_card(jbyte* entry);
+  // Work methods called by the clear_card()
+  inline bool clear_card_serial(jbyte* entry);
+  inline bool clear_card_parallel(jbyte* entry);
+
+public:
+  ClearNoncleanCardWrapper(MemRegionClosure* dirty_card_closure, CardTableRS* ct);
+  void do_MemRegion(MemRegion mr);
+};
+
 #endif // SHARE_VM_MEMORY_CARDTABLERS_HPP