6819077: G1: first GC thread coming late into the GC.

Summary: The first worker thread is delayed when entering the GC because it clears the card count table that is used in identifying hot cards. Replace the card count table with a dynamically sized evicting hash table that includes an epoch based counter. Reviewed-by: iveresov, tonyp

6819077: G1: first GC thread coming late into the GC.
Summary: The first worker thread is delayed when entering the GC because it clears the card count table that is used in identifying hot cards. Replace the card count table with a dynamically sized evicting hash table that includes an epoch based counter. Reviewed-by: iveresov, tonyp
c046dd73 · johnc · 8763012d · c046dd73 · c046dd73 · c046dd73
9 changed file
--- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
+++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
@@ -25,11 +25,21 @@
 #include "incls/_precompiled.incl"
 #include "incls/_concurrentG1Refine.cpp.incl"

+// Possible sizes for the card counts cache: odd primes that roughly double in size.
+// (See jvmtiTagMap.cpp).
+int ConcurrentG1Refine::_cc_cache_sizes[] = {
+        16381,    32771,    76831,    150001,   307261,
+       614563,  1228891,  2457733,   4915219,  9830479,
+     19660831, 39321619, 78643219, 157286461,       -1
+  };
+
 ConcurrentG1Refine::ConcurrentG1Refine() :
-  _card_counts(NULL), _cur_card_count_histo(NULL), _cum_card_count_histo(NULL),
+  _card_counts(NULL), _card_epochs(NULL),
+  _n_card_counts(0), _max_n_card_counts(0),
+  _cache_size_index(0), _expand_card_counts(false),
  _hot_cache(NULL),
  _def_use_cache(false), _use_cache(false),
-  _n_periods(0), _total_cards(0), _total_travs(0),
+  _n_periods(0),
  _threads(NULL), _n_threads(0)
 {
  if (G1ConcRefine) {
@@ -57,26 +67,39 @@ size_t ConcurrentG1Refine::thread_num() {
 }

 void ConcurrentG1Refine::init() {
-  G1CollectedHeap* g1h = G1CollectedHeap::heap();
-  if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
-    _n_card_counts =
-      (unsigned) (g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
-    _card_counts = NEW_C_HEAP_ARRAY(unsigned char, _n_card_counts);
-    for (size_t i = 0; i < _n_card_counts; i++) _card_counts[i] = 0;
-    ModRefBarrierSet* bs = g1h->mr_bs();
-    guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
-    CardTableModRefBS* ctbs = (CardTableModRefBS*)bs;
-    _ct_bot = ctbs->byte_for_const(g1h->reserved_region().start());
-    if (G1ConcRSCountTraversals) {
-      _cur_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256);
-      _cum_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256);
-      for (int i = 0; i < 256; i++) {
-        _cur_card_count_histo[i] = 0;
-        _cum_card_count_histo[i] = 0;
-      }
-    }
-  }
  if (G1ConcRSLogCacheSize > 0) {
+    _g1h = G1CollectedHeap::heap();
+    _max_n_card_counts =
+      (unsigned) (_g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
+
+    size_t max_card_num = ((size_t)1 << (sizeof(unsigned)*BitsPerByte-1)) - 1;
+    guarantee(_max_n_card_counts < max_card_num, "card_num representation");
+
+    int desired = _max_n_card_counts / InitialCacheFraction;
+    for (_cache_size_index = 0;
+              _cc_cache_sizes[_cache_size_index] >= 0; _cache_size_index++) {
+      if (_cc_cache_sizes[_cache_size_index] >= desired) break;
+    }
+    _cache_size_index = MAX2(0, (_cache_size_index - 1));
+
+    int initial_size = _cc_cache_sizes[_cache_size_index];
+    if (initial_size < 0) initial_size = _max_n_card_counts;
+
+    // Make sure we don't go bigger than we will ever need
+    _n_card_counts = MIN2((unsigned) initial_size, _max_n_card_counts);
+
+    _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts);
+    _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts);
+
+    Copy::fill_to_bytes(&_card_counts[0],
+                        _n_card_counts * sizeof(CardCountCacheEntry));
+    Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
+
+    ModRefBarrierSet* bs = _g1h->mr_bs();
+    guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
+    _ct_bs = (CardTableModRefBS*)bs;
+    _ct_bot = _ct_bs->byte_for_const(_g1h->reserved_region().start());
+
    _def_use_cache = true;
    _use_cache = true;
    _hot_cache_size = (1 << G1ConcRSLogCacheSize);
@@ -86,7 +109,7 @@ void ConcurrentG1Refine::init() {

    // For refining the cards in the hot cache in parallel
    int n_workers = (ParallelGCThreads > 0 ?
-                        g1h->workers()->total_workers() : 1);
+                        _g1h->workers()->total_workers() : 1);
    _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers);
    _hot_cache_par_claimed_idx = 0;
  }
@@ -101,15 +124,11 @@ void ConcurrentG1Refine::stop() {
 }

 ConcurrentG1Refine::~ConcurrentG1Refine() {
-  if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
-    assert(_card_counts != NULL, "Logic");
-    FREE_C_HEAP_ARRAY(unsigned char, _card_counts);
-    assert(_cur_card_count_histo != NULL, "Logic");
-    FREE_C_HEAP_ARRAY(unsigned, _cur_card_count_histo);
-    assert(_cum_card_count_histo != NULL, "Logic");
-    FREE_C_HEAP_ARRAY(unsigned, _cum_card_count_histo);
-  }
  if (G1ConcRSLogCacheSize > 0) {
+    assert(_card_counts != NULL, "Logic");
+    FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts);
+    assert(_card_epochs != NULL, "Logic");
+    FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs);
    assert(_hot_cache != NULL, "Logic");
    FREE_C_HEAP_ARRAY(jbyte*, _hot_cache);
  }
@@ -129,42 +148,165 @@ void ConcurrentG1Refine::threads_do(ThreadClosure *tc) {
  }
 }

+bool ConcurrentG1Refine::is_young_card(jbyte* card_ptr) {
+  HeapWord* start = _ct_bs->addr_for(card_ptr);
+  HeapRegion* r = _g1h->heap_region_containing(start);
+  if (r != NULL && r->is_young()) {
+    return true;
+  }
+  // This card is not associated with a heap region
+  // so can't be young.
+  return false;
+}
+
+jbyte* ConcurrentG1Refine::add_card_count(jbyte* card_ptr, int* count, bool* defer) {
+  unsigned new_card_num = ptr_2_card_num(card_ptr);
+  unsigned bucket = hash(new_card_num);
+  assert(0 <= bucket && bucket < _n_card_counts, "Bounds");
+
+  CardCountCacheEntry* count_ptr = &_card_counts[bucket];
+  CardEpochCacheEntry* epoch_ptr = &_card_epochs[bucket];
+
+  // We have to construct a new entry if we haven't updated the counts
+  // during the current period, or if the count was updated for a
+  // different card number.
+  unsigned int new_epoch = (unsigned int) _n_periods;
+  julong new_epoch_entry = make_epoch_entry(new_card_num, new_epoch);

-int ConcurrentG1Refine::add_card_count(jbyte* card_ptr) {
-  size_t card_num = (card_ptr - _ct_bot);
-  guarantee(0 <= card_num && card_num < _n_card_counts, "Bounds");
-  unsigned char cnt = _card_counts[card_num];
-  if (cnt < 255) _card_counts[card_num]++;
-  return cnt;
-  _total_travs++;
+  while (true) {
+    // Fetch the previous epoch value
+    julong prev_epoch_entry = epoch_ptr->_value;
+    julong cas_res;
+
+    if (extract_epoch(prev_epoch_entry) != new_epoch) {
+      // This entry has not yet been updated during this period.
+      // Note: we update the epoch value atomically to ensure
+      // that there is only one winner that updates the cached
+      // card_ptr value even though all the refine threads share
+      // the same epoch value.
+
+      cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
+                                         (volatile jlong*)&epoch_ptr->_value,
+                                         (jlong) prev_epoch_entry);
+
+      if (cas_res == prev_epoch_entry) {
+        // We have successfully won the race to update the
+        // epoch and card_num value. Make it look like the
+        // count and eviction count were previously cleared.
+        count_ptr->_count = 1;
+        count_ptr->_evict_count = 0;
+        *count = 0;
+        // We can defer the processing of card_ptr
+        *defer = true;
+        return card_ptr;
+      }
+      // We did not win the race to update the epoch field, so some other
+      // thread must have done it. The value that gets returned by CAS
+      // should be the new epoch value.
+      assert(extract_epoch(cas_res) == new_epoch, "unexpected epoch");
+      // We could 'continue' here or just re-read the previous epoch value
+      prev_epoch_entry = epoch_ptr->_value;
+    }
+
+    // The epoch entry for card_ptr has been updated during this period.
+    unsigned old_card_num = extract_card_num(prev_epoch_entry);
+
+    // The card count that will be returned to caller
+    *count = count_ptr->_count;
+
+    // Are we updating the count for the same card?
+    if (new_card_num == old_card_num) {
+      // Same card - just update the count. We could have more than one
+      // thread racing to update count for the current card. It should be
+      // OK not to use a CAS as the only penalty should be some missed
+      // increments of the count which delays identifying the card as "hot".
+
+      if (*count < max_jubyte) count_ptr->_count++;
+      // We can defer the processing of card_ptr
+      *defer = true;
+      return card_ptr;
+    }
+
+    // Different card - evict old card info
+    if (count_ptr->_evict_count < max_jubyte) count_ptr->_evict_count++;
+    if (count_ptr->_evict_count > G1CardCountCacheExpandThreshold) {
+      // Trigger a resize the next time we clear
+      _expand_card_counts = true;
+    }
+
+    cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
+                                       (volatile jlong*)&epoch_ptr->_value,
+                                       (jlong) prev_epoch_entry);
+
+    if (cas_res == prev_epoch_entry) {
+      // We successfully updated the card num value in the epoch entry
+      count_ptr->_count = 0; // initialize counter for new card num
+
+      // Even though the region containg the card at old_card_num was not
+      // in the young list when old_card_num was recorded in the epoch
+      // cache it could have been added to the free list and subsequently
+      // added to the young list in the intervening time. If the evicted
+      // card is in a young region just return the card_ptr and the evicted
+      // card will not be cleaned. See CR 6817995.
+
+      jbyte* old_card_ptr = card_num_2_ptr(old_card_num);
+      if (is_young_card(old_card_ptr)) {
+        *count = 0;
+        // We can defer the processing of card_ptr
+        *defer = true;
+        return card_ptr;
+      }
+
+      // We do not want to defer processing of card_ptr in this case
+      // (we need to refine old_card_ptr and card_ptr)
+      *defer = false;
+      return old_card_ptr;
+    }
+    // Someone else beat us - try again.
+  }
 }

-jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr) {
-  int count = add_card_count(card_ptr);
-  // Count previously unvisited cards.
-  if (count == 0) _total_cards++;
-  // We'll assume a traversal unless we store it in the cache.
+jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) {
+  int count;
+  jbyte* cached_ptr = add_card_count(card_ptr, &count, defer);
+  assert(cached_ptr != NULL, "bad cached card ptr");
+  assert(!is_young_card(cached_ptr), "shouldn't get a card in young region");
+
+  // The card pointer we obtained from card count cache is not hot
+  // so do not store it in the cache; return it for immediate
+  // refining.
  if (count < G1ConcRSHotCardLimit) {
-    _total_travs++;
-    return card_ptr;
+    return cached_ptr;
  }
-  // Otherwise, it's hot.
+
+  // Otherwise, the pointer we got from the _card_counts is hot.
  jbyte* res = NULL;
  MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag);
  if (_n_hot == _hot_cache_size) {
-    _total_travs++;
    res = _hot_cache[_hot_cache_idx];
    _n_hot--;
  }
  // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx.
-  _hot_cache[_hot_cache_idx] = card_ptr;
+  _hot_cache[_hot_cache_idx] = cached_ptr;
  _hot_cache_idx++;
  if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0;
  _n_hot++;
+
+  if (res != NULL) {
+    // Even though the region containg res was not in the young list
+    // when it was recorded in the hot cache it could have been added
+    // to the free list and subsequently added to the young list in
+    // the intervening time. If res is in a young region, return NULL
+    // so that res is not cleaned. See CR 6817995.
+
+    if (is_young_card(res)) {
+      res = NULL;
+    }
+  }
+
  return res;
 }

-
 void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) {
  assert(!use_cache(), "cache should be disabled");
  int start_idx;
@@ -186,114 +328,52 @@ void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) {
  }
 }

-void ConcurrentG1Refine::clear_and_record_card_counts() {
-  if (G1ConcRSLogCacheSize == 0 && !G1ConcRSCountTraversals) return;
-  _n_periods++;
-  if (G1ConcRSCountTraversals) {
-    for (size_t i = 0; i < _n_card_counts; i++) {
-      unsigned char bucket = _card_counts[i];
-      _cur_card_count_histo[bucket]++;
-      _card_counts[i] = 0;
-    }
-    gclog_or_tty->print_cr("Card counts:");
-    for (int i = 0; i < 256; i++) {
-      if (_cur_card_count_histo[i] > 0) {
-        gclog_or_tty->print_cr("  %3d: %9d", i, _cur_card_count_histo[i]);
-        _cum_card_count_histo[i] += _cur_card_count_histo[i];
-        _cur_card_count_histo[i] = 0;
-      }
-    }
-  } else {
-    assert(G1ConcRSLogCacheSize > 0, "Logic");
-    Copy::fill_to_words((HeapWord*)(&_card_counts[0]),
-                        _n_card_counts / HeapWordSize);
-  }
-}
+void ConcurrentG1Refine::expand_card_count_cache() {
+  if (_n_card_counts < _max_n_card_counts) {
+    int new_idx = _cache_size_index+1;
+    int new_size = _cc_cache_sizes[new_idx];
+    if (new_size < 0) new_size = _max_n_card_counts;

-void
-ConcurrentG1Refine::
-print_card_count_histo_range(unsigned* histo, int from, int to,
-                             float& cum_card_pct,
-                             float& cum_travs_pct) {
-  unsigned cards = 0;
-  unsigned travs = 0;
-  guarantee(to <= 256, "Precondition");
-  for (int i = from; i < to-1; i++) {
-    cards += histo[i];
-    travs += histo[i] * i;
-  }
-  if (to == 256) {
-    unsigned histo_card_sum = 0;
-    unsigned histo_trav_sum = 0;
-    for (int i = 1; i < 255; i++) {
-      histo_trav_sum += histo[i] * i;
-    }
-    cards += histo[255];
-    // correct traversals for the last one.
-    unsigned travs_255 = (unsigned) (_total_travs - histo_trav_sum);
-    travs += travs_255;
-
-  } else {
-    cards += histo[to-1];
-    travs += histo[to-1] * (to-1);
-  }
-  float fperiods = (float)_n_periods;
-  float f_tot_cards = (float)_total_cards/fperiods;
-  float f_tot_travs = (float)_total_travs/fperiods;
-  if (cards > 0) {
-    float fcards = (float)cards/fperiods;
-    float ftravs = (float)travs/fperiods;
-    if (to == 256) {
-      gclog_or_tty->print(" %4d-       %10.2f%10.2f", from, fcards, ftravs);
-    } else {
-      gclog_or_tty->print(" %4d-%4d   %10.2f%10.2f", from, to-1, fcards, ftravs);
+    // Make sure we don't go bigger than we will ever need
+    new_size = MIN2((unsigned) new_size, _max_n_card_counts);
+
+    // Expand the card count and card epoch tables
+    if (new_size > (int)_n_card_counts) {
+      // We can just free and allocate a new array as we're
+      // not interested in preserving the contents
+      assert(_card_counts != NULL, "Logic!");
+      assert(_card_epochs != NULL, "Logic!");
+      FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts);
+      FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs);
+      _n_card_counts = new_size;
+      _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts);
+      _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts);
+      _cache_size_index = new_idx;
    }
-    float pct_cards = fcards*100.0/f_tot_cards;
-    cum_card_pct += pct_cards;
-    float pct_travs = ftravs*100.0/f_tot_travs;
-    cum_travs_pct += pct_travs;
-    gclog_or_tty->print_cr("%10.2f%10.2f%10.2f%10.2f",
-                  pct_cards, cum_card_pct,
-                  pct_travs, cum_travs_pct);
  }
 }

-void ConcurrentG1Refine::print_final_card_counts() {
-  if (!G1ConcRSCountTraversals) return;
-
-  gclog_or_tty->print_cr("Did %d total traversals of %d distinct cards.",
-                _total_travs, _total_cards);
-  float fperiods = (float)_n_periods;
-  gclog_or_tty->print_cr("  This is an average of %8.2f traversals, %8.2f cards, "
-                "per collection.", (float)_total_travs/fperiods,
-                (float)_total_cards/fperiods);
-  gclog_or_tty->print_cr("  This is an average of %8.2f traversals/distinct "
-                "dirty card.\n",
-                _total_cards > 0 ?
-                (float)_total_travs/(float)_total_cards : 0.0);
-
-
-  gclog_or_tty->print_cr("Histogram:\n\n%10s   %10s%10s%10s%10s%10s%10s",
-                "range", "# cards", "# travs", "% cards", "(cum)",
-                "% travs", "(cum)");
-  gclog_or_tty->print_cr("------------------------------------------------------------"
-                "-------------");
-  float cum_cards_pct = 0.0;
-  float cum_travs_pct = 0.0;
-  for (int i = 1; i < 10; i++) {
-    print_card_count_histo_range(_cum_card_count_histo, i, i+1,
-                                 cum_cards_pct, cum_travs_pct);
-  }
-  for (int i = 10; i < 100; i += 10) {
-    print_card_count_histo_range(_cum_card_count_histo, i, i+10,
-                                 cum_cards_pct, cum_travs_pct);
+void ConcurrentG1Refine::clear_and_record_card_counts() {
+  if (G1ConcRSLogCacheSize == 0) return;
+
+#ifndef PRODUCT
+  double start = os::elapsedTime();
+#endif
+
+  if (_expand_card_counts) {
+    expand_card_count_cache();
+    _expand_card_counts = false;
+    // Only need to clear the epochs.
+    Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
  }
-  print_card_count_histo_range(_cum_card_count_histo, 100, 150,
-                               cum_cards_pct, cum_travs_pct);
-  print_card_count_histo_range(_cum_card_count_histo, 150, 200,
-                               cum_cards_pct, cum_travs_pct);
-  print_card_count_histo_range(_cum_card_count_histo, 150, 255,
-                               cum_cards_pct, cum_travs_pct);
-  print_card_count_histo_range(_cum_card_count_histo, 255, 256,
-                               cum_cards_pct, cum_travs_pct);
+
+  int this_epoch = (int) _n_periods;
+  assert((this_epoch+1) <= max_jint, "to many periods");
+  // Update epoch
+  _n_periods++;
+
+#ifndef PRODUCT
+  double elapsed = os::elapsedTime() - start;
+  _g1h->g1_policy()->record_cc_clear_time(elapsed * 1000.0);
+#endif
 }
--- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
+++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
@@ -29,18 +29,77 @@ class G1RemSet;
 class ConcurrentG1Refine: public CHeapObj {
  ConcurrentG1RefineThread** _threads;
  int _n_threads;
+
  // The cache for card refinement.
-  bool     _use_cache;
-  bool     _def_use_cache;
-  size_t _n_periods;
-  size_t _total_cards;
-  size_t _total_travs;
-
-  unsigned char* _card_counts;
-  unsigned       _n_card_counts;
-  const jbyte*   _ct_bot;
-  unsigned*      _cur_card_count_histo;
-  unsigned*      _cum_card_count_histo;
+  bool   _use_cache;
+  bool   _def_use_cache;
+
+  size_t _n_periods;    // Used as clearing epoch
+
+  // An evicting cache of the number of times each card
+  // is accessed. Reduces, but does not eliminate, the amount
+  // of duplicated processing of dirty cards.
+
+  enum SomePrivateConstants {
+    epoch_bits           = 32,
+    card_num_shift       = epoch_bits,
+    epoch_mask           = AllBits,
+    card_num_mask        = AllBits,
+
+    // The initial cache size is approximately this fraction
+    // of a maximal cache (i.e. the size needed for all cards
+    // in the heap)
+    InitialCacheFraction = 512
+  };
+
+  const static julong card_num_mask_in_place =
+                        (julong) card_num_mask << card_num_shift;
+
+  typedef struct {
+    julong _value;      // |  card_num   |  epoch   |
+  } CardEpochCacheEntry;
+
+  julong make_epoch_entry(unsigned int card_num, unsigned int epoch) {
+    assert(0 <= card_num && card_num < _max_n_card_counts, "Bounds");
+    assert(0 <= epoch && epoch <= _n_periods, "must be");
+
+    return ((julong) card_num << card_num_shift) | epoch;
+  }
+
+  unsigned int extract_epoch(julong v) {
+    return (v & epoch_mask);
+  }
+
+  unsigned int extract_card_num(julong v) {
+    return (v & card_num_mask_in_place) >> card_num_shift;
+  }
+
+  typedef struct {
+    unsigned char _count;
+    unsigned char _evict_count;
+  } CardCountCacheEntry;
+
+  CardCountCacheEntry* _card_counts;
+  CardEpochCacheEntry* _card_epochs;
+
+  // The current number of buckets in the card count cache
+  unsigned _n_card_counts;
+
+  // The max number of buckets required for the number of
+  // cards for the entire reserved heap
+  unsigned _max_n_card_counts;
+
+  // Possible sizes of the cache: odd primes that roughly double in size.
+  // (See jvmtiTagMap.cpp).
+  static int _cc_cache_sizes[];
+
+  // The index in _cc_cache_sizes corresponding to the size of
+  // _card_counts.
+  int _cache_size_index;
+
+  bool _expand_card_counts;
+
+  const jbyte* _ct_bot;

  jbyte**      _hot_cache;
  int          _hot_cache_size;
@@ -50,12 +109,37 @@ class ConcurrentG1Refine: public CHeapObj {
  int          _hot_cache_par_chunk_size;
  volatile int _hot_cache_par_claimed_idx;

+  // Needed to workaround 6817995
+  CardTableModRefBS* _ct_bs;
+  G1CollectedHeap*   _g1h;
+
+  // Expands the array that holds the card counts to the next size up
+  void expand_card_count_cache();
+
+  // hash a given key (index of card_ptr) with the specified size
+  static unsigned int hash(size_t key, int size) {
+    return (unsigned int) key % size;
+  }
+
+  // hash a given key (index of card_ptr)
+  unsigned int hash(size_t key) {
+    return hash(key, _n_card_counts);
+  }
+
+  unsigned ptr_2_card_num(jbyte* card_ptr) {
+    return (unsigned) (card_ptr - _ct_bot);
+  }
+
+  jbyte* card_num_2_ptr(unsigned card_num) {
+    return (jbyte*) (_ct_bot + card_num);
+  }
+
  // Returns the count of this card after incrementing it.
-  int add_card_count(jbyte* card_ptr);
+  jbyte* add_card_count(jbyte* card_ptr, int* count, bool* defer);
+
+  // Returns true if this card is in a young region
+  bool is_young_card(jbyte* card_ptr);

-  void print_card_count_histo_range(unsigned* histo, int from, int to,
-                                    float& cum_card_pct,
-                                    float& cum_travs_pct);
 public:
  ConcurrentG1Refine();
  ~ConcurrentG1Refine();
@@ -69,7 +153,7 @@ class ConcurrentG1Refine: public CHeapObj {
  // If this is the first entry for the slot, writes into the cache and
  // returns NULL.  If it causes an eviction, returns the evicted pointer.
  // Otherwise, its a cache hit, and returns NULL.
-  jbyte* cache_insert(jbyte* card_ptr);
+  jbyte* cache_insert(jbyte* card_ptr, bool* defer);

  // Process the cached entries.
  void clean_up_cache(int worker_i, G1RemSet* g1rs);
@@ -93,7 +177,6 @@ class ConcurrentG1Refine: public CHeapObj {
  }

  void clear_and_record_card_counts();
-  void print_final_card_counts();

  static size_t thread_num();
 };
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@@ -2414,8 +2414,6 @@ void G1CollectedHeap::gc_threads_do(ThreadClosure* tc) const {
 }

 void G1CollectedHeap::print_tracing_info() const {
-  concurrent_g1_refine()->print_final_card_counts();
-
  // We'll overload this to mean "trace GC pause statistics."
  if (TraceGen0Time || TraceGen1Time) {
    // The "G1CollectorPolicy" is keeping track of these stats, so delegate

--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
@@ -94,7 +94,14 @@ G1CollectorPolicy::G1CollectorPolicy() :
  _summary(new Summary()),
  _abandoned_summary(new AbandonedSummary()),

+#ifndef PRODUCT
  _cur_clear_ct_time_ms(0.0),
+  _min_clear_cc_time_ms(-1.0),
+  _max_clear_cc_time_ms(-1.0),
+  _cur_clear_cc_time_ms(0.0),
+  _cum_clear_cc_time_ms(0.0),
+  _num_cc_clears(0L),
+#endif

  _region_num_young(0),
  _region_num_tenured(0),
@@ -1648,6 +1655,15 @@ void G1CollectorPolicy::record_collection_pause_end(bool abandoned) {
        print_stats(1, "Object Copying", obj_copy_time);
      }
    }
+#ifndef PRODUCT
+    print_stats(1, "Cur Clear CC", _cur_clear_cc_time_ms);
+    print_stats(1, "Cum Clear CC", _cum_clear_cc_time_ms);
+    print_stats(1, "Min Clear CC", _min_clear_cc_time_ms);
+    print_stats(1, "Max Clear CC", _max_clear_cc_time_ms);
+    if (_num_cc_clears > 0) {
+      print_stats(1, "Avg Clear CC", _cum_clear_cc_time_ms / ((double)_num_cc_clears));
+    }
+#endif
    print_stats(1, "Other", other_time_ms);
    for (int i = 0; i < _aux_num; ++i) {
      if (_cur_aux_times_set[i]) {

--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
@@ -112,7 +112,6 @@ protected:
    return 8*M;
  }

-
  double _cur_collection_start_sec;
  size_t _cur_collection_pause_used_at_start_bytes;
  size_t _cur_collection_pause_used_regions_at_start;
@@ -122,6 +121,15 @@ protected:
  double _cur_clear_ct_time_ms;
  bool   _satb_drain_time_set;

+#ifndef PRODUCT
+  // Card Table Count Cache stats
+  double _min_clear_cc_time_ms;         // min
+  double _max_clear_cc_time_ms;         // max
+  double _cur_clear_cc_time_ms;         // clearing time during current pause
+  double _cum_clear_cc_time_ms;         // cummulative clearing time
+  jlong  _num_cc_clears;                // number of times the card count cache has been cleared
+#endif
+
  double _cur_CH_strong_roots_end_sec;
  double _cur_CH_strong_roots_dur_ms;
  double _cur_G1_strong_roots_end_sec;
@@ -931,6 +939,18 @@ public:
    _cur_aux_times_ms[i] += ms;
  }

+#ifndef PRODUCT
+  void record_cc_clear_time(double ms) {
+    if (_min_clear_cc_time_ms < 0.0 || ms <= _min_clear_cc_time_ms)
+      _min_clear_cc_time_ms = ms;
+    if (_max_clear_cc_time_ms < 0.0 || ms >= _max_clear_cc_time_ms)
+      _max_clear_cc_time_ms = ms;
+    _cur_clear_cc_time_ms = ms;
+    _cum_clear_cc_time_ms += ms;
+    _num_cc_clears++;
+  }
+#endif
+
  // Record the fact that "bytes" bytes allocated in a region.
  void record_before_bytes(size_t bytes);
  void record_after_bytes(size_t bytes);

--- a/src/share/vm/gc_implementation/g1/g1RemSet.cpp
+++ b/src/share/vm/gc_implementation/g1/g1RemSet.cpp
@@ -676,61 +676,12 @@ void HRInto_G1RemSet::scrub_par(BitMap* region_bm, BitMap* card_bm,

 static IntHistogram out_of_histo(50, 50);

-void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
-  // If the card is no longer dirty, nothing to do.
-  if (*card_ptr != CardTableModRefBS::dirty_card_val()) return;
-
+void HRInto_G1RemSet::concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i) {
  // Construct the region representing the card.
  HeapWord* start = _ct_bs->addr_for(card_ptr);
  // And find the region containing it.
  HeapRegion* r = _g1->heap_region_containing(start);
-  if (r == NULL) {
-    guarantee(_g1->is_in_permanent(start), "Or else where?");
-    return;  // Not in the G1 heap (might be in perm, for example.)
-  }
-  // Why do we have to check here whether a card is on a young region,
-  // given that we dirty young regions and, as a result, the
-  // post-barrier is supposed to filter them out and never to enqueue
-  // them? When we allocate a new region as the "allocation region" we
-  // actually dirty its cards after we release the lock, since card
-  // dirtying while holding the lock was a performance bottleneck. So,
-  // as a result, it is possible for other threads to actually
-  // allocate objects in the region (after the acquire the lock)
-  // before all the cards on the region are dirtied. This is unlikely,
-  // and it doesn't happen often, but it can happen. So, the extra
-  // check below filters out those cards.
-  if (r->is_young()) {
-    return;
-  }
-  // While we are processing RSet buffers during the collection, we
-  // actually don't want to scan any cards on the collection set,
-  // since we don't want to update remebered sets with entries that
-  // point into the collection set, given that live objects from the
-  // collection set are about to move and such entries will be stale
-  // very soon. This change also deals with a reliability issue which
-  // involves scanning a card in the collection set and coming across
-  // an array that was being chunked and looking malformed. Note,
-  // however, that if evacuation fails, we have to scan any objects
-  // that were not moved and create any missing entries.
-  if (r->in_collection_set()) {
-    return;
-  }
-
-  // Should we defer it?
-  if (_cg1r->use_cache()) {
-    card_ptr = _cg1r->cache_insert(card_ptr);
-    // If it was not an eviction, nothing to do.
-    if (card_ptr == NULL) return;
-
-    // OK, we have to reset the card start, region, etc.
-    start = _ct_bs->addr_for(card_ptr);
-    r = _g1->heap_region_containing(start);
-    if (r == NULL) {
-      guarantee(_g1->is_in_permanent(start), "Or else where?");
-      return;  // Not in the G1 heap (might be in perm, for example.)
-    }
-    guarantee(!r->is_young(), "It was evicted in the current minor cycle.");
-  }
+  assert(r != NULL, "unexpected null");

  HeapWord* end   = _ct_bs->addr_for(card_ptr + 1);
  MemRegion dirtyRegion(start, end);
@@ -774,6 +725,106 @@ void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
  }
 }

+void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
+  // If the card is no longer dirty, nothing to do.
+  if (*card_ptr != CardTableModRefBS::dirty_card_val()) return;
+
+  // Construct the region representing the card.
+  HeapWord* start = _ct_bs->addr_for(card_ptr);
+  // And find the region containing it.
+  HeapRegion* r = _g1->heap_region_containing(start);
+  if (r == NULL) {
+    guarantee(_g1->is_in_permanent(start), "Or else where?");
+    return;  // Not in the G1 heap (might be in perm, for example.)
+  }
+  // Why do we have to check here whether a card is on a young region,
+  // given that we dirty young regions and, as a result, the
+  // post-barrier is supposed to filter them out and never to enqueue
+  // them? When we allocate a new region as the "allocation region" we
+  // actually dirty its cards after we release the lock, since card
+  // dirtying while holding the lock was a performance bottleneck. So,
+  // as a result, it is possible for other threads to actually
+  // allocate objects in the region (after the acquire the lock)
+  // before all the cards on the region are dirtied. This is unlikely,
+  // and it doesn't happen often, but it can happen. So, the extra
+  // check below filters out those cards.
+  if (r->is_young()) {
+    return;
+  }
+  // While we are processing RSet buffers during the collection, we
+  // actually don't want to scan any cards on the collection set,
+  // since we don't want to update remebered sets with entries that
+  // point into the collection set, given that live objects from the
+  // collection set are about to move and such entries will be stale
+  // very soon. This change also deals with a reliability issue which
+  // involves scanning a card in the collection set and coming across
+  // an array that was being chunked and looking malformed. Note,
+  // however, that if evacuation fails, we have to scan any objects
+  // that were not moved and create any missing entries.
+  if (r->in_collection_set()) {
+    return;
+  }
+
+  // Should we defer processing the card?
+  //
+  // Previously the result from the insert_cache call would be
+  // either card_ptr (implying that card_ptr was currently "cold"),
+  // null (meaning we had inserted the card ptr into the "hot"
+  // cache, which had some headroom), or a "hot" card ptr
+  // extracted from the "hot" cache.
+  //
+  // Now that the _card_counts cache in the ConcurrentG1Refine
+  // instance is an evicting hash table, the result we get back
+  // could be from evicting the card ptr in an already occupied
+  // bucket (in which case we have replaced the card ptr in the
+  // bucket with card_ptr and "defer" is set to false). To avoid
+  // having a data structure (updates to which would need a lock)
+  // to hold these unprocessed dirty cards, we need to immediately
+  // process card_ptr. The actions needed to be taken on return
+  // from cache_insert are summarized in the following table:
+  //
+  // res      defer   action
+  // --------------------------------------------------------------
+  // null     false   card evicted from _card_counts & replaced with
+  //                  card_ptr; evicted ptr added to hot cache.
+  //                  No need to process res; immediately process card_ptr
+  //
+  // null     true    card not evicted from _card_counts; card_ptr added
+  //                  to hot cache.
+  //                  Nothing to do.
+  //
+  // non-null false   card evicted from _card_counts & replaced with
+  //                  card_ptr; evicted ptr is currently "cold" or
+  //                  caused an eviction from the hot cache.
+  //                  Immediately process res; process card_ptr.
+  //
+  // non-null true    card not evicted from _card_counts; card_ptr is
+  //                  currently cold, or caused an eviction from hot
+  //                  cache.
+  //                  Immediately process res; no need to process card_ptr.
+
+  jbyte* res = card_ptr;
+  bool defer = false;
+  if (_cg1r->use_cache()) {
+    jbyte* res = _cg1r->cache_insert(card_ptr, &defer);
+    if (res != NULL && (res != card_ptr || defer)) {
+      start = _ct_bs->addr_for(res);
+      r = _g1->heap_region_containing(start);
+      if (r == NULL) {
+        assert(_g1->is_in_permanent(start), "Or else where?");
+      } else {
+        guarantee(!r->is_young(), "It was evicted in the current minor cycle.");
+        // Process card pointer we get back from the hot card cache
+        concurrentRefineOneCard_impl(res, worker_i);
+      }
+    }
+  }
+
+  if (!defer) {
+    concurrentRefineOneCard_impl(card_ptr, worker_i);
+  }
+}
+
 class HRRSStatsIter: public HeapRegionClosure {
  size_t _occupied;
  size_t _total_mem_sz;

--- a/src/share/vm/gc_implementation/g1/g1RemSet.hpp
+++ b/src/share/vm/gc_implementation/g1/g1RemSet.hpp
@@ -157,6 +157,10 @@ protected:
    }
  }

+  // The routine that performs the actual work of refining a dirty
+  // card.
+  void concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i);
+
 protected:
  template <class T> void write_ref_nv(HeapRegion* from, T* p);
  template <class T> void par_write_ref_nv(HeapRegion* from, T* p, int tid);

--- a/src/share/vm/gc_implementation/g1/g1_globals.hpp
+++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp
@@ -187,10 +187,6 @@
  develop(intx, G1ConcRSLogCacheSize, 10,                                   \
          "Log base 2 of the length of conc RS hot-card cache.")            \
                                                                            \
-  develop(bool, G1ConcRSCountTraversals, false,                             \
-          "If true, gather data about the number of times CR traverses "    \
-          "cards ")                                                         \
-                                                                            \
  develop(intx, G1ConcRSHotCardLimit, 4,                                    \
          "The threshold that defines (>=) a hot card.")                    \
                                                                            \
@@ -264,6 +260,10 @@
                                                                            \
  product(uintx, G1ParallelRSetThreads, 0,                                  \
          "If non-0 is the number of parallel rem set update threads, "     \
-          "otherwise the value is determined ergonomically.")
+          "otherwise the value is determined ergonomically.")               \
+                                                                            \
+  develop(intx, G1CardCountCacheExpandThreshold, 16,                        \
+          "Expand the card count cache if the number of collisions for "    \
+          "a particular entry exceeds this value.")

 G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG)
--- a/src/share/vm/gc_implementation/includeDB_gc_g1
+++ b/src/share/vm/gc_implementation/includeDB_gc_g1
@@ -45,11 +45,14 @@ concurrentG1Refine.cpp			concurrentG1Refine.hpp
 concurrentG1Refine.cpp			concurrentG1RefineThread.hpp
 concurrentG1Refine.cpp			copy.hpp
 concurrentG1Refine.cpp			g1CollectedHeap.inline.hpp
+concurrentG1Refine.cpp                  g1CollectorPolicy.hpp
 concurrentG1Refine.cpp			g1RemSet.hpp
 concurrentG1Refine.cpp			space.inline.hpp
+concurrentG1Refine.cpp                  heapRegionSeq.inline.hpp

 concurrentG1Refine.hpp			globalDefinitions.hpp
 concurrentG1Refine.hpp			allocation.hpp
+concurrentG1Refine.hpp                  cardTableModRefBS.hpp
 concurrentG1Refine.hpp			thread.hpp

 concurrentG1RefineThread.cpp		concurrentG1Refine.hpp