From 8763012d211ea6deb9450dd3cdaf85bbafa8e69a Mon Sep 17 00:00:00 2001
From: johnc <unknown>
Date: Mon, 3 Aug 2009 12:59:30 -0700
Subject: [PATCH] 6865703: G1: Parallelize hot card cache cleanup Summary: Have
 the GC worker threads clear the hot card cache in parallel by having each
 worker thread claim a chunk of the card cache and process the cards in that
 chunk. The size of the chunks that each thread will claim is determined at VM
 initialization from the size of the card cache and the number of worker
 threads. Reviewed-by: jmasa, tonyp

---
 .../g1/concurrentG1Refine.cpp                 | 32 +++++++++++++------
 .../g1/concurrentG1Refine.hpp                 | 27 ++++++++++------
 .../gc_implementation/g1/g1CollectedHeap.cpp  |  9 ++++--
 3 files changed, 46 insertions(+), 22 deletions(-)
diff --git a/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp b/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
index 3473fac01..4b6aafbd2 100644
--- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
+++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
@@ -57,8 +57,8 @@ size_t ConcurrentG1Refine::thread_num() {
 }
 
 void ConcurrentG1Refine::init() {
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
   if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
-    G1CollectedHeap* g1h = G1CollectedHeap::heap();
     _n_card_counts =
       (unsigned) (g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
     _card_counts = NEW_C_HEAP_ARRAY(unsigned char, _n_card_counts);
@@ -83,6 +83,12 @@ void ConcurrentG1Refine::init() {
     _hot_cache = NEW_C_HEAP_ARRAY(jbyte*, _hot_cache_size);
     _n_hot = 0;
     _hot_cache_idx = 0;
+
+    // For refining the cards in the hot cache in parallel
+    int n_workers = (ParallelGCThreads > 0 ?
+                        g1h->workers()->total_workers() : 1);
+    _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers);
+    _hot_cache_par_claimed_idx = 0;
   }
 }
 
@@ -161,17 +167,23 @@ jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr) {
 
 void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) {
   assert(!use_cache(), "cache should be disabled");
-  int start_ind = _hot_cache_idx-1;
-  for (int i = 0; i < _n_hot; i++) {
-    int ind = start_ind - i;
-    if (ind < 0) ind = ind + _hot_cache_size;
-    jbyte* entry = _hot_cache[ind];
-    if (entry != NULL) {
-      g1rs->concurrentRefineOneCard(entry, worker_i);
+  int start_idx;
+
+  while ((start_idx = _hot_cache_par_claimed_idx) < _n_hot) { // read once
+    int end_idx = start_idx + _hot_cache_par_chunk_size;
+
+    if (start_idx ==
+        Atomic::cmpxchg(end_idx, &_hot_cache_par_claimed_idx, start_idx)) {
+      // The current worker has successfully claimed the chunk [start_idx..end_idx)
+      end_idx = MIN2(end_idx, _n_hot);
+      for (int i = start_idx; i < end_idx; i++) {
+        jbyte* entry = _hot_cache[i];
+        if (entry != NULL) {
+          g1rs->concurrentRefineOneCard(entry, worker_i);
+        }
+      }
     }
   }
-  _n_hot = 0;
-  _hot_cache_idx = 0;
 }
 
 void ConcurrentG1Refine::clear_and_record_card_counts() {
diff --git a/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp b/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
index 830e19ee6..279996448 100644
--- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
+++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
@@ -36,15 +36,19 @@ class ConcurrentG1Refine: public CHeapObj {
   size_t _total_cards;
   size_t _total_travs;
 
-  unsigned char*  _card_counts;
-  unsigned _n_card_counts;
-  const jbyte* _ct_bot;
-  unsigned* _cur_card_count_histo;
-  unsigned* _cum_card_count_histo;
-  jbyte**  _hot_cache;
-  int      _hot_cache_size;
-  int      _n_hot;
-  int      _hot_cache_idx;
+  unsigned char* _card_counts;
+  unsigned       _n_card_counts;
+  const jbyte*   _ct_bot;
+  unsigned*      _cur_card_count_histo;
+  unsigned*      _cum_card_count_histo;
+
+  jbyte**      _hot_cache;
+  int          _hot_cache_size;
+  int          _n_hot;
+  int          _hot_cache_idx;
+
+  int          _hot_cache_par_chunk_size;
+  volatile int _hot_cache_par_claimed_idx;
 
   // Returns the count of this card after incrementing it.
   int add_card_count(jbyte* card_ptr);
@@ -70,6 +74,11 @@ class ConcurrentG1Refine: public CHeapObj {
   // Process the cached entries.
   void clean_up_cache(int worker_i, G1RemSet* g1rs);
 
+  // Set up for parallel processing of the cards in the hot cache
+  void clear_hot_cache_claimed_index() {
+    _hot_cache_par_claimed_idx = 0;
+  }
+
   // Discard entries in the hot cache.
   void clear_hot_cache() {
     _hot_cache_idx = 0; _n_hot = 0;
diff --git a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
index 992df850a..86872d6e0 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@@ -1637,6 +1637,9 @@ size_t G1CollectedHeap::capacity() const {
 
 void G1CollectedHeap::iterate_dirty_card_closure(bool concurrent,
                                                  int worker_i) {
+  // Clean cards in the hot card cache
+  concurrent_g1_refine()->clean_up_cache(worker_i, g1_rem_set());
+
   DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
   int n_completed_buffers = 0;
   while (dcqs.apply_closure_to_completed_buffer(worker_i, 0, true)) {
@@ -1645,9 +1648,6 @@ void G1CollectedHeap::iterate_dirty_card_closure(bool concurrent,
   g1_policy()->record_update_rs_processed_buffers(worker_i,
                                                   (double) n_completed_buffers);
   dcqs.clear_n_completed_buffers();
-  // Finish up the queue...
-  if (worker_i == 0) concurrent_g1_refine()->clean_up_cache(worker_i,
-                                                            g1_rem_set());
   assert(!dcqs.completed_buffers_exist_dirty(), "Completed buffers exist!");
 }
 
@@ -4111,6 +4111,8 @@ void G1CollectedHeap::evacuate_collection_set() {
 
   g1_rem_set()->prepare_for_oops_into_collection_set_do();
   concurrent_g1_refine()->set_use_cache(false);
+  concurrent_g1_refine()->clear_hot_cache_claimed_index();
+
   int n_workers = (ParallelGCThreads > 0 ? workers()->total_workers() : 1);
   set_par_threads(n_workers);
   G1ParTask g1_par_task(this, n_workers, _task_queues);
@@ -4143,6 +4145,7 @@ void G1CollectedHeap::evacuate_collection_set() {
   }
   g1_rem_set()->cleanup_after_oops_into_collection_set_do();
 
+  concurrent_g1_refine()->clear_hot_cache();
   concurrent_g1_refine()->set_use_cache(true);
 
   finalize_for_evac_failure();
-- 
GitLab