diff --git a/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp b/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
index 3473fac01866d42a34e5bb981c590b01639636d3..4b6aafbd2fdc7103511855deb8c7b948ad904541 100644
--- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
+++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
@@ -57,8 +57,8 @@ size_t ConcurrentG1Refine::thread_num() {
 }
 
 void ConcurrentG1Refine::init() {
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
   if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
-    G1CollectedHeap* g1h = G1CollectedHeap::heap();
     _n_card_counts =
       (unsigned) (g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
     _card_counts = NEW_C_HEAP_ARRAY(unsigned char, _n_card_counts);
@@ -83,6 +83,12 @@ void ConcurrentG1Refine::init() {
     _hot_cache = NEW_C_HEAP_ARRAY(jbyte*, _hot_cache_size);
     _n_hot = 0;
     _hot_cache_idx = 0;
+
+    // For refining the cards in the hot cache in parallel
+    int n_workers = (ParallelGCThreads > 0 ?
+                        g1h->workers()->total_workers() : 1);
+    _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers);
+    _hot_cache_par_claimed_idx = 0;
   }
 }
 
@@ -161,17 +167,23 @@ jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr) {
 
 void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) {
   assert(!use_cache(), "cache should be disabled");
-  int start_ind = _hot_cache_idx-1;
-  for (int i = 0; i < _n_hot; i++) {
-    int ind = start_ind - i;
-    if (ind < 0) ind = ind + _hot_cache_size;
-    jbyte* entry = _hot_cache[ind];
-    if (entry != NULL) {
-      g1rs->concurrentRefineOneCard(entry, worker_i);
+  int start_idx;
+
+  while ((start_idx = _hot_cache_par_claimed_idx) < _n_hot) { // read once
+    int end_idx = start_idx + _hot_cache_par_chunk_size;
+
+    if (start_idx ==
+        Atomic::cmpxchg(end_idx, &_hot_cache_par_claimed_idx, start_idx)) {
+      // The current worker has successfully claimed the chunk [start_idx..end_idx)
+      end_idx = MIN2(end_idx, _n_hot);
+      for (int i = start_idx; i < end_idx; i++) {
+        jbyte* entry = _hot_cache[i];
+        if (entry != NULL) {
+          g1rs->concurrentRefineOneCard(entry, worker_i);
+        }
+      }
     }
   }
-  _n_hot = 0;
-  _hot_cache_idx = 0;
 }
 
 void ConcurrentG1Refine::clear_and_record_card_counts() {
diff --git a/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp b/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
index 830e19ee6338aa63d26bb03cdb96954195f1362b..27999644807421b05159b5c14448fa08294527ff 100644
--- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
+++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
@@ -36,15 +36,19 @@ class ConcurrentG1Refine: public CHeapObj {
   size_t _total_cards;
   size_t _total_travs;
 
-  unsigned char*  _card_counts;
-  unsigned _n_card_counts;
-  const jbyte* _ct_bot;
-  unsigned* _cur_card_count_histo;
-  unsigned* _cum_card_count_histo;
-  jbyte**  _hot_cache;
-  int      _hot_cache_size;
-  int      _n_hot;
-  int      _hot_cache_idx;
+  unsigned char* _card_counts;
+  unsigned       _n_card_counts;
+  const jbyte*   _ct_bot;
+  unsigned*      _cur_card_count_histo;
+  unsigned*      _cum_card_count_histo;
+
+  jbyte**      _hot_cache;
+  int          _hot_cache_size;
+  int          _n_hot;
+  int          _hot_cache_idx;
+
+  int          _hot_cache_par_chunk_size;
+  volatile int _hot_cache_par_claimed_idx;
 
   // Returns the count of this card after incrementing it.
   int add_card_count(jbyte* card_ptr);
@@ -70,6 +74,11 @@ class ConcurrentG1Refine: public CHeapObj {
   // Process the cached entries.
   void clean_up_cache(int worker_i, G1RemSet* g1rs);
 
+  // Set up for parallel processing of the cards in the hot cache
+  void clear_hot_cache_claimed_index() {
+    _hot_cache_par_claimed_idx = 0;
+  }
+
   // Discard entries in the hot cache.
   void clear_hot_cache() {
     _hot_cache_idx = 0; _n_hot = 0;
diff --git a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
index 992df850a4c32f0e51cb513a86b7ed0e43003302..86872d6e0a68e4ecc44b161fec302b89fb3285c2 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@@ -1637,6 +1637,9 @@ size_t G1CollectedHeap::capacity() const {
 
 void G1CollectedHeap::iterate_dirty_card_closure(bool concurrent,
                                                  int worker_i) {
+  // Clean cards in the hot card cache
+  concurrent_g1_refine()->clean_up_cache(worker_i, g1_rem_set());
+
   DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
   int n_completed_buffers = 0;
   while (dcqs.apply_closure_to_completed_buffer(worker_i, 0, true)) {
@@ -1645,9 +1648,6 @@ void G1CollectedHeap::iterate_dirty_card_closure(bool concurrent,
   g1_policy()->record_update_rs_processed_buffers(worker_i,
                                                   (double) n_completed_buffers);
   dcqs.clear_n_completed_buffers();
-  // Finish up the queue...
-  if (worker_i == 0) concurrent_g1_refine()->clean_up_cache(worker_i,
-                                                            g1_rem_set());
   assert(!dcqs.completed_buffers_exist_dirty(), "Completed buffers exist!");
 }
 
@@ -4111,6 +4111,8 @@ void G1CollectedHeap::evacuate_collection_set() {
 
   g1_rem_set()->prepare_for_oops_into_collection_set_do();
   concurrent_g1_refine()->set_use_cache(false);
+  concurrent_g1_refine()->clear_hot_cache_claimed_index();
+
   int n_workers = (ParallelGCThreads > 0 ? workers()->total_workers() : 1);
   set_par_threads(n_workers);
   G1ParTask g1_par_task(this, n_workers, _task_queues);
@@ -4143,6 +4145,7 @@ void G1CollectedHeap::evacuate_collection_set() {
   }
   g1_rem_set()->cleanup_after_oops_into_collection_set_do();
 
+  concurrent_g1_refine()->clear_hot_cache();
   concurrent_g1_refine()->set_use_cache(true);
 
   finalize_for_evac_failure();