From 9c3adbcc3235dc4cd7267d4954716897d53dbde0 Mon Sep 17 00:00:00 2001
From: johnc <unknown>
Date: Thu, 22 Sep 2011 10:57:37 -0700
Subject: [PATCH] 6484982: G1: process references during evacuation pauses
 Summary: G1 now uses two reference processors - one is used by concurrent
 marking and the other is used by STW GCs (both full and incremental
 evacuation pauses). In an evacuation pause, the reference processor is
 embedded into the closures used to scan objects. Doing so causes causes
 reference objects to be 'discovered' by the reference processor. At the end
 of the evacuation pause, these discovered reference objects are processed -
 preserving (and copying) referent objects (and their reachable graphs) as
 appropriate. Reviewed-by: ysr, jwilhelm, brutisso, stefank, tonyp

---
 .../concurrentMarkSweepGeneration.cpp         |    9 +-
 .../gc_implementation/g1/concurrentMark.cpp   |   47 +-
 .../gc_implementation/g1/concurrentMark.hpp   |    4 +-
 .../gc_implementation/g1/g1CollectedHeap.cpp  | 1140 +++++++++++++----
 .../gc_implementation/g1/g1CollectedHeap.hpp  |  115 +-
 .../g1/g1CollectorPolicy.cpp                  |    8 +-
 .../g1/g1CollectorPolicy.hpp                  |   10 +
 .../vm/gc_implementation/g1/g1MarkSweep.cpp   |    8 +-
 .../vm/gc_implementation/g1/g1OopClosures.hpp |   88 +-
 .../vm/gc_implementation/g1/g1RemSet.cpp      |    6 +-
 .../vm/gc_implementation/g1/heapRegion.cpp    |   40 +-
 .../vm/gc_implementation/g1/heapRegion.hpp    |   31 -
 .../vm/gc_implementation/g1/satbQueue.cpp     |   16 +-
 .../parallelScavenge/psMarkSweep.cpp          |    3 +-
 .../parallelScavenge/psParallelCompact.cpp    |    3 +-
 .../parallelScavenge/psScavenge.cpp           |    3 +-
 src/share/vm/memory/genCollectedHeap.cpp      |    3 +-
 src/share/vm/memory/referenceProcessor.cpp    |  234 +---
 src/share/vm/memory/referenceProcessor.hpp    |  224 +++-
 src/share/vm/runtime/thread.cpp               |    8 +-
 20 files changed, 1408 insertions(+), 592 deletions(-)
diff --git a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
index c5bd4ca29..54bbb24b0 100644
--- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
@@ -2004,7 +2004,7 @@ void CMSCollector::do_compaction_work(bool clear_all_soft_refs) {
   ReferenceProcessorMTDiscoveryMutator rp_mut_discovery(ref_processor(), false);
 
   ref_processor()->set_enqueuing_is_done(false);
-  ref_processor()->enable_discovery();
+  ref_processor()->enable_discovery(false /*verify_disabled*/, false /*check_no_refs*/);
   ref_processor()->setup_policy(clear_all_soft_refs);
   // If an asynchronous collection finishes, the _modUnionTable is
   // all clear.  If we are assuming the collection from an asynchronous
@@ -3490,8 +3490,8 @@ void CMSCollector::checkpointRootsInitial(bool asynch) {
     MutexLockerEx x(bitMapLock(),
                     Mutex::_no_safepoint_check_flag);
     checkpointRootsInitialWork(asynch);
-    rp->verify_no_references_recorded();
-    rp->enable_discovery(); // enable ("weak") refs discovery
+    // enable ("weak") refs discovery
+    rp->enable_discovery(true /*verify_disabled*/, true /*check_no_refs*/);
     _collectorState = Marking;
   } else {
     // (Weak) Refs discovery: this is controlled from genCollectedHeap::do_collection
@@ -3503,7 +3503,8 @@ void CMSCollector::checkpointRootsInitial(bool asynch) {
            "ref discovery for this generation kind");
     // already have locks
     checkpointRootsInitialWork(asynch);
-    rp->enable_discovery(); // now enable ("weak") refs discovery
+    // now enable ("weak") refs discovery
+    rp->enable_discovery(true /*verify_disabled*/, false /*verify_no_refs*/);
     _collectorState = Marking;
   }
   SpecializationStats::print();
diff --git a/src/share/vm/gc_implementation/g1/concurrentMark.cpp b/src/share/vm/gc_implementation/g1/concurrentMark.cpp
index 61adccb10..d78db6e5d 100644
--- a/src/share/vm/gc_implementation/g1/concurrentMark.cpp
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.cpp
@@ -818,10 +818,10 @@ void ConcurrentMark::checkpointRootsInitialPost() {
   NoteStartOfMarkHRClosure startcl;
   g1h->heap_region_iterate(&startcl);
 
-  // Start weak-reference discovery.
-  ReferenceProcessor* rp = g1h->ref_processor();
-  rp->verify_no_references_recorded();
-  rp->enable_discovery(); // enable ("weak") refs discovery
+  // Start Concurrent Marking weak-reference discovery.
+  ReferenceProcessor* rp = g1h->ref_processor_cm();
+  // enable ("weak") refs discovery
+  rp->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
   rp->setup_policy(false); // snapshot the soft ref policy to be used in this cycle
 
   SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
@@ -1133,6 +1133,7 @@ void ConcurrentMark::checkpointRootsFinal(bool clear_all_soft_refs) {
   // world is stopped at this checkpoint
   assert(SafepointSynchronize::is_at_safepoint(),
          "world should be stopped");
+
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
 
   // If a full collection has happened, we shouldn't do this.
@@ -1837,6 +1838,10 @@ void ConcurrentMark::cleanup() {
   size_t cleaned_up_bytes = start_used_bytes - g1h->used();
   g1p->decrease_known_garbage_bytes(cleaned_up_bytes);
 
+  // Clean up will have freed any regions completely full of garbage.
+  // Update the soft reference policy with the new heap occupancy.
+  Universe::update_heap_info_at_gc();
+
   // We need to make this be a "collection" so any collection pause that
   // races with it goes around and waits for completeCleanup to finish.
   g1h->increment_total_collections();
@@ -2072,8 +2077,10 @@ class G1CMParDrainMarkingStackClosure: public VoidClosure {
   }
 };
 
-// Implementation of AbstractRefProcTaskExecutor for G1
-class G1RefProcTaskExecutor: public AbstractRefProcTaskExecutor {
+// Implementation of AbstractRefProcTaskExecutor for parallel
+// reference processing at the end of G1 concurrent marking
+
+class G1CMRefProcTaskExecutor: public AbstractRefProcTaskExecutor {
 private:
   G1CollectedHeap* _g1h;
   ConcurrentMark*  _cm;
@@ -2082,7 +2089,7 @@ private:
   int              _active_workers;
 
 public:
-  G1RefProcTaskExecutor(G1CollectedHeap* g1h,
+  G1CMRefProcTaskExecutor(G1CollectedHeap* g1h,
                         ConcurrentMark* cm,
                         CMBitMap* bitmap,
                         WorkGang* workers,
@@ -2096,7 +2103,7 @@ public:
   virtual void execute(EnqueueTask& task);
 };
 
-class G1RefProcTaskProxy: public AbstractGangTask {
+class G1CMRefProcTaskProxy: public AbstractGangTask {
   typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
   ProcessTask&     _proc_task;
   G1CollectedHeap* _g1h;
@@ -2104,7 +2111,7 @@ class G1RefProcTaskProxy: public AbstractGangTask {
   CMBitMap*        _bitmap;
 
 public:
-  G1RefProcTaskProxy(ProcessTask& proc_task,
+  G1CMRefProcTaskProxy(ProcessTask& proc_task,
                      G1CollectedHeap* g1h,
                      ConcurrentMark* cm,
                      CMBitMap* bitmap) :
@@ -2122,10 +2129,10 @@ public:
   }
 };
 
-void G1RefProcTaskExecutor::execute(ProcessTask& proc_task) {
+void G1CMRefProcTaskExecutor::execute(ProcessTask& proc_task) {
   assert(_workers != NULL, "Need parallel worker threads.");
 
-  G1RefProcTaskProxy proc_task_proxy(proc_task, _g1h, _cm, _bitmap);
+  G1CMRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _cm, _bitmap);
 
   // We need to reset the phase for each task execution so that
   // the termination protocol of CMTask::do_marking_step works.
@@ -2135,12 +2142,12 @@ void G1RefProcTaskExecutor::execute(ProcessTask& proc_task) {
   _g1h->set_par_threads(0);
 }
 
-class G1RefEnqueueTaskProxy: public AbstractGangTask {
+class G1CMRefEnqueueTaskProxy: public AbstractGangTask {
   typedef AbstractRefProcTaskExecutor::EnqueueTask EnqueueTask;
   EnqueueTask& _enq_task;
 
 public:
-  G1RefEnqueueTaskProxy(EnqueueTask& enq_task) :
+  G1CMRefEnqueueTaskProxy(EnqueueTask& enq_task) :
     AbstractGangTask("Enqueue reference objects in parallel"),
     _enq_task(enq_task)
   { }
@@ -2150,10 +2157,10 @@ public:
   }
 };
 
-void G1RefProcTaskExecutor::execute(EnqueueTask& enq_task) {
+void G1CMRefProcTaskExecutor::execute(EnqueueTask& enq_task) {
   assert(_workers != NULL, "Need parallel worker threads.");
 
-  G1RefEnqueueTaskProxy enq_task_proxy(enq_task);
+  G1CMRefEnqueueTaskProxy enq_task_proxy(enq_task);
 
   _g1h->set_par_threads(_active_workers);
   _workers->run_task(&enq_task_proxy);
@@ -2178,7 +2185,7 @@ void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) {
     }
     TraceTime t("GC ref-proc", verbose, false, gclog_or_tty);
 
-    ReferenceProcessor* rp = g1h->ref_processor();
+    ReferenceProcessor* rp = g1h->ref_processor_cm();
 
     // See the comment in G1CollectedHeap::ref_processing_init()
     // about how reference processing currently works in G1.
@@ -2196,8 +2203,8 @@ void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) {
     int active_workers = g1h->workers() ? g1h->workers()->total_workers() : 1;
     active_workers = MAX2(MIN2(active_workers, (int)_max_task_num), 1);
 
-    G1RefProcTaskExecutor par_task_executor(g1h, this, nextMarkBitMap(),
-                                            g1h->workers(), active_workers);
+    G1CMRefProcTaskExecutor par_task_executor(g1h, this, nextMarkBitMap(),
+                                              g1h->workers(), active_workers);
 
     if (rp->processing_is_mt()) {
       // Set the degree of MT here.  If the discovery is done MT, there
@@ -2238,7 +2245,7 @@ void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) {
     }
 
     rp->verify_no_references_recorded();
-    assert(!rp->discovery_enabled(), "should have been disabled");
+    assert(!rp->discovery_enabled(), "Post condition");
   }
 
   // Now clean up stale oops in StringTable
@@ -3342,7 +3349,7 @@ G1CMOopClosure::G1CMOopClosure(G1CollectedHeap* g1h,
   assert(_ref_processor == NULL, "should be initialized to NULL");
 
   if (G1UseConcMarkReferenceProcessing) {
-    _ref_processor = g1h->ref_processor();
+    _ref_processor = g1h->ref_processor_cm();
     assert(_ref_processor != NULL, "should not be NULL");
   }
 }
diff --git a/src/share/vm/gc_implementation/g1/concurrentMark.hpp b/src/share/vm/gc_implementation/g1/concurrentMark.hpp
index 0aca6421d..c724594f4 100644
--- a/src/share/vm/gc_implementation/g1/concurrentMark.hpp
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.hpp
@@ -366,8 +366,8 @@ class ConcurrentMark: public CHeapObj {
   friend class CMConcurrentMarkingTask;
   friend class G1ParNoteEndTask;
   friend class CalcLiveObjectsClosure;
-  friend class G1RefProcTaskProxy;
-  friend class G1RefProcTaskExecutor;
+  friend class G1CMRefProcTaskProxy;
+  friend class G1CMRefProcTaskExecutor;
   friend class G1CMParKeepAliveAndDrainClosure;
   friend class G1CMParDrainMarkingStackClosure;
 
diff --git a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
index 146c28781..66ca7442c 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@@ -42,6 +42,7 @@
 #include "memory/gcLocker.inline.hpp"
 #include "memory/genOopClosures.inline.hpp"
 #include "memory/generationSpec.hpp"
+#include "memory/referenceProcessor.hpp"
 #include "oops/oop.inline.hpp"
 #include "oops/oop.pcgc.inline.hpp"
 #include "runtime/aprofiler.hpp"
@@ -1244,15 +1245,11 @@ bool G1CollectedHeap::do_collection(bool explicit_gc,
 
     COMPILER2_PRESENT(DerivedPointerTable::clear());
 
-    // We want to discover references, but not process them yet.
-    // This mode is disabled in
-    // instanceRefKlass::process_discovered_references if the
-    // generation does some collection work, or
-    // instanceRefKlass::enqueue_discovered_references if the
-    // generation returns without doing any work.
-    ref_processor()->disable_discovery();
-    ref_processor()->abandon_partial_discovery();
-    ref_processor()->verify_no_references_recorded();
+    // Disable discovery and empty the discovered lists
+    // for the CM ref processor.
+    ref_processor_cm()->disable_discovery();
+    ref_processor_cm()->abandon_partial_discovery();
+    ref_processor_cm()->verify_no_references_recorded();
 
     // Abandon current iterations of concurrent marking and concurrent
     // refinement, if any are in progress.
@@ -1280,31 +1277,33 @@ bool G1CollectedHeap::do_collection(bool explicit_gc,
     empty_young_list();
     g1_policy()->set_full_young_gcs(true);
 
-    // See the comment in G1CollectedHeap::ref_processing_init() about
+    // See the comments in g1CollectedHeap.hpp and
+    // G1CollectedHeap::ref_processing_init() about
     // how reference processing currently works in G1.
 
-    // Temporarily make reference _discovery_ single threaded (non-MT).
-    ReferenceProcessorMTDiscoveryMutator rp_disc_ser(ref_processor(), false);
+    // Temporarily make discovery by the STW ref processor single threaded (non-MT).
+    ReferenceProcessorMTDiscoveryMutator stw_rp_disc_ser(ref_processor_stw(), false);
 
-    // Temporarily make refs discovery atomic
-    ReferenceProcessorAtomicMutator rp_disc_atomic(ref_processor(), true);
+    // Temporarily clear the STW ref processor's _is_alive_non_header field.
+    ReferenceProcessorIsAliveMutator stw_rp_is_alive_null(ref_processor_stw(), NULL);
 
-    // Temporarily clear _is_alive_non_header
-    ReferenceProcessorIsAliveMutator rp_is_alive_null(ref_processor(), NULL);
+    ref_processor_stw()->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
+    ref_processor_stw()->setup_policy(do_clear_all_soft_refs);
 
-    ref_processor()->enable_discovery();
-    ref_processor()->setup_policy(do_clear_all_soft_refs);
     // Do collection work
     {
       HandleMark hm;  // Discard invalid handles created during gc
-      G1MarkSweep::invoke_at_safepoint(ref_processor(), do_clear_all_soft_refs);
+      G1MarkSweep::invoke_at_safepoint(ref_processor_stw(), do_clear_all_soft_refs);
     }
+
     assert(free_regions() == 0, "we should not have added any free regions");
     rebuild_region_lists();
 
     _summary_bytes_used = recalculate_used();
 
-    ref_processor()->enqueue_discovered_references();
+    // Enqueue any discovered reference objects that have
+    // not been removed from the discovered lists.
+    ref_processor_stw()->enqueue_discovered_references();
 
     COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
 
@@ -1319,7 +1318,16 @@ bool G1CollectedHeap::do_collection(bool explicit_gc,
                        /* option      */ VerifyOption_G1UsePrevMarking);
 
     }
-    NOT_PRODUCT(ref_processor()->verify_no_references_recorded());
+
+    assert(!ref_processor_stw()->discovery_enabled(), "Postcondition");
+    ref_processor_stw()->verify_no_references_recorded();
+
+    // Note: since we've just done a full GC, concurrent
+    // marking is no longer active. Therefore we need not
+    // re-enable reference discovery for the CM ref processor.
+    // That will be done at the start of the next marking cycle.
+    assert(!ref_processor_cm()->discovery_enabled(), "Postcondition");
+    ref_processor_cm()->verify_no_references_recorded();
 
     reset_gc_time_stamp();
     // Since everything potentially moved, we will clear all remembered
@@ -1772,8 +1780,10 @@ G1CollectedHeap::G1CollectedHeap(G1CollectorPolicy* policy_) :
   _g1_policy(policy_),
   _dirty_card_queue_set(false),
   _into_cset_dirty_card_queue_set(false),
-  _is_alive_closure(this),
-  _ref_processor(NULL),
+  _is_alive_closure_cm(this),
+  _is_alive_closure_stw(this),
+  _ref_processor_cm(NULL),
+  _ref_processor_stw(NULL),
   _process_strong_tasks(new SubTasksDone(G1H_PS_NumElements)),
   _bot_shared(NULL),
   _objs_with_preserved_marks(NULL), _preserved_marks_of_objs(NULL),
@@ -2067,34 +2077,81 @@ jint G1CollectedHeap::initialize() {
 void G1CollectedHeap::ref_processing_init() {
   // Reference processing in G1 currently works as follows:
   //
-  // * There is only one reference processor instance that
-  //   'spans' the entire heap. It is created by the code
-  //   below.
-  // * Reference discovery is not enabled during an incremental
-  //   pause (see 6484982).
-  // * Discoverered refs are not enqueued nor are they processed
-  //   during an incremental pause (see 6484982).
-  // * Reference discovery is enabled at initial marking.
-  // * Reference discovery is disabled and the discovered
-  //   references processed etc during remarking.
-  // * Reference discovery is MT (see below).
-  // * Reference discovery requires a barrier (see below).
-  // * Reference processing is currently not MT (see 6608385).
-  // * A full GC enables (non-MT) reference discovery and
-  //   processes any discovered references.
+  // * There are two reference processor instances. One is
+  //   used to record and process discovered references
+  //   during concurrent marking; the other is used to
+  //   record and process references during STW pauses
+  //   (both full and incremental).
+  // * Both ref processors need to 'span' the entire heap as
+  //   the regions in the collection set may be dotted around.
+  //
+  // * For the concurrent marking ref processor:
+  //   * Reference discovery is enabled at initial marking.
+  //   * Reference discovery is disabled and the discovered
+  //     references processed etc during remarking.
+  //   * Reference discovery is MT (see below).
+  //   * Reference discovery requires a barrier (see below).
+  //   * Reference processing may or may not be MT
+  //     (depending on the value of ParallelRefProcEnabled
+  //     and ParallelGCThreads).
+  //   * A full GC disables reference discovery by the CM
+  //     ref processor and abandons any entries on it's
+  //     discovered lists.
+  //
+  // * For the STW processor:
+  //   * Non MT discovery is enabled at the start of a full GC.
+  //   * Processing and enqueueing during a full GC is non-MT.
+  //   * During a full GC, references are processed after marking.
+  //
+  //   * Discovery (may or may not be MT) is enabled at the start
+  //     of an incremental evacuation pause.
+  //   * References are processed near the end of a STW evacuation pause.
+  //   * For both types of GC:
+  //     * Discovery is atomic - i.e. not concurrent.
+  //     * Reference discovery will not need a barrier.
 
   SharedHeap::ref_processing_init();
   MemRegion mr = reserved_region();
-  _ref_processor =
+
+  // Concurrent Mark ref processor
+  _ref_processor_cm =
+    new ReferenceProcessor(mr,    // span
+                           ParallelRefProcEnabled && (ParallelGCThreads > 1),
+                                // mt processing
+                           (int) ParallelGCThreads,
+                                // degree of mt processing
+                           (ParallelGCThreads > 1) || (ConcGCThreads > 1),
+                                // mt discovery
+                           (int) MAX2(ParallelGCThreads, ConcGCThreads),
+                                // degree of mt discovery
+                           false,
+                                // Reference discovery is not atomic
+                           &_is_alive_closure_cm,
+                                // is alive closure
+                                // (for efficiency/performance)
+                           true);
+                                // Setting next fields of discovered
+                                // lists requires a barrier.
+
+  // STW ref processor
+  _ref_processor_stw =
     new ReferenceProcessor(mr,    // span
-                           ParallelRefProcEnabled && (ParallelGCThreads > 1),    // mt processing
-                           (int) ParallelGCThreads,   // degree of mt processing
-                           ParallelGCThreads > 1 || ConcGCThreads > 1,  // mt discovery
-                           (int) MAX2(ParallelGCThreads, ConcGCThreads), // degree of mt discovery
-                           false,                     // Reference discovery is not atomic
-                           &_is_alive_closure,        // is alive closure for efficiency
-                           true);                     // Setting next fields of discovered
-                                                      // lists requires a barrier.
+                           ParallelRefProcEnabled && (ParallelGCThreads > 1),
+                                // mt processing
+                           MAX2((int)ParallelGCThreads, 1),
+                                // degree of mt processing
+                           (ParallelGCThreads > 1),
+                                // mt discovery
+                           MAX2((int)ParallelGCThreads, 1),
+                                // degree of mt discovery
+                           true,
+                                // Reference discovery is atomic
+                           &_is_alive_closure_stw,
+                                // is alive closure
+                                // (for efficiency/performance)
+                           false);
+                                // Setting next fields of discovered
+                                // lists requires a barrier.
 }
 
 size_t G1CollectedHeap::capacity() const {
@@ -3117,6 +3174,10 @@ void G1CollectedHeap::gc_epilogue(bool full /* Ignored */) {
   COMPILER2_PRESENT(assert(DerivedPointerTable::is_empty(),
                         "derived pointer present"));
   // always_do_update_barrier = true;
+
+  // We have just completed a GC. Update the soft reference
+  // policy with the new heap occupancy
+  Universe::update_heap_info_at_gc();
 }
 
 HeapWord* G1CollectedHeap::do_collection_pause(size_t word_size,
@@ -3354,230 +3415,241 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
 
       COMPILER2_PRESENT(DerivedPointerTable::clear());
 
-      // Please see comment in G1CollectedHeap::ref_processing_init()
-      // to see how reference processing currently works in G1.
-      //
-      // We want to turn off ref discovery, if necessary, and turn it back on
-      // on again later if we do. XXX Dubious: why is discovery disabled?
-      bool was_enabled = ref_processor()->discovery_enabled();
-      if (was_enabled) ref_processor()->disable_discovery();
+      // Please see comment in g1CollectedHeap.hpp and
+      // G1CollectedHeap::ref_processing_init() to see how
+      // reference processing currently works in G1.
 
-      // Forget the current alloc region (we might even choose it to be part
-      // of the collection set!).
-      release_mutator_alloc_region();
+      // Enable discovery in the STW reference processor
+      ref_processor_stw()->enable_discovery(true /*verify_disabled*/,
+                                            true /*verify_no_refs*/);
 
-      // We should call this after we retire the mutator alloc
-      // region(s) so that all the ALLOC / RETIRE events are generated
-      // before the start GC event.
-      _hr_printer.start_gc(false /* full */, (size_t) total_collections());
-
-      // The elapsed time induced by the start time below deliberately elides
-      // the possible verification above.
-      double start_time_sec = os::elapsedTime();
-      size_t start_used_bytes = used();
+      {
+        // We want to temporarily turn off discovery by the
+        // CM ref processor, if necessary, and turn it back on
+        // on again later if we do. Using a scoped
+        // NoRefDiscovery object will do this.
+        NoRefDiscovery no_cm_discovery(ref_processor_cm());
+
+        // Forget the current alloc region (we might even choose it to be part
+        // of the collection set!).
+        release_mutator_alloc_region();
+
+        // We should call this after we retire the mutator alloc
+        // region(s) so that all the ALLOC / RETIRE events are generated
+        // before the start GC event.
+        _hr_printer.start_gc(false /* full */, (size_t) total_collections());
+
+        // The elapsed time induced by the start time below deliberately elides
+        // the possible verification above.
+        double start_time_sec = os::elapsedTime();
+        size_t start_used_bytes = used();
 
 #if YOUNG_LIST_VERBOSE
-      gclog_or_tty->print_cr("\nBefore recording pause start.\nYoung_list:");
-      _young_list->print();
-      g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
+        gclog_or_tty->print_cr("\nBefore recording pause start.\nYoung_list:");
+        _young_list->print();
+        g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
 #endif // YOUNG_LIST_VERBOSE
 
-      g1_policy()->record_collection_pause_start(start_time_sec,
-                                                 start_used_bytes);
+        g1_policy()->record_collection_pause_start(start_time_sec,
+                                                   start_used_bytes);
 
 #if YOUNG_LIST_VERBOSE
-      gclog_or_tty->print_cr("\nAfter recording pause start.\nYoung_list:");
-      _young_list->print();
+        gclog_or_tty->print_cr("\nAfter recording pause start.\nYoung_list:");
+        _young_list->print();
 #endif // YOUNG_LIST_VERBOSE
 
-      if (g1_policy()->during_initial_mark_pause()) {
-        concurrent_mark()->checkpointRootsInitialPre();
-      }
-      perm_gen()->save_marks();
-
-      // We must do this before any possible evacuation that should propagate
-      // marks.
-      if (mark_in_progress()) {
-        double start_time_sec = os::elapsedTime();
+        if (g1_policy()->during_initial_mark_pause()) {
+          concurrent_mark()->checkpointRootsInitialPre();
+        }
+        perm_gen()->save_marks();
 
-        _cm->drainAllSATBBuffers();
-        double finish_mark_ms = (os::elapsedTime() - start_time_sec) * 1000.0;
-        g1_policy()->record_satb_drain_time(finish_mark_ms);
-      }
-      // Record the number of elements currently on the mark stack, so we
-      // only iterate over these.  (Since evacuation may add to the mark
-      // stack, doing more exposes race conditions.)  If no mark is in
-      // progress, this will be zero.
-      _cm->set_oops_do_bound();
+        // We must do this before any possible evacuation that should propagate
+        // marks.
+        if (mark_in_progress()) {
+          double start_time_sec = os::elapsedTime();
 
-      if (mark_in_progress()) {
-        concurrent_mark()->newCSet();
-      }
+          _cm->drainAllSATBBuffers();
+          double finish_mark_ms = (os::elapsedTime() - start_time_sec) * 1000.0;
+          g1_policy()->record_satb_drain_time(finish_mark_ms);
+        }
+        // Record the number of elements currently on the mark stack, so we
+        // only iterate over these.  (Since evacuation may add to the mark
+        // stack, doing more exposes race conditions.)  If no mark is in
+        // progress, this will be zero.
+        _cm->set_oops_do_bound();
+
+        if (mark_in_progress()) {
+          concurrent_mark()->newCSet();
+        }
 
 #if YOUNG_LIST_VERBOSE
-      gclog_or_tty->print_cr("\nBefore choosing collection set.\nYoung_list:");
-      _young_list->print();
-      g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
+        gclog_or_tty->print_cr("\nBefore choosing collection set.\nYoung_list:");
+        _young_list->print();
+        g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
 #endif // YOUNG_LIST_VERBOSE
 
-      g1_policy()->choose_collection_set(target_pause_time_ms);
-
-      if (_hr_printer.is_active()) {
-        HeapRegion* hr = g1_policy()->collection_set();
-        while (hr != NULL) {
-          G1HRPrinter::RegionType type;
-          if (!hr->is_young()) {
-            type = G1HRPrinter::Old;
-          } else if (hr->is_survivor()) {
-            type = G1HRPrinter::Survivor;
-          } else {
-            type = G1HRPrinter::Eden;
+        g1_policy()->choose_collection_set(target_pause_time_ms);
+
+        if (_hr_printer.is_active()) {
+          HeapRegion* hr = g1_policy()->collection_set();
+          while (hr != NULL) {
+            G1HRPrinter::RegionType type;
+            if (!hr->is_young()) {
+              type = G1HRPrinter::Old;
+            } else if (hr->is_survivor()) {
+              type = G1HRPrinter::Survivor;
+            } else {
+              type = G1HRPrinter::Eden;
+            }
+            _hr_printer.cset(hr);
+            hr = hr->next_in_collection_set();
           }
-          _hr_printer.cset(hr);
-          hr = hr->next_in_collection_set();
         }
-      }
 
-      // We have chosen the complete collection set. If marking is
-      // active then, we clear the region fields of any of the
-      // concurrent marking tasks whose region fields point into
-      // the collection set as these values will become stale. This
-      // will cause the owning marking threads to claim a new region
-      // when marking restarts.
-      if (mark_in_progress()) {
-        concurrent_mark()->reset_active_task_region_fields_in_cset();
-      }
+        // We have chosen the complete collection set. If marking is
+        // active then, we clear the region fields of any of the
+        // concurrent marking tasks whose region fields point into
+        // the collection set as these values will become stale. This
+        // will cause the owning marking threads to claim a new region
+        // when marking restarts.
+        if (mark_in_progress()) {
+          concurrent_mark()->reset_active_task_region_fields_in_cset();
+        }
 
 #ifdef ASSERT
-      VerifyCSetClosure cl;
-      collection_set_iterate(&cl);
+        VerifyCSetClosure cl;
+        collection_set_iterate(&cl);
 #endif // ASSERT
 
-      setup_surviving_young_words();
+        setup_surviving_young_words();
 
-      // Initialize the GC alloc regions.
-      init_gc_alloc_regions();
+        // Initialize the GC alloc regions.
+        init_gc_alloc_regions();
 
-      // Actually do the work...
-      evacuate_collection_set();
+        // Actually do the work...
+        evacuate_collection_set();
 
-      free_collection_set(g1_policy()->collection_set());
-      g1_policy()->clear_collection_set();
+        free_collection_set(g1_policy()->collection_set());
+        g1_policy()->clear_collection_set();
 
-      cleanup_surviving_young_words();
+        cleanup_surviving_young_words();
 
-      // Start a new incremental collection set for the next pause.
-      g1_policy()->start_incremental_cset_building();
+        // Start a new incremental collection set for the next pause.
+        g1_policy()->start_incremental_cset_building();
 
-      // Clear the _cset_fast_test bitmap in anticipation of adding
-      // regions to the incremental collection set for the next
-      // evacuation pause.
-      clear_cset_fast_test();
+        // Clear the _cset_fast_test bitmap in anticipation of adding
+        // regions to the incremental collection set for the next
+        // evacuation pause.
+        clear_cset_fast_test();
 
-      _young_list->reset_sampled_info();
+        _young_list->reset_sampled_info();
 
-      // Don't check the whole heap at this point as the
-      // GC alloc regions from this pause have been tagged
-      // as survivors and moved on to the survivor list.
-      // Survivor regions will fail the !is_young() check.
-      assert(check_young_list_empty(false /* check_heap */),
-        "young list should be empty");
+        // Don't check the whole heap at this point as the
+        // GC alloc regions from this pause have been tagged
+        // as survivors and moved on to the survivor list.
+        // Survivor regions will fail the !is_young() check.
+        assert(check_young_list_empty(false /* check_heap */),
+          "young list should be empty");
 
 #if YOUNG_LIST_VERBOSE
-      gclog_or_tty->print_cr("Before recording survivors.\nYoung List:");
-      _young_list->print();
+        gclog_or_tty->print_cr("Before recording survivors.\nYoung List:");
+        _young_list->print();
 #endif // YOUNG_LIST_VERBOSE
 
-      g1_policy()->record_survivor_regions(_young_list->survivor_length(),
-        _young_list->first_survivor_region(),
-        _young_list->last_survivor_region());
+        g1_policy()->record_survivor_regions(_young_list->survivor_length(),
+                                            _young_list->first_survivor_region(),
+                                            _young_list->last_survivor_region());
 
-      _young_list->reset_auxilary_lists();
+        _young_list->reset_auxilary_lists();
 
-      if (evacuation_failed()) {
-        _summary_bytes_used = recalculate_used();
-      } else {
-        // The "used" of the the collection set have already been subtracted
-        // when they were freed.  Add in the bytes evacuated.
-        _summary_bytes_used += g1_policy()->bytes_copied_during_gc();
-      }
+        if (evacuation_failed()) {
+          _summary_bytes_used = recalculate_used();
+        } else {
+          // The "used" of the the collection set have already been subtracted
+          // when they were freed.  Add in the bytes evacuated.
+          _summary_bytes_used += g1_policy()->bytes_copied_during_gc();
+        }
 
-      if (g1_policy()->during_initial_mark_pause()) {
-        concurrent_mark()->checkpointRootsInitialPost();
-        set_marking_started();
-        // CAUTION: after the doConcurrentMark() call below,
-        // the concurrent marking thread(s) could be running
-        // concurrently with us. Make sure that anything after
-        // this point does not assume that we are the only GC thread
-        // running. Note: of course, the actual marking work will
-        // not start until the safepoint itself is released in
-        // ConcurrentGCThread::safepoint_desynchronize().
-        doConcurrentMark();
-      }
+        if (g1_policy()->during_initial_mark_pause()) {
+          concurrent_mark()->checkpointRootsInitialPost();
+          set_marking_started();
+          // CAUTION: after the doConcurrentMark() call below,
+          // the concurrent marking thread(s) could be running
+          // concurrently with us. Make sure that anything after
+          // this point does not assume that we are the only GC thread
+          // running. Note: of course, the actual marking work will
+          // not start until the safepoint itself is released in
+          // ConcurrentGCThread::safepoint_desynchronize().
+          doConcurrentMark();
+        }
 
-      allocate_dummy_regions();
+        allocate_dummy_regions();
 
 #if YOUNG_LIST_VERBOSE
-      gclog_or_tty->print_cr("\nEnd of the pause.\nYoung_list:");
-      _young_list->print();
-      g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
+        gclog_or_tty->print_cr("\nEnd of the pause.\nYoung_list:");
+        _young_list->print();
+        g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
 #endif // YOUNG_LIST_VERBOSE
 
-      init_mutator_alloc_region();
-
-      {
-        size_t expand_bytes = g1_policy()->expansion_amount();
-        if (expand_bytes > 0) {
-          size_t bytes_before = capacity();
-          if (!expand(expand_bytes)) {
-            // We failed to expand the heap so let's verify that
-            // committed/uncommitted amount match the backing store
-            assert(capacity() == _g1_storage.committed_size(), "committed size mismatch");
-            assert(max_capacity() == _g1_storage.reserved_size(), "reserved size mismatch");
+        init_mutator_alloc_region();
+
+        {
+          size_t expand_bytes = g1_policy()->expansion_amount();
+          if (expand_bytes > 0) {
+            size_t bytes_before = capacity();
+            if (!expand(expand_bytes)) {
+              // We failed to expand the heap so let's verify that
+              // committed/uncommitted amount match the backing store
+              assert(capacity() == _g1_storage.committed_size(), "committed size mismatch");
+              assert(max_capacity() == _g1_storage.reserved_size(), "reserved size mismatch");
+            }
           }
         }
-      }
 
-      double end_time_sec = os::elapsedTime();
-      double pause_time_ms = (end_time_sec - start_time_sec) * MILLIUNITS;
-      g1_policy()->record_pause_time_ms(pause_time_ms);
-      g1_policy()->record_collection_pause_end();
-
-      MemoryService::track_memory_usage();
-
-      // In prepare_for_verify() below we'll need to scan the deferred
-      // update buffers to bring the RSets up-to-date if
-      // G1HRRSFlushLogBuffersOnVerify has been set. While scanning
-      // the update buffers we'll probably need to scan cards on the
-      // regions we just allocated to (i.e., the GC alloc
-      // regions). However, during the last GC we called
-      // set_saved_mark() on all the GC alloc regions, so card
-      // scanning might skip the [saved_mark_word()...top()] area of
-      // those regions (i.e., the area we allocated objects into
-      // during the last GC). But it shouldn't. Given that
-      // saved_mark_word() is conditional on whether the GC time stamp
-      // on the region is current or not, by incrementing the GC time
-      // stamp here we invalidate all the GC time stamps on all the
-      // regions and saved_mark_word() will simply return top() for
-      // all the regions. This is a nicer way of ensuring this rather
-      // than iterating over the regions and fixing them. In fact, the
-      // GC time stamp increment here also ensures that
-      // saved_mark_word() will return top() between pauses, i.e.,
-      // during concurrent refinement. So we don't need the
-      // is_gc_active() check to decided which top to use when
-      // scanning cards (see CR 7039627).
-      increment_gc_time_stamp();
+        double end_time_sec = os::elapsedTime();
+        double pause_time_ms = (end_time_sec - start_time_sec) * MILLIUNITS;
+        g1_policy()->record_pause_time_ms(pause_time_ms);
+        g1_policy()->record_collection_pause_end();
+
+        MemoryService::track_memory_usage();
+
+        // In prepare_for_verify() below we'll need to scan the deferred
+        // update buffers to bring the RSets up-to-date if
+        // G1HRRSFlushLogBuffersOnVerify has been set. While scanning
+        // the update buffers we'll probably need to scan cards on the
+        // regions we just allocated to (i.e., the GC alloc
+        // regions). However, during the last GC we called
+        // set_saved_mark() on all the GC alloc regions, so card
+        // scanning might skip the [saved_mark_word()...top()] area of
+        // those regions (i.e., the area we allocated objects into
+        // during the last GC). But it shouldn't. Given that
+        // saved_mark_word() is conditional on whether the GC time stamp
+        // on the region is current or not, by incrementing the GC time
+        // stamp here we invalidate all the GC time stamps on all the
+        // regions and saved_mark_word() will simply return top() for
+        // all the regions. This is a nicer way of ensuring this rather
+        // than iterating over the regions and fixing them. In fact, the
+        // GC time stamp increment here also ensures that
+        // saved_mark_word() will return top() between pauses, i.e.,
+        // during concurrent refinement. So we don't need the
+        // is_gc_active() check to decided which top to use when
+        // scanning cards (see CR 7039627).
+        increment_gc_time_stamp();
+
+        if (VerifyAfterGC && total_collections() >= VerifyGCStartAt) {
+          HandleMark hm;  // Discard invalid handles created during verification
+          gclog_or_tty->print(" VerifyAfterGC:");
+          prepare_for_verify();
+          Universe::verify(/* allow dirty */ true,
+                           /* silent      */ false,
+                           /* option      */ VerifyOption_G1UsePrevMarking);
+        }
 
-      if (VerifyAfterGC && total_collections() >= VerifyGCStartAt) {
-        HandleMark hm;  // Discard invalid handles created during verification
-        gclog_or_tty->print(" VerifyAfterGC:");
-        prepare_for_verify();
-        Universe::verify(/* allow dirty */ true,
-                         /* silent      */ false,
-                         /* option      */ VerifyOption_G1UsePrevMarking);
-      }
+        assert(!ref_processor_stw()->discovery_enabled(), "Postcondition");
+        ref_processor_stw()->verify_no_references_recorded();
 
-      if (was_enabled) ref_processor()->enable_discovery();
+        // CM reference discovery will be re-enabled if necessary.
+      }
 
       {
         size_t expand_bytes = g1_policy()->expansion_amount();
@@ -3728,34 +3800,6 @@ void G1CollectedHeap::finalize_for_evac_failure() {
   _evac_failure_scan_stack = NULL;
 }
 
-// *** Sequential G1 Evacuation
-
-class G1IsAliveClosure: public BoolObjectClosure {
-  G1CollectedHeap* _g1;
-public:
-  G1IsAliveClosure(G1CollectedHeap* g1) : _g1(g1) {}
-  void do_object(oop p) { assert(false, "Do not call."); }
-  bool do_object_b(oop p) {
-    // It is reachable if it is outside the collection set, or is inside
-    // and forwarded.
-    return !_g1->obj_in_cs(p) || p->is_forwarded();
-  }
-};
-
-class G1KeepAliveClosure: public OopClosure {
-  G1CollectedHeap* _g1;
-public:
-  G1KeepAliveClosure(G1CollectedHeap* g1) : _g1(g1) {}
-  void do_oop(narrowOop* p) { guarantee(false, "Not needed"); }
-  void do_oop(      oop* p) {
-    oop obj = *p;
-    if (_g1->obj_in_cs(obj)) {
-      assert( obj->is_forwarded(), "invariant" );
-      *p = obj->forwardee();
-    }
-  }
-};
-
 class UpdateRSetDeferred : public OopsInHeapRegionClosure {
 private:
   G1CollectedHeap* _g1;
@@ -4186,12 +4230,17 @@ bool G1ParScanThreadState::verify_task(StarTask ref) const {
 #endif // ASSERT
 
 void G1ParScanThreadState::trim_queue() {
+  assert(_evac_cl != NULL, "not set");
+  assert(_evac_failure_cl != NULL, "not set");
+  assert(_partial_scan_cl != NULL, "not set");
+
   StarTask ref;
   do {
     // Drain the overflow stack first, so other threads can steal.
     while (refs()->pop_overflow(ref)) {
       deal_with_reference(ref);
     }
+
     while (refs()->pop_local(ref)) {
       deal_with_reference(ref);
     }
@@ -4529,35 +4578,42 @@ public:
     ResourceMark rm;
     HandleMark   hm;
 
+    ReferenceProcessor*             rp = _g1h->ref_processor_stw();
+
     G1ParScanThreadState            pss(_g1h, i);
-    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss);
-    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss);
-    G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss);
+    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, rp);
+    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, rp);
+    G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, rp);
 
     pss.set_evac_closure(&scan_evac_cl);
     pss.set_evac_failure_closure(&evac_failure_cl);
     pss.set_partial_scan_closure(&partial_scan_cl);
 
-    G1ParScanExtRootClosure         only_scan_root_cl(_g1h, &pss);
-    G1ParScanPermClosure            only_scan_perm_cl(_g1h, &pss);
-    G1ParScanHeapRSClosure          only_scan_heap_rs_cl(_g1h, &pss);
-    G1ParPushHeapRSClosure          push_heap_rs_cl(_g1h, &pss);
+    G1ParScanExtRootClosure        only_scan_root_cl(_g1h, &pss, rp);
+    G1ParScanPermClosure           only_scan_perm_cl(_g1h, &pss, rp);
 
-    G1ParScanAndMarkExtRootClosure  scan_mark_root_cl(_g1h, &pss);
-    G1ParScanAndMarkPermClosure     scan_mark_perm_cl(_g1h, &pss);
-    G1ParScanAndMarkHeapRSClosure   scan_mark_heap_rs_cl(_g1h, &pss);
+    G1ParScanAndMarkExtRootClosure scan_mark_root_cl(_g1h, &pss, rp);
+    G1ParScanAndMarkPermClosure    scan_mark_perm_cl(_g1h, &pss, rp);
 
-    OopsInHeapRegionClosure        *scan_root_cl;
-    OopsInHeapRegionClosure        *scan_perm_cl;
+    OopClosure*                    scan_root_cl = &only_scan_root_cl;
+    OopsInHeapRegionClosure*       scan_perm_cl = &only_scan_perm_cl;
 
     if (_g1h->g1_policy()->during_initial_mark_pause()) {
+      // We also need to mark copied objects.
       scan_root_cl = &scan_mark_root_cl;
       scan_perm_cl = &scan_mark_perm_cl;
-    } else {
-      scan_root_cl = &only_scan_root_cl;
-      scan_perm_cl = &only_scan_perm_cl;
     }
 
+    // The following closure is used to scan RSets looking for reference
+    // fields that point into the collection set. The actual field iteration
+    // is performed by a FilterIntoCSClosure, whose do_oop method calls the
+    // do_oop method of the following closure.
+    // Therefore we want to record the reference processor in the
+    // FilterIntoCSClosure. To do so we record the STW reference
+    // processor into the following closure and pass it to the
+    // FilterIntoCSClosure in HeapRegionDCTOC::walk_mem_region_with_cl.
+    G1ParPushHeapRSClosure          push_heap_rs_cl(_g1h, &pss, rp);
+
     pss.start_strong_roots();
     _g1h->g1_process_strong_roots(/* not collecting perm */ false,
                                   SharedHeap::SO_AllClasses,
@@ -4605,6 +4661,7 @@ g1_process_strong_roots(bool collecting_perm_gen,
                         OopsInHeapRegionClosure* scan_rs,
                         OopsInGenClosure* scan_perm,
                         int worker_i) {
+
   // First scan the strong roots, including the perm gen.
   double ext_roots_start = os::elapsedTime();
   double closure_app_time_sec = 0.0;
@@ -4623,12 +4680,13 @@ g1_process_strong_roots(bool collecting_perm_gen,
                        &eager_scan_code_roots,
                        &buf_scan_perm);
 
-  // Now the ref_processor roots.
+  // Now the CM ref_processor roots.
   if (!_process_strong_tasks->is_task_claimed(G1H_PS_refProcessor_oops_do)) {
-    // We need to treat the discovered reference lists as roots and
-    // keep entries (which are added by the marking threads) on them
-    // live until they can be processed at the end of marking.
-    ref_processor()->weak_oops_do(&buf_scan_non_heap_roots);
+    // We need to treat the discovered reference lists of the
+    // concurrent mark ref processor as roots and keep entries
+    // (which are added by the marking threads) on them live
+    // until they can be processed at the end of marking.
+    ref_processor_cm()->weak_oops_do(&buf_scan_non_heap_roots);
   }
 
   // Finish up any enqueued closure apps (attributed as object copy time).
@@ -4669,6 +4727,524 @@ G1CollectedHeap::g1_process_weak_roots(OopClosure* root_closure,
   SharedHeap::process_weak_roots(root_closure, &roots_in_blobs, non_root_closure);
 }
 
+// Weak Reference Processing support
+
+// An always "is_alive" closure that is used to preserve referents.
+// If the object is non-null then it's alive.  Used in the preservation
+// of referent objects that are pointed to by reference objects
+// discovered by the CM ref processor.
+class G1AlwaysAliveClosure: public BoolObjectClosure {
+  G1CollectedHeap* _g1;
+public:
+  G1AlwaysAliveClosure(G1CollectedHeap* g1) : _g1(g1) {}
+  void do_object(oop p) { assert(false, "Do not call."); }
+  bool do_object_b(oop p) {
+    if (p != NULL) {
+      return true;
+    }
+    return false;
+  }
+};
+
+bool G1STWIsAliveClosure::do_object_b(oop p) {
+  // An object is reachable if it is outside the collection set,
+  // or is inside and copied.
+  return !_g1->obj_in_cs(p) || p->is_forwarded();
+}
+
+// Non Copying Keep Alive closure
+class G1KeepAliveClosure: public OopClosure {
+  G1CollectedHeap* _g1;
+public:
+  G1KeepAliveClosure(G1CollectedHeap* g1) : _g1(g1) {}
+  void do_oop(narrowOop* p) { guarantee(false, "Not needed"); }
+  void do_oop(      oop* p) {
+    oop obj = *p;
+
+    if (_g1->obj_in_cs(obj)) {
+      assert( obj->is_forwarded(), "invariant" );
+      *p = obj->forwardee();
+    }
+  }
+};
+
+// Copying Keep Alive closure - can be called from both
+// serial and parallel code as long as different worker
+// threads utilize different G1ParScanThreadState instances
+// and different queues.
+
+class G1CopyingKeepAliveClosure: public OopClosure {
+  G1CollectedHeap*         _g1h;
+  OopClosure*              _copy_non_heap_obj_cl;
+  OopsInHeapRegionClosure* _copy_perm_obj_cl;
+  G1ParScanThreadState*    _par_scan_state;
+
+public:
+  G1CopyingKeepAliveClosure(G1CollectedHeap* g1h,
+                            OopClosure* non_heap_obj_cl,
+                            OopsInHeapRegionClosure* perm_obj_cl,
+                            G1ParScanThreadState* pss):
+    _g1h(g1h),
+    _copy_non_heap_obj_cl(non_heap_obj_cl),
+    _copy_perm_obj_cl(perm_obj_cl),
+    _par_scan_state(pss)
+  {}
+
+  virtual void do_oop(narrowOop* p) { do_oop_work(p); }
+  virtual void do_oop(      oop* p) { do_oop_work(p); }
+
+  template <class T> void do_oop_work(T* p) {
+    oop obj = oopDesc::load_decode_heap_oop(p);
+
+    if (_g1h->obj_in_cs(obj)) {
+      // If the referent object has been forwarded (either copied
+      // to a new location or to itself in the event of an
+      // evacuation failure) then we need to update the reference
+      // field and, if both reference and referent are in the G1
+      // heap, update the RSet for the referent.
+      //
+      // If the referent has not been forwarded then we have to keep
+      // it alive by policy. Therefore we have copy the referent.
+      //
+      // If the reference field is in the G1 heap then we can push
+      // on the PSS queue. When the queue is drained (after each
+      // phase of reference processing) the object and it's followers
+      // will be copied, the reference field set to point to the
+      // new location, and the RSet updated. Otherwise we need to
+      // use the the non-heap or perm closures directly to copy
+      // the refernt object and update the pointer, while avoiding
+      // updating the RSet.
+
+      if (_g1h->is_in_g1_reserved(p)) {
+        _par_scan_state->push_on_queue(p);
+      } else {
+        // The reference field is not in the G1 heap.
+        if (_g1h->perm_gen()->is_in(p)) {
+          _copy_perm_obj_cl->do_oop(p);
+        } else {
+          _copy_non_heap_obj_cl->do_oop(p);
+        }
+      }
+    }
+  }
+};
+
+// Serial drain queue closure. Called as the 'complete_gc'
+// closure for each discovered list in some of the
+// reference processing phases.
+
+class G1STWDrainQueueClosure: public VoidClosure {
+protected:
+  G1CollectedHeap* _g1h;
+  G1ParScanThreadState* _par_scan_state;
+
+  G1ParScanThreadState*   par_scan_state() { return _par_scan_state; }
+
+public:
+  G1STWDrainQueueClosure(G1CollectedHeap* g1h, G1ParScanThreadState* pss) :
+    _g1h(g1h),
+    _par_scan_state(pss)
+  { }
+
+  void do_void() {
+    G1ParScanThreadState* const pss = par_scan_state();
+    pss->trim_queue();
+  }
+};
+
+// Parallel Reference Processing closures
+
+// Implementation of AbstractRefProcTaskExecutor for parallel reference
+// processing during G1 evacuation pauses.
+
+class G1STWRefProcTaskExecutor: public AbstractRefProcTaskExecutor {
+private:
+  G1CollectedHeap*   _g1h;
+  RefToScanQueueSet* _queues;
+  WorkGang*          _workers;
+  int                _active_workers;
+
+public:
+  G1STWRefProcTaskExecutor(G1CollectedHeap* g1h,
+                        WorkGang* workers,
+                        RefToScanQueueSet *task_queues,
+                        int n_workers) :
+    _g1h(g1h),
+    _queues(task_queues),
+    _workers(workers),
+    _active_workers(n_workers)
+  {
+    assert(n_workers > 0, "shouldn't call this otherwise");
+  }
+
+  // Executes the given task using concurrent marking worker threads.
+  virtual void execute(ProcessTask& task);
+  virtual void execute(EnqueueTask& task);
+};
+
+// Gang task for possibly parallel reference processing
+
+class G1STWRefProcTaskProxy: public AbstractGangTask {
+  typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
+  ProcessTask&     _proc_task;
+  G1CollectedHeap* _g1h;
+  RefToScanQueueSet *_task_queues;
+  ParallelTaskTerminator* _terminator;
+
+public:
+  G1STWRefProcTaskProxy(ProcessTask& proc_task,
+                     G1CollectedHeap* g1h,
+                     RefToScanQueueSet *task_queues,
+                     ParallelTaskTerminator* terminator) :
+    AbstractGangTask("Process reference objects in parallel"),
+    _proc_task(proc_task),
+    _g1h(g1h),
+    _task_queues(task_queues),
+    _terminator(terminator)
+  {}
+
+  virtual void work(int i) {
+    // The reference processing task executed by a single worker.
+    ResourceMark rm;
+    HandleMark   hm;
+
+    G1STWIsAliveClosure is_alive(_g1h);
+
+    G1ParScanThreadState pss(_g1h, i);
+
+    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, NULL);
+    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, NULL);
+    G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, NULL);
+
+    pss.set_evac_closure(&scan_evac_cl);
+    pss.set_evac_failure_closure(&evac_failure_cl);
+    pss.set_partial_scan_closure(&partial_scan_cl);
+
+    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanPermClosure           only_copy_perm_cl(_g1h, &pss, NULL);
+
+    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanAndMarkPermClosure    copy_mark_perm_cl(_g1h, &pss, NULL);
+
+    OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
+    OopsInHeapRegionClosure*       copy_perm_cl = &only_copy_perm_cl;
+
+    if (_g1h->g1_policy()->during_initial_mark_pause()) {
+      // We also need to mark copied objects.
+      copy_non_heap_cl = &copy_mark_non_heap_cl;
+      copy_perm_cl = &copy_mark_perm_cl;
+    }
+
+    // Keep alive closure.
+    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, copy_perm_cl, &pss);
+
+    // Complete GC closure
+    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _task_queues, _terminator);
+
+    // Call the reference processing task's work routine.
+    _proc_task.work(i, is_alive, keep_alive, drain_queue);
+
+    // Note we cannot assert that the refs array is empty here as not all
+    // of the processing tasks (specifically phase2 - pp2_work) execute
+    // the complete_gc closure (which ordinarily would drain the queue) so
+    // the queue may not be empty.
+  }
+};
+
+// Driver routine for parallel reference processing.
+// Creates an instance of the ref processing gang
+// task and has the worker threads execute it.
+void G1STWRefProcTaskExecutor::execute(ProcessTask& proc_task) {
+  assert(_workers != NULL, "Need parallel worker threads.");
+
+  ParallelTaskTerminator terminator(_active_workers, _queues);
+  G1STWRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _queues, &terminator);
+
+  _g1h->set_par_threads(_active_workers);
+  _workers->run_task(&proc_task_proxy);
+  _g1h->set_par_threads(0);
+}
+
+// Gang task for parallel reference enqueueing.
+
+class G1STWRefEnqueueTaskProxy: public AbstractGangTask {
+  typedef AbstractRefProcTaskExecutor::EnqueueTask EnqueueTask;
+  EnqueueTask& _enq_task;
+
+public:
+  G1STWRefEnqueueTaskProxy(EnqueueTask& enq_task) :
+    AbstractGangTask("Enqueue reference objects in parallel"),
+    _enq_task(enq_task)
+  { }
+
+  virtual void work(int i) {
+    _enq_task.work(i);
+  }
+};
+
+// Driver routine for parallel reference enqueing.
+// Creates an instance of the ref enqueueing gang
+// task and has the worker threads execute it.
+
+void G1STWRefProcTaskExecutor::execute(EnqueueTask& enq_task) {
+  assert(_workers != NULL, "Need parallel worker threads.");
+
+  G1STWRefEnqueueTaskProxy enq_task_proxy(enq_task);
+
+  _g1h->set_par_threads(_active_workers);
+  _workers->run_task(&enq_task_proxy);
+  _g1h->set_par_threads(0);
+}
+
+// End of weak reference support closures
+
+// Abstract task used to preserve (i.e. copy) any referent objects
+// that are in the collection set and are pointed to by reference
+// objects discovered by the CM ref processor.
+
+class G1ParPreserveCMReferentsTask: public AbstractGangTask {
+protected:
+  G1CollectedHeap* _g1h;
+  RefToScanQueueSet      *_queues;
+  ParallelTaskTerminator _terminator;
+  int _n_workers;
+
+public:
+  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h,int workers, RefToScanQueueSet *task_queues) :
+    AbstractGangTask("ParPreserveCMReferents"),
+    _g1h(g1h),
+    _queues(task_queues),
+    _terminator(workers, _queues),
+    _n_workers(workers)
+  { }
+
+  void work(int i) {
+    ResourceMark rm;
+    HandleMark   hm;
+
+    G1ParScanThreadState            pss(_g1h, i);
+    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, NULL);
+    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, NULL);
+    G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, NULL);
+
+    pss.set_evac_closure(&scan_evac_cl);
+    pss.set_evac_failure_closure(&evac_failure_cl);
+    pss.set_partial_scan_closure(&partial_scan_cl);
+
+    assert(pss.refs()->is_empty(), "both queue and overflow should be empty");
+
+
+    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanPermClosure           only_copy_perm_cl(_g1h, &pss, NULL);
+
+    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanAndMarkPermClosure    copy_mark_perm_cl(_g1h, &pss, NULL);
+
+    OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
+    OopsInHeapRegionClosure*       copy_perm_cl = &only_copy_perm_cl;
+
+    if (_g1h->g1_policy()->during_initial_mark_pause()) {
+      // We also need to mark copied objects.
+      copy_non_heap_cl = &copy_mark_non_heap_cl;
+      copy_perm_cl = &copy_mark_perm_cl;
+    }
+
+    // Is alive closure
+    G1AlwaysAliveClosure always_alive(_g1h);
+
+    // Copying keep alive closure. Applied to referent objects that need
+    // to be copied.
+    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, copy_perm_cl, &pss);
+
+    ReferenceProcessor* rp = _g1h->ref_processor_cm();
+
+    int limit = ReferenceProcessor::number_of_subclasses_of_ref() * rp->max_num_q();
+    int stride = MIN2(MAX2(_n_workers, 1), limit);
+
+    // limit is set using max_num_q() - which was set using ParallelGCThreads.
+    // So this must be true - but assert just in case someone decides to
+    // change the worker ids.
+    assert(0 <= i && i < limit, "sanity");
+    assert(!rp->discovery_is_atomic(), "check this code");
+
+    // Select discovered lists [i, i+stride, i+2*stride,...,limit)
+    for (int idx = i; idx < limit; idx += stride) {
+      DiscoveredList& ref_list = rp->discovered_soft_refs()[idx];
+
+      DiscoveredListIterator iter(ref_list, &keep_alive, &always_alive);
+      while (iter.has_next()) {
+        // Since discovery is not atomic for the CM ref processor, we
+        // can see some null referent objects.
+        iter.load_ptrs(DEBUG_ONLY(true));
+        oop ref = iter.obj();
+
+        // This will filter nulls.
+        if (iter.is_referent_alive()) {
+          iter.make_referent_alive();
+        }
+        iter.move_to_next();
+      }
+    }
+
+    // Drain the queue - which may cause stealing
+    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _queues, &_terminator);
+    drain_queue.do_void();
+    // Allocation buffers were retired at the end of G1ParEvacuateFollowersClosure
+    assert(pss.refs()->is_empty(), "should be");
+  }
+};
+
+// Weak Reference processing during an evacuation pause (part 1).
+void G1CollectedHeap::process_discovered_references() {
+  double ref_proc_start = os::elapsedTime();
+
+  ReferenceProcessor* rp = _ref_processor_stw;
+  assert(rp->discovery_enabled(), "should have been enabled");
+
+  // Any reference objects, in the collection set, that were 'discovered'
+  // by the CM ref processor should have already been copied (either by
+  // applying the external root copy closure to the discovered lists, or
+  // by following an RSet entry).
+  //
+  // But some of the referents, that are in the collection set, that these
+  // reference objects point to may not have been copied: the STW ref
+  // processor would have seen that the reference object had already
+  // been 'discovered' and would have skipped discovering the reference,
+  // but would not have treated the reference object as a regular oop.
+  // As a reult the copy closure would not have been applied to the
+  // referent object.
+  //
+  // We need to explicitly copy these referent objects - the references
+  // will be processed at the end of remarking.
+  //
+  // We also need to do this copying before we process the reference
+  // objects discovered by the STW ref processor in case one of these
+  // referents points to another object which is also referenced by an
+  // object discovered by the STW ref processor.
+
+  int n_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
+                        workers()->total_workers() : 1);
+
+  set_par_threads(n_workers);
+  G1ParPreserveCMReferentsTask keep_cm_referents(this, n_workers, _task_queues);
+
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
+    workers()->run_task(&keep_cm_referents);
+  } else {
+    keep_cm_referents.work(0);
+  }
+
+  set_par_threads(0);
+
+  // Closure to test whether a referent is alive.
+  G1STWIsAliveClosure is_alive(this);
+
+  // Even when parallel reference processing is enabled, the processing
+  // of JNI refs is serial and performed serially by the current thread
+  // rather than by a worker. The following PSS will be used for processing
+  // JNI refs.
+
+  // Use only a single queue for this PSS.
+  G1ParScanThreadState pss(this, 0);
+
+  // We do not embed a reference processor in the copying/scanning
+  // closures while we're actually processing the discovered
+  // reference objects.
+  G1ParScanHeapEvacClosure        scan_evac_cl(this, &pss, NULL);
+  G1ParScanHeapEvacFailureClosure evac_failure_cl(this, &pss, NULL);
+  G1ParScanPartialArrayClosure    partial_scan_cl(this, &pss, NULL);
+
+  pss.set_evac_closure(&scan_evac_cl);
+  pss.set_evac_failure_closure(&evac_failure_cl);
+  pss.set_partial_scan_closure(&partial_scan_cl);
+
+  assert(pss.refs()->is_empty(), "pre-condition");
+
+  G1ParScanExtRootClosure        only_copy_non_heap_cl(this, &pss, NULL);
+  G1ParScanPermClosure           only_copy_perm_cl(this, &pss, NULL);
+
+  G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(this, &pss, NULL);
+  G1ParScanAndMarkPermClosure    copy_mark_perm_cl(this, &pss, NULL);
+
+  OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
+  OopsInHeapRegionClosure*       copy_perm_cl = &only_copy_perm_cl;
+
+  if (_g1h->g1_policy()->during_initial_mark_pause()) {
+    // We also need to mark copied objects.
+    copy_non_heap_cl = &copy_mark_non_heap_cl;
+    copy_perm_cl = &copy_mark_perm_cl;
+  }
+
+  // Keep alive closure.
+  G1CopyingKeepAliveClosure keep_alive(this, copy_non_heap_cl, copy_perm_cl, &pss);
+
+  // Serial Complete GC closure
+  G1STWDrainQueueClosure drain_queue(this, &pss);
+
+  // Setup the soft refs policy...
+  rp->setup_policy(false);
+
+  if (!rp->processing_is_mt()) {
+    // Serial reference processing...
+    rp->process_discovered_references(&is_alive,
+                                      &keep_alive,
+                                      &drain_queue,
+                                      NULL);
+  } else {
+    // Parallel reference processing
+    int active_workers = (ParallelGCThreads > 0 ? workers()->total_workers() : 1);
+    assert(rp->num_q() == active_workers, "sanity");
+    assert(active_workers <= rp->max_num_q(), "sanity");
+
+    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, active_workers);
+    rp->process_discovered_references(&is_alive, &keep_alive, &drain_queue, &par_task_executor);
+  }
+
+  // We have completed copying any necessary live referent objects
+  // (that were not copied during the actual pause) so we can
+  // retire any active alloc buffers
+  pss.retire_alloc_buffers();
+  assert(pss.refs()->is_empty(), "both queue and overflow should be empty");
+
+  double ref_proc_time = os::elapsedTime() - ref_proc_start;
+  g1_policy()->record_ref_proc_time(ref_proc_time * 1000.0);
+}
+
+// Weak Reference processing during an evacuation pause (part 2).
+void G1CollectedHeap::enqueue_discovered_references() {
+  double ref_enq_start = os::elapsedTime();
+
+  ReferenceProcessor* rp = _ref_processor_stw;
+  assert(!rp->discovery_enabled(), "should have been disabled as part of processing");
+
+  // Now enqueue any remaining on the discovered lists on to
+  // the pending list.
+  if (!rp->processing_is_mt()) {
+    // Serial reference processing...
+    rp->enqueue_discovered_references();
+  } else {
+    // Parallel reference enqueuing
+
+    int active_workers = (ParallelGCThreads > 0 ? workers()->total_workers() : 1);
+    assert(rp->num_q() == active_workers, "sanity");
+    assert(active_workers <= rp->max_num_q(), "sanity");
+
+    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, active_workers);
+    rp->enqueue_discovered_references(&par_task_executor);
+  }
+
+  rp->verify_no_references_recorded();
+  assert(!rp->discovery_enabled(), "should have been disabled");
+
+  // FIXME
+  // CM's reference processing also cleans up the string and symbol tables.
+  // Should we do that here also? We could, but it is a serial operation
+  // and could signicantly increase the pause time.
+
+  double ref_enq_time = os::elapsedTime() - ref_enq_start;
+  g1_policy()->record_ref_enq_time(ref_enq_time * 1000.0);
+}
+
 void G1CollectedHeap::evacuate_collection_set() {
   set_evacuation_failed(false);
 
@@ -4686,6 +5262,7 @@ void G1CollectedHeap::evacuate_collection_set() {
 
   assert(dirty_card_queue_set().completed_buffers_num() == 0, "Should be empty");
   double start_par = os::elapsedTime();
+
   if (G1CollectedHeap::use_parallel_gc_threads()) {
     // The individual threads will set their evac-failure closures.
     StrongRootsScope srs(this);
@@ -4700,15 +5277,23 @@ void G1CollectedHeap::evacuate_collection_set() {
   g1_policy()->record_par_time(par_time);
   set_par_threads(0);
 
+  // Process any discovered reference objects - we have
+  // to do this _before_ we retire the GC alloc regions
+  // as we may have to copy some 'reachable' referent
+  // objects (and their reachable sub-graphs) that were
+  // not copied during the pause.
+  process_discovered_references();
+
   // Weak root processing.
   // Note: when JSR 292 is enabled and code blobs can contain
   // non-perm oops then we will need to process the code blobs
   // here too.
   {
-    G1IsAliveClosure is_alive(this);
+    G1STWIsAliveClosure is_alive(this);
     G1KeepAliveClosure keep_alive(this);
     JNIHandles::weak_oops_do(&is_alive, &keep_alive);
   }
+
   release_gc_alloc_regions();
   g1_rem_set()->cleanup_after_oops_into_collection_set_do();
 
@@ -4730,6 +5315,15 @@ void G1CollectedHeap::evacuate_collection_set() {
     }
   }
 
+  // Enqueue any remaining references remaining on the STW
+  // reference processor's discovered lists. We need to do
+  // this after the card table is cleaned (and verified) as
+  // the act of enqueuing entries on to the pending list
+  // will log these updates (and dirty their associated
+  // cards). We need these updates logged to update any
+  // RSets.
+  enqueue_discovered_references();
+
   if (G1DeferredRSUpdate) {
     RedirtyLoggedCardTableEntryFastClosure redirty;
     dirty_card_queue_set().set_closure(&redirty);
@@ -4930,7 +5524,7 @@ void G1CollectedHeap::cleanUpCardTable() {
   }
 
   double elapsed = os::elapsedTime() - start;
-  g1_policy()->record_clear_ct_time( elapsed * 1000.0);
+  g1_policy()->record_clear_ct_time(elapsed * 1000.0);
 #ifndef PRODUCT
   if (G1VerifyCTCleanup || VerifyAfterGC) {
     G1VerifyCardTableCleanup cleanup_verifier(this, ct_bs);
diff --git a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
index 6a361326f..9dcf1a825 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
@@ -155,6 +155,19 @@ public:
     : G1AllocRegion("Mutator Alloc Region", false /* bot_updates */) { }
 };
 
+// The G1 STW is alive closure.
+// An instance is embedded into the G1CH and used as the
+// (optional) _is_alive_non_header closure in the STW
+// reference processor. It is also extensively used during
+// refence processing during STW evacuation pauses.
+class G1STWIsAliveClosure: public BoolObjectClosure {
+  G1CollectedHeap* _g1;
+public:
+  G1STWIsAliveClosure(G1CollectedHeap* g1) : _g1(g1) {}
+  void do_object(oop p) { assert(false, "Do not call."); }
+  bool do_object_b(oop p);
+};
+
 class SurvivorGCAllocRegion : public G1AllocRegion {
 protected:
   virtual HeapRegion* allocate_new_region(size_t word_size, bool force);
@@ -174,6 +187,7 @@ public:
 };
 
 class RefineCardTableEntryClosure;
+
 class G1CollectedHeap : public SharedHeap {
   friend class VM_G1CollectForAllocation;
   friend class VM_GenCollectForPermanentAllocation;
@@ -573,6 +587,14 @@ protected:
   // allocated block, or else "NULL".
   HeapWord* expand_and_allocate(size_t word_size);
 
+  // Process any reference objects discovered during
+  // an incremental evacuation pause.
+  void process_discovered_references();
+
+  // Enqueue any remaining discovered references
+  // after processing.
+  void enqueue_discovered_references();
+
 public:
 
   G1MonitoringSupport* g1mm() { return _g1mm; }
@@ -826,14 +848,83 @@ protected:
                                     bool should_mark_root);
   void handle_evacuation_failure_common(oop obj, markOop m);
 
-  // Instance of the concurrent mark is_alive closure for embedding
-  // into the reference processor as the is_alive_non_header. This
-  // prevents unnecessary additions to the discovered lists during
-  // concurrent discovery.
-  G1CMIsAliveClosure _is_alive_closure;
+  // ("Weak") Reference processing support.
+  //
+  // G1 has 2 instances of the referece processor class. One
+  // (_ref_processor_cm) handles reference object discovery
+  // and subsequent processing during concurrent marking cycles.
+  //
+  // The other (_ref_processor_stw) handles reference object
+  // discovery and processing during full GCs and incremental
+  // evacuation pauses.
+  //
+  // During an incremental pause, reference discovery will be
+  // temporarily disabled for _ref_processor_cm and will be
+  // enabled for _ref_processor_stw. At the end of the evacuation
+  // pause references discovered by _ref_processor_stw will be
+  // processed and discovery will be disabled. The previous
+  // setting for reference object discovery for _ref_processor_cm
+  // will be re-instated.
+  //
+  // At the start of marking:
+  //  * Discovery by the CM ref processor is verified to be inactive
+  //    and it's discovered lists are empty.
+  //  * Discovery by the CM ref processor is then enabled.
+  //
+  // At the end of marking:
+  //  * Any references on the CM ref processor's discovered
+  //    lists are processed (possibly MT).
+  //
+  // At the start of full GC we:
+  //  * Disable discovery by the CM ref processor and
+  //    empty CM ref processor's discovered lists
+  //    (without processing any entries).
+  //  * Verify that the STW ref processor is inactive and it's
+  //    discovered lists are empty.
+  //  * Temporarily set STW ref processor discovery as single threaded.
+  //  * Temporarily clear the STW ref processor's _is_alive_non_header
+  //    field.
+  //  * Finally enable discovery by the STW ref processor.
+  //
+  // The STW ref processor is used to record any discovered
+  // references during the full GC.
+  //
+  // At the end of a full GC we:
+  //  * Enqueue any reference objects discovered by the STW ref processor
+  //    that have non-live referents. This has the side-effect of
+  //    making the STW ref processor inactive by disabling discovery.
+  //  * Verify that the CM ref processor is still inactive
+  //    and no references have been placed on it's discovered
+  //    lists (also checked as a precondition during initial marking).
+
+  // The (stw) reference processor...
+  ReferenceProcessor* _ref_processor_stw;
+
+  // During reference object discovery, the _is_alive_non_header
+  // closure (if non-null) is applied to the referent object to
+  // determine whether the referent is live. If so then the
+  // reference object does not need to be 'discovered' and can
+  // be treated as a regular oop. This has the benefit of reducing
+  // the number of 'discovered' reference objects that need to
+  // be processed.
+  //
+  // Instance of the is_alive closure for embedding into the
+  // STW reference processor as the _is_alive_non_header field.
+  // Supplying a value for the _is_alive_non_header field is
+  // optional but doing so prevents unnecessary additions to
+  // the discovered lists during reference discovery.
+  G1STWIsAliveClosure _is_alive_closure_stw;
+
+  // The (concurrent marking) reference processor...
+  ReferenceProcessor* _ref_processor_cm;
 
-  // ("Weak") Reference processing support
-  ReferenceProcessor* _ref_processor;
+  // Instance of the concurrent mark is_alive closure for embedding
+  // into the Concurrent Marking reference processor as the
+  // _is_alive_non_header field. Supplying a value for the
+  // _is_alive_non_header field is optional but doing so prevents
+  // unnecessary additions to the discovered lists during reference
+  // discovery.
+  G1CMIsAliveClosure _is_alive_closure_cm;
 
   enum G1H_process_strong_roots_tasks {
     G1H_PS_mark_stack_oops_do,
@@ -874,6 +965,7 @@ public:
   // specified by the policy object.
   jint initialize();
 
+  // Initialize weak reference processing.
   virtual void ref_processing_init();
 
   void set_par_threads(int t) {
@@ -925,8 +1017,13 @@ public:
   // The shared block offset table array.
   G1BlockOffsetSharedArray* bot_shared() const { return _bot_shared; }
 
-  // Reference Processing accessor
-  ReferenceProcessor* ref_processor() { return _ref_processor; }
+  // Reference Processing accessors
+
+  // The STW reference processor....
+  ReferenceProcessor* ref_processor_stw() const { return _ref_processor_stw; }
+
+  // The Concurent Marking reference processor...
+  ReferenceProcessor* ref_processor_cm() const { return _ref_processor_cm; }
 
   virtual size_t capacity() const;
   virtual size_t used() const;
diff --git a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
index 92c47409a..24f96aef8 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
@@ -152,8 +152,12 @@ G1CollectorPolicy::G1CollectorPolicy() :
 
   _summary(new Summary()),
 
-#ifndef PRODUCT
   _cur_clear_ct_time_ms(0.0),
+
+  _cur_ref_proc_time_ms(0.0),
+  _cur_ref_enq_time_ms(0.0),
+
+#ifndef PRODUCT
   _min_clear_cc_time_ms(-1.0),
   _max_clear_cc_time_ms(-1.0),
   _cur_clear_cc_time_ms(0.0),
@@ -1479,6 +1483,8 @@ void G1CollectorPolicy::record_collection_pause_end() {
 #endif
     print_stats(1, "Other", other_time_ms);
     print_stats(2, "Choose CSet", _recorded_young_cset_choice_time_ms);
+    print_stats(2, "Ref Proc", _cur_ref_proc_time_ms);
+    print_stats(2, "Ref Enq", _cur_ref_enq_time_ms);
 
     for (int i = 0; i < _aux_num; ++i) {
       if (_cur_aux_times_set[i]) {
diff --git a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
index 9dc7349fa..2c315729e 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
@@ -119,6 +119,8 @@ protected:
   double _cur_satb_drain_time_ms;
   double _cur_clear_ct_time_ms;
   bool   _satb_drain_time_set;
+  double _cur_ref_proc_time_ms;
+  double _cur_ref_enq_time_ms;
 
 #ifndef PRODUCT
   // Card Table Count Cache stats
@@ -986,6 +988,14 @@ public:
     _cur_aux_times_ms[i] += ms;
   }
 
+  void record_ref_proc_time(double ms) {
+    _cur_ref_proc_time_ms = ms;
+  }
+
+  void record_ref_enq_time(double ms) {
+    _cur_ref_enq_time_ms = ms;
+  }
+
 #ifndef PRODUCT
   void record_cc_clear_time(double ms) {
     if (_min_clear_cc_time_ms < 0.0 || ms <= _min_clear_cc_time_ms)
diff --git a/src/share/vm/gc_implementation/g1/g1MarkSweep.cpp b/src/share/vm/gc_implementation/g1/g1MarkSweep.cpp
index 1ed4f993d..84bbadbb9 100644
--- a/src/share/vm/gc_implementation/g1/g1MarkSweep.cpp
+++ b/src/share/vm/gc_implementation/g1/g1MarkSweep.cpp
@@ -62,6 +62,8 @@ void G1MarkSweep::invoke_at_safepoint(ReferenceProcessor* rp,
   // hook up weak ref data so it can be used during Mark-Sweep
   assert(GenMarkSweep::ref_processor() == NULL, "no stomping");
   assert(rp != NULL, "should be non-NULL");
+  assert(rp == G1CollectedHeap::heap()->ref_processor_stw(), "Precondition");
+
   GenMarkSweep::_ref_processor = rp;
   rp->setup_policy(clear_all_softrefs);
 
@@ -139,6 +141,8 @@ void G1MarkSweep::mark_sweep_phase1(bool& marked_for_unloading,
 
   // Process reference objects found during marking
   ReferenceProcessor* rp = GenMarkSweep::ref_processor();
+  assert(rp == G1CollectedHeap::heap()->ref_processor_stw(), "Sanity");
+
   rp->setup_policy(clear_all_softrefs);
   rp->process_discovered_references(&GenMarkSweep::is_alive,
                                     &GenMarkSweep::keep_alive,
@@ -166,7 +170,6 @@ void G1MarkSweep::mark_sweep_phase1(bool& marked_for_unloading,
   GenMarkSweep::follow_mdo_weak_refs();
   assert(GenMarkSweep::_marking_stack.is_empty(), "just drained");
 
-
   // Visit interned string tables and delete unmarked oops
   StringTable::unlink(&GenMarkSweep::is_alive);
   // Clean up unreferenced symbols in symbol table.
@@ -346,7 +349,8 @@ void G1MarkSweep::mark_sweep_phase3() {
                            NULL,  // do not touch code cache here
                            &GenMarkSweep::adjust_pointer_closure);
 
-  g1h->ref_processor()->weak_oops_do(&GenMarkSweep::adjust_root_pointer_closure);
+  assert(GenMarkSweep::ref_processor() == g1h->ref_processor_stw(), "Sanity");
+  g1h->ref_processor_stw()->weak_oops_do(&GenMarkSweep::adjust_root_pointer_closure);
 
   // Now adjust pointers in remaining weak roots.  (All of which should
   // have been cleared if they pointed to non-surviving objects.)
diff --git a/src/share/vm/gc_implementation/g1/g1OopClosures.hpp b/src/share/vm/gc_implementation/g1/g1OopClosures.hpp
index 73d87a11c..84e67725f 100644
--- a/src/share/vm/gc_implementation/g1/g1OopClosures.hpp
+++ b/src/share/vm/gc_implementation/g1/g1OopClosures.hpp
@@ -34,6 +34,7 @@ class CMBitMap;
 class CMMarkStack;
 class G1ParScanThreadState;
 class CMTask;
+class ReferenceProcessor;
 
 // A class that scans oops in a given heap region (much as OopsInGenClosure
 // scans oops in a generation.)
@@ -59,8 +60,15 @@ public:
 
 class G1ParPushHeapRSClosure : public G1ParClosureSuper {
 public:
-  G1ParPushHeapRSClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
-    G1ParClosureSuper(g1, par_scan_state) { }
+  G1ParPushHeapRSClosure(G1CollectedHeap* g1,
+                         G1ParScanThreadState* par_scan_state,
+                         ReferenceProcessor* rp) :
+    G1ParClosureSuper(g1, par_scan_state)
+  {
+    assert(_ref_processor == NULL, "sanity");
+    _ref_processor = rp;
+  }
+
   template <class T> void do_oop_nv(T* p);
   virtual void do_oop(oop* p)          { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p)    { do_oop_nv(p); }
@@ -68,8 +76,13 @@ public:
 
 class G1ParScanClosure : public G1ParClosureSuper {
 public:
-  G1ParScanClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
-    G1ParClosureSuper(g1, par_scan_state) { }
+  G1ParScanClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state, ReferenceProcessor* rp) :
+    G1ParClosureSuper(g1, par_scan_state)
+  {
+    assert(_ref_processor == NULL, "sanity");
+    _ref_processor = rp;
+  }
+
   template <class T> void do_oop_nv(T* p);
   virtual void do_oop(oop* p)          { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p)    { do_oop_nv(p); }
@@ -92,9 +105,18 @@ template <class T> inline oop clear_partial_array_mask(T* ref) {
 
 class G1ParScanPartialArrayClosure : public G1ParClosureSuper {
   G1ParScanClosure _scanner;
+
 public:
-  G1ParScanPartialArrayClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
-    G1ParClosureSuper(g1, par_scan_state), _scanner(g1, par_scan_state) { }
+  G1ParScanPartialArrayClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state, ReferenceProcessor* rp) :
+    G1ParClosureSuper(g1, par_scan_state), _scanner(g1, par_scan_state, rp)
+  {
+    assert(_ref_processor == NULL, "sanity");
+  }
+
+  G1ParScanClosure* scanner() {
+    return &_scanner;
+  }
+
   template <class T> void do_oop_nv(T* p);
   virtual void do_oop(oop* p)       { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
@@ -117,10 +139,20 @@ template<bool do_gen_barrier, G1Barrier barrier,
          bool do_mark_object>
 class G1ParCopyClosure : public G1ParCopyHelper {
   G1ParScanClosure _scanner;
+
   template <class T> void do_oop_work(T* p);
+
 public:
-  G1ParCopyClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
-    _scanner(g1, par_scan_state), G1ParCopyHelper(g1, par_scan_state, &_scanner) { }
+  G1ParCopyClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state,
+                   ReferenceProcessor* rp) :
+    _scanner(g1, par_scan_state, rp),
+    G1ParCopyHelper(g1, par_scan_state, &_scanner)
+  {
+    assert(_ref_processor == NULL, "sanity");
+  }
+
+  G1ParScanClosure* scanner() { return &_scanner; }
+
   template <class T> void do_oop_nv(T* p) {
     do_oop_work(p);
   }
@@ -130,21 +162,25 @@ public:
 
 typedef G1ParCopyClosure<false, G1BarrierNone, false> G1ParScanExtRootClosure;
 typedef G1ParCopyClosure<true,  G1BarrierNone, false> G1ParScanPermClosure;
-typedef G1ParCopyClosure<false, G1BarrierRS,   false> G1ParScanHeapRSClosure;
+
 typedef G1ParCopyClosure<false, G1BarrierNone, true> G1ParScanAndMarkExtRootClosure;
 typedef G1ParCopyClosure<true,  G1BarrierNone, true> G1ParScanAndMarkPermClosure;
-typedef G1ParCopyClosure<false, G1BarrierRS,   true> G1ParScanAndMarkHeapRSClosure;
-
-// This is the only case when we set skip_cset_test. Basically, this
-// closure is (should?) only be called directly while we're draining
-// the overflow and task queues. In that case we know that the
-// reference in question points into the collection set, otherwise we
-// would not have pushed it on the queue. The following is defined in
-// g1_specialized_oop_closures.hpp.
-// typedef G1ParCopyClosure<false, G1BarrierEvac, false, true> G1ParScanHeapEvacClosure;
-// We need a separate closure to handle references during evacuation
-// failure processing, as we cannot asume that the reference already
-// points into the collection set (like G1ParScanHeapEvacClosure does).
+
+// The following closure types are no longer used but are retained
+// for historical reasons:
+// typedef G1ParCopyClosure<false, G1BarrierRS,   false> G1ParScanHeapRSClosure;
+// typedef G1ParCopyClosure<false, G1BarrierRS,   true> G1ParScanAndMarkHeapRSClosure;
+
+// The following closure type is defined in g1_specialized_oop_closures.hpp:
+//
+// typedef G1ParCopyClosure<false, G1BarrierEvac, false> G1ParScanHeapEvacClosure;
+
+// We use a separate closure to handle references during evacuation
+// failure processing.
+// We could have used another instance of G1ParScanHeapEvacClosure
+// (since that closure no longer assumes that the references it
+// handles point into the collection set).
+
 typedef G1ParCopyClosure<false, G1BarrierEvac, false> G1ParScanHeapEvacFailureClosure;
 
 class FilterIntoCSClosure: public OopClosure {
@@ -153,9 +189,15 @@ class FilterIntoCSClosure: public OopClosure {
   DirtyCardToOopClosure* _dcto_cl;
 public:
   FilterIntoCSClosure(  DirtyCardToOopClosure* dcto_cl,
-                        G1CollectedHeap* g1, OopClosure* oc) :
+                        G1CollectedHeap* g1,
+                        OopClosure* oc,
+                        ReferenceProcessor* rp) :
     _dcto_cl(dcto_cl), _g1(g1), _oc(oc)
-  {}
+  {
+    assert(_ref_processor == NULL, "sanity");
+    _ref_processor = rp;
+  }
+
   template <class T> void do_oop_nv(T* p);
   virtual void do_oop(oop* p)        { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p)  { do_oop_nv(p); }
diff --git a/src/share/vm/gc_implementation/g1/g1RemSet.cpp b/src/share/vm/gc_implementation/g1/g1RemSet.cpp
index 568bb64f5..04a3bf5e5 100644
--- a/src/share/vm/gc_implementation/g1/g1RemSet.cpp
+++ b/src/share/vm/gc_implementation/g1/g1RemSet.cpp
@@ -234,6 +234,7 @@ void G1RemSet::scanRS(OopsInHeapRegionClosure* oc, int worker_i) {
   HeapRegion *startRegion = calculateStartRegion(worker_i);
 
   ScanRSClosure scanRScl(oc, worker_i);
+
   _g1->collection_set_iterate_from(startRegion, &scanRScl);
   scanRScl.set_try_claimed();
   _g1->collection_set_iterate_from(startRegion, &scanRScl);
@@ -283,6 +284,7 @@ void G1RemSet::updateRS(DirtyCardQueue* into_cset_dcq, int worker_i) {
   double start = os::elapsedTime();
   // Apply the given closure to all remaining log entries.
   RefineRecordRefsIntoCSCardTableEntryClosure into_cset_update_rs_cl(_g1, into_cset_dcq);
+
   _g1->iterate_dirty_card_closure(&into_cset_update_rs_cl, into_cset_dcq, false, worker_i);
 
   // Now there should be no dirty cards.
@@ -466,7 +468,7 @@ public:
     MemRegion scanRegion(start, end);
 
     UpdateRSetImmediate update_rs_cl(_g1->g1_rem_set());
-    FilterIntoCSClosure update_rs_cset_oop_cl(NULL, _g1, &update_rs_cl);
+    FilterIntoCSClosure update_rs_cset_oop_cl(NULL, _g1, &update_rs_cl, NULL /* rp */);
     FilterOutOfRegionClosure filter_then_update_rs_cset_oop_cl(r, &update_rs_cset_oop_cl);
 
     // We can pass false as the "filter_young" parameter here as:
@@ -642,7 +644,7 @@ bool G1RemSet::concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i,
   update_rs_oop_cl.set_from(r);
 
   TriggerClosure trigger_cl;
-  FilterIntoCSClosure into_cs_cl(NULL, _g1, &trigger_cl);
+  FilterIntoCSClosure into_cs_cl(NULL, _g1, &trigger_cl, NULL /* rp */);
   InvokeIfNotTriggeredClosure invoke_cl(&trigger_cl, &into_cs_cl);
   Mux2Closure mux(&invoke_cl, &update_rs_oop_cl);
 
diff --git a/src/share/vm/gc_implementation/g1/heapRegion.cpp b/src/share/vm/gc_implementation/g1/heapRegion.cpp
index f199cd85f..36636152e 100644
--- a/src/share/vm/gc_implementation/g1/heapRegion.cpp
+++ b/src/share/vm/gc_implementation/g1/heapRegion.cpp
@@ -45,7 +45,7 @@ HeapRegionDCTOC::HeapRegionDCTOC(G1CollectedHeap* g1,
                                  FilterKind fk) :
   ContiguousSpaceDCTOC(hr, cl, precision, NULL),
   _hr(hr), _fk(fk), _g1(g1)
-{}
+{ }
 
 FilterOutOfRegionClosure::FilterOutOfRegionClosure(HeapRegion* r,
                                                    OopClosure* oc) :
@@ -214,8 +214,37 @@ void HeapRegionDCTOC::walk_mem_region_with_cl(MemRegion mr,
   int oop_size;
 
   OopClosure* cl2 = cl;
-  FilterIntoCSClosure intoCSFilt(this, g1h, cl);
+
+  // If we are scanning the remembered sets looking for refs
+  // into the collection set during an evacuation pause then
+  // we will want to 'discover' reference objects that point
+  // to referents in the collection set.
+  //
+  // Unfortunately it is an instance of FilterIntoCSClosure
+  // that is iterated over the reference fields of oops in
+  // mr (and not the G1ParPushHeapRSClosure - which is the
+  // cl parameter).
+  // If we set the _ref_processor field in the FilterIntoCSClosure
+  // instance, all the reference objects that are walked
+  // (regardless of whether their referent object's are in
+  // the cset) will be 'discovered'.
+  //
+  // The G1STWIsAlive closure considers a referent object that
+  // is outside the cset as alive. The G1CopyingKeepAliveClosure
+  // skips referents that are not in the cset.
+  //
+  // Therefore reference objects in mr with a referent that is
+  // outside the cset should be OK.
+
+  ReferenceProcessor* rp = _cl->_ref_processor;
+  if (rp != NULL) {
+    assert(rp == _g1->ref_processor_stw(), "should be stw");
+    assert(_fk == IntoCSFilterKind, "should be looking for refs into CS");
+  }
+
+  FilterIntoCSClosure intoCSFilt(this, g1h, cl, rp);
   FilterOutOfRegionClosure outOfRegionFilt(_hr, cl);
+
   switch (_fk) {
   case IntoCSFilterKind:      cl2 = &intoCSFilt; break;
   case OutOfRegionFilterKind: cl2 = &outOfRegionFilt; break;
@@ -239,16 +268,19 @@ void HeapRegionDCTOC::walk_mem_region_with_cl(MemRegion mr,
     case NoFilterKind:
       bottom = walk_mem_region_loop(cl, g1h, _hr, bottom, top);
       break;
+
     case IntoCSFilterKind: {
-      FilterIntoCSClosure filt(this, g1h, cl);
+      FilterIntoCSClosure filt(this, g1h, cl, rp);
       bottom = walk_mem_region_loop(&filt, g1h, _hr, bottom, top);
       break;
     }
+
     case OutOfRegionFilterKind: {
       FilterOutOfRegionClosure filt(_hr, cl);
       bottom = walk_mem_region_loop(&filt, g1h, _hr, bottom, top);
       break;
     }
+
     default:
       ShouldNotReachHere();
     }
@@ -483,7 +515,7 @@ HeapRegion::
 HeapRegion(size_t hrs_index, G1BlockOffsetSharedArray* sharedOffsetArray,
            MemRegion mr, bool is_zeroed)
   : G1OffsetTableContigSpace(sharedOffsetArray, mr, is_zeroed),
-    _next_fk(HeapRegionDCTOC::NoFilterKind), _hrs_index(hrs_index),
+    _hrs_index(hrs_index),
     _humongous_type(NotHumongous), _humongous_start_region(NULL),
     _in_collection_set(false),
     _next_in_special_set(NULL), _orig_end(NULL),
diff --git a/src/share/vm/gc_implementation/g1/heapRegion.hpp b/src/share/vm/gc_implementation/g1/heapRegion.hpp
index 774987dd8..f2edf01b9 100644
--- a/src/share/vm/gc_implementation/g1/heapRegion.hpp
+++ b/src/share/vm/gc_implementation/g1/heapRegion.hpp
@@ -118,7 +118,6 @@ public:
                   FilterKind fk);
 };
 
-
 // The complicating factor is that BlockOffsetTable diverged
 // significantly, and we need functionality that is only in the G1 version.
 // So I copied that code, which led to an alternate G1 version of
@@ -223,10 +222,6 @@ class HeapRegion: public G1OffsetTableContigSpace {
     ContinuesHumongous
   };
 
-  // The next filter kind that should be used for a "new_dcto_cl" call with
-  // the "traditional" signature.
-  HeapRegionDCTOC::FilterKind _next_fk;
-
   // Requires that the region "mr" be dense with objects, and begin and end
   // with an object.
   void oops_in_mr_iterate(MemRegion mr, OopClosure* cl);
@@ -573,40 +568,14 @@ class HeapRegion: public G1OffsetTableContigSpace {
   // allocated in the current region before the last call to "save_mark".
   void oop_before_save_marks_iterate(OopClosure* cl);
 
-  // This call determines the "filter kind" argument that will be used for
-  // the next call to "new_dcto_cl" on this region with the "traditional"
-  // signature (i.e., the call below.)  The default, in the absence of a
-  // preceding call to this method, is "NoFilterKind", and a call to this
-  // method is necessary for each such call, or else it reverts to the
-  // default.
-  // (This is really ugly, but all other methods I could think of changed a
-  // lot of main-line code for G1.)
-  void set_next_filter_kind(HeapRegionDCTOC::FilterKind nfk) {
-    _next_fk = nfk;
-  }
-
   DirtyCardToOopClosure*
   new_dcto_closure(OopClosure* cl,
                    CardTableModRefBS::PrecisionStyle precision,
                    HeapRegionDCTOC::FilterKind fk);
 
-#if WHASSUP
-  DirtyCardToOopClosure*
-  new_dcto_closure(OopClosure* cl,
-                   CardTableModRefBS::PrecisionStyle precision,
-                   HeapWord* boundary) {
-    assert(boundary == NULL, "This arg doesn't make sense here.");
-    DirtyCardToOopClosure* res = new_dcto_closure(cl, precision, _next_fk);
-    _next_fk = HeapRegionDCTOC::NoFilterKind;
-    return res;
-  }
-#endif
-
-  //
   // Note the start or end of marking. This tells the heap region
   // that the collector is about to start or has finished (concurrently)
   // marking the heap.
-  //
 
   // Note the start of a marking phase. Record the
   // start of the unmarked area of the region here.
diff --git a/src/share/vm/gc_implementation/g1/satbQueue.cpp b/src/share/vm/gc_implementation/g1/satbQueue.cpp
index 5a7a7b694..ea0c19a89 100644
--- a/src/share/vm/gc_implementation/g1/satbQueue.cpp
+++ b/src/share/vm/gc_implementation/g1/satbQueue.cpp
@@ -29,6 +29,7 @@
 #include "memory/sharedHeap.hpp"
 #include "runtime/mutexLocker.hpp"
 #include "runtime/thread.hpp"
+#include "runtime/vmThread.hpp"
 
 // This method removes entries from an SATB buffer that will not be
 // useful to the concurrent marking threads. An entry is removed if it
@@ -252,9 +253,18 @@ void SATBMarkQueueSet::par_iterate_closure_all_threads(int worker) {
       t->satb_mark_queue().apply_closure(_par_closures[worker]);
     }
   }
-  // We'll have worker 0 do this one.
-  if (worker == 0) {
-    shared_satb_queue()->apply_closure(_par_closures[0]);
+
+  // We also need to claim the VMThread so that its parity is updated
+  // otherwise the next call to Thread::possibly_parallel_oops_do inside
+  // a StrongRootsScope might skip the VMThread because it has a stale
+  // parity that matches the parity set by the StrongRootsScope
+  //
+  // Whichever worker succeeds in claiming the VMThread gets to do
+  // the shared queue.
+
+  VMThread* vmt = VMThread::vm_thread();
+  if (vmt->claim_oops_do(true, parity)) {
+    shared_satb_queue()->apply_closure(_par_closures[worker]);
   }
 }
 
diff --git a/src/share/vm/gc_implementation/parallelScavenge/psMarkSweep.cpp b/src/share/vm/gc_implementation/parallelScavenge/psMarkSweep.cpp
index 3d7e0ba4c..f2965e674 100644
--- a/src/share/vm/gc_implementation/parallelScavenge/psMarkSweep.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psMarkSweep.cpp
@@ -198,10 +198,9 @@ void PSMarkSweep::invoke_no_policy(bool clear_all_softrefs) {
 
     allocate_stacks();
 
-    NOT_PRODUCT(ref_processor()->verify_no_references_recorded());
     COMPILER2_PRESENT(DerivedPointerTable::clear());
 
-    ref_processor()->enable_discovery();
+    ref_processor()->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
     ref_processor()->setup_policy(clear_all_softrefs);
 
     mark_sweep_phase1(clear_all_softrefs);
diff --git a/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp b/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp
index 55cb34350..a62059c68 100644
--- a/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp
@@ -2069,10 +2069,9 @@ void PSParallelCompact::invoke_no_policy(bool maximum_heap_compaction) {
     CodeCache::gc_prologue();
     Threads::gc_prologue();
 
-    NOT_PRODUCT(ref_processor()->verify_no_references_recorded());
     COMPILER2_PRESENT(DerivedPointerTable::clear());
 
-    ref_processor()->enable_discovery();
+    ref_processor()->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
     ref_processor()->setup_policy(maximum_heap_compaction);
 
     bool marked_for_unloading = false;
diff --git a/src/share/vm/gc_implementation/parallelScavenge/psScavenge.cpp b/src/share/vm/gc_implementation/parallelScavenge/psScavenge.cpp
index b2234676f..1094d1708 100644
--- a/src/share/vm/gc_implementation/parallelScavenge/psScavenge.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psScavenge.cpp
@@ -350,10 +350,9 @@ bool PSScavenge::invoke_no_policy() {
     }
     save_to_space_top_before_gc();
 
-    NOT_PRODUCT(reference_processor()->verify_no_references_recorded());
     COMPILER2_PRESENT(DerivedPointerTable::clear());
 
-    reference_processor()->enable_discovery();
+    reference_processor()->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
     reference_processor()->setup_policy(false);
 
     // We track how much was promoted to the next generation for
diff --git a/src/share/vm/memory/genCollectedHeap.cpp b/src/share/vm/memory/genCollectedHeap.cpp
index f233cbc7b..d6a54e17e 100644
--- a/src/share/vm/memory/genCollectedHeap.cpp
+++ b/src/share/vm/memory/genCollectedHeap.cpp
@@ -599,8 +599,7 @@ void GenCollectedHeap::do_collection(bool  full,
           // atomic wrt other collectors in this configuration, we
           // are guaranteed to have empty discovered ref lists.
           if (rp->discovery_is_atomic()) {
-            rp->verify_no_references_recorded();
-            rp->enable_discovery();
+            rp->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
             rp->setup_policy(do_clear_all_soft_refs);
           } else {
             // collect() below will enable discovery as appropriate
diff --git a/src/share/vm/memory/referenceProcessor.cpp b/src/share/vm/memory/referenceProcessor.cpp
index 9b593ef3f..27a1ff41b 100644
--- a/src/share/vm/memory/referenceProcessor.cpp
+++ b/src/share/vm/memory/referenceProcessor.cpp
@@ -35,42 +35,8 @@
 
 ReferencePolicy* ReferenceProcessor::_always_clear_soft_ref_policy = NULL;
 ReferencePolicy* ReferenceProcessor::_default_soft_ref_policy      = NULL;
-const int        subclasses_of_ref                = REF_PHANTOM - REF_OTHER;
 bool             ReferenceProcessor::_pending_list_uses_discovered_field = false;
 
-// List of discovered references.
-class DiscoveredList {
-public:
-  DiscoveredList() : _len(0), _compressed_head(0), _oop_head(NULL) { }
-  oop head() const     {
-     return UseCompressedOops ?  oopDesc::decode_heap_oop(_compressed_head) :
-                                _oop_head;
-  }
-  HeapWord* adr_head() {
-    return UseCompressedOops ? (HeapWord*)&_compressed_head :
-                               (HeapWord*)&_oop_head;
-  }
-  void   set_head(oop o) {
-    if (UseCompressedOops) {
-      // Must compress the head ptr.
-      _compressed_head = oopDesc::encode_heap_oop(o);
-    } else {
-      _oop_head = o;
-    }
-  }
-  bool   empty() const          { return head() == NULL; }
-  size_t length()               { return _len; }
-  void   set_length(size_t len) { _len = len;  }
-  void   inc_length(size_t inc) { _len += inc; assert(_len > 0, "Error"); }
-  void   dec_length(size_t dec) { _len -= dec; }
-private:
-  // Set value depending on UseCompressedOops. This could be a template class
-  // but then we have to fix all the instantiations and declarations that use this class.
-  oop       _oop_head;
-  narrowOop _compressed_head;
-  size_t _len;
-};
-
 void referenceProcessor_init() {
   ReferenceProcessor::init_statics();
 }
@@ -112,7 +78,8 @@ ReferenceProcessor::ReferenceProcessor(MemRegion span,
   _discovery_is_mt     = mt_discovery;
   _num_q               = MAX2(1, mt_processing_degree);
   _max_num_q           = MAX2(_num_q, mt_discovery_degree);
-  _discoveredSoftRefs  = NEW_C_HEAP_ARRAY(DiscoveredList, _max_num_q * subclasses_of_ref);
+  _discoveredSoftRefs  = NEW_C_HEAP_ARRAY(DiscoveredList,
+                                          _max_num_q * number_of_subclasses_of_ref());
   if (_discoveredSoftRefs == NULL) {
     vm_exit_during_initialization("Could not allocated RefProc Array");
   }
@@ -120,7 +87,7 @@ ReferenceProcessor::ReferenceProcessor(MemRegion span,
   _discoveredFinalRefs   = &_discoveredWeakRefs[_max_num_q];
   _discoveredPhantomRefs = &_discoveredFinalRefs[_max_num_q];
   // Initialized all entries to NULL
-  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     _discoveredSoftRefs[i].set_head(NULL);
     _discoveredSoftRefs[i].set_length(0);
   }
@@ -134,19 +101,15 @@ ReferenceProcessor::ReferenceProcessor(MemRegion span,
 #ifndef PRODUCT
 void ReferenceProcessor::verify_no_references_recorded() {
   guarantee(!_discovering_refs, "Discovering refs?");
-  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
-    guarantee(_discoveredSoftRefs[i].empty(),
+  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+    guarantee(_discoveredSoftRefs[i].is_empty(),
               "Found non-empty discovered list");
   }
 }
 #endif
 
 void ReferenceProcessor::weak_oops_do(OopClosure* f) {
-  // Should this instead be
-  // for (int i = 0; i < subclasses_of_ref; i++_ {
-  //   for (int j = 0; j < _num_q; j++) {
-  //     int index = i * _max_num_q + j;
-  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     if (UseCompressedOops) {
       f->do_oop((narrowOop*)_discoveredSoftRefs[i].adr_head());
     } else {
@@ -404,7 +367,7 @@ public:
     // allocated and are indexed into.
     assert(_n_queues == (int) _ref_processor.max_num_q(), "Different number not expected");
     for (int j = 0;
-         j < subclasses_of_ref;
+         j < ReferenceProcessor::number_of_subclasses_of_ref();
          j++, index += _n_queues) {
       _ref_processor.enqueue_discovered_reflist(
         _refs_lists[index], _pending_list_addr);
@@ -424,7 +387,7 @@ void ReferenceProcessor::enqueue_discovered_reflists(HeapWord* pending_list_addr
     task_executor->execute(tsk);
   } else {
     // Serial code: call the parent class's implementation
-    for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+    for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
       enqueue_discovered_reflist(_discoveredSoftRefs[i], pending_list_addr);
       _discoveredSoftRefs[i].set_head(NULL);
       _discoveredSoftRefs[i].set_length(0);
@@ -432,119 +395,7 @@ void ReferenceProcessor::enqueue_discovered_reflists(HeapWord* pending_list_addr
   }
 }
 
-// Iterator for the list of discovered references.
-class DiscoveredListIterator {
-public:
-  inline DiscoveredListIterator(DiscoveredList&    refs_list,
-                                OopClosure*        keep_alive,
-                                BoolObjectClosure* is_alive);
-
-  // End Of List.
-  inline bool has_next() const { return _ref != NULL; }
-
-  // Get oop to the Reference object.
-  inline oop obj() const { return _ref; }
-
-  // Get oop to the referent object.
-  inline oop referent() const { return _referent; }
-
-  // Returns true if referent is alive.
-  inline bool is_referent_alive() const;
-
-  // Loads data for the current reference.
-  // The "allow_null_referent" argument tells us to allow for the possibility
-  // of a NULL referent in the discovered Reference object. This typically
-  // happens in the case of concurrent collectors that may have done the
-  // discovery concurrently, or interleaved, with mutator execution.
-  inline void load_ptrs(DEBUG_ONLY(bool allow_null_referent));
-
-  // Move to the next discovered reference.
-  inline void next();
-
-  // Remove the current reference from the list
-  inline void remove();
-
-  // Make the Reference object active again.
-  inline void make_active() { java_lang_ref_Reference::set_next(_ref, NULL); }
-
-  // Make the referent alive.
-  inline void make_referent_alive() {
-    if (UseCompressedOops) {
-      _keep_alive->do_oop((narrowOop*)_referent_addr);
-    } else {
-      _keep_alive->do_oop((oop*)_referent_addr);
-    }
-  }
-
-  // Update the discovered field.
-  inline void update_discovered() {
-    // First _prev_next ref actually points into DiscoveredList (gross).
-    if (UseCompressedOops) {
-      if (!oopDesc::is_null(*(narrowOop*)_prev_next)) {
-        _keep_alive->do_oop((narrowOop*)_prev_next);
-      }
-    } else {
-      if (!oopDesc::is_null(*(oop*)_prev_next)) {
-        _keep_alive->do_oop((oop*)_prev_next);
-      }
-    }
-  }
-
-  // NULL out referent pointer.
-  inline void clear_referent() { oop_store_raw(_referent_addr, NULL); }
-
-  // Statistics
-  NOT_PRODUCT(
-  inline size_t processed() const { return _processed; }
-  inline size_t removed() const   { return _removed; }
-  )
-
-  inline void move_to_next();
-
-private:
-  DiscoveredList&    _refs_list;
-  HeapWord*          _prev_next;
-  oop                _prev;
-  oop                _ref;
-  HeapWord*          _discovered_addr;
-  oop                _next;
-  HeapWord*          _referent_addr;
-  oop                _referent;
-  OopClosure*        _keep_alive;
-  BoolObjectClosure* _is_alive;
-  DEBUG_ONLY(
-  oop                _first_seen; // cyclic linked list check
-  )
-  NOT_PRODUCT(
-  size_t             _processed;
-  size_t             _removed;
-  )
-};
-
-inline DiscoveredListIterator::DiscoveredListIterator(DiscoveredList&    refs_list,
-                                                      OopClosure*        keep_alive,
-                                                      BoolObjectClosure* is_alive)
-  : _refs_list(refs_list),
-    _prev_next(refs_list.adr_head()),
-    _prev(NULL),
-    _ref(refs_list.head()),
-#ifdef ASSERT
-    _first_seen(refs_list.head()),
-#endif
-#ifndef PRODUCT
-    _processed(0),
-    _removed(0),
-#endif
-    _next(NULL),
-    _keep_alive(keep_alive),
-    _is_alive(is_alive)
-{ }
-
-inline bool DiscoveredListIterator::is_referent_alive() const {
-  return _is_alive->do_object_b(_referent);
-}
-
-inline void DiscoveredListIterator::load_ptrs(DEBUG_ONLY(bool allow_null_referent)) {
+void DiscoveredListIterator::load_ptrs(DEBUG_ONLY(bool allow_null_referent)) {
   _discovered_addr = java_lang_ref_Reference::discovered_addr(_ref);
   oop discovered = java_lang_ref_Reference::discovered(_ref);
   assert(_discovered_addr && discovered->is_oop_or_null(),
@@ -560,13 +411,7 @@ inline void DiscoveredListIterator::load_ptrs(DEBUG_ONLY(bool allow_null_referen
          "bad referent");
 }
 
-inline void DiscoveredListIterator::next() {
-  _prev_next = _discovered_addr;
-  _prev = _ref;
-  move_to_next();
-}
-
-inline void DiscoveredListIterator::remove() {
+void DiscoveredListIterator::remove() {
   assert(_ref->is_oop(), "Dropping a bad reference");
   oop_store_raw(_discovered_addr, NULL);
 
@@ -592,15 +437,29 @@ inline void DiscoveredListIterator::remove() {
   _refs_list.dec_length(1);
 }
 
-inline void DiscoveredListIterator::move_to_next() {
-  if (_ref == _next) {
-    // End of the list.
-    _ref = NULL;
+// Make the Reference object active again.
+void DiscoveredListIterator::make_active() {
+  // For G1 we don't want to use set_next - it
+  // will dirty the card for the next field of
+  // the reference object and will fail
+  // CT verification.
+  if (UseG1GC) {
+    BarrierSet* bs = oopDesc::bs();
+    HeapWord* next_addr = java_lang_ref_Reference::next_addr(_ref);
+
+    if (UseCompressedOops) {
+      bs->write_ref_field_pre((narrowOop*)next_addr, NULL);
+    } else {
+      bs->write_ref_field_pre((oop*)next_addr, NULL);
+    }
+    java_lang_ref_Reference::set_next_raw(_ref, NULL);
   } else {
-    _ref = _next;
+    java_lang_ref_Reference::set_next(_ref, NULL);
   }
-  assert(_ref != _first_seen, "cyclic ref_list found");
-  NOT_PRODUCT(_processed++);
+}
+
+void DiscoveredListIterator::clear_referent() {
+  oop_store_raw(_referent_addr, NULL);
 }
 
 // NOTE: process_phase*() are largely similar, and at a high level
@@ -786,10 +645,9 @@ ReferenceProcessor::abandon_partial_discovered_list(DiscoveredList& refs_list) {
 
 void ReferenceProcessor::abandon_partial_discovery() {
   // loop over the lists
-  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     if (TraceReferenceGC && PrintGCDetails && ((i % _max_num_q) == 0)) {
-      gclog_or_tty->print_cr("\nAbandoning %s discovered list",
-                             list_name(i));
+      gclog_or_tty->print_cr("\nAbandoning %s discovered list", list_name(i));
     }
     abandon_partial_discovered_list(_discoveredSoftRefs[i]);
   }
@@ -858,6 +716,14 @@ private:
   bool _clear_referent;
 };
 
+void ReferenceProcessor::set_discovered(oop ref, oop value) {
+  if (_discovered_list_needs_barrier) {
+    java_lang_ref_Reference::set_discovered(ref, value);
+  } else {
+    java_lang_ref_Reference::set_discovered_raw(ref, value);
+  }
+}
+
 // Balances reference queues.
 // Move entries from all queues[0, 1, ..., _max_num_q-1] to
 // queues[0, 1, ..., _num_q-1] because only the first _num_q
@@ -915,9 +781,9 @@ void ReferenceProcessor::balance_queues(DiscoveredList ref_lists[])
         // Add the chain to the to list.
         if (ref_lists[to_idx].head() == NULL) {
           // to list is empty. Make a loop at the end.
-          java_lang_ref_Reference::set_discovered(move_tail, move_tail);
+          set_discovered(move_tail, move_tail);
         } else {
-          java_lang_ref_Reference::set_discovered(move_tail, ref_lists[to_idx].head());
+          set_discovered(move_tail, ref_lists[to_idx].head());
         }
         ref_lists[to_idx].set_head(move_head);
         ref_lists[to_idx].inc_length(refs_to_move);
@@ -1038,11 +904,7 @@ ReferenceProcessor::process_discovered_reflist(
 
 void ReferenceProcessor::clean_up_discovered_references() {
   // loop over the lists
-  // Should this instead be
-  // for (int i = 0; i < subclasses_of_ref; i++_ {
-  //   for (int j = 0; j < _num_q; j++) {
-  //     int index = i * _max_num_q + j;
-  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     if (TraceReferenceGC && PrintGCDetails && ((i % _max_num_q) == 0)) {
       gclog_or_tty->print_cr(
         "\nScrubbing %s discovered list of Null referents",
@@ -1260,6 +1122,8 @@ bool ReferenceProcessor::discover_reference(oop obj, ReferenceType rt) {
     }
   }
 
+  ResourceMark rm;      // Needed for tracing.
+
   HeapWord* const discovered_addr = java_lang_ref_Reference::discovered_addr(obj);
   const oop  discovered = java_lang_ref_Reference::discovered(obj);
   assert(discovered->is_oop_or_null(), "bad discovered field");
@@ -1472,7 +1336,9 @@ ReferenceProcessor::preclean_discovered_reflist(DiscoveredList&    refs_list,
 }
 
 const char* ReferenceProcessor::list_name(int i) {
-   assert(i >= 0 && i <= _max_num_q * subclasses_of_ref, "Out of bounds index");
+   assert(i >= 0 && i <= _max_num_q * number_of_subclasses_of_ref(),
+          "Out of bounds index");
+
    int j = i / _max_num_q;
    switch (j) {
      case 0: return "SoftRef";
@@ -1493,7 +1359,7 @@ void ReferenceProcessor::verify_ok_to_handle_reflists() {
 #ifndef PRODUCT
 void ReferenceProcessor::clear_discovered_references() {
   guarantee(!_discovering_refs, "Discovering refs?");
-  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     clear_discovered_references(_discoveredSoftRefs[i]);
   }
 }
diff --git a/src/share/vm/memory/referenceProcessor.hpp b/src/share/vm/memory/referenceProcessor.hpp
index 8178f60ea..63ed31ddd 100644
--- a/src/share/vm/memory/referenceProcessor.hpp
+++ b/src/share/vm/memory/referenceProcessor.hpp
@@ -48,18 +48,175 @@
 // forward references
 class ReferencePolicy;
 class AbstractRefProcTaskExecutor;
-class DiscoveredList;
+
+// List of discovered references.
+class DiscoveredList {
+public:
+  DiscoveredList() : _len(0), _compressed_head(0), _oop_head(NULL) { }
+  oop head() const     {
+     return UseCompressedOops ?  oopDesc::decode_heap_oop(_compressed_head) :
+                                _oop_head;
+  }
+  HeapWord* adr_head() {
+    return UseCompressedOops ? (HeapWord*)&_compressed_head :
+                               (HeapWord*)&_oop_head;
+  }
+  void set_head(oop o) {
+    if (UseCompressedOops) {
+      // Must compress the head ptr.
+      _compressed_head = oopDesc::encode_heap_oop(o);
+    } else {
+      _oop_head = o;
+    }
+  }
+  bool   is_empty() const       { return head() == NULL; }
+  size_t length()               { return _len; }
+  void   set_length(size_t len) { _len = len;  }
+  void   inc_length(size_t inc) { _len += inc; assert(_len > 0, "Error"); }
+  void   dec_length(size_t dec) { _len -= dec; }
+private:
+  // Set value depending on UseCompressedOops. This could be a template class
+  // but then we have to fix all the instantiations and declarations that use this class.
+  oop       _oop_head;
+  narrowOop _compressed_head;
+  size_t _len;
+};
+
+// Iterator for the list of discovered references.
+class DiscoveredListIterator {
+private:
+  DiscoveredList&    _refs_list;
+  HeapWord*          _prev_next;
+  oop                _prev;
+  oop                _ref;
+  HeapWord*          _discovered_addr;
+  oop                _next;
+  HeapWord*          _referent_addr;
+  oop                _referent;
+  OopClosure*        _keep_alive;
+  BoolObjectClosure* _is_alive;
+
+  DEBUG_ONLY(
+  oop                _first_seen; // cyclic linked list check
+  )
+
+  NOT_PRODUCT(
+  size_t             _processed;
+  size_t             _removed;
+  )
+
+public:
+  inline DiscoveredListIterator(DiscoveredList&    refs_list,
+                                OopClosure*        keep_alive,
+                                BoolObjectClosure* is_alive):
+    _refs_list(refs_list),
+    _prev_next(refs_list.adr_head()),
+    _prev(NULL),
+    _ref(refs_list.head()),
+#ifdef ASSERT
+    _first_seen(refs_list.head()),
+#endif
+#ifndef PRODUCT
+    _processed(0),
+    _removed(0),
+#endif
+    _next(NULL),
+    _keep_alive(keep_alive),
+    _is_alive(is_alive)
+{ }
+
+  // End Of List.
+  inline bool has_next() const { return _ref != NULL; }
+
+  // Get oop to the Reference object.
+  inline oop obj() const { return _ref; }
+
+  // Get oop to the referent object.
+  inline oop referent() const { return _referent; }
+
+  // Returns true if referent is alive.
+  inline bool is_referent_alive() const {
+    return _is_alive->do_object_b(_referent);
+  }
+
+  // Loads data for the current reference.
+  // The "allow_null_referent" argument tells us to allow for the possibility
+  // of a NULL referent in the discovered Reference object. This typically
+  // happens in the case of concurrent collectors that may have done the
+  // discovery concurrently, or interleaved, with mutator execution.
+  void load_ptrs(DEBUG_ONLY(bool allow_null_referent));
+
+  // Move to the next discovered reference.
+  inline void next() {
+    _prev_next = _discovered_addr;
+    _prev = _ref;
+    move_to_next();
+  }
+
+  // Remove the current reference from the list
+  void remove();
+
+  // Make the Reference object active again.
+  void make_active();
+
+  // Make the referent alive.
+  inline void make_referent_alive() {
+    if (UseCompressedOops) {
+      _keep_alive->do_oop((narrowOop*)_referent_addr);
+    } else {
+      _keep_alive->do_oop((oop*)_referent_addr);
+    }
+  }
+
+  // Update the discovered field.
+  inline void update_discovered() {
+    // First _prev_next ref actually points into DiscoveredList (gross).
+    if (UseCompressedOops) {
+      if (!oopDesc::is_null(*(narrowOop*)_prev_next)) {
+        _keep_alive->do_oop((narrowOop*)_prev_next);
+      }
+    } else {
+      if (!oopDesc::is_null(*(oop*)_prev_next)) {
+        _keep_alive->do_oop((oop*)_prev_next);
+      }
+    }
+  }
+
+  // NULL out referent pointer.
+  void clear_referent();
+
+  // Statistics
+  NOT_PRODUCT(
+  inline size_t processed() const { return _processed; }
+  inline size_t removed() const   { return _removed; }
+  )
+
+  inline void move_to_next() {
+    if (_ref == _next) {
+      // End of the list.
+      _ref = NULL;
+    } else {
+      _ref = _next;
+    }
+    assert(_ref != _first_seen, "cyclic ref_list found");
+    NOT_PRODUCT(_processed++);
+  }
+
+};
 
 class ReferenceProcessor : public CHeapObj {
  protected:
   // Compatibility with pre-4965777 JDK's
   static bool _pending_list_uses_discovered_field;
-  MemRegion   _span; // (right-open) interval of heap
-                     // subject to wkref discovery
-  bool        _discovering_refs;      // true when discovery enabled
-  bool        _discovery_is_atomic;   // if discovery is atomic wrt
-                                      // other collectors in configuration
-  bool        _discovery_is_mt;       // true if reference discovery is MT.
+
+  MemRegion   _span;                    // (right-open) interval of heap
+                                        // subject to wkref discovery
+
+  bool        _discovering_refs;        // true when discovery enabled
+  bool        _discovery_is_atomic;     // if discovery is atomic wrt
+                                        // other collectors in configuration
+  bool        _discovery_is_mt;         // true if reference discovery is MT.
+
   // If true, setting "next" field of a discovered refs list requires
   // write barrier(s).  (Must be true if used in a collector in which
   // elements of a discovered list may be moved during discovery: for
@@ -67,18 +224,19 @@ class ReferenceProcessor : public CHeapObj {
   // long-term concurrent marking phase that does weak reference
   // discovery.)
   bool        _discovered_list_needs_barrier;
-  BarrierSet* _bs;                    // Cached copy of BarrierSet.
-  bool        _enqueuing_is_done;     // true if all weak references enqueued
-  bool        _processing_is_mt;      // true during phases when
-                                      // reference processing is MT.
-  int         _next_id;               // round-robin mod _num_q counter in
-                                      // support of work distribution
-
-  // For collectors that do not keep GC marking information
+
+  BarrierSet* _bs;                      // Cached copy of BarrierSet.
+  bool        _enqueuing_is_done;       // true if all weak references enqueued
+  bool        _processing_is_mt;        // true during phases when
+                                        // reference processing is MT.
+  int         _next_id;                 // round-robin mod _num_q counter in
+                                        // support of work distribution
+
+  // For collectors that do not keep GC liveness information
   // in the object header, this field holds a closure that
   // helps the reference processor determine the reachability
-  // of an oop (the field is currently initialized to NULL for
-  // all collectors but the CMS collector).
+  // of an oop. It is currently initialized to NULL for all
+  // collectors except for CMS and G1.
   BoolObjectClosure* _is_alive_non_header;
 
   // Soft ref clearing policies
@@ -102,10 +260,13 @@ class ReferenceProcessor : public CHeapObj {
   DiscoveredList* _discoveredPhantomRefs;
 
  public:
-  int num_q()                            { return _num_q; }
-  int max_num_q()                        { return _max_num_q; }
-  void set_active_mt_degree(int v)       { _num_q = v; }
-  DiscoveredList* discovered_soft_refs() { return _discoveredSoftRefs; }
+  static int number_of_subclasses_of_ref() { return (REF_PHANTOM - REF_OTHER); }
+
+  int num_q()                              { return _num_q; }
+  int max_num_q()                          { return _max_num_q; }
+  void set_active_mt_degree(int v)         { _num_q = v; }
+  DiscoveredList* discovered_soft_refs()   { return _discoveredSoftRefs; }
+
   ReferencePolicy* setup_policy(bool always_clear) {
     _current_soft_ref_policy = always_clear ?
       _always_clear_soft_ref_policy : _default_soft_ref_policy;
@@ -205,6 +366,11 @@ class ReferenceProcessor : public CHeapObj {
   void enqueue_discovered_reflists(HeapWord* pending_list_addr, AbstractRefProcTaskExecutor* task_executor);
 
  protected:
+  // Set the 'discovered' field of the given reference to
+  // the given value - emitting barriers depending upon
+  // the value of _discovered_list_needs_barrier.
+  void set_discovered(oop ref, oop value);
+
   // "Preclean" the given discovered reference list
   // by removing references with strongly reachable referents.
   // Currently used in support of CMS only.
@@ -290,7 +456,19 @@ class ReferenceProcessor : public CHeapObj {
   void      set_span(MemRegion span) { _span = span; }
 
   // start and stop weak ref discovery
-  void enable_discovery()   { _discovering_refs = true;  }
+  void enable_discovery(bool verify_disabled, bool check_no_refs) {
+#ifdef ASSERT
+    // Verify that we're not currently discovering refs
+    assert(!verify_disabled || !_discovering_refs, "nested call?");
+
+    if (check_no_refs) {
+      // Verify that the discovered lists are empty
+      verify_no_references_recorded();
+    }
+#endif // ASSERT
+    _discovering_refs = true;
+  }
+
   void disable_discovery()  { _discovering_refs = false; }
   bool discovery_enabled()  { return _discovering_refs;  }
 
@@ -365,7 +543,7 @@ class NoRefDiscovery: StackObj {
 
   ~NoRefDiscovery() {
     if (_was_discovering_refs) {
-      _rp->enable_discovery();
+      _rp->enable_discovery(true /*verify_disabled*/, false /*check_no_refs*/);
     }
   }
 };
diff --git a/src/share/vm/runtime/thread.cpp b/src/share/vm/runtime/thread.cpp
index 5b5138e83..8b7059c83 100644
--- a/src/share/vm/runtime/thread.cpp
+++ b/src/share/vm/runtime/thread.cpp
@@ -749,8 +749,9 @@ bool Thread::claim_oops_do_par_case(int strong_roots_parity) {
   jint thread_parity = _oops_do_parity;
   if (thread_parity != strong_roots_parity) {
     jint res = Atomic::cmpxchg(strong_roots_parity, &_oops_do_parity, thread_parity);
-    if (res == thread_parity) return true;
-    else {
+    if (res == thread_parity) {
+      return true;
+    } else {
       guarantee(res == strong_roots_parity, "Or else what?");
       assert(SharedHeap::heap()->n_par_threads() > 0,
              "Should only fail when parallel.");
@@ -3905,8 +3906,9 @@ void Threads::possibly_parallel_oops_do(OopClosure* f, CodeBlobClosure* cf) {
     }
   }
   VMThread* vmt = VMThread::vm_thread();
-  if (vmt->claim_oops_do(is_par, cp))
+  if (vmt->claim_oops_do(is_par, cp)) {
     vmt->oops_do(f, cf);
+  }
 }
 
 #ifndef SERIALGC
-- 
GitLab