diff --git a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
index c5bd4ca295e38d57432bdc1bbae83153b8b88cd6..54bbb24b0a48a72f560830b36b2f19833bd0a25a 100644
--- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
@@ -2004,7 +2004,7 @@ void CMSCollector::do_compaction_work(bool clear_all_soft_refs) {
   ReferenceProcessorMTDiscoveryMutator rp_mut_discovery(ref_processor(), false);
 
   ref_processor()->set_enqueuing_is_done(false);
-  ref_processor()->enable_discovery();
+  ref_processor()->enable_discovery(false /*verify_disabled*/, false /*check_no_refs*/);
   ref_processor()->setup_policy(clear_all_soft_refs);
   // If an asynchronous collection finishes, the _modUnionTable is
   // all clear.  If we are assuming the collection from an asynchronous
@@ -3490,8 +3490,8 @@ void CMSCollector::checkpointRootsInitial(bool asynch) {
     MutexLockerEx x(bitMapLock(),
                     Mutex::_no_safepoint_check_flag);
     checkpointRootsInitialWork(asynch);
-    rp->verify_no_references_recorded();
-    rp->enable_discovery(); // enable ("weak") refs discovery
+    // enable ("weak") refs discovery
+    rp->enable_discovery(true /*verify_disabled*/, true /*check_no_refs*/);
     _collectorState = Marking;
   } else {
     // (Weak) Refs discovery: this is controlled from genCollectedHeap::do_collection
@@ -3503,7 +3503,8 @@ void CMSCollector::checkpointRootsInitial(bool asynch) {
            "ref discovery for this generation kind");
     // already have locks
     checkpointRootsInitialWork(asynch);
-    rp->enable_discovery(); // now enable ("weak") refs discovery
+    // now enable ("weak") refs discovery
+    rp->enable_discovery(true /*verify_disabled*/, false /*verify_no_refs*/);
     _collectorState = Marking;
   }
   SpecializationStats::print();
diff --git a/src/share/vm/gc_implementation/g1/concurrentMark.cpp b/src/share/vm/gc_implementation/g1/concurrentMark.cpp
index 61adccb100546daf485c25b57507f106a5f33cfa..d78db6e5d2d5c723c46da5c78a7390d283c902f1 100644
--- a/src/share/vm/gc_implementation/g1/concurrentMark.cpp
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.cpp
@@ -818,10 +818,10 @@ void ConcurrentMark::checkpointRootsInitialPost() {
   NoteStartOfMarkHRClosure startcl;
   g1h->heap_region_iterate(&startcl);
 
-  // Start weak-reference discovery.
-  ReferenceProcessor* rp = g1h->ref_processor();
-  rp->verify_no_references_recorded();
-  rp->enable_discovery(); // enable ("weak") refs discovery
+  // Start Concurrent Marking weak-reference discovery.
+  ReferenceProcessor* rp = g1h->ref_processor_cm();
+  // enable ("weak") refs discovery
+  rp->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
   rp->setup_policy(false); // snapshot the soft ref policy to be used in this cycle
 
   SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
@@ -1133,6 +1133,7 @@ void ConcurrentMark::checkpointRootsFinal(bool clear_all_soft_refs) {
   // world is stopped at this checkpoint
   assert(SafepointSynchronize::is_at_safepoint(),
          "world should be stopped");
+
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
 
   // If a full collection has happened, we shouldn't do this.
@@ -1837,6 +1838,10 @@ void ConcurrentMark::cleanup() {
   size_t cleaned_up_bytes = start_used_bytes - g1h->used();
   g1p->decrease_known_garbage_bytes(cleaned_up_bytes);
 
+  // Clean up will have freed any regions completely full of garbage.
+  // Update the soft reference policy with the new heap occupancy.
+  Universe::update_heap_info_at_gc();
+
   // We need to make this be a "collection" so any collection pause that
   // races with it goes around and waits for completeCleanup to finish.
   g1h->increment_total_collections();
@@ -2072,8 +2077,10 @@ class G1CMParDrainMarkingStackClosure: public VoidClosure {
   }
 };
 
-// Implementation of AbstractRefProcTaskExecutor for G1
-class G1RefProcTaskExecutor: public AbstractRefProcTaskExecutor {
+// Implementation of AbstractRefProcTaskExecutor for parallel
+// reference processing at the end of G1 concurrent marking
+
+class G1CMRefProcTaskExecutor: public AbstractRefProcTaskExecutor {
 private:
   G1CollectedHeap* _g1h;
   ConcurrentMark*  _cm;
@@ -2082,7 +2089,7 @@ private:
   int              _active_workers;
 
 public:
-  G1RefProcTaskExecutor(G1CollectedHeap* g1h,
+  G1CMRefProcTaskExecutor(G1CollectedHeap* g1h,
                         ConcurrentMark* cm,
                         CMBitMap* bitmap,
                         WorkGang* workers,
@@ -2096,7 +2103,7 @@ public:
   virtual void execute(EnqueueTask& task);
 };
 
-class G1RefProcTaskProxy: public AbstractGangTask {
+class G1CMRefProcTaskProxy: public AbstractGangTask {
   typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
   ProcessTask&     _proc_task;
   G1CollectedHeap* _g1h;
@@ -2104,7 +2111,7 @@ class G1RefProcTaskProxy: public AbstractGangTask {
   CMBitMap*        _bitmap;
 
 public:
-  G1RefProcTaskProxy(ProcessTask& proc_task,
+  G1CMRefProcTaskProxy(ProcessTask& proc_task,
                      G1CollectedHeap* g1h,
                      ConcurrentMark* cm,
                      CMBitMap* bitmap) :
@@ -2122,10 +2129,10 @@ public:
   }
 };
 
-void G1RefProcTaskExecutor::execute(ProcessTask& proc_task) {
+void G1CMRefProcTaskExecutor::execute(ProcessTask& proc_task) {
   assert(_workers != NULL, "Need parallel worker threads.");
 
-  G1RefProcTaskProxy proc_task_proxy(proc_task, _g1h, _cm, _bitmap);
+  G1CMRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _cm, _bitmap);
 
   // We need to reset the phase for each task execution so that
   // the termination protocol of CMTask::do_marking_step works.
@@ -2135,12 +2142,12 @@ void G1RefProcTaskExecutor::execute(ProcessTask& proc_task) {
   _g1h->set_par_threads(0);
 }
 
-class G1RefEnqueueTaskProxy: public AbstractGangTask {
+class G1CMRefEnqueueTaskProxy: public AbstractGangTask {
   typedef AbstractRefProcTaskExecutor::EnqueueTask EnqueueTask;
   EnqueueTask& _enq_task;
 
 public:
-  G1RefEnqueueTaskProxy(EnqueueTask& enq_task) :
+  G1CMRefEnqueueTaskProxy(EnqueueTask& enq_task) :
     AbstractGangTask("Enqueue reference objects in parallel"),
     _enq_task(enq_task)
   { }
@@ -2150,10 +2157,10 @@ public:
   }
 };
 
-void G1RefProcTaskExecutor::execute(EnqueueTask& enq_task) {
+void G1CMRefProcTaskExecutor::execute(EnqueueTask& enq_task) {
   assert(_workers != NULL, "Need parallel worker threads.");
 
-  G1RefEnqueueTaskProxy enq_task_proxy(enq_task);
+  G1CMRefEnqueueTaskProxy enq_task_proxy(enq_task);
 
   _g1h->set_par_threads(_active_workers);
   _workers->run_task(&enq_task_proxy);
@@ -2178,7 +2185,7 @@ void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) {
     }
     TraceTime t("GC ref-proc", verbose, false, gclog_or_tty);
 
-    ReferenceProcessor* rp = g1h->ref_processor();
+    ReferenceProcessor* rp = g1h->ref_processor_cm();
 
     // See the comment in G1CollectedHeap::ref_processing_init()
     // about how reference processing currently works in G1.
@@ -2196,8 +2203,8 @@ void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) {
     int active_workers = g1h->workers() ? g1h->workers()->total_workers() : 1;
     active_workers = MAX2(MIN2(active_workers, (int)_max_task_num), 1);
 
-    G1RefProcTaskExecutor par_task_executor(g1h, this, nextMarkBitMap(),
-                                            g1h->workers(), active_workers);
+    G1CMRefProcTaskExecutor par_task_executor(g1h, this, nextMarkBitMap(),
+                                              g1h->workers(), active_workers);
 
     if (rp->processing_is_mt()) {
       // Set the degree of MT here.  If the discovery is done MT, there
@@ -2238,7 +2245,7 @@ void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) {
     }
 
     rp->verify_no_references_recorded();
-    assert(!rp->discovery_enabled(), "should have been disabled");
+    assert(!rp->discovery_enabled(), "Post condition");
   }
 
   // Now clean up stale oops in StringTable
@@ -3342,7 +3349,7 @@ G1CMOopClosure::G1CMOopClosure(G1CollectedHeap* g1h,
   assert(_ref_processor == NULL, "should be initialized to NULL");
 
   if (G1UseConcMarkReferenceProcessing) {
-    _ref_processor = g1h->ref_processor();
+    _ref_processor = g1h->ref_processor_cm();
     assert(_ref_processor != NULL, "should not be NULL");
   }
 }
diff --git a/src/share/vm/gc_implementation/g1/concurrentMark.hpp b/src/share/vm/gc_implementation/g1/concurrentMark.hpp
index 0aca6421d5d6e72e784fb773321f76599ea8ae07..c724594f4af4bde0fb8b1a14522e6465cbdc68ae 100644
--- a/src/share/vm/gc_implementation/g1/concurrentMark.hpp
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.hpp
@@ -366,8 +366,8 @@ class ConcurrentMark: public CHeapObj {
   friend class CMConcurrentMarkingTask;
   friend class G1ParNoteEndTask;
   friend class CalcLiveObjectsClosure;
-  friend class G1RefProcTaskProxy;
-  friend class G1RefProcTaskExecutor;
+  friend class G1CMRefProcTaskProxy;
+  friend class G1CMRefProcTaskExecutor;
   friend class G1CMParKeepAliveAndDrainClosure;
   friend class G1CMParDrainMarkingStackClosure;
 
diff --git a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
index 146c28781f2aceb0f4a4b3e76ef9c01bbe172fe6..66ca7442c5f7d075cd74c1c19abf386486a7ff62 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@@ -42,6 +42,7 @@
 #include "memory/gcLocker.inline.hpp"
 #include "memory/genOopClosures.inline.hpp"
 #include "memory/generationSpec.hpp"
+#include "memory/referenceProcessor.hpp"
 #include "oops/oop.inline.hpp"
 #include "oops/oop.pcgc.inline.hpp"
 #include "runtime/aprofiler.hpp"
@@ -1244,15 +1245,11 @@ bool G1CollectedHeap::do_collection(bool explicit_gc,
 
     COMPILER2_PRESENT(DerivedPointerTable::clear());
 
-    // We want to discover references, but not process them yet.
-    // This mode is disabled in
-    // instanceRefKlass::process_discovered_references if the
-    // generation does some collection work, or
-    // instanceRefKlass::enqueue_discovered_references if the
-    // generation returns without doing any work.
-    ref_processor()->disable_discovery();
-    ref_processor()->abandon_partial_discovery();
-    ref_processor()->verify_no_references_recorded();
+    // Disable discovery and empty the discovered lists
+    // for the CM ref processor.
+    ref_processor_cm()->disable_discovery();
+    ref_processor_cm()->abandon_partial_discovery();
+    ref_processor_cm()->verify_no_references_recorded();
 
     // Abandon current iterations of concurrent marking and concurrent
     // refinement, if any are in progress.
@@ -1280,31 +1277,33 @@ bool G1CollectedHeap::do_collection(bool explicit_gc,
     empty_young_list();
     g1_policy()->set_full_young_gcs(true);
 
-    // See the comment in G1CollectedHeap::ref_processing_init() about
+    // See the comments in g1CollectedHeap.hpp and
+    // G1CollectedHeap::ref_processing_init() about
     // how reference processing currently works in G1.
 
-    // Temporarily make reference _discovery_ single threaded (non-MT).
-    ReferenceProcessorMTDiscoveryMutator rp_disc_ser(ref_processor(), false);
+    // Temporarily make discovery by the STW ref processor single threaded (non-MT).
+    ReferenceProcessorMTDiscoveryMutator stw_rp_disc_ser(ref_processor_stw(), false);
 
-    // Temporarily make refs discovery atomic
-    ReferenceProcessorAtomicMutator rp_disc_atomic(ref_processor(), true);
+    // Temporarily clear the STW ref processor's _is_alive_non_header field.
+    ReferenceProcessorIsAliveMutator stw_rp_is_alive_null(ref_processor_stw(), NULL);
 
-    // Temporarily clear _is_alive_non_header
-    ReferenceProcessorIsAliveMutator rp_is_alive_null(ref_processor(), NULL);
+    ref_processor_stw()->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
+    ref_processor_stw()->setup_policy(do_clear_all_soft_refs);
 
-    ref_processor()->enable_discovery();
-    ref_processor()->setup_policy(do_clear_all_soft_refs);
     // Do collection work
     {
       HandleMark hm;  // Discard invalid handles created during gc
-      G1MarkSweep::invoke_at_safepoint(ref_processor(), do_clear_all_soft_refs);
+      G1MarkSweep::invoke_at_safepoint(ref_processor_stw(), do_clear_all_soft_refs);
     }
+
     assert(free_regions() == 0, "we should not have added any free regions");
     rebuild_region_lists();
 
     _summary_bytes_used = recalculate_used();
 
-    ref_processor()->enqueue_discovered_references();
+    // Enqueue any discovered reference objects that have
+    // not been removed from the discovered lists.
+    ref_processor_stw()->enqueue_discovered_references();
 
     COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
 
@@ -1319,7 +1318,16 @@ bool G1CollectedHeap::do_collection(bool explicit_gc,
                        /* option      */ VerifyOption_G1UsePrevMarking);
 
     }
-    NOT_PRODUCT(ref_processor()->verify_no_references_recorded());
+
+    assert(!ref_processor_stw()->discovery_enabled(), "Postcondition");
+    ref_processor_stw()->verify_no_references_recorded();
+
+    // Note: since we've just done a full GC, concurrent
+    // marking is no longer active. Therefore we need not
+    // re-enable reference discovery for the CM ref processor.
+    // That will be done at the start of the next marking cycle.
+    assert(!ref_processor_cm()->discovery_enabled(), "Postcondition");
+    ref_processor_cm()->verify_no_references_recorded();
 
     reset_gc_time_stamp();
     // Since everything potentially moved, we will clear all remembered
@@ -1772,8 +1780,10 @@ G1CollectedHeap::G1CollectedHeap(G1CollectorPolicy* policy_) :
   _g1_policy(policy_),
   _dirty_card_queue_set(false),
   _into_cset_dirty_card_queue_set(false),
-  _is_alive_closure(this),
-  _ref_processor(NULL),
+  _is_alive_closure_cm(this),
+  _is_alive_closure_stw(this),
+  _ref_processor_cm(NULL),
+  _ref_processor_stw(NULL),
   _process_strong_tasks(new SubTasksDone(G1H_PS_NumElements)),
   _bot_shared(NULL),
   _objs_with_preserved_marks(NULL), _preserved_marks_of_objs(NULL),
@@ -2067,34 +2077,81 @@ jint G1CollectedHeap::initialize() {
 void G1CollectedHeap::ref_processing_init() {
   // Reference processing in G1 currently works as follows:
   //
-  // * There is only one reference processor instance that
-  //   'spans' the entire heap. It is created by the code
-  //   below.
-  // * Reference discovery is not enabled during an incremental
-  //   pause (see 6484982).
-  // * Discoverered refs are not enqueued nor are they processed
-  //   during an incremental pause (see 6484982).
-  // * Reference discovery is enabled at initial marking.
-  // * Reference discovery is disabled and the discovered
-  //   references processed etc during remarking.
-  // * Reference discovery is MT (see below).
-  // * Reference discovery requires a barrier (see below).
-  // * Reference processing is currently not MT (see 6608385).
-  // * A full GC enables (non-MT) reference discovery and
-  //   processes any discovered references.
+  // * There are two reference processor instances. One is
+  //   used to record and process discovered references
+  //   during concurrent marking; the other is used to
+  //   record and process references during STW pauses
+  //   (both full and incremental).
+  // * Both ref processors need to 'span' the entire heap as
+  //   the regions in the collection set may be dotted around.
+  //
+  // * For the concurrent marking ref processor:
+  //   * Reference discovery is enabled at initial marking.
+  //   * Reference discovery is disabled and the discovered
+  //     references processed etc during remarking.
+  //   * Reference discovery is MT (see below).
+  //   * Reference discovery requires a barrier (see below).
+  //   * Reference processing may or may not be MT
+  //     (depending on the value of ParallelRefProcEnabled
+  //     and ParallelGCThreads).
+  //   * A full GC disables reference discovery by the CM
+  //     ref processor and abandons any entries on it's
+  //     discovered lists.
+  //
+  // * For the STW processor:
+  //   * Non MT discovery is enabled at the start of a full GC.
+  //   * Processing and enqueueing during a full GC is non-MT.
+  //   * During a full GC, references are processed after marking.
+  //
+  //   * Discovery (may or may not be MT) is enabled at the start
+  //     of an incremental evacuation pause.
+  //   * References are processed near the end of a STW evacuation pause.
+  //   * For both types of GC:
+  //     * Discovery is atomic - i.e. not concurrent.
+  //     * Reference discovery will not need a barrier.
 
   SharedHeap::ref_processing_init();
   MemRegion mr = reserved_region();
-  _ref_processor =
+
+  // Concurrent Mark ref processor
+  _ref_processor_cm =
+    new ReferenceProcessor(mr,    // span
+                           ParallelRefProcEnabled && (ParallelGCThreads > 1),
+                                // mt processing
+                           (int) ParallelGCThreads,
+                                // degree of mt processing
+                           (ParallelGCThreads > 1) || (ConcGCThreads > 1),
+                                // mt discovery
+                           (int) MAX2(ParallelGCThreads, ConcGCThreads),
+                                // degree of mt discovery
+                           false,
+                                // Reference discovery is not atomic
+                           &_is_alive_closure_cm,
+                                // is alive closure
+                                // (for efficiency/performance)
+                           true);
+                                // Setting next fields of discovered
+                                // lists requires a barrier.
+
+  // STW ref processor
+  _ref_processor_stw =
     new ReferenceProcessor(mr,    // span
-                           ParallelRefProcEnabled && (ParallelGCThreads > 1),    // mt processing
-                           (int) ParallelGCThreads,   // degree of mt processing
-                           ParallelGCThreads > 1 || ConcGCThreads > 1,  // mt discovery
-                           (int) MAX2(ParallelGCThreads, ConcGCThreads), // degree of mt discovery
-                           false,                     // Reference discovery is not atomic
-                           &_is_alive_closure,        // is alive closure for efficiency
-                           true);                     // Setting next fields of discovered
-                                                      // lists requires a barrier.
+                           ParallelRefProcEnabled && (ParallelGCThreads > 1),
+                                // mt processing
+                           MAX2((int)ParallelGCThreads, 1),
+                                // degree of mt processing
+                           (ParallelGCThreads > 1),
+                                // mt discovery
+                           MAX2((int)ParallelGCThreads, 1),
+                                // degree of mt discovery
+                           true,
+                                // Reference discovery is atomic
+                           &_is_alive_closure_stw,
+                                // is alive closure
+                                // (for efficiency/performance)
+                           false);
+                                // Setting next fields of discovered
+                                // lists requires a barrier.
 }
 
 size_t G1CollectedHeap::capacity() const {
@@ -3117,6 +3174,10 @@ void G1CollectedHeap::gc_epilogue(bool full /* Ignored */) {
   COMPILER2_PRESENT(assert(DerivedPointerTable::is_empty(),
                         "derived pointer present"));
   // always_do_update_barrier = true;
+
+  // We have just completed a GC. Update the soft reference
+  // policy with the new heap occupancy
+  Universe::update_heap_info_at_gc();
 }
 
 HeapWord* G1CollectedHeap::do_collection_pause(size_t word_size,
@@ -3354,230 +3415,241 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
 
       COMPILER2_PRESENT(DerivedPointerTable::clear());
 
-      // Please see comment in G1CollectedHeap::ref_processing_init()
-      // to see how reference processing currently works in G1.
-      //
-      // We want to turn off ref discovery, if necessary, and turn it back on
-      // on again later if we do. XXX Dubious: why is discovery disabled?
-      bool was_enabled = ref_processor()->discovery_enabled();
-      if (was_enabled) ref_processor()->disable_discovery();
+      // Please see comment in g1CollectedHeap.hpp and
+      // G1CollectedHeap::ref_processing_init() to see how
+      // reference processing currently works in G1.
 
-      // Forget the current alloc region (we might even choose it to be part
-      // of the collection set!).
-      release_mutator_alloc_region();
+      // Enable discovery in the STW reference processor
+      ref_processor_stw()->enable_discovery(true /*verify_disabled*/,
+                                            true /*verify_no_refs*/);
 
-      // We should call this after we retire the mutator alloc
-      // region(s) so that all the ALLOC / RETIRE events are generated
-      // before the start GC event.
-      _hr_printer.start_gc(false /* full */, (size_t) total_collections());
-
-      // The elapsed time induced by the start time below deliberately elides
-      // the possible verification above.
-      double start_time_sec = os::elapsedTime();
-      size_t start_used_bytes = used();
+      {
+        // We want to temporarily turn off discovery by the
+        // CM ref processor, if necessary, and turn it back on
+        // on again later if we do. Using a scoped
+        // NoRefDiscovery object will do this.
+        NoRefDiscovery no_cm_discovery(ref_processor_cm());
+
+        // Forget the current alloc region (we might even choose it to be part
+        // of the collection set!).
+        release_mutator_alloc_region();
+
+        // We should call this after we retire the mutator alloc
+        // region(s) so that all the ALLOC / RETIRE events are generated
+        // before the start GC event.
+        _hr_printer.start_gc(false /* full */, (size_t) total_collections());
+
+        // The elapsed time induced by the start time below deliberately elides
+        // the possible verification above.
+        double start_time_sec = os::elapsedTime();
+        size_t start_used_bytes = used();
 
 #if YOUNG_LIST_VERBOSE
-      gclog_or_tty->print_cr("\nBefore recording pause start.\nYoung_list:");
-      _young_list->print();
-      g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
+        gclog_or_tty->print_cr("\nBefore recording pause start.\nYoung_list:");
+        _young_list->print();
+        g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
 #endif // YOUNG_LIST_VERBOSE
 
-      g1_policy()->record_collection_pause_start(start_time_sec,
-                                                 start_used_bytes);
+        g1_policy()->record_collection_pause_start(start_time_sec,
+                                                   start_used_bytes);
 
 #if YOUNG_LIST_VERBOSE
-      gclog_or_tty->print_cr("\nAfter recording pause start.\nYoung_list:");
-      _young_list->print();
+        gclog_or_tty->print_cr("\nAfter recording pause start.\nYoung_list:");
+        _young_list->print();
 #endif // YOUNG_LIST_VERBOSE
 
-      if (g1_policy()->during_initial_mark_pause()) {
-        concurrent_mark()->checkpointRootsInitialPre();
-      }
-      perm_gen()->save_marks();
-
-      // We must do this before any possible evacuation that should propagate
-      // marks.
-      if (mark_in_progress()) {
-        double start_time_sec = os::elapsedTime();
+        if (g1_policy()->during_initial_mark_pause()) {
+          concurrent_mark()->checkpointRootsInitialPre();
+        }
+        perm_gen()->save_marks();
 
-        _cm->drainAllSATBBuffers();
-        double finish_mark_ms = (os::elapsedTime() - start_time_sec) * 1000.0;
-        g1_policy()->record_satb_drain_time(finish_mark_ms);
-      }
-      // Record the number of elements currently on the mark stack, so we
-      // only iterate over these.  (Since evacuation may add to the mark
-      // stack, doing more exposes race conditions.)  If no mark is in
-      // progress, this will be zero.
-      _cm->set_oops_do_bound();
+        // We must do this before any possible evacuation that should propagate
+        // marks.
+        if (mark_in_progress()) {
+          double start_time_sec = os::elapsedTime();
 
-      if (mark_in_progress()) {
-        concurrent_mark()->newCSet();
-      }
+          _cm->drainAllSATBBuffers();
+          double finish_mark_ms = (os::elapsedTime() - start_time_sec) * 1000.0;
+          g1_policy()->record_satb_drain_time(finish_mark_ms);
+        }
+        // Record the number of elements currently on the mark stack, so we
+        // only iterate over these.  (Since evacuation may add to the mark
+        // stack, doing more exposes race conditions.)  If no mark is in
+        // progress, this will be zero.
+        _cm->set_oops_do_bound();
+
+        if (mark_in_progress()) {
+          concurrent_mark()->newCSet();
+        }
 
 #if YOUNG_LIST_VERBOSE
-      gclog_or_tty->print_cr("\nBefore choosing collection set.\nYoung_list:");
-      _young_list->print();
-      g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
+        gclog_or_tty->print_cr("\nBefore choosing collection set.\nYoung_list:");
+        _young_list->print();
+        g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
 #endif // YOUNG_LIST_VERBOSE
 
-      g1_policy()->choose_collection_set(target_pause_time_ms);
-
-      if (_hr_printer.is_active()) {
-        HeapRegion* hr = g1_policy()->collection_set();
-        while (hr != NULL) {
-          G1HRPrinter::RegionType type;
-          if (!hr->is_young()) {
-            type = G1HRPrinter::Old;
-          } else if (hr->is_survivor()) {
-            type = G1HRPrinter::Survivor;
-          } else {
-            type = G1HRPrinter::Eden;
+        g1_policy()->choose_collection_set(target_pause_time_ms);
+
+        if (_hr_printer.is_active()) {
+          HeapRegion* hr = g1_policy()->collection_set();
+          while (hr != NULL) {
+            G1HRPrinter::RegionType type;
+            if (!hr->is_young()) {
+              type = G1HRPrinter::Old;
+            } else if (hr->is_survivor()) {
+              type = G1HRPrinter::Survivor;
+            } else {
+              type = G1HRPrinter::Eden;
+            }
+            _hr_printer.cset(hr);
+            hr = hr->next_in_collection_set();
           }
-          _hr_printer.cset(hr);
-          hr = hr->next_in_collection_set();
         }
-      }
 
-      // We have chosen the complete collection set. If marking is
-      // active then, we clear the region fields of any of the
-      // concurrent marking tasks whose region fields point into
-      // the collection set as these values will become stale. This
-      // will cause the owning marking threads to claim a new region
-      // when marking restarts.
-      if (mark_in_progress()) {
-        concurrent_mark()->reset_active_task_region_fields_in_cset();
-      }
+        // We have chosen the complete collection set. If marking is
+        // active then, we clear the region fields of any of the
+        // concurrent marking tasks whose region fields point into
+        // the collection set as these values will become stale. This
+        // will cause the owning marking threads to claim a new region
+        // when marking restarts.
+        if (mark_in_progress()) {
+          concurrent_mark()->reset_active_task_region_fields_in_cset();
+        }
 
 #ifdef ASSERT
-      VerifyCSetClosure cl;
-      collection_set_iterate(&cl);
+        VerifyCSetClosure cl;
+        collection_set_iterate(&cl);
 #endif // ASSERT
 
-      setup_surviving_young_words();
+        setup_surviving_young_words();
 
-      // Initialize the GC alloc regions.
-      init_gc_alloc_regions();
+        // Initialize the GC alloc regions.
+        init_gc_alloc_regions();
 
-      // Actually do the work...
-      evacuate_collection_set();
+        // Actually do the work...
+        evacuate_collection_set();
 
-      free_collection_set(g1_policy()->collection_set());
-      g1_policy()->clear_collection_set();
+        free_collection_set(g1_policy()->collection_set());
+        g1_policy()->clear_collection_set();
 
-      cleanup_surviving_young_words();
+        cleanup_surviving_young_words();
 
-      // Start a new incremental collection set for the next pause.
-      g1_policy()->start_incremental_cset_building();
+        // Start a new incremental collection set for the next pause.
+        g1_policy()->start_incremental_cset_building();
 
-      // Clear the _cset_fast_test bitmap in anticipation of adding
-      // regions to the incremental collection set for the next
-      // evacuation pause.
-      clear_cset_fast_test();
+        // Clear the _cset_fast_test bitmap in anticipation of adding
+        // regions to the incremental collection set for the next
+        // evacuation pause.
+        clear_cset_fast_test();
 
-      _young_list->reset_sampled_info();
+        _young_list->reset_sampled_info();
 
-      // Don't check the whole heap at this point as the
-      // GC alloc regions from this pause have been tagged
-      // as survivors and moved on to the survivor list.
-      // Survivor regions will fail the !is_young() check.
-      assert(check_young_list_empty(false /* check_heap */),
-        "young list should be empty");
+        // Don't check the whole heap at this point as the
+        // GC alloc regions from this pause have been tagged
+        // as survivors and moved on to the survivor list.
+        // Survivor regions will fail the !is_young() check.
+        assert(check_young_list_empty(false /* check_heap */),
+          "young list should be empty");
 
 #if YOUNG_LIST_VERBOSE
-      gclog_or_tty->print_cr("Before recording survivors.\nYoung List:");
-      _young_list->print();
+        gclog_or_tty->print_cr("Before recording survivors.\nYoung List:");
+        _young_list->print();
 #endif // YOUNG_LIST_VERBOSE
 
-      g1_policy()->record_survivor_regions(_young_list->survivor_length(),
-        _young_list->first_survivor_region(),
-        _young_list->last_survivor_region());
+        g1_policy()->record_survivor_regions(_young_list->survivor_length(),
+                                            _young_list->first_survivor_region(),
+                                            _young_list->last_survivor_region());
 
-      _young_list->reset_auxilary_lists();
+        _young_list->reset_auxilary_lists();
 
-      if (evacuation_failed()) {
-        _summary_bytes_used = recalculate_used();
-      } else {
-        // The "used" of the the collection set have already been subtracted
-        // when they were freed.  Add in the bytes evacuated.
-        _summary_bytes_used += g1_policy()->bytes_copied_during_gc();
-      }
+        if (evacuation_failed()) {
+          _summary_bytes_used = recalculate_used();
+        } else {
+          // The "used" of the the collection set have already been subtracted
+          // when they were freed.  Add in the bytes evacuated.
+          _summary_bytes_used += g1_policy()->bytes_copied_during_gc();
+        }
 
-      if (g1_policy()->during_initial_mark_pause()) {
-        concurrent_mark()->checkpointRootsInitialPost();
-        set_marking_started();
-        // CAUTION: after the doConcurrentMark() call below,
-        // the concurrent marking thread(s) could be running
-        // concurrently with us. Make sure that anything after
-        // this point does not assume that we are the only GC thread
-        // running. Note: of course, the actual marking work will
-        // not start until the safepoint itself is released in
-        // ConcurrentGCThread::safepoint_desynchronize().
-        doConcurrentMark();
-      }
+        if (g1_policy()->during_initial_mark_pause()) {
+          concurrent_mark()->checkpointRootsInitialPost();
+          set_marking_started();
+          // CAUTION: after the doConcurrentMark() call below,
+          // the concurrent marking thread(s) could be running
+          // concurrently with us. Make sure that anything after
+          // this point does not assume that we are the only GC thread
+          // running. Note: of course, the actual marking work will
+          // not start until the safepoint itself is released in
+          // ConcurrentGCThread::safepoint_desynchronize().
+          doConcurrentMark();
+        }
 
-      allocate_dummy_regions();
+        allocate_dummy_regions();
 
 #if YOUNG_LIST_VERBOSE
-      gclog_or_tty->print_cr("\nEnd of the pause.\nYoung_list:");
-      _young_list->print();
-      g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
+        gclog_or_tty->print_cr("\nEnd of the pause.\nYoung_list:");
+        _young_list->print();
+        g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
 #endif // YOUNG_LIST_VERBOSE
 
-      init_mutator_alloc_region();
-
-      {
-        size_t expand_bytes = g1_policy()->expansion_amount();
-        if (expand_bytes > 0) {
-          size_t bytes_before = capacity();
-          if (!expand(expand_bytes)) {
-            // We failed to expand the heap so let's verify that
-            // committed/uncommitted amount match the backing store
-            assert(capacity() == _g1_storage.committed_size(), "committed size mismatch");
-            assert(max_capacity() == _g1_storage.reserved_size(), "reserved size mismatch");
+        init_mutator_alloc_region();
+
+        {
+          size_t expand_bytes = g1_policy()->expansion_amount();
+          if (expand_bytes > 0) {
+            size_t bytes_before = capacity();
+            if (!expand(expand_bytes)) {
+              // We failed to expand the heap so let's verify that
+              // committed/uncommitted amount match the backing store
+              assert(capacity() == _g1_storage.committed_size(), "committed size mismatch");
+              assert(max_capacity() == _g1_storage.reserved_size(), "reserved size mismatch");
+            }
           }
         }
-      }
 
-      double end_time_sec = os::elapsedTime();
-      double pause_time_ms = (end_time_sec - start_time_sec) * MILLIUNITS;
-      g1_policy()->record_pause_time_ms(pause_time_ms);
-      g1_policy()->record_collection_pause_end();
-
-      MemoryService::track_memory_usage();
-
-      // In prepare_for_verify() below we'll need to scan the deferred
-      // update buffers to bring the RSets up-to-date if
-      // G1HRRSFlushLogBuffersOnVerify has been set. While scanning
-      // the update buffers we'll probably need to scan cards on the
-      // regions we just allocated to (i.e., the GC alloc
-      // regions). However, during the last GC we called
-      // set_saved_mark() on all the GC alloc regions, so card
-      // scanning might skip the [saved_mark_word()...top()] area of
-      // those regions (i.e., the area we allocated objects into
-      // during the last GC). But it shouldn't. Given that
-      // saved_mark_word() is conditional on whether the GC time stamp
-      // on the region is current or not, by incrementing the GC time
-      // stamp here we invalidate all the GC time stamps on all the
-      // regions and saved_mark_word() will simply return top() for
-      // all the regions. This is a nicer way of ensuring this rather
-      // than iterating over the regions and fixing them. In fact, the
-      // GC time stamp increment here also ensures that
-      // saved_mark_word() will return top() between pauses, i.e.,
-      // during concurrent refinement. So we don't need the
-      // is_gc_active() check to decided which top to use when
-      // scanning cards (see CR 7039627).
-      increment_gc_time_stamp();
+        double end_time_sec = os::elapsedTime();
+        double pause_time_ms = (end_time_sec - start_time_sec) * MILLIUNITS;
+        g1_policy()->record_pause_time_ms(pause_time_ms);
+        g1_policy()->record_collection_pause_end();
+
+        MemoryService::track_memory_usage();
+
+        // In prepare_for_verify() below we'll need to scan the deferred
+        // update buffers to bring the RSets up-to-date if
+        // G1HRRSFlushLogBuffersOnVerify has been set. While scanning
+        // the update buffers we'll probably need to scan cards on the
+        // regions we just allocated to (i.e., the GC alloc
+        // regions). However, during the last GC we called
+        // set_saved_mark() on all the GC alloc regions, so card
+        // scanning might skip the [saved_mark_word()...top()] area of
+        // those regions (i.e., the area we allocated objects into
+        // during the last GC). But it shouldn't. Given that
+        // saved_mark_word() is conditional on whether the GC time stamp
+        // on the region is current or not, by incrementing the GC time
+        // stamp here we invalidate all the GC time stamps on all the
+        // regions and saved_mark_word() will simply return top() for
+        // all the regions. This is a nicer way of ensuring this rather
+        // than iterating over the regions and fixing them. In fact, the
+        // GC time stamp increment here also ensures that
+        // saved_mark_word() will return top() between pauses, i.e.,
+        // during concurrent refinement. So we don't need the
+        // is_gc_active() check to decided which top to use when
+        // scanning cards (see CR 7039627).
+        increment_gc_time_stamp();
+
+        if (VerifyAfterGC && total_collections() >= VerifyGCStartAt) {
+          HandleMark hm;  // Discard invalid handles created during verification
+          gclog_or_tty->print(" VerifyAfterGC:");
+          prepare_for_verify();
+          Universe::verify(/* allow dirty */ true,
+                           /* silent      */ false,
+                           /* option      */ VerifyOption_G1UsePrevMarking);
+        }
 
-      if (VerifyAfterGC && total_collections() >= VerifyGCStartAt) {
-        HandleMark hm;  // Discard invalid handles created during verification
-        gclog_or_tty->print(" VerifyAfterGC:");
-        prepare_for_verify();
-        Universe::verify(/* allow dirty */ true,
-                         /* silent      */ false,
-                         /* option      */ VerifyOption_G1UsePrevMarking);
-      }
+        assert(!ref_processor_stw()->discovery_enabled(), "Postcondition");
+        ref_processor_stw()->verify_no_references_recorded();
 
-      if (was_enabled) ref_processor()->enable_discovery();
+        // CM reference discovery will be re-enabled if necessary.
+      }
 
       {
         size_t expand_bytes = g1_policy()->expansion_amount();
@@ -3728,34 +3800,6 @@ void G1CollectedHeap::finalize_for_evac_failure() {
   _evac_failure_scan_stack = NULL;
 }
 
-// *** Sequential G1 Evacuation
-
-class G1IsAliveClosure: public BoolObjectClosure {
-  G1CollectedHeap* _g1;
-public:
-  G1IsAliveClosure(G1CollectedHeap* g1) : _g1(g1) {}
-  void do_object(oop p) { assert(false, "Do not call."); }
-  bool do_object_b(oop p) {
-    // It is reachable if it is outside the collection set, or is inside
-    // and forwarded.
-    return !_g1->obj_in_cs(p) || p->is_forwarded();
-  }
-};
-
-class G1KeepAliveClosure: public OopClosure {
-  G1CollectedHeap* _g1;
-public:
-  G1KeepAliveClosure(G1CollectedHeap* g1) : _g1(g1) {}
-  void do_oop(narrowOop* p) { guarantee(false, "Not needed"); }
-  void do_oop(      oop* p) {
-    oop obj = *p;
-    if (_g1->obj_in_cs(obj)) {
-      assert( obj->is_forwarded(), "invariant" );
-      *p = obj->forwardee();
-    }
-  }
-};
-
 class UpdateRSetDeferred : public OopsInHeapRegionClosure {
 private:
   G1CollectedHeap* _g1;
@@ -4186,12 +4230,17 @@ bool G1ParScanThreadState::verify_task(StarTask ref) const {
 #endif // ASSERT
 
 void G1ParScanThreadState::trim_queue() {
+  assert(_evac_cl != NULL, "not set");
+  assert(_evac_failure_cl != NULL, "not set");
+  assert(_partial_scan_cl != NULL, "not set");
+
   StarTask ref;
   do {
     // Drain the overflow stack first, so other threads can steal.
     while (refs()->pop_overflow(ref)) {
       deal_with_reference(ref);
     }
+
     while (refs()->pop_local(ref)) {
       deal_with_reference(ref);
     }
@@ -4529,35 +4578,42 @@ public:
     ResourceMark rm;
     HandleMark   hm;
 
+    ReferenceProcessor*             rp = _g1h->ref_processor_stw();
+
     G1ParScanThreadState            pss(_g1h, i);
-    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss);
-    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss);
-    G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss);
+    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, rp);
+    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, rp);
+    G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, rp);
 
     pss.set_evac_closure(&scan_evac_cl);
     pss.set_evac_failure_closure(&evac_failure_cl);
     pss.set_partial_scan_closure(&partial_scan_cl);
 
-    G1ParScanExtRootClosure         only_scan_root_cl(_g1h, &pss);
-    G1ParScanPermClosure            only_scan_perm_cl(_g1h, &pss);
-    G1ParScanHeapRSClosure          only_scan_heap_rs_cl(_g1h, &pss);
-    G1ParPushHeapRSClosure          push_heap_rs_cl(_g1h, &pss);
+    G1ParScanExtRootClosure        only_scan_root_cl(_g1h, &pss, rp);
+    G1ParScanPermClosure           only_scan_perm_cl(_g1h, &pss, rp);
 
-    G1ParScanAndMarkExtRootClosure  scan_mark_root_cl(_g1h, &pss);
-    G1ParScanAndMarkPermClosure     scan_mark_perm_cl(_g1h, &pss);
-    G1ParScanAndMarkHeapRSClosure   scan_mark_heap_rs_cl(_g1h, &pss);
+    G1ParScanAndMarkExtRootClosure scan_mark_root_cl(_g1h, &pss, rp);
+    G1ParScanAndMarkPermClosure    scan_mark_perm_cl(_g1h, &pss, rp);
 
-    OopsInHeapRegionClosure        *scan_root_cl;
-    OopsInHeapRegionClosure        *scan_perm_cl;
+    OopClosure*                    scan_root_cl = &only_scan_root_cl;
+    OopsInHeapRegionClosure*       scan_perm_cl = &only_scan_perm_cl;
 
     if (_g1h->g1_policy()->during_initial_mark_pause()) {
+      // We also need to mark copied objects.
       scan_root_cl = &scan_mark_root_cl;
       scan_perm_cl = &scan_mark_perm_cl;
-    } else {
-      scan_root_cl = &only_scan_root_cl;
-      scan_perm_cl = &only_scan_perm_cl;
     }
 
+    // The following closure is used to scan RSets looking for reference
+    // fields that point into the collection set. The actual field iteration
+    // is performed by a FilterIntoCSClosure, whose do_oop method calls the
+    // do_oop method of the following closure.
+    // Therefore we want to record the reference processor in the
+    // FilterIntoCSClosure. To do so we record the STW reference
+    // processor into the following closure and pass it to the
+    // FilterIntoCSClosure in HeapRegionDCTOC::walk_mem_region_with_cl.
+    G1ParPushHeapRSClosure          push_heap_rs_cl(_g1h, &pss, rp);
+
     pss.start_strong_roots();
     _g1h->g1_process_strong_roots(/* not collecting perm */ false,
                                   SharedHeap::SO_AllClasses,
@@ -4605,6 +4661,7 @@ g1_process_strong_roots(bool collecting_perm_gen,
                         OopsInHeapRegionClosure* scan_rs,
                         OopsInGenClosure* scan_perm,
                         int worker_i) {
+
   // First scan the strong roots, including the perm gen.
   double ext_roots_start = os::elapsedTime();
   double closure_app_time_sec = 0.0;
@@ -4623,12 +4680,13 @@ g1_process_strong_roots(bool collecting_perm_gen,
                        &eager_scan_code_roots,
                        &buf_scan_perm);
 
-  // Now the ref_processor roots.
+  // Now the CM ref_processor roots.
   if (!_process_strong_tasks->is_task_claimed(G1H_PS_refProcessor_oops_do)) {
-    // We need to treat the discovered reference lists as roots and
-    // keep entries (which are added by the marking threads) on them
-    // live until they can be processed at the end of marking.
-    ref_processor()->weak_oops_do(&buf_scan_non_heap_roots);
+    // We need to treat the discovered reference lists of the
+    // concurrent mark ref processor as roots and keep entries
+    // (which are added by the marking threads) on them live
+    // until they can be processed at the end of marking.
+    ref_processor_cm()->weak_oops_do(&buf_scan_non_heap_roots);
   }
 
   // Finish up any enqueued closure apps (attributed as object copy time).
@@ -4669,6 +4727,524 @@ G1CollectedHeap::g1_process_weak_roots(OopClosure* root_closure,
   SharedHeap::process_weak_roots(root_closure, &roots_in_blobs, non_root_closure);
 }
 
+// Weak Reference Processing support
+
+// An always "is_alive" closure that is used to preserve referents.
+// If the object is non-null then it's alive.  Used in the preservation
+// of referent objects that are pointed to by reference objects
+// discovered by the CM ref processor.
+class G1AlwaysAliveClosure: public BoolObjectClosure {
+  G1CollectedHeap* _g1;
+public:
+  G1AlwaysAliveClosure(G1CollectedHeap* g1) : _g1(g1) {}
+  void do_object(oop p) { assert(false, "Do not call."); }
+  bool do_object_b(oop p) {
+    if (p != NULL) {
+      return true;
+    }
+    return false;
+  }
+};
+
+bool G1STWIsAliveClosure::do_object_b(oop p) {
+  // An object is reachable if it is outside the collection set,
+  // or is inside and copied.
+  return !_g1->obj_in_cs(p) || p->is_forwarded();
+}
+
+// Non Copying Keep Alive closure
+class G1KeepAliveClosure: public OopClosure {
+  G1CollectedHeap* _g1;
+public:
+  G1KeepAliveClosure(G1CollectedHeap* g1) : _g1(g1) {}
+  void do_oop(narrowOop* p) { guarantee(false, "Not needed"); }
+  void do_oop(      oop* p) {
+    oop obj = *p;
+
+    if (_g1->obj_in_cs(obj)) {
+      assert( obj->is_forwarded(), "invariant" );
+      *p = obj->forwardee();
+    }
+  }
+};
+
+// Copying Keep Alive closure - can be called from both
+// serial and parallel code as long as different worker
+// threads utilize different G1ParScanThreadState instances
+// and different queues.
+
+class G1CopyingKeepAliveClosure: public OopClosure {
+  G1CollectedHeap*         _g1h;
+  OopClosure*              _copy_non_heap_obj_cl;
+  OopsInHeapRegionClosure* _copy_perm_obj_cl;
+  G1ParScanThreadState*    _par_scan_state;
+
+public:
+  G1CopyingKeepAliveClosure(G1CollectedHeap* g1h,
+                            OopClosure* non_heap_obj_cl,
+                            OopsInHeapRegionClosure* perm_obj_cl,
+                            G1ParScanThreadState* pss):
+    _g1h(g1h),
+    _copy_non_heap_obj_cl(non_heap_obj_cl),
+    _copy_perm_obj_cl(perm_obj_cl),
+    _par_scan_state(pss)
+  {}
+
+  virtual void do_oop(narrowOop* p) { do_oop_work(p); }
+  virtual void do_oop(      oop* p) { do_oop_work(p); }
+
+  template <class T> void do_oop_work(T* p) {
+    oop obj = oopDesc::load_decode_heap_oop(p);
+
+    if (_g1h->obj_in_cs(obj)) {
+      // If the referent object has been forwarded (either copied
+      // to a new location or to itself in the event of an
+      // evacuation failure) then we need to update the reference
+      // field and, if both reference and referent are in the G1
+      // heap, update the RSet for the referent.
+      //
+      // If the referent has not been forwarded then we have to keep
+      // it alive by policy. Therefore we have copy the referent.
+      //
+      // If the reference field is in the G1 heap then we can push
+      // on the PSS queue. When the queue is drained (after each
+      // phase of reference processing) the object and it's followers
+      // will be copied, the reference field set to point to the
+      // new location, and the RSet updated. Otherwise we need to
+      // use the the non-heap or perm closures directly to copy
+      // the refernt object and update the pointer, while avoiding
+      // updating the RSet.
+
+      if (_g1h->is_in_g1_reserved(p)) {
+        _par_scan_state->push_on_queue(p);
+      } else {
+        // The reference field is not in the G1 heap.
+        if (_g1h->perm_gen()->is_in(p)) {
+          _copy_perm_obj_cl->do_oop(p);
+        } else {
+          _copy_non_heap_obj_cl->do_oop(p);
+        }
+      }
+    }
+  }
+};
+
+// Serial drain queue closure. Called as the 'complete_gc'
+// closure for each discovered list in some of the
+// reference processing phases.
+
+class G1STWDrainQueueClosure: public VoidClosure {
+protected:
+  G1CollectedHeap* _g1h;
+  G1ParScanThreadState* _par_scan_state;
+
+  G1ParScanThreadState*   par_scan_state() { return _par_scan_state; }
+
+public:
+  G1STWDrainQueueClosure(G1CollectedHeap* g1h, G1ParScanThreadState* pss) :
+    _g1h(g1h),
+    _par_scan_state(pss)
+  { }
+
+  void do_void() {
+    G1ParScanThreadState* const pss = par_scan_state();
+    pss->trim_queue();
+  }
+};
+
+// Parallel Reference Processing closures
+
+// Implementation of AbstractRefProcTaskExecutor for parallel reference
+// processing during G1 evacuation pauses.
+
+class G1STWRefProcTaskExecutor: public AbstractRefProcTaskExecutor {
+private:
+  G1CollectedHeap*   _g1h;
+  RefToScanQueueSet* _queues;
+  WorkGang*          _workers;
+  int                _active_workers;
+
+public:
+  G1STWRefProcTaskExecutor(G1CollectedHeap* g1h,
+                        WorkGang* workers,
+                        RefToScanQueueSet *task_queues,
+                        int n_workers) :
+    _g1h(g1h),
+    _queues(task_queues),
+    _workers(workers),
+    _active_workers(n_workers)
+  {
+    assert(n_workers > 0, "shouldn't call this otherwise");
+  }
+
+  // Executes the given task using concurrent marking worker threads.
+  virtual void execute(ProcessTask& task);
+  virtual void execute(EnqueueTask& task);
+};
+
+// Gang task for possibly parallel reference processing
+
+class G1STWRefProcTaskProxy: public AbstractGangTask {
+  typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
+  ProcessTask&     _proc_task;
+  G1CollectedHeap* _g1h;
+  RefToScanQueueSet *_task_queues;
+  ParallelTaskTerminator* _terminator;
+
+public:
+  G1STWRefProcTaskProxy(ProcessTask& proc_task,
+                     G1CollectedHeap* g1h,
+                     RefToScanQueueSet *task_queues,
+                     ParallelTaskTerminator* terminator) :
+    AbstractGangTask("Process reference objects in parallel"),
+    _proc_task(proc_task),
+    _g1h(g1h),
+    _task_queues(task_queues),
+    _terminator(terminator)
+  {}
+
+  virtual void work(int i) {
+    // The reference processing task executed by a single worker.
+    ResourceMark rm;
+    HandleMark   hm;
+
+    G1STWIsAliveClosure is_alive(_g1h);
+
+    G1ParScanThreadState pss(_g1h, i);
+
+    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, NULL);
+    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, NULL);
+    G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, NULL);
+
+    pss.set_evac_closure(&scan_evac_cl);
+    pss.set_evac_failure_closure(&evac_failure_cl);
+    pss.set_partial_scan_closure(&partial_scan_cl);
+
+    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanPermClosure           only_copy_perm_cl(_g1h, &pss, NULL);
+
+    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanAndMarkPermClosure    copy_mark_perm_cl(_g1h, &pss, NULL);
+
+    OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
+    OopsInHeapRegionClosure*       copy_perm_cl = &only_copy_perm_cl;
+
+    if (_g1h->g1_policy()->during_initial_mark_pause()) {
+      // We also need to mark copied objects.
+      copy_non_heap_cl = &copy_mark_non_heap_cl;
+      copy_perm_cl = &copy_mark_perm_cl;
+    }
+
+    // Keep alive closure.
+    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, copy_perm_cl, &pss);
+
+    // Complete GC closure
+    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _task_queues, _terminator);
+
+    // Call the reference processing task's work routine.
+    _proc_task.work(i, is_alive, keep_alive, drain_queue);
+
+    // Note we cannot assert that the refs array is empty here as not all
+    // of the processing tasks (specifically phase2 - pp2_work) execute
+    // the complete_gc closure (which ordinarily would drain the queue) so
+    // the queue may not be empty.
+  }
+};
+
+// Driver routine for parallel reference processing.
+// Creates an instance of the ref processing gang
+// task and has the worker threads execute it.
+void G1STWRefProcTaskExecutor::execute(ProcessTask& proc_task) {
+  assert(_workers != NULL, "Need parallel worker threads.");
+
+  ParallelTaskTerminator terminator(_active_workers, _queues);
+  G1STWRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _queues, &terminator);
+
+  _g1h->set_par_threads(_active_workers);
+  _workers->run_task(&proc_task_proxy);
+  _g1h->set_par_threads(0);
+}
+
+// Gang task for parallel reference enqueueing.
+
+class G1STWRefEnqueueTaskProxy: public AbstractGangTask {
+  typedef AbstractRefProcTaskExecutor::EnqueueTask EnqueueTask;
+  EnqueueTask& _enq_task;
+
+public:
+  G1STWRefEnqueueTaskProxy(EnqueueTask& enq_task) :
+    AbstractGangTask("Enqueue reference objects in parallel"),
+    _enq_task(enq_task)
+  { }
+
+  virtual void work(int i) {
+    _enq_task.work(i);
+  }
+};
+
+// Driver routine for parallel reference enqueing.
+// Creates an instance of the ref enqueueing gang
+// task and has the worker threads execute it.
+
+void G1STWRefProcTaskExecutor::execute(EnqueueTask& enq_task) {
+  assert(_workers != NULL, "Need parallel worker threads.");
+
+  G1STWRefEnqueueTaskProxy enq_task_proxy(enq_task);
+
+  _g1h->set_par_threads(_active_workers);
+  _workers->run_task(&enq_task_proxy);
+  _g1h->set_par_threads(0);
+}
+
+// End of weak reference support closures
+
+// Abstract task used to preserve (i.e. copy) any referent objects
+// that are in the collection set and are pointed to by reference
+// objects discovered by the CM ref processor.
+
+class G1ParPreserveCMReferentsTask: public AbstractGangTask {
+protected:
+  G1CollectedHeap* _g1h;
+  RefToScanQueueSet      *_queues;
+  ParallelTaskTerminator _terminator;
+  int _n_workers;
+
+public:
+  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h,int workers, RefToScanQueueSet *task_queues) :
+    AbstractGangTask("ParPreserveCMReferents"),
+    _g1h(g1h),
+    _queues(task_queues),
+    _terminator(workers, _queues),
+    _n_workers(workers)
+  { }
+
+  void work(int i) {
+    ResourceMark rm;
+    HandleMark   hm;
+
+    G1ParScanThreadState            pss(_g1h, i);
+    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, NULL);
+    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, NULL);
+    G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, NULL);
+
+    pss.set_evac_closure(&scan_evac_cl);
+    pss.set_evac_failure_closure(&evac_failure_cl);
+    pss.set_partial_scan_closure(&partial_scan_cl);
+
+    assert(pss.refs()->is_empty(), "both queue and overflow should be empty");
+
+
+    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanPermClosure           only_copy_perm_cl(_g1h, &pss, NULL);
+
+    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanAndMarkPermClosure    copy_mark_perm_cl(_g1h, &pss, NULL);
+
+    OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
+    OopsInHeapRegionClosure*       copy_perm_cl = &only_copy_perm_cl;
+
+    if (_g1h->g1_policy()->during_initial_mark_pause()) {
+      // We also need to mark copied objects.
+      copy_non_heap_cl = &copy_mark_non_heap_cl;
+      copy_perm_cl = &copy_mark_perm_cl;
+    }
+
+    // Is alive closure
+    G1AlwaysAliveClosure always_alive(_g1h);
+
+    // Copying keep alive closure. Applied to referent objects that need
+    // to be copied.
+    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, copy_perm_cl, &pss);
+
+    ReferenceProcessor* rp = _g1h->ref_processor_cm();
+
+    int limit = ReferenceProcessor::number_of_subclasses_of_ref() * rp->max_num_q();
+    int stride = MIN2(MAX2(_n_workers, 1), limit);
+
+    // limit is set using max_num_q() - which was set using ParallelGCThreads.
+    // So this must be true - but assert just in case someone decides to
+    // change the worker ids.
+    assert(0 <= i && i < limit, "sanity");
+    assert(!rp->discovery_is_atomic(), "check this code");
+
+    // Select discovered lists [i, i+stride, i+2*stride,...,limit)
+    for (int idx = i; idx < limit; idx += stride) {
+      DiscoveredList& ref_list = rp->discovered_soft_refs()[idx];
+
+      DiscoveredListIterator iter(ref_list, &keep_alive, &always_alive);
+      while (iter.has_next()) {
+        // Since discovery is not atomic for the CM ref processor, we
+        // can see some null referent objects.
+        iter.load_ptrs(DEBUG_ONLY(true));
+        oop ref = iter.obj();
+
+        // This will filter nulls.
+        if (iter.is_referent_alive()) {
+          iter.make_referent_alive();
+        }
+        iter.move_to_next();
+      }
+    }
+
+    // Drain the queue - which may cause stealing
+    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _queues, &_terminator);
+    drain_queue.do_void();
+    // Allocation buffers were retired at the end of G1ParEvacuateFollowersClosure
+    assert(pss.refs()->is_empty(), "should be");
+  }
+};
+
+// Weak Reference processing during an evacuation pause (part 1).
+void G1CollectedHeap::process_discovered_references() {
+  double ref_proc_start = os::elapsedTime();
+
+  ReferenceProcessor* rp = _ref_processor_stw;
+  assert(rp->discovery_enabled(), "should have been enabled");
+
+  // Any reference objects, in the collection set, that were 'discovered'
+  // by the CM ref processor should have already been copied (either by
+  // applying the external root copy closure to the discovered lists, or
+  // by following an RSet entry).
+  //
+  // But some of the referents, that are in the collection set, that these
+  // reference objects point to may not have been copied: the STW ref
+  // processor would have seen that the reference object had already
+  // been 'discovered' and would have skipped discovering the reference,
+  // but would not have treated the reference object as a regular oop.
+  // As a reult the copy closure would not have been applied to the
+  // referent object.
+  //
+  // We need to explicitly copy these referent objects - the references
+  // will be processed at the end of remarking.
+  //
+  // We also need to do this copying before we process the reference
+  // objects discovered by the STW ref processor in case one of these
+  // referents points to another object which is also referenced by an
+  // object discovered by the STW ref processor.
+
+  int n_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
+                        workers()->total_workers() : 1);
+
+  set_par_threads(n_workers);
+  G1ParPreserveCMReferentsTask keep_cm_referents(this, n_workers, _task_queues);
+
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
+    workers()->run_task(&keep_cm_referents);
+  } else {
+    keep_cm_referents.work(0);
+  }
+
+  set_par_threads(0);
+
+  // Closure to test whether a referent is alive.
+  G1STWIsAliveClosure is_alive(this);
+
+  // Even when parallel reference processing is enabled, the processing
+  // of JNI refs is serial and performed serially by the current thread
+  // rather than by a worker. The following PSS will be used for processing
+  // JNI refs.
+
+  // Use only a single queue for this PSS.
+  G1ParScanThreadState pss(this, 0);
+
+  // We do not embed a reference processor in the copying/scanning
+  // closures while we're actually processing the discovered
+  // reference objects.
+  G1ParScanHeapEvacClosure        scan_evac_cl(this, &pss, NULL);
+  G1ParScanHeapEvacFailureClosure evac_failure_cl(this, &pss, NULL);
+  G1ParScanPartialArrayClosure    partial_scan_cl(this, &pss, NULL);
+
+  pss.set_evac_closure(&scan_evac_cl);
+  pss.set_evac_failure_closure(&evac_failure_cl);
+  pss.set_partial_scan_closure(&partial_scan_cl);
+
+  assert(pss.refs()->is_empty(), "pre-condition");
+
+  G1ParScanExtRootClosure        only_copy_non_heap_cl(this, &pss, NULL);
+  G1ParScanPermClosure           only_copy_perm_cl(this, &pss, NULL);
+
+  G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(this, &pss, NULL);
+  G1ParScanAndMarkPermClosure    copy_mark_perm_cl(this, &pss, NULL);
+
+  OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
+  OopsInHeapRegionClosure*       copy_perm_cl = &only_copy_perm_cl;
+
+  if (_g1h->g1_policy()->during_initial_mark_pause()) {
+    // We also need to mark copied objects.
+    copy_non_heap_cl = &copy_mark_non_heap_cl;
+    copy_perm_cl = &copy_mark_perm_cl;
+  }
+
+  // Keep alive closure.
+  G1CopyingKeepAliveClosure keep_alive(this, copy_non_heap_cl, copy_perm_cl, &pss);
+
+  // Serial Complete GC closure
+  G1STWDrainQueueClosure drain_queue(this, &pss);
+
+  // Setup the soft refs policy...
+  rp->setup_policy(false);
+
+  if (!rp->processing_is_mt()) {
+    // Serial reference processing...
+    rp->process_discovered_references(&is_alive,
+                                      &keep_alive,
+                                      &drain_queue,
+                                      NULL);
+  } else {
+    // Parallel reference processing
+    int active_workers = (ParallelGCThreads > 0 ? workers()->total_workers() : 1);
+    assert(rp->num_q() == active_workers, "sanity");
+    assert(active_workers <= rp->max_num_q(), "sanity");
+
+    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, active_workers);
+    rp->process_discovered_references(&is_alive, &keep_alive, &drain_queue, &par_task_executor);
+  }
+
+  // We have completed copying any necessary live referent objects
+  // (that were not copied during the actual pause) so we can
+  // retire any active alloc buffers
+  pss.retire_alloc_buffers();
+  assert(pss.refs()->is_empty(), "both queue and overflow should be empty");
+
+  double ref_proc_time = os::elapsedTime() - ref_proc_start;
+  g1_policy()->record_ref_proc_time(ref_proc_time * 1000.0);
+}
+
+// Weak Reference processing during an evacuation pause (part 2).
+void G1CollectedHeap::enqueue_discovered_references() {
+  double ref_enq_start = os::elapsedTime();
+
+  ReferenceProcessor* rp = _ref_processor_stw;
+  assert(!rp->discovery_enabled(), "should have been disabled as part of processing");
+
+  // Now enqueue any remaining on the discovered lists on to
+  // the pending list.
+  if (!rp->processing_is_mt()) {
+    // Serial reference processing...
+    rp->enqueue_discovered_references();
+  } else {
+    // Parallel reference enqueuing
+
+    int active_workers = (ParallelGCThreads > 0 ? workers()->total_workers() : 1);
+    assert(rp->num_q() == active_workers, "sanity");
+    assert(active_workers <= rp->max_num_q(), "sanity");
+
+    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, active_workers);
+    rp->enqueue_discovered_references(&par_task_executor);
+  }
+
+  rp->verify_no_references_recorded();
+  assert(!rp->discovery_enabled(), "should have been disabled");
+
+  // FIXME
+  // CM's reference processing also cleans up the string and symbol tables.
+  // Should we do that here also? We could, but it is a serial operation
+  // and could signicantly increase the pause time.
+
+  double ref_enq_time = os::elapsedTime() - ref_enq_start;
+  g1_policy()->record_ref_enq_time(ref_enq_time * 1000.0);
+}
+
 void G1CollectedHeap::evacuate_collection_set() {
   set_evacuation_failed(false);
 
@@ -4686,6 +5262,7 @@ void G1CollectedHeap::evacuate_collection_set() {
 
   assert(dirty_card_queue_set().completed_buffers_num() == 0, "Should be empty");
   double start_par = os::elapsedTime();
+
   if (G1CollectedHeap::use_parallel_gc_threads()) {
     // The individual threads will set their evac-failure closures.
     StrongRootsScope srs(this);
@@ -4700,15 +5277,23 @@ void G1CollectedHeap::evacuate_collection_set() {
   g1_policy()->record_par_time(par_time);
   set_par_threads(0);
 
+  // Process any discovered reference objects - we have
+  // to do this _before_ we retire the GC alloc regions
+  // as we may have to copy some 'reachable' referent
+  // objects (and their reachable sub-graphs) that were
+  // not copied during the pause.
+  process_discovered_references();
+
   // Weak root processing.
   // Note: when JSR 292 is enabled and code blobs can contain
   // non-perm oops then we will need to process the code blobs
   // here too.
   {
-    G1IsAliveClosure is_alive(this);
+    G1STWIsAliveClosure is_alive(this);
     G1KeepAliveClosure keep_alive(this);
     JNIHandles::weak_oops_do(&is_alive, &keep_alive);
   }
+
   release_gc_alloc_regions();
   g1_rem_set()->cleanup_after_oops_into_collection_set_do();
 
@@ -4730,6 +5315,15 @@ void G1CollectedHeap::evacuate_collection_set() {
     }
   }
 
+  // Enqueue any remaining references remaining on the STW
+  // reference processor's discovered lists. We need to do
+  // this after the card table is cleaned (and verified) as
+  // the act of enqueuing entries on to the pending list
+  // will log these updates (and dirty their associated
+  // cards). We need these updates logged to update any
+  // RSets.
+  enqueue_discovered_references();
+
   if (G1DeferredRSUpdate) {
     RedirtyLoggedCardTableEntryFastClosure redirty;
     dirty_card_queue_set().set_closure(&redirty);
@@ -4930,7 +5524,7 @@ void G1CollectedHeap::cleanUpCardTable() {
   }
 
   double elapsed = os::elapsedTime() - start;
-  g1_policy()->record_clear_ct_time( elapsed * 1000.0);
+  g1_policy()->record_clear_ct_time(elapsed * 1000.0);
 #ifndef PRODUCT
   if (G1VerifyCTCleanup || VerifyAfterGC) {
     G1VerifyCardTableCleanup cleanup_verifier(this, ct_bs);
diff --git a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
index 6a361326f6dea4f1f9066511a96a68f45ded1410..9dcf1a825569421dea04e06438c29e162ae7979e 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
@@ -155,6 +155,19 @@ public:
     : G1AllocRegion("Mutator Alloc Region", false /* bot_updates */) { }
 };
 
+// The G1 STW is alive closure.
+// An instance is embedded into the G1CH and used as the
+// (optional) _is_alive_non_header closure in the STW
+// reference processor. It is also extensively used during
+// refence processing during STW evacuation pauses.
+class G1STWIsAliveClosure: public BoolObjectClosure {
+  G1CollectedHeap* _g1;
+public:
+  G1STWIsAliveClosure(G1CollectedHeap* g1) : _g1(g1) {}
+  void do_object(oop p) { assert(false, "Do not call."); }
+  bool do_object_b(oop p);
+};
+
 class SurvivorGCAllocRegion : public G1AllocRegion {
 protected:
   virtual HeapRegion* allocate_new_region(size_t word_size, bool force);
@@ -174,6 +187,7 @@ public:
 };
 
 class RefineCardTableEntryClosure;
+
 class G1CollectedHeap : public SharedHeap {
   friend class VM_G1CollectForAllocation;
   friend class VM_GenCollectForPermanentAllocation;
@@ -573,6 +587,14 @@ protected:
   // allocated block, or else "NULL".
   HeapWord* expand_and_allocate(size_t word_size);
 
+  // Process any reference objects discovered during
+  // an incremental evacuation pause.
+  void process_discovered_references();
+
+  // Enqueue any remaining discovered references
+  // after processing.
+  void enqueue_discovered_references();
+
 public:
 
   G1MonitoringSupport* g1mm() { return _g1mm; }
@@ -826,14 +848,83 @@ protected:
                                     bool should_mark_root);
   void handle_evacuation_failure_common(oop obj, markOop m);
 
-  // Instance of the concurrent mark is_alive closure for embedding
-  // into the reference processor as the is_alive_non_header. This
-  // prevents unnecessary additions to the discovered lists during
-  // concurrent discovery.
-  G1CMIsAliveClosure _is_alive_closure;
+  // ("Weak") Reference processing support.
+  //
+  // G1 has 2 instances of the referece processor class. One
+  // (_ref_processor_cm) handles reference object discovery
+  // and subsequent processing during concurrent marking cycles.
+  //
+  // The other (_ref_processor_stw) handles reference object
+  // discovery and processing during full GCs and incremental
+  // evacuation pauses.
+  //
+  // During an incremental pause, reference discovery will be
+  // temporarily disabled for _ref_processor_cm and will be
+  // enabled for _ref_processor_stw. At the end of the evacuation
+  // pause references discovered by _ref_processor_stw will be
+  // processed and discovery will be disabled. The previous
+  // setting for reference object discovery for _ref_processor_cm
+  // will be re-instated.
+  //
+  // At the start of marking:
+  //  * Discovery by the CM ref processor is verified to be inactive
+  //    and it's discovered lists are empty.
+  //  * Discovery by the CM ref processor is then enabled.
+  //
+  // At the end of marking:
+  //  * Any references on the CM ref processor's discovered
+  //    lists are processed (possibly MT).
+  //
+  // At the start of full GC we:
+  //  * Disable discovery by the CM ref processor and
+  //    empty CM ref processor's discovered lists
+  //    (without processing any entries).
+  //  * Verify that the STW ref processor is inactive and it's
+  //    discovered lists are empty.
+  //  * Temporarily set STW ref processor discovery as single threaded.
+  //  * Temporarily clear the STW ref processor's _is_alive_non_header
+  //    field.
+  //  * Finally enable discovery by the STW ref processor.
+  //
+  // The STW ref processor is used to record any discovered
+  // references during the full GC.
+  //
+  // At the end of a full GC we:
+  //  * Enqueue any reference objects discovered by the STW ref processor
+  //    that have non-live referents. This has the side-effect of
+  //    making the STW ref processor inactive by disabling discovery.
+  //  * Verify that the CM ref processor is still inactive
+  //    and no references have been placed on it's discovered
+  //    lists (also checked as a precondition during initial marking).
+
+  // The (stw) reference processor...
+  ReferenceProcessor* _ref_processor_stw;
+
+  // During reference object discovery, the _is_alive_non_header
+  // closure (if non-null) is applied to the referent object to
+  // determine whether the referent is live. If so then the
+  // reference object does not need to be 'discovered' and can
+  // be treated as a regular oop. This has the benefit of reducing
+  // the number of 'discovered' reference objects that need to
+  // be processed.
+  //
+  // Instance of the is_alive closure for embedding into the
+  // STW reference processor as the _is_alive_non_header field.
+  // Supplying a value for the _is_alive_non_header field is
+  // optional but doing so prevents unnecessary additions to
+  // the discovered lists during reference discovery.
+  G1STWIsAliveClosure _is_alive_closure_stw;
+
+  // The (concurrent marking) reference processor...
+  ReferenceProcessor* _ref_processor_cm;
 
-  // ("Weak") Reference processing support
-  ReferenceProcessor* _ref_processor;
+  // Instance of the concurrent mark is_alive closure for embedding
+  // into the Concurrent Marking reference processor as the
+  // _is_alive_non_header field. Supplying a value for the
+  // _is_alive_non_header field is optional but doing so prevents
+  // unnecessary additions to the discovered lists during reference
+  // discovery.
+  G1CMIsAliveClosure _is_alive_closure_cm;
 
   enum G1H_process_strong_roots_tasks {
     G1H_PS_mark_stack_oops_do,
@@ -874,6 +965,7 @@ public:
   // specified by the policy object.
   jint initialize();
 
+  // Initialize weak reference processing.
   virtual void ref_processing_init();
 
   void set_par_threads(int t) {
@@ -925,8 +1017,13 @@ public:
   // The shared block offset table array.
   G1BlockOffsetSharedArray* bot_shared() const { return _bot_shared; }
 
-  // Reference Processing accessor
-  ReferenceProcessor* ref_processor() { return _ref_processor; }
+  // Reference Processing accessors
+
+  // The STW reference processor....
+  ReferenceProcessor* ref_processor_stw() const { return _ref_processor_stw; }
+
+  // The Concurent Marking reference processor...
+  ReferenceProcessor* ref_processor_cm() const { return _ref_processor_cm; }
 
   virtual size_t capacity() const;
   virtual size_t used() const;
diff --git a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
index 92c47409a285f7688b4b7e160e3a070097178753..24f96aef85c5e95dd1ca2f3853c5f6d2b6ec4d55 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
@@ -152,8 +152,12 @@ G1CollectorPolicy::G1CollectorPolicy() :
 
   _summary(new Summary()),
 
-#ifndef PRODUCT
   _cur_clear_ct_time_ms(0.0),
+
+  _cur_ref_proc_time_ms(0.0),
+  _cur_ref_enq_time_ms(0.0),
+
+#ifndef PRODUCT
   _min_clear_cc_time_ms(-1.0),
   _max_clear_cc_time_ms(-1.0),
   _cur_clear_cc_time_ms(0.0),
@@ -1479,6 +1483,8 @@ void G1CollectorPolicy::record_collection_pause_end() {
 #endif
     print_stats(1, "Other", other_time_ms);
     print_stats(2, "Choose CSet", _recorded_young_cset_choice_time_ms);
+    print_stats(2, "Ref Proc", _cur_ref_proc_time_ms);
+    print_stats(2, "Ref Enq", _cur_ref_enq_time_ms);
 
     for (int i = 0; i < _aux_num; ++i) {
       if (_cur_aux_times_set[i]) {
diff --git a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
index 9dc7349fa47019594e4b8c660e1493fd3bf82764..2c315729e3406d678034dfea849e23407dd8130f 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
@@ -119,6 +119,8 @@ protected:
   double _cur_satb_drain_time_ms;
   double _cur_clear_ct_time_ms;
   bool   _satb_drain_time_set;
+  double _cur_ref_proc_time_ms;
+  double _cur_ref_enq_time_ms;
 
 #ifndef PRODUCT
   // Card Table Count Cache stats
@@ -986,6 +988,14 @@ public:
     _cur_aux_times_ms[i] += ms;
   }
 
+  void record_ref_proc_time(double ms) {
+    _cur_ref_proc_time_ms = ms;
+  }
+
+  void record_ref_enq_time(double ms) {
+    _cur_ref_enq_time_ms = ms;
+  }
+
 #ifndef PRODUCT
   void record_cc_clear_time(double ms) {
     if (_min_clear_cc_time_ms < 0.0 || ms <= _min_clear_cc_time_ms)
diff --git a/src/share/vm/gc_implementation/g1/g1MarkSweep.cpp b/src/share/vm/gc_implementation/g1/g1MarkSweep.cpp
index 1ed4f993d1271726bd95ce4168ac80125750dc9c..84bbadbb951b0361a110c33cf2c26ff79e201367 100644
--- a/src/share/vm/gc_implementation/g1/g1MarkSweep.cpp
+++ b/src/share/vm/gc_implementation/g1/g1MarkSweep.cpp
@@ -62,6 +62,8 @@ void G1MarkSweep::invoke_at_safepoint(ReferenceProcessor* rp,
   // hook up weak ref data so it can be used during Mark-Sweep
   assert(GenMarkSweep::ref_processor() == NULL, "no stomping");
   assert(rp != NULL, "should be non-NULL");
+  assert(rp == G1CollectedHeap::heap()->ref_processor_stw(), "Precondition");
+
   GenMarkSweep::_ref_processor = rp;
   rp->setup_policy(clear_all_softrefs);
 
@@ -139,6 +141,8 @@ void G1MarkSweep::mark_sweep_phase1(bool& marked_for_unloading,
 
   // Process reference objects found during marking
   ReferenceProcessor* rp = GenMarkSweep::ref_processor();
+  assert(rp == G1CollectedHeap::heap()->ref_processor_stw(), "Sanity");
+
   rp->setup_policy(clear_all_softrefs);
   rp->process_discovered_references(&GenMarkSweep::is_alive,
                                     &GenMarkSweep::keep_alive,
@@ -166,7 +170,6 @@ void G1MarkSweep::mark_sweep_phase1(bool& marked_for_unloading,
   GenMarkSweep::follow_mdo_weak_refs();
   assert(GenMarkSweep::_marking_stack.is_empty(), "just drained");
 
-
   // Visit interned string tables and delete unmarked oops
   StringTable::unlink(&GenMarkSweep::is_alive);
   // Clean up unreferenced symbols in symbol table.
@@ -346,7 +349,8 @@ void G1MarkSweep::mark_sweep_phase3() {
                            NULL,  // do not touch code cache here
                            &GenMarkSweep::adjust_pointer_closure);
 
-  g1h->ref_processor()->weak_oops_do(&GenMarkSweep::adjust_root_pointer_closure);
+  assert(GenMarkSweep::ref_processor() == g1h->ref_processor_stw(), "Sanity");
+  g1h->ref_processor_stw()->weak_oops_do(&GenMarkSweep::adjust_root_pointer_closure);
 
   // Now adjust pointers in remaining weak roots.  (All of which should
   // have been cleared if they pointed to non-surviving objects.)
diff --git a/src/share/vm/gc_implementation/g1/g1OopClosures.hpp b/src/share/vm/gc_implementation/g1/g1OopClosures.hpp
index 73d87a11c668dc4e9eea191f97f9b7b6b43aee1b..84e67725f90f57d3002c840e9d361619cae14edd 100644
--- a/src/share/vm/gc_implementation/g1/g1OopClosures.hpp
+++ b/src/share/vm/gc_implementation/g1/g1OopClosures.hpp
@@ -34,6 +34,7 @@ class CMBitMap;
 class CMMarkStack;
 class G1ParScanThreadState;
 class CMTask;
+class ReferenceProcessor;
 
 // A class that scans oops in a given heap region (much as OopsInGenClosure
 // scans oops in a generation.)
@@ -59,8 +60,15 @@ public:
 
 class G1ParPushHeapRSClosure : public G1ParClosureSuper {
 public:
-  G1ParPushHeapRSClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
-    G1ParClosureSuper(g1, par_scan_state) { }
+  G1ParPushHeapRSClosure(G1CollectedHeap* g1,
+                         G1ParScanThreadState* par_scan_state,
+                         ReferenceProcessor* rp) :
+    G1ParClosureSuper(g1, par_scan_state)
+  {
+    assert(_ref_processor == NULL, "sanity");
+    _ref_processor = rp;
+  }
+
   template <class T> void do_oop_nv(T* p);
   virtual void do_oop(oop* p)          { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p)    { do_oop_nv(p); }
@@ -68,8 +76,13 @@ public:
 
 class G1ParScanClosure : public G1ParClosureSuper {
 public:
-  G1ParScanClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
-    G1ParClosureSuper(g1, par_scan_state) { }
+  G1ParScanClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state, ReferenceProcessor* rp) :
+    G1ParClosureSuper(g1, par_scan_state)
+  {
+    assert(_ref_processor == NULL, "sanity");
+    _ref_processor = rp;
+  }
+
   template <class T> void do_oop_nv(T* p);
   virtual void do_oop(oop* p)          { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p)    { do_oop_nv(p); }
@@ -92,9 +105,18 @@ template <class T> inline oop clear_partial_array_mask(T* ref) {
 
 class G1ParScanPartialArrayClosure : public G1ParClosureSuper {
   G1ParScanClosure _scanner;
+
 public:
-  G1ParScanPartialArrayClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
-    G1ParClosureSuper(g1, par_scan_state), _scanner(g1, par_scan_state) { }
+  G1ParScanPartialArrayClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state, ReferenceProcessor* rp) :
+    G1ParClosureSuper(g1, par_scan_state), _scanner(g1, par_scan_state, rp)
+  {
+    assert(_ref_processor == NULL, "sanity");
+  }
+
+  G1ParScanClosure* scanner() {
+    return &_scanner;
+  }
+
   template <class T> void do_oop_nv(T* p);
   virtual void do_oop(oop* p)       { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
@@ -117,10 +139,20 @@ template<bool do_gen_barrier, G1Barrier barrier,
          bool do_mark_object>
 class G1ParCopyClosure : public G1ParCopyHelper {
   G1ParScanClosure _scanner;
+
   template <class T> void do_oop_work(T* p);
+
 public:
-  G1ParCopyClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
-    _scanner(g1, par_scan_state), G1ParCopyHelper(g1, par_scan_state, &_scanner) { }
+  G1ParCopyClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state,
+                   ReferenceProcessor* rp) :
+    _scanner(g1, par_scan_state, rp),
+    G1ParCopyHelper(g1, par_scan_state, &_scanner)
+  {
+    assert(_ref_processor == NULL, "sanity");
+  }
+
+  G1ParScanClosure* scanner() { return &_scanner; }
+
   template <class T> void do_oop_nv(T* p) {
     do_oop_work(p);
   }
@@ -130,21 +162,25 @@ public:
 
 typedef G1ParCopyClosure<false, G1BarrierNone, false> G1ParScanExtRootClosure;
 typedef G1ParCopyClosure<true,  G1BarrierNone, false> G1ParScanPermClosure;
-typedef G1ParCopyClosure<false, G1BarrierRS,   false> G1ParScanHeapRSClosure;
+
 typedef G1ParCopyClosure<false, G1BarrierNone, true> G1ParScanAndMarkExtRootClosure;
 typedef G1ParCopyClosure<true,  G1BarrierNone, true> G1ParScanAndMarkPermClosure;
-typedef G1ParCopyClosure<false, G1BarrierRS,   true> G1ParScanAndMarkHeapRSClosure;
-
-// This is the only case when we set skip_cset_test. Basically, this
-// closure is (should?) only be called directly while we're draining
-// the overflow and task queues. In that case we know that the
-// reference in question points into the collection set, otherwise we
-// would not have pushed it on the queue. The following is defined in
-// g1_specialized_oop_closures.hpp.
-// typedef G1ParCopyClosure<false, G1BarrierEvac, false, true> G1ParScanHeapEvacClosure;
-// We need a separate closure to handle references during evacuation
-// failure processing, as we cannot asume that the reference already
-// points into the collection set (like G1ParScanHeapEvacClosure does).
+
+// The following closure types are no longer used but are retained
+// for historical reasons:
+// typedef G1ParCopyClosure<false, G1BarrierRS,   false> G1ParScanHeapRSClosure;
+// typedef G1ParCopyClosure<false, G1BarrierRS,   true> G1ParScanAndMarkHeapRSClosure;
+
+// The following closure type is defined in g1_specialized_oop_closures.hpp:
+//
+// typedef G1ParCopyClosure<false, G1BarrierEvac, false> G1ParScanHeapEvacClosure;
+
+// We use a separate closure to handle references during evacuation
+// failure processing.
+// We could have used another instance of G1ParScanHeapEvacClosure
+// (since that closure no longer assumes that the references it
+// handles point into the collection set).
+
 typedef G1ParCopyClosure<false, G1BarrierEvac, false> G1ParScanHeapEvacFailureClosure;
 
 class FilterIntoCSClosure: public OopClosure {
@@ -153,9 +189,15 @@ class FilterIntoCSClosure: public OopClosure {
   DirtyCardToOopClosure* _dcto_cl;
 public:
   FilterIntoCSClosure(  DirtyCardToOopClosure* dcto_cl,
-                        G1CollectedHeap* g1, OopClosure* oc) :
+                        G1CollectedHeap* g1,
+                        OopClosure* oc,
+                        ReferenceProcessor* rp) :
     _dcto_cl(dcto_cl), _g1(g1), _oc(oc)
-  {}
+  {
+    assert(_ref_processor == NULL, "sanity");
+    _ref_processor = rp;
+  }
+
   template <class T> void do_oop_nv(T* p);
   virtual void do_oop(oop* p)        { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p)  { do_oop_nv(p); }
diff --git a/src/share/vm/gc_implementation/g1/g1RemSet.cpp b/src/share/vm/gc_implementation/g1/g1RemSet.cpp
index 568bb64f5050c14564f60302ec9fe61c891d0e26..04a3bf5e5fa96a2b1e737d8fc9add656f9b4e837 100644
--- a/src/share/vm/gc_implementation/g1/g1RemSet.cpp
+++ b/src/share/vm/gc_implementation/g1/g1RemSet.cpp
@@ -234,6 +234,7 @@ void G1RemSet::scanRS(OopsInHeapRegionClosure* oc, int worker_i) {
   HeapRegion *startRegion = calculateStartRegion(worker_i);
 
   ScanRSClosure scanRScl(oc, worker_i);
+
   _g1->collection_set_iterate_from(startRegion, &scanRScl);
   scanRScl.set_try_claimed();
   _g1->collection_set_iterate_from(startRegion, &scanRScl);
@@ -283,6 +284,7 @@ void G1RemSet::updateRS(DirtyCardQueue* into_cset_dcq, int worker_i) {
   double start = os::elapsedTime();
   // Apply the given closure to all remaining log entries.
   RefineRecordRefsIntoCSCardTableEntryClosure into_cset_update_rs_cl(_g1, into_cset_dcq);
+
   _g1->iterate_dirty_card_closure(&into_cset_update_rs_cl, into_cset_dcq, false, worker_i);
 
   // Now there should be no dirty cards.
@@ -466,7 +468,7 @@ public:
     MemRegion scanRegion(start, end);
 
     UpdateRSetImmediate update_rs_cl(_g1->g1_rem_set());
-    FilterIntoCSClosure update_rs_cset_oop_cl(NULL, _g1, &update_rs_cl);
+    FilterIntoCSClosure update_rs_cset_oop_cl(NULL, _g1, &update_rs_cl, NULL /* rp */);
     FilterOutOfRegionClosure filter_then_update_rs_cset_oop_cl(r, &update_rs_cset_oop_cl);
 
     // We can pass false as the "filter_young" parameter here as:
@@ -642,7 +644,7 @@ bool G1RemSet::concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i,
   update_rs_oop_cl.set_from(r);
 
   TriggerClosure trigger_cl;
-  FilterIntoCSClosure into_cs_cl(NULL, _g1, &trigger_cl);
+  FilterIntoCSClosure into_cs_cl(NULL, _g1, &trigger_cl, NULL /* rp */);
   InvokeIfNotTriggeredClosure invoke_cl(&trigger_cl, &into_cs_cl);
   Mux2Closure mux(&invoke_cl, &update_rs_oop_cl);
 
diff --git a/src/share/vm/gc_implementation/g1/heapRegion.cpp b/src/share/vm/gc_implementation/g1/heapRegion.cpp
index f199cd85f38e6bd26cff5c3b8688df8d59c95d2d..36636152e399d7cfd7b2b0047361d583cbddaa0f 100644
--- a/src/share/vm/gc_implementation/g1/heapRegion.cpp
+++ b/src/share/vm/gc_implementation/g1/heapRegion.cpp
@@ -45,7 +45,7 @@ HeapRegionDCTOC::HeapRegionDCTOC(G1CollectedHeap* g1,
                                  FilterKind fk) :
   ContiguousSpaceDCTOC(hr, cl, precision, NULL),
   _hr(hr), _fk(fk), _g1(g1)
-{}
+{ }
 
 FilterOutOfRegionClosure::FilterOutOfRegionClosure(HeapRegion* r,
                                                    OopClosure* oc) :
@@ -214,8 +214,37 @@ void HeapRegionDCTOC::walk_mem_region_with_cl(MemRegion mr,
   int oop_size;
 
   OopClosure* cl2 = cl;
-  FilterIntoCSClosure intoCSFilt(this, g1h, cl);
+
+  // If we are scanning the remembered sets looking for refs
+  // into the collection set during an evacuation pause then
+  // we will want to 'discover' reference objects that point
+  // to referents in the collection set.
+  //
+  // Unfortunately it is an instance of FilterIntoCSClosure
+  // that is iterated over the reference fields of oops in
+  // mr (and not the G1ParPushHeapRSClosure - which is the
+  // cl parameter).
+  // If we set the _ref_processor field in the FilterIntoCSClosure
+  // instance, all the reference objects that are walked
+  // (regardless of whether their referent object's are in
+  // the cset) will be 'discovered'.
+  //
+  // The G1STWIsAlive closure considers a referent object that
+  // is outside the cset as alive. The G1CopyingKeepAliveClosure
+  // skips referents that are not in the cset.
+  //
+  // Therefore reference objects in mr with a referent that is
+  // outside the cset should be OK.
+
+  ReferenceProcessor* rp = _cl->_ref_processor;
+  if (rp != NULL) {
+    assert(rp == _g1->ref_processor_stw(), "should be stw");
+    assert(_fk == IntoCSFilterKind, "should be looking for refs into CS");
+  }
+
+  FilterIntoCSClosure intoCSFilt(this, g1h, cl, rp);
   FilterOutOfRegionClosure outOfRegionFilt(_hr, cl);
+
   switch (_fk) {
   case IntoCSFilterKind:      cl2 = &intoCSFilt; break;
   case OutOfRegionFilterKind: cl2 = &outOfRegionFilt; break;
@@ -239,16 +268,19 @@ void HeapRegionDCTOC::walk_mem_region_with_cl(MemRegion mr,
     case NoFilterKind:
       bottom = walk_mem_region_loop(cl, g1h, _hr, bottom, top);
       break;
+
     case IntoCSFilterKind: {
-      FilterIntoCSClosure filt(this, g1h, cl);
+      FilterIntoCSClosure filt(this, g1h, cl, rp);
       bottom = walk_mem_region_loop(&filt, g1h, _hr, bottom, top);
       break;
     }
+
     case OutOfRegionFilterKind: {
       FilterOutOfRegionClosure filt(_hr, cl);
       bottom = walk_mem_region_loop(&filt, g1h, _hr, bottom, top);
       break;
     }
+
     default:
       ShouldNotReachHere();
     }
@@ -483,7 +515,7 @@ HeapRegion::
 HeapRegion(size_t hrs_index, G1BlockOffsetSharedArray* sharedOffsetArray,
            MemRegion mr, bool is_zeroed)
   : G1OffsetTableContigSpace(sharedOffsetArray, mr, is_zeroed),
-    _next_fk(HeapRegionDCTOC::NoFilterKind), _hrs_index(hrs_index),
+    _hrs_index(hrs_index),
     _humongous_type(NotHumongous), _humongous_start_region(NULL),
     _in_collection_set(false),
     _next_in_special_set(NULL), _orig_end(NULL),
diff --git a/src/share/vm/gc_implementation/g1/heapRegion.hpp b/src/share/vm/gc_implementation/g1/heapRegion.hpp
index 774987dd8afc9339a206debd12eaa8c5970e86ea..f2edf01b9f65ba80efb388b5d426b99d76f03758 100644
--- a/src/share/vm/gc_implementation/g1/heapRegion.hpp
+++ b/src/share/vm/gc_implementation/g1/heapRegion.hpp
@@ -118,7 +118,6 @@ public:
                   FilterKind fk);
 };
 
-
 // The complicating factor is that BlockOffsetTable diverged
 // significantly, and we need functionality that is only in the G1 version.
 // So I copied that code, which led to an alternate G1 version of
@@ -223,10 +222,6 @@ class HeapRegion: public G1OffsetTableContigSpace {
     ContinuesHumongous
   };
 
-  // The next filter kind that should be used for a "new_dcto_cl" call with
-  // the "traditional" signature.
-  HeapRegionDCTOC::FilterKind _next_fk;
-
   // Requires that the region "mr" be dense with objects, and begin and end
   // with an object.
   void oops_in_mr_iterate(MemRegion mr, OopClosure* cl);
@@ -573,40 +568,14 @@ class HeapRegion: public G1OffsetTableContigSpace {
   // allocated in the current region before the last call to "save_mark".
   void oop_before_save_marks_iterate(OopClosure* cl);
 
-  // This call determines the "filter kind" argument that will be used for
-  // the next call to "new_dcto_cl" on this region with the "traditional"
-  // signature (i.e., the call below.)  The default, in the absence of a
-  // preceding call to this method, is "NoFilterKind", and a call to this
-  // method is necessary for each such call, or else it reverts to the
-  // default.
-  // (This is really ugly, but all other methods I could think of changed a
-  // lot of main-line code for G1.)
-  void set_next_filter_kind(HeapRegionDCTOC::FilterKind nfk) {
-    _next_fk = nfk;
-  }
-
   DirtyCardToOopClosure*
   new_dcto_closure(OopClosure* cl,
                    CardTableModRefBS::PrecisionStyle precision,
                    HeapRegionDCTOC::FilterKind fk);
 
-#if WHASSUP
-  DirtyCardToOopClosure*
-  new_dcto_closure(OopClosure* cl,
-                   CardTableModRefBS::PrecisionStyle precision,
-                   HeapWord* boundary) {
-    assert(boundary == NULL, "This arg doesn't make sense here.");
-    DirtyCardToOopClosure* res = new_dcto_closure(cl, precision, _next_fk);
-    _next_fk = HeapRegionDCTOC::NoFilterKind;
-    return res;
-  }
-#endif
-
-  //
   // Note the start or end of marking. This tells the heap region
   // that the collector is about to start or has finished (concurrently)
   // marking the heap.
-  //
 
   // Note the start of a marking phase. Record the
   // start of the unmarked area of the region here.
diff --git a/src/share/vm/gc_implementation/g1/satbQueue.cpp b/src/share/vm/gc_implementation/g1/satbQueue.cpp
index 5a7a7b694e886b1935e7601bd574a13bef092b28..ea0c19a89a8f57e8539c3c3713bad3de86e24dd4 100644
--- a/src/share/vm/gc_implementation/g1/satbQueue.cpp
+++ b/src/share/vm/gc_implementation/g1/satbQueue.cpp
@@ -29,6 +29,7 @@
 #include "memory/sharedHeap.hpp"
 #include "runtime/mutexLocker.hpp"
 #include "runtime/thread.hpp"
+#include "runtime/vmThread.hpp"
 
 // This method removes entries from an SATB buffer that will not be
 // useful to the concurrent marking threads. An entry is removed if it
@@ -252,9 +253,18 @@ void SATBMarkQueueSet::par_iterate_closure_all_threads(int worker) {
       t->satb_mark_queue().apply_closure(_par_closures[worker]);
     }
   }
-  // We'll have worker 0 do this one.
-  if (worker == 0) {
-    shared_satb_queue()->apply_closure(_par_closures[0]);
+
+  // We also need to claim the VMThread so that its parity is updated
+  // otherwise the next call to Thread::possibly_parallel_oops_do inside
+  // a StrongRootsScope might skip the VMThread because it has a stale
+  // parity that matches the parity set by the StrongRootsScope
+  //
+  // Whichever worker succeeds in claiming the VMThread gets to do
+  // the shared queue.
+
+  VMThread* vmt = VMThread::vm_thread();
+  if (vmt->claim_oops_do(true, parity)) {
+    shared_satb_queue()->apply_closure(_par_closures[worker]);
   }
 }
 
diff --git a/src/share/vm/gc_implementation/parallelScavenge/psMarkSweep.cpp b/src/share/vm/gc_implementation/parallelScavenge/psMarkSweep.cpp
index 3d7e0ba4c2dfda4e6dddc88b0fd50723101353f3..f2965e67406b5d3ff8a2ff326f30972144b9af80 100644
--- a/src/share/vm/gc_implementation/parallelScavenge/psMarkSweep.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psMarkSweep.cpp
@@ -198,10 +198,9 @@ void PSMarkSweep::invoke_no_policy(bool clear_all_softrefs) {
 
     allocate_stacks();
 
-    NOT_PRODUCT(ref_processor()->verify_no_references_recorded());
     COMPILER2_PRESENT(DerivedPointerTable::clear());
 
-    ref_processor()->enable_discovery();
+    ref_processor()->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
     ref_processor()->setup_policy(clear_all_softrefs);
 
     mark_sweep_phase1(clear_all_softrefs);
diff --git a/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp b/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp
index 55cb34350b2be4d963697a58a2a26a3a48d8c3a7..a62059c68e8ce81963d671070fede9fb7d81423c 100644
--- a/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp
@@ -2069,10 +2069,9 @@ void PSParallelCompact::invoke_no_policy(bool maximum_heap_compaction) {
     CodeCache::gc_prologue();
     Threads::gc_prologue();
 
-    NOT_PRODUCT(ref_processor()->verify_no_references_recorded());
     COMPILER2_PRESENT(DerivedPointerTable::clear());
 
-    ref_processor()->enable_discovery();
+    ref_processor()->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
     ref_processor()->setup_policy(maximum_heap_compaction);
 
     bool marked_for_unloading = false;
diff --git a/src/share/vm/gc_implementation/parallelScavenge/psScavenge.cpp b/src/share/vm/gc_implementation/parallelScavenge/psScavenge.cpp
index b2234676f1e435d55894a4a1df600da2f412c6f7..1094d1708e8141f1b3cd0088f0c756c3d8be0cec 100644
--- a/src/share/vm/gc_implementation/parallelScavenge/psScavenge.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psScavenge.cpp
@@ -350,10 +350,9 @@ bool PSScavenge::invoke_no_policy() {
     }
     save_to_space_top_before_gc();
 
-    NOT_PRODUCT(reference_processor()->verify_no_references_recorded());
     COMPILER2_PRESENT(DerivedPointerTable::clear());
 
-    reference_processor()->enable_discovery();
+    reference_processor()->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
     reference_processor()->setup_policy(false);
 
     // We track how much was promoted to the next generation for
diff --git a/src/share/vm/memory/genCollectedHeap.cpp b/src/share/vm/memory/genCollectedHeap.cpp
index f233cbc7b59c31091d0db9aef3b0bd072803fa86..d6a54e17e56fc6c553acb2100aeaf53ca9989ca3 100644
--- a/src/share/vm/memory/genCollectedHeap.cpp
+++ b/src/share/vm/memory/genCollectedHeap.cpp
@@ -599,8 +599,7 @@ void GenCollectedHeap::do_collection(bool  full,
           // atomic wrt other collectors in this configuration, we
           // are guaranteed to have empty discovered ref lists.
           if (rp->discovery_is_atomic()) {
-            rp->verify_no_references_recorded();
-            rp->enable_discovery();
+            rp->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
             rp->setup_policy(do_clear_all_soft_refs);
           } else {
             // collect() below will enable discovery as appropriate
diff --git a/src/share/vm/memory/referenceProcessor.cpp b/src/share/vm/memory/referenceProcessor.cpp
index 9b593ef3f75db3dd60ba190417a71a982fcf7f96..27a1ff41b12c05ea134f629de45ac5d59ff3bbb3 100644
--- a/src/share/vm/memory/referenceProcessor.cpp
+++ b/src/share/vm/memory/referenceProcessor.cpp
@@ -35,42 +35,8 @@
 
 ReferencePolicy* ReferenceProcessor::_always_clear_soft_ref_policy = NULL;
 ReferencePolicy* ReferenceProcessor::_default_soft_ref_policy      = NULL;
-const int        subclasses_of_ref                = REF_PHANTOM - REF_OTHER;
 bool             ReferenceProcessor::_pending_list_uses_discovered_field = false;
 
-// List of discovered references.
-class DiscoveredList {
-public:
-  DiscoveredList() : _len(0), _compressed_head(0), _oop_head(NULL) { }
-  oop head() const     {
-     return UseCompressedOops ?  oopDesc::decode_heap_oop(_compressed_head) :
-                                _oop_head;
-  }
-  HeapWord* adr_head() {
-    return UseCompressedOops ? (HeapWord*)&_compressed_head :
-                               (HeapWord*)&_oop_head;
-  }
-  void   set_head(oop o) {
-    if (UseCompressedOops) {
-      // Must compress the head ptr.
-      _compressed_head = oopDesc::encode_heap_oop(o);
-    } else {
-      _oop_head = o;
-    }
-  }
-  bool   empty() const          { return head() == NULL; }
-  size_t length()               { return _len; }
-  void   set_length(size_t len) { _len = len;  }
-  void   inc_length(size_t inc) { _len += inc; assert(_len > 0, "Error"); }
-  void   dec_length(size_t dec) { _len -= dec; }
-private:
-  // Set value depending on UseCompressedOops. This could be a template class
-  // but then we have to fix all the instantiations and declarations that use this class.
-  oop       _oop_head;
-  narrowOop _compressed_head;
-  size_t _len;
-};
-
 void referenceProcessor_init() {
   ReferenceProcessor::init_statics();
 }
@@ -112,7 +78,8 @@ ReferenceProcessor::ReferenceProcessor(MemRegion span,
   _discovery_is_mt     = mt_discovery;
   _num_q               = MAX2(1, mt_processing_degree);
   _max_num_q           = MAX2(_num_q, mt_discovery_degree);
-  _discoveredSoftRefs  = NEW_C_HEAP_ARRAY(DiscoveredList, _max_num_q * subclasses_of_ref);
+  _discoveredSoftRefs  = NEW_C_HEAP_ARRAY(DiscoveredList,
+                                          _max_num_q * number_of_subclasses_of_ref());
   if (_discoveredSoftRefs == NULL) {
     vm_exit_during_initialization("Could not allocated RefProc Array");
   }
@@ -120,7 +87,7 @@ ReferenceProcessor::ReferenceProcessor(MemRegion span,
   _discoveredFinalRefs   = &_discoveredWeakRefs[_max_num_q];
   _discoveredPhantomRefs = &_discoveredFinalRefs[_max_num_q];
   // Initialized all entries to NULL
-  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     _discoveredSoftRefs[i].set_head(NULL);
     _discoveredSoftRefs[i].set_length(0);
   }
@@ -134,19 +101,15 @@ ReferenceProcessor::ReferenceProcessor(MemRegion span,
 #ifndef PRODUCT
 void ReferenceProcessor::verify_no_references_recorded() {
   guarantee(!_discovering_refs, "Discovering refs?");
-  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
-    guarantee(_discoveredSoftRefs[i].empty(),
+  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+    guarantee(_discoveredSoftRefs[i].is_empty(),
               "Found non-empty discovered list");
   }
 }
 #endif
 
 void ReferenceProcessor::weak_oops_do(OopClosure* f) {
-  // Should this instead be
-  // for (int i = 0; i < subclasses_of_ref; i++_ {
-  //   for (int j = 0; j < _num_q; j++) {
-  //     int index = i * _max_num_q + j;
-  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     if (UseCompressedOops) {
       f->do_oop((narrowOop*)_discoveredSoftRefs[i].adr_head());
     } else {
@@ -404,7 +367,7 @@ public:
     // allocated and are indexed into.
     assert(_n_queues == (int) _ref_processor.max_num_q(), "Different number not expected");
     for (int j = 0;
-         j < subclasses_of_ref;
+         j < ReferenceProcessor::number_of_subclasses_of_ref();
          j++, index += _n_queues) {
       _ref_processor.enqueue_discovered_reflist(
         _refs_lists[index], _pending_list_addr);
@@ -424,7 +387,7 @@ void ReferenceProcessor::enqueue_discovered_reflists(HeapWord* pending_list_addr
     task_executor->execute(tsk);
   } else {
     // Serial code: call the parent class's implementation
-    for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+    for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
       enqueue_discovered_reflist(_discoveredSoftRefs[i], pending_list_addr);
       _discoveredSoftRefs[i].set_head(NULL);
       _discoveredSoftRefs[i].set_length(0);
@@ -432,119 +395,7 @@ void ReferenceProcessor::enqueue_discovered_reflists(HeapWord* pending_list_addr
   }
 }
 
-// Iterator for the list of discovered references.
-class DiscoveredListIterator {
-public:
-  inline DiscoveredListIterator(DiscoveredList&    refs_list,
-                                OopClosure*        keep_alive,
-                                BoolObjectClosure* is_alive);
-
-  // End Of List.
-  inline bool has_next() const { return _ref != NULL; }
-
-  // Get oop to the Reference object.
-  inline oop obj() const { return _ref; }
-
-  // Get oop to the referent object.
-  inline oop referent() const { return _referent; }
-
-  // Returns true if referent is alive.
-  inline bool is_referent_alive() const;
-
-  // Loads data for the current reference.
-  // The "allow_null_referent" argument tells us to allow for the possibility
-  // of a NULL referent in the discovered Reference object. This typically
-  // happens in the case of concurrent collectors that may have done the
-  // discovery concurrently, or interleaved, with mutator execution.
-  inline void load_ptrs(DEBUG_ONLY(bool allow_null_referent));
-
-  // Move to the next discovered reference.
-  inline void next();
-
-  // Remove the current reference from the list
-  inline void remove();
-
-  // Make the Reference object active again.
-  inline void make_active() { java_lang_ref_Reference::set_next(_ref, NULL); }
-
-  // Make the referent alive.
-  inline void make_referent_alive() {
-    if (UseCompressedOops) {
-      _keep_alive->do_oop((narrowOop*)_referent_addr);
-    } else {
-      _keep_alive->do_oop((oop*)_referent_addr);
-    }
-  }
-
-  // Update the discovered field.
-  inline void update_discovered() {
-    // First _prev_next ref actually points into DiscoveredList (gross).
-    if (UseCompressedOops) {
-      if (!oopDesc::is_null(*(narrowOop*)_prev_next)) {
-        _keep_alive->do_oop((narrowOop*)_prev_next);
-      }
-    } else {
-      if (!oopDesc::is_null(*(oop*)_prev_next)) {
-        _keep_alive->do_oop((oop*)_prev_next);
-      }
-    }
-  }
-
-  // NULL out referent pointer.
-  inline void clear_referent() { oop_store_raw(_referent_addr, NULL); }
-
-  // Statistics
-  NOT_PRODUCT(
-  inline size_t processed() const { return _processed; }
-  inline size_t removed() const   { return _removed; }
-  )
-
-  inline void move_to_next();
-
-private:
-  DiscoveredList&    _refs_list;
-  HeapWord*          _prev_next;
-  oop                _prev;
-  oop                _ref;
-  HeapWord*          _discovered_addr;
-  oop                _next;
-  HeapWord*          _referent_addr;
-  oop                _referent;
-  OopClosure*        _keep_alive;
-  BoolObjectClosure* _is_alive;
-  DEBUG_ONLY(
-  oop                _first_seen; // cyclic linked list check
-  )
-  NOT_PRODUCT(
-  size_t             _processed;
-  size_t             _removed;
-  )
-};
-
-inline DiscoveredListIterator::DiscoveredListIterator(DiscoveredList&    refs_list,
-                                                      OopClosure*        keep_alive,
-                                                      BoolObjectClosure* is_alive)
-  : _refs_list(refs_list),
-    _prev_next(refs_list.adr_head()),
-    _prev(NULL),
-    _ref(refs_list.head()),
-#ifdef ASSERT
-    _first_seen(refs_list.head()),
-#endif
-#ifndef PRODUCT
-    _processed(0),
-    _removed(0),
-#endif
-    _next(NULL),
-    _keep_alive(keep_alive),
-    _is_alive(is_alive)
-{ }
-
-inline bool DiscoveredListIterator::is_referent_alive() const {
-  return _is_alive->do_object_b(_referent);
-}
-
-inline void DiscoveredListIterator::load_ptrs(DEBUG_ONLY(bool allow_null_referent)) {
+void DiscoveredListIterator::load_ptrs(DEBUG_ONLY(bool allow_null_referent)) {
   _discovered_addr = java_lang_ref_Reference::discovered_addr(_ref);
   oop discovered = java_lang_ref_Reference::discovered(_ref);
   assert(_discovered_addr && discovered->is_oop_or_null(),
@@ -560,13 +411,7 @@ inline void DiscoveredListIterator::load_ptrs(DEBUG_ONLY(bool allow_null_referen
          "bad referent");
 }
 
-inline void DiscoveredListIterator::next() {
-  _prev_next = _discovered_addr;
-  _prev = _ref;
-  move_to_next();
-}
-
-inline void DiscoveredListIterator::remove() {
+void DiscoveredListIterator::remove() {
   assert(_ref->is_oop(), "Dropping a bad reference");
   oop_store_raw(_discovered_addr, NULL);
 
@@ -592,15 +437,29 @@ inline void DiscoveredListIterator::remove() {
   _refs_list.dec_length(1);
 }
 
-inline void DiscoveredListIterator::move_to_next() {
-  if (_ref == _next) {
-    // End of the list.
-    _ref = NULL;
+// Make the Reference object active again.
+void DiscoveredListIterator::make_active() {
+  // For G1 we don't want to use set_next - it
+  // will dirty the card for the next field of
+  // the reference object and will fail
+  // CT verification.
+  if (UseG1GC) {
+    BarrierSet* bs = oopDesc::bs();
+    HeapWord* next_addr = java_lang_ref_Reference::next_addr(_ref);
+
+    if (UseCompressedOops) {
+      bs->write_ref_field_pre((narrowOop*)next_addr, NULL);
+    } else {
+      bs->write_ref_field_pre((oop*)next_addr, NULL);
+    }
+    java_lang_ref_Reference::set_next_raw(_ref, NULL);
   } else {
-    _ref = _next;
+    java_lang_ref_Reference::set_next(_ref, NULL);
   }
-  assert(_ref != _first_seen, "cyclic ref_list found");
-  NOT_PRODUCT(_processed++);
+}
+
+void DiscoveredListIterator::clear_referent() {
+  oop_store_raw(_referent_addr, NULL);
 }
 
 // NOTE: process_phase*() are largely similar, and at a high level
@@ -786,10 +645,9 @@ ReferenceProcessor::abandon_partial_discovered_list(DiscoveredList& refs_list) {
 
 void ReferenceProcessor::abandon_partial_discovery() {
   // loop over the lists
-  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     if (TraceReferenceGC && PrintGCDetails && ((i % _max_num_q) == 0)) {
-      gclog_or_tty->print_cr("\nAbandoning %s discovered list",
-                             list_name(i));
+      gclog_or_tty->print_cr("\nAbandoning %s discovered list", list_name(i));
     }
     abandon_partial_discovered_list(_discoveredSoftRefs[i]);
   }
@@ -858,6 +716,14 @@ private:
   bool _clear_referent;
 };
 
+void ReferenceProcessor::set_discovered(oop ref, oop value) {
+  if (_discovered_list_needs_barrier) {
+    java_lang_ref_Reference::set_discovered(ref, value);
+  } else {
+    java_lang_ref_Reference::set_discovered_raw(ref, value);
+  }
+}
+
 // Balances reference queues.
 // Move entries from all queues[0, 1, ..., _max_num_q-1] to
 // queues[0, 1, ..., _num_q-1] because only the first _num_q
@@ -915,9 +781,9 @@ void ReferenceProcessor::balance_queues(DiscoveredList ref_lists[])
         // Add the chain to the to list.
         if (ref_lists[to_idx].head() == NULL) {
           // to list is empty. Make a loop at the end.
-          java_lang_ref_Reference::set_discovered(move_tail, move_tail);
+          set_discovered(move_tail, move_tail);
         } else {
-          java_lang_ref_Reference::set_discovered(move_tail, ref_lists[to_idx].head());
+          set_discovered(move_tail, ref_lists[to_idx].head());
         }
         ref_lists[to_idx].set_head(move_head);
         ref_lists[to_idx].inc_length(refs_to_move);
@@ -1038,11 +904,7 @@ ReferenceProcessor::process_discovered_reflist(
 
 void ReferenceProcessor::clean_up_discovered_references() {
   // loop over the lists
-  // Should this instead be
-  // for (int i = 0; i < subclasses_of_ref; i++_ {
-  //   for (int j = 0; j < _num_q; j++) {
-  //     int index = i * _max_num_q + j;
-  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     if (TraceReferenceGC && PrintGCDetails && ((i % _max_num_q) == 0)) {
       gclog_or_tty->print_cr(
         "\nScrubbing %s discovered list of Null referents",
@@ -1260,6 +1122,8 @@ bool ReferenceProcessor::discover_reference(oop obj, ReferenceType rt) {
     }
   }
 
+  ResourceMark rm;      // Needed for tracing.
+
   HeapWord* const discovered_addr = java_lang_ref_Reference::discovered_addr(obj);
   const oop  discovered = java_lang_ref_Reference::discovered(obj);
   assert(discovered->is_oop_or_null(), "bad discovered field");
@@ -1472,7 +1336,9 @@ ReferenceProcessor::preclean_discovered_reflist(DiscoveredList&    refs_list,
 }
 
 const char* ReferenceProcessor::list_name(int i) {
-   assert(i >= 0 && i <= _max_num_q * subclasses_of_ref, "Out of bounds index");
+   assert(i >= 0 && i <= _max_num_q * number_of_subclasses_of_ref(),
+          "Out of bounds index");
+
    int j = i / _max_num_q;
    switch (j) {
      case 0: return "SoftRef";
@@ -1493,7 +1359,7 @@ void ReferenceProcessor::verify_ok_to_handle_reflists() {
 #ifndef PRODUCT
 void ReferenceProcessor::clear_discovered_references() {
   guarantee(!_discovering_refs, "Discovering refs?");
-  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     clear_discovered_references(_discoveredSoftRefs[i]);
   }
 }
diff --git a/src/share/vm/memory/referenceProcessor.hpp b/src/share/vm/memory/referenceProcessor.hpp
index 8178f60ea7b3ad887dd6e86c6c488b927360d082..63ed31ddd2a134908f3e3f068e36b4b2246803fb 100644
--- a/src/share/vm/memory/referenceProcessor.hpp
+++ b/src/share/vm/memory/referenceProcessor.hpp
@@ -48,18 +48,175 @@
 // forward references
 class ReferencePolicy;
 class AbstractRefProcTaskExecutor;
-class DiscoveredList;
+
+// List of discovered references.
+class DiscoveredList {
+public:
+  DiscoveredList() : _len(0), _compressed_head(0), _oop_head(NULL) { }
+  oop head() const     {
+     return UseCompressedOops ?  oopDesc::decode_heap_oop(_compressed_head) :
+                                _oop_head;
+  }
+  HeapWord* adr_head() {
+    return UseCompressedOops ? (HeapWord*)&_compressed_head :
+                               (HeapWord*)&_oop_head;
+  }
+  void set_head(oop o) {
+    if (UseCompressedOops) {
+      // Must compress the head ptr.
+      _compressed_head = oopDesc::encode_heap_oop(o);
+    } else {
+      _oop_head = o;
+    }
+  }
+  bool   is_empty() const       { return head() == NULL; }
+  size_t length()               { return _len; }
+  void   set_length(size_t len) { _len = len;  }
+  void   inc_length(size_t inc) { _len += inc; assert(_len > 0, "Error"); }
+  void   dec_length(size_t dec) { _len -= dec; }
+private:
+  // Set value depending on UseCompressedOops. This could be a template class
+  // but then we have to fix all the instantiations and declarations that use this class.
+  oop       _oop_head;
+  narrowOop _compressed_head;
+  size_t _len;
+};
+
+// Iterator for the list of discovered references.
+class DiscoveredListIterator {
+private:
+  DiscoveredList&    _refs_list;
+  HeapWord*          _prev_next;
+  oop                _prev;
+  oop                _ref;
+  HeapWord*          _discovered_addr;
+  oop                _next;
+  HeapWord*          _referent_addr;
+  oop                _referent;
+  OopClosure*        _keep_alive;
+  BoolObjectClosure* _is_alive;
+
+  DEBUG_ONLY(
+  oop                _first_seen; // cyclic linked list check
+  )
+
+  NOT_PRODUCT(
+  size_t             _processed;
+  size_t             _removed;
+  )
+
+public:
+  inline DiscoveredListIterator(DiscoveredList&    refs_list,
+                                OopClosure*        keep_alive,
+                                BoolObjectClosure* is_alive):
+    _refs_list(refs_list),
+    _prev_next(refs_list.adr_head()),
+    _prev(NULL),
+    _ref(refs_list.head()),
+#ifdef ASSERT
+    _first_seen(refs_list.head()),
+#endif
+#ifndef PRODUCT
+    _processed(0),
+    _removed(0),
+#endif
+    _next(NULL),
+    _keep_alive(keep_alive),
+    _is_alive(is_alive)
+{ }
+
+  // End Of List.
+  inline bool has_next() const { return _ref != NULL; }
+
+  // Get oop to the Reference object.
+  inline oop obj() const { return _ref; }
+
+  // Get oop to the referent object.
+  inline oop referent() const { return _referent; }
+
+  // Returns true if referent is alive.
+  inline bool is_referent_alive() const {
+    return _is_alive->do_object_b(_referent);
+  }
+
+  // Loads data for the current reference.
+  // The "allow_null_referent" argument tells us to allow for the possibility
+  // of a NULL referent in the discovered Reference object. This typically
+  // happens in the case of concurrent collectors that may have done the
+  // discovery concurrently, or interleaved, with mutator execution.
+  void load_ptrs(DEBUG_ONLY(bool allow_null_referent));
+
+  // Move to the next discovered reference.
+  inline void next() {
+    _prev_next = _discovered_addr;
+    _prev = _ref;
+    move_to_next();
+  }
+
+  // Remove the current reference from the list
+  void remove();
+
+  // Make the Reference object active again.
+  void make_active();
+
+  // Make the referent alive.
+  inline void make_referent_alive() {
+    if (UseCompressedOops) {
+      _keep_alive->do_oop((narrowOop*)_referent_addr);
+    } else {
+      _keep_alive->do_oop((oop*)_referent_addr);
+    }
+  }
+
+  // Update the discovered field.
+  inline void update_discovered() {
+    // First _prev_next ref actually points into DiscoveredList (gross).
+    if (UseCompressedOops) {
+      if (!oopDesc::is_null(*(narrowOop*)_prev_next)) {
+        _keep_alive->do_oop((narrowOop*)_prev_next);
+      }
+    } else {
+      if (!oopDesc::is_null(*(oop*)_prev_next)) {
+        _keep_alive->do_oop((oop*)_prev_next);
+      }
+    }
+  }
+
+  // NULL out referent pointer.
+  void clear_referent();
+
+  // Statistics
+  NOT_PRODUCT(
+  inline size_t processed() const { return _processed; }
+  inline size_t removed() const   { return _removed; }
+  )
+
+  inline void move_to_next() {
+    if (_ref == _next) {
+      // End of the list.
+      _ref = NULL;
+    } else {
+      _ref = _next;
+    }
+    assert(_ref != _first_seen, "cyclic ref_list found");
+    NOT_PRODUCT(_processed++);
+  }
+
+};
 
 class ReferenceProcessor : public CHeapObj {
  protected:
   // Compatibility with pre-4965777 JDK's
   static bool _pending_list_uses_discovered_field;
-  MemRegion   _span; // (right-open) interval of heap
-                     // subject to wkref discovery
-  bool        _discovering_refs;      // true when discovery enabled
-  bool        _discovery_is_atomic;   // if discovery is atomic wrt
-                                      // other collectors in configuration
-  bool        _discovery_is_mt;       // true if reference discovery is MT.
+
+  MemRegion   _span;                    // (right-open) interval of heap
+                                        // subject to wkref discovery
+
+  bool        _discovering_refs;        // true when discovery enabled
+  bool        _discovery_is_atomic;     // if discovery is atomic wrt
+                                        // other collectors in configuration
+  bool        _discovery_is_mt;         // true if reference discovery is MT.
+
   // If true, setting "next" field of a discovered refs list requires
   // write barrier(s).  (Must be true if used in a collector in which
   // elements of a discovered list may be moved during discovery: for
@@ -67,18 +224,19 @@ class ReferenceProcessor : public CHeapObj {
   // long-term concurrent marking phase that does weak reference
   // discovery.)
   bool        _discovered_list_needs_barrier;
-  BarrierSet* _bs;                    // Cached copy of BarrierSet.
-  bool        _enqueuing_is_done;     // true if all weak references enqueued
-  bool        _processing_is_mt;      // true during phases when
-                                      // reference processing is MT.
-  int         _next_id;               // round-robin mod _num_q counter in
-                                      // support of work distribution
-
-  // For collectors that do not keep GC marking information
+
+  BarrierSet* _bs;                      // Cached copy of BarrierSet.
+  bool        _enqueuing_is_done;       // true if all weak references enqueued
+  bool        _processing_is_mt;        // true during phases when
+                                        // reference processing is MT.
+  int         _next_id;                 // round-robin mod _num_q counter in
+                                        // support of work distribution
+
+  // For collectors that do not keep GC liveness information
   // in the object header, this field holds a closure that
   // helps the reference processor determine the reachability
-  // of an oop (the field is currently initialized to NULL for
-  // all collectors but the CMS collector).
+  // of an oop. It is currently initialized to NULL for all
+  // collectors except for CMS and G1.
   BoolObjectClosure* _is_alive_non_header;
 
   // Soft ref clearing policies
@@ -102,10 +260,13 @@ class ReferenceProcessor : public CHeapObj {
   DiscoveredList* _discoveredPhantomRefs;
 
  public:
-  int num_q()                            { return _num_q; }
-  int max_num_q()                        { return _max_num_q; }
-  void set_active_mt_degree(int v)       { _num_q = v; }
-  DiscoveredList* discovered_soft_refs() { return _discoveredSoftRefs; }
+  static int number_of_subclasses_of_ref() { return (REF_PHANTOM - REF_OTHER); }
+
+  int num_q()                              { return _num_q; }
+  int max_num_q()                          { return _max_num_q; }
+  void set_active_mt_degree(int v)         { _num_q = v; }
+  DiscoveredList* discovered_soft_refs()   { return _discoveredSoftRefs; }
+
   ReferencePolicy* setup_policy(bool always_clear) {
     _current_soft_ref_policy = always_clear ?
       _always_clear_soft_ref_policy : _default_soft_ref_policy;
@@ -205,6 +366,11 @@ class ReferenceProcessor : public CHeapObj {
   void enqueue_discovered_reflists(HeapWord* pending_list_addr, AbstractRefProcTaskExecutor* task_executor);
 
  protected:
+  // Set the 'discovered' field of the given reference to
+  // the given value - emitting barriers depending upon
+  // the value of _discovered_list_needs_barrier.
+  void set_discovered(oop ref, oop value);
+
   // "Preclean" the given discovered reference list
   // by removing references with strongly reachable referents.
   // Currently used in support of CMS only.
@@ -290,7 +456,19 @@ class ReferenceProcessor : public CHeapObj {
   void      set_span(MemRegion span) { _span = span; }
 
   // start and stop weak ref discovery
-  void enable_discovery()   { _discovering_refs = true;  }
+  void enable_discovery(bool verify_disabled, bool check_no_refs) {
+#ifdef ASSERT
+    // Verify that we're not currently discovering refs
+    assert(!verify_disabled || !_discovering_refs, "nested call?");
+
+    if (check_no_refs) {
+      // Verify that the discovered lists are empty
+      verify_no_references_recorded();
+    }
+#endif // ASSERT
+    _discovering_refs = true;
+  }
+
   void disable_discovery()  { _discovering_refs = false; }
   bool discovery_enabled()  { return _discovering_refs;  }
 
@@ -365,7 +543,7 @@ class NoRefDiscovery: StackObj {
 
   ~NoRefDiscovery() {
     if (_was_discovering_refs) {
-      _rp->enable_discovery();
+      _rp->enable_discovery(true /*verify_disabled*/, false /*check_no_refs*/);
     }
   }
 };
diff --git a/src/share/vm/runtime/thread.cpp b/src/share/vm/runtime/thread.cpp
index 5b5138e835031693667d9a74f8596a6949c1dd07..8b7059c8389faff36e887342703e15fe4dd3632a 100644
--- a/src/share/vm/runtime/thread.cpp
+++ b/src/share/vm/runtime/thread.cpp
@@ -749,8 +749,9 @@ bool Thread::claim_oops_do_par_case(int strong_roots_parity) {
   jint thread_parity = _oops_do_parity;
   if (thread_parity != strong_roots_parity) {
     jint res = Atomic::cmpxchg(strong_roots_parity, &_oops_do_parity, thread_parity);
-    if (res == thread_parity) return true;
-    else {
+    if (res == thread_parity) {
+      return true;
+    } else {
       guarantee(res == strong_roots_parity, "Or else what?");
       assert(SharedHeap::heap()->n_par_threads() > 0,
              "Should only fail when parallel.");
@@ -3905,8 +3906,9 @@ void Threads::possibly_parallel_oops_do(OopClosure* f, CodeBlobClosure* cf) {
     }
   }
   VMThread* vmt = VMThread::vm_thread();
-  if (vmt->claim_oops_do(is_par, cp))
+  if (vmt->claim_oops_do(is_par, cp)) {
     vmt->oops_do(f, cf);
+  }
 }
 
 #ifndef SERIALGC