Merge

f31cc255 · tonyp · 0edc156b · 3d60bedf · f31cc255 · f31cc255
32 changed file
--- a/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp
+++ b/src/share/vm/gc_implementation/g1/concurrentG1RefineThread.cpp
@@ -133,14 +133,12 @@ void ConcurrentG1RefineThread::queueBasedRefinement() {
      _co_tracker.update(false);
      if (G1SmoothConcRefine) {
-        start_vtime_sec = os::elapsedVTime();
        prev_buffer_num = curr_buffer_num;
        _sts.leave();
        os::sleep(Thread::current(), (jlong) _interval_ms, false);
        _sts.join();
+        start_vtime_sec = os::elapsedVTime();
      }
      n_logs++;
    }
    // Make sure we harvest the PYA, if any.

--- a/src/share/vm/gc_implementation/g1/concurrentMark.cpp
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.cpp
@@ -420,6 +420,10 @@ ConcurrentMark::ConcurrentMark(ReservedSpace rs,
  _has_overflown(false),
  _concurrent(false),
+  _has_aborted(false),
+  _restart_for_overflow(false),
+  _concurrent_marking_in_progress(false),
+  _should_gray_objects(false),
  // _verbose_level set below

--- a/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp
+++ b/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp
@@ -107,7 +107,7 @@ void ConcurrentMarkThread::run() {
      if (PrintGC) {
        gclog_or_tty->date_stamp(PrintGCDateStamps);
        gclog_or_tty->stamp(PrintGCTimeStamps);
-        tty->print_cr("[GC concurrent-mark-start]");
+        gclog_or_tty->print_cr("[GC concurrent-mark-start]");
      }
      if (!g1_policy->in_young_gc_mode()) {
@@ -320,8 +320,6 @@ void ConcurrentMarkThread::sleepBeforeNextCycle() {
  set_in_progress();
  clear_started();
  if (TraceConcurrentMark) gclog_or_tty->print_cr("CM-starting");
-  return;
 }
 // Note: this method, although exported by the ConcurrentMarkSweepThread,

--- a/src/share/vm/gc_implementation/g1/dirtyCardQueue.cpp
+++ b/src/share/vm/gc_implementation/g1/dirtyCardQueue.cpp
@@ -78,8 +78,8 @@ size_t DirtyCardQueueSet::num_par_ids() {
 void DirtyCardQueueSet::initialize(Monitor* cbl_mon, Mutex* fl_lock,
                                   int max_completed_queue,
-                                   Mutex* lock) {
+                                   Mutex* lock, PtrQueueSet* fl_owner) {
-  PtrQueueSet::initialize(cbl_mon, fl_lock, max_completed_queue);
+  PtrQueueSet::initialize(cbl_mon, fl_lock, max_completed_queue, fl_owner);
  set_buffer_size(DCQBarrierQueueBufferSize);
  set_process_completed_threshold(DCQBarrierProcessCompletedThreshold);

--- a/src/share/vm/gc_implementation/g1/dirtyCardQueue.hpp
+++ b/src/share/vm/gc_implementation/g1/dirtyCardQueue.hpp
@@ -88,7 +88,7 @@ public:
  void initialize(Monitor* cbl_mon, Mutex* fl_lock,
                  int max_completed_queue = 0,
-                  Mutex* lock = NULL);
+                  Mutex* lock = NULL, PtrQueueSet* fl_owner = NULL);
  // The number of parallel ids that can be claimed to allow collector or
  // mutator threads to do card-processing work.

--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@@ -136,6 +136,14 @@ public:
  int calls() { return _calls; }
 };
+class RedirtyLoggedCardTableEntryFastClosure : public CardTableEntryClosure {
+public:
+  bool do_card_ptr(jbyte* card_ptr, int worker_i) {
+    *card_ptr = CardTableModRefBS::dirty_card_val();
+    return true;
+  }
+};
 YoungList::YoungList(G1CollectedHeap* g1h)
  : _g1h(g1h), _head(NULL),
    _scan_only_head(NULL), _scan_only_tail(NULL), _curr_scan_only(NULL),
@@ -961,7 +969,8 @@ void G1CollectedHeap::do_collection(bool full, bool clear_all_soft_refs,
    // dirtied, so this should abandon those logs, and set "do_traversal"
    // to true.
    concurrent_g1_refine()->set_pya_restart();
+    assert(!G1DeferredRSUpdate
+           || (G1DeferredRSUpdate && (dirty_card_queue_set().completed_buffers_num() == 0)), "Should not be any");
    assert(regions_accounted_for(), "Region leakage!");
  }
@@ -1466,6 +1475,13 @@ jint G1CollectedHeap::initialize() {
                                                  G1DirtyCardQueueMax,
                                                  Shared_DirtyCardQ_lock);
  }
+  if (G1DeferredRSUpdate) {
+    dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon,
+                                      DirtyCardQ_FL_lock,
+                                      0,
+                                      Shared_DirtyCardQ_lock,
+                                      &JavaThread::dirty_card_queue_set());
+  }
  // In case we're keeping closure specialization stats, initialize those
  // counts and that mechanism.
  SpecializationStats::clear();
@@ -2316,7 +2332,6 @@ class VerifyMarkedObjsClosure: public ObjectClosure {
 void
 G1CollectedHeap::checkConcurrentMark() {
    VerifyMarkedObjsClosure verifycl(this);
-    doConcurrentMark();
    //    MutexLockerEx x(getMarkBitMapLock(),
    //              Mutex::_no_safepoint_check_flag);
    object_iterate(&verifycl);
@@ -2493,7 +2508,7 @@ G1CollectedHeap::do_collection_pause_at_safepoint(HeapRegion* popular_region) {
    guarantee(_in_cset_fast_test == NULL, "invariant");
    guarantee(_in_cset_fast_test_base == NULL, "invariant");
-    _in_cset_fast_test_length = n_regions();
+    _in_cset_fast_test_length = max_regions();
    _in_cset_fast_test_base =
                             NEW_C_HEAP_ARRAY(bool, _in_cset_fast_test_length);
    memset(_in_cset_fast_test_base, false,
@@ -2918,27 +2933,51 @@ public:
  }
 };
-class RecreateRSetEntriesClosure: public OopClosure {
+class UpdateRSetImmediate : public OopsInHeapRegionClosure {
 private:
  G1CollectedHeap* _g1;
  G1RemSet* _g1_rem_set;
-  HeapRegion* _from;
 public:
-  RecreateRSetEntriesClosure(G1CollectedHeap* g1, HeapRegion* from) :
+  UpdateRSetImmediate(G1CollectedHeap* g1) :
-    _g1(g1), _g1_rem_set(g1->g1_rem_set()), _from(from)
+    _g1(g1), _g1_rem_set(g1->g1_rem_set()) {}
-  {}
+  void do_oop(narrowOop* p) {
+    guarantee(false, "NYI");
+  }
+  void do_oop(oop* p) {
+    assert(_from->is_in_reserved(p), "paranoia");
+    if (*p != NULL && !_from->is_survivor()) {
+      _g1_rem_set->par_write_ref(_from, p, 0);
+    }
+  }
+};
+class UpdateRSetDeferred : public OopsInHeapRegionClosure {
+private:
+  G1CollectedHeap* _g1;
+  DirtyCardQueue *_dcq;
+  CardTableModRefBS* _ct_bs;
+public:
+  UpdateRSetDeferred(G1CollectedHeap* g1, DirtyCardQueue* dcq) :
+    _g1(g1), _ct_bs((CardTableModRefBS*)_g1->barrier_set()), _dcq(dcq) {}
  void do_oop(narrowOop* p) {
    guarantee(false, "NYI");
  }
  void do_oop(oop* p) {
    assert(_from->is_in_reserved(p), "paranoia");
-    if (*p != NULL) {
+    if (!_from->is_in_reserved(*p) && !_from->is_survivor()) {
-      _g1_rem_set->write_ref(_from, p);
+      size_t card_index = _ct_bs->index_for(p);
+      if (_ct_bs->mark_card_deferred(card_index)) {
+        _dcq->enqueue((jbyte*)_ct_bs->byte_for_index(card_index));
+      }
    }
  }
 };
 class RemoveSelfPointerClosure: public ObjectClosure {
 private:
  G1CollectedHeap* _g1;
@@ -2946,11 +2985,11 @@ private:
  HeapRegion* _hr;
  size_t _prev_marked_bytes;
  size_t _next_marked_bytes;
+  OopsInHeapRegionClosure *_cl;
 public:
-  RemoveSelfPointerClosure(G1CollectedHeap* g1, HeapRegion* hr) :
+  RemoveSelfPointerClosure(G1CollectedHeap* g1, OopsInHeapRegionClosure* cl) :
-    _g1(g1), _cm(_g1->concurrent_mark()), _hr(hr),
+    _g1(g1), _cm(_g1->concurrent_mark()),  _prev_marked_bytes(0),
-    _prev_marked_bytes(0), _next_marked_bytes(0)
+    _next_marked_bytes(0), _cl(cl) {}
-  {}
  size_t prev_marked_bytes() { return _prev_marked_bytes; }
  size_t next_marked_bytes() { return _next_marked_bytes; }
@@ -2988,8 +3027,7 @@ public:
      // that, if evacuation fails, we might have remembered set
      // entries missing given that we skipped cards on the
      // collection set. So, we'll recreate such entries now.
-      RecreateRSetEntriesClosure cl(_g1, _hr);
+      obj->oop_iterate(_cl);
-      obj->oop_iterate(&cl);
      assert(_cm->isPrevMarked(obj), "Should be marked!");
    } else {
      // The object has been either evacuated or is dead. Fill it with a
@@ -3002,14 +3040,23 @@ public:
 };
 void G1CollectedHeap::remove_self_forwarding_pointers() {
+  UpdateRSetImmediate immediate_update(_g1h);
+  DirtyCardQueue dcq(&_g1h->dirty_card_queue_set());
+  UpdateRSetDeferred deferred_update(_g1h, &dcq);
+  OopsInHeapRegionClosure *cl;
+  if (G1DeferredRSUpdate) {
+    cl = &deferred_update;
+  } else {
+    cl = &immediate_update;
+  }
  HeapRegion* cur = g1_policy()->collection_set();
  while (cur != NULL) {
    assert(g1_policy()->assertMarkedBytesDataOK(), "Should be!");
+    RemoveSelfPointerClosure rspc(_g1h, cl);
    if (cur->evacuation_failed()) {
-      RemoveSelfPointerClosure rspc(_g1h, cur);
      assert(cur->in_collection_set(), "bad CS");
+      cl->set_region(cur);
      cur->object_iterate(&rspc);
      // A number of manipulations to make the TAMS be the current top,
@@ -3518,6 +3565,9 @@ class G1ParScanThreadState : public StackObj {
 protected:
  G1CollectedHeap* _g1h;
  RefToScanQueue*  _refs;
+  DirtyCardQueue   _dcq;
+  CardTableModRefBS* _ct_bs;
+  G1RemSet* _g1_rem;
  typedef GrowableArray<oop*> OverflowQueue;
  OverflowQueue* _overflowed_refs;
@@ -3559,10 +3609,32 @@ protected:
  void   add_to_undo_waste(size_t waste)         { _undo_waste += waste; }
+  DirtyCardQueue& dirty_card_queue()             { return _dcq;  }
+  CardTableModRefBS* ctbs()                      { return _ct_bs; }
+  void immediate_rs_update(HeapRegion* from, oop* p, int tid) {
+    _g1_rem->par_write_ref(from, p, tid);
+  }
+  void deferred_rs_update(HeapRegion* from, oop* p, int tid) {
+    // If the new value of the field points to the same region or
+    // is the to-space, we don't need to include it in the Rset updates.
+    if (!from->is_in_reserved(*p) && !from->is_survivor()) {
+      size_t card_index = ctbs()->index_for(p);
+      // If the card hasn't been added to the buffer, do it.
+      if (ctbs()->mark_card_deferred(card_index)) {
+        dirty_card_queue().enqueue((jbyte*)ctbs()->byte_for_index(card_index));
+      }
+    }
+  }
 public:
  G1ParScanThreadState(G1CollectedHeap* g1h, int queue_num)
    : _g1h(g1h),
      _refs(g1h->task_queue(queue_num)),
+      _dcq(&g1h->dirty_card_queue_set()),
+      _ct_bs((CardTableModRefBS*)_g1h->barrier_set()),
+      _g1_rem(g1h->g1_rem_set()),
      _hash_seed(17), _queue_num(queue_num),
      _term_attempts(0),
      _age_table(false),
@@ -3640,6 +3712,14 @@ public:
  int refs_to_scan()                             { return refs()->size();                 }
  int overflowed_refs_to_scan()                  { return overflowed_refs()->length();    }
+  void update_rs(HeapRegion* from, oop* p, int tid) {
+    if (G1DeferredRSUpdate) {
+      deferred_rs_update(from, p, tid);
+    } else {
+      immediate_rs_update(from, p, tid);
+    }
+  }
  HeapWord* allocate_slow(GCAllocPurpose purpose, size_t word_sz) {
    HeapWord* obj = NULL;
@@ -3808,7 +3888,6 @@ public:
  }
 };
 G1ParClosureSuper::G1ParClosureSuper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
  _g1(g1), _g1_rem(_g1->g1_rem_set()), _cm(_g1->concurrent_mark()),
  _par_scan_state(par_scan_state) { }
@@ -3834,7 +3913,7 @@ void G1ParScanClosure::do_oop_nv(oop* p) {
      assert(obj == *p, "the value of *p should not have changed");
      _par_scan_state->push_on_queue(p);
    } else {
-      _g1_rem->par_write_ref(_from, p, _par_scan_state->queue_num());
+      _par_scan_state->update_rs(_from, p, _par_scan_state->queue_num());
    }
  }
 }
@@ -3972,13 +4051,13 @@ void G1ParCopyClosure<do_gen_barrier, barrier,
    }
    // When scanning the RS, we only care about objs in CS.
    if (barrier == G1BarrierRS) {
-      _g1_rem->par_write_ref(_from, p, _par_scan_state->queue_num());
+      _par_scan_state->update_rs(_from, p, _par_scan_state->queue_num());
    }
  }
  // When scanning moved objs, must look at all oops.
  if (barrier == G1BarrierEvac && obj != NULL) {
-    _g1_rem->par_write_ref(_from, p, _par_scan_state->queue_num());
+    _par_scan_state->update_rs(_from, p, _par_scan_state->queue_num());
  }
  if (do_gen_barrier && obj != NULL) {
@@ -4127,6 +4206,7 @@ public:
    G1ParScanExtRootClosure         only_scan_root_cl(_g1h, &pss);
    G1ParScanPermClosure            only_scan_perm_cl(_g1h, &pss);
    G1ParScanHeapRSClosure          only_scan_heap_rs_cl(_g1h, &pss);
    G1ParScanAndMarkExtRootClosure  scan_mark_root_cl(_g1h, &pss);
    G1ParScanAndMarkPermClosure     scan_mark_perm_cl(_g1h, &pss);
    G1ParScanAndMarkHeapRSClosure   scan_mark_heap_rs_cl(_g1h, &pss);
@@ -4382,7 +4462,6 @@ void G1CollectedHeap::evacuate_collection_set() {
  g1_rem_set()->prepare_for_oops_into_collection_set_do();
  concurrent_g1_refine()->set_use_cache(false);
  int n_workers = (ParallelGCThreads > 0 ? workers()->total_workers() : 1);
  set_par_threads(n_workers);
  G1ParTask g1_par_task(this, n_workers, _task_queues);
@@ -4390,8 +4469,9 @@ void G1CollectedHeap::evacuate_collection_set() {
  change_strong_roots_parity();  // In preparation for parallel strong roots.
  rem_set()->prepare_for_younger_refs_iterate(true);
-  double start_par = os::elapsedTime();
+  assert(dirty_card_queue_set().completed_buffers_num() == 0, "Should be empty");
+  double start_par = os::elapsedTime();
  if (ParallelGCThreads > 0) {
    // The individual threads will set their evac-failure closures.
    workers()->run_task(&g1_par_task);
@@ -4411,8 +4491,8 @@ void G1CollectedHeap::evacuate_collection_set() {
    G1KeepAliveClosure keep_alive(this);
    JNIHandles::weak_oops_do(&is_alive, &keep_alive);
  }
  g1_rem_set()->cleanup_after_oops_into_collection_set_do();
  concurrent_g1_refine()->set_use_cache(true);
  finalize_for_evac_failure();
@@ -4423,7 +4503,6 @@ void G1CollectedHeap::evacuate_collection_set() {
  if (evacuation_failed()) {
    remove_self_forwarding_pointers();
    if (PrintGCDetails) {
      gclog_or_tty->print(" (evacuation failed)");
    } else if (PrintGC) {
@@ -4431,6 +4510,14 @@ void G1CollectedHeap::evacuate_collection_set() {
    }
  }
+  if (G1DeferredRSUpdate) {
+    RedirtyLoggedCardTableEntryFastClosure redirty;
+    dirty_card_queue_set().set_closure(&redirty);
+    dirty_card_queue_set().apply_closure_to_all_completed_buffers();
+    JavaThread::dirty_card_queue_set().merge_bufferlists(&dirty_card_queue_set());
+    assert(dirty_card_queue_set().completed_buffers_num() == 0, "All should be consumed");
+  }
  COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
 }

--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
@@ -457,6 +457,10 @@ protected:
  // And it's mod ref barrier set, used to track updates for the above.
  ModRefBarrierSet* _mr_bs;
+  // A set of cards that cover the objects for which the Rsets should be updated
+  // concurrently after the collection.
+  DirtyCardQueueSet _dirty_card_queue_set;
  // The Heap Region Rem Set Iterator.
  HeapRegionRemSetIterator** _rem_set_iterator;
@@ -666,6 +670,9 @@ public:
  RefToScanQueue *task_queue(int i);
+  // A set of cards where updates happened during the GC
+  DirtyCardQueueSet& dirty_card_queue_set() { return _dirty_card_queue_set; }
  // Create a G1CollectedHeap with the specified policy.
  // Must call the initialize method afterwards.
  // May not return if something goes wrong.

--- a/src/share/vm/gc_implementation/g1/g1RemSet.cpp
+++ b/src/share/vm/gc_implementation/g1/g1RemSet.cpp
@@ -177,11 +177,19 @@ HRInto_G1RemSet::HRInto_G1RemSet(G1CollectedHeap* g1, CardTableModRefBS* ct_bs)
    _cards_scanned(NULL), _total_cards_scanned(0)
 {
  _seq_task = new SubTasksDone(NumSeqTasks);
-  _new_refs = NEW_C_HEAP_ARRAY(GrowableArray<oop*>*, ParallelGCThreads);
+  guarantee(n_workers() > 0, "There should be some workers");
+  _new_refs = NEW_C_HEAP_ARRAY(GrowableArray<oop*>*, n_workers());
+  for (uint i = 0; i < n_workers(); i++) {
+    _new_refs[i] = new (ResourceObj::C_HEAP) GrowableArray<oop*>(8192,true);
+  }
 }
 HRInto_G1RemSet::~HRInto_G1RemSet() {
  delete _seq_task;
+  for (uint i = 0; i < n_workers(); i++) {
+    delete _new_refs[i];
+  }
+  FREE_C_HEAP_ARRAY(GrowableArray<oop*>*, _new_refs);
 }
 void CountNonCleanMemRegionClosure::do_MemRegion(MemRegion mr) {
@@ -281,8 +289,9 @@ public:
        if (!_ct_bs->is_card_claimed(card_index) &&
            !_ct_bs->is_card_dirty(card_index)) {
          assert(_ct_bs->is_card_clean(card_index) ||
-                 _ct_bs->is_card_claimed(card_index),
+                 _ct_bs->is_card_claimed(card_index) ||
-                 "Card is either dirty, clean, or claimed");
+                 _ct_bs->is_card_deferred(card_index),
+                 "Card is either clean, claimed or deferred");
          if (_ct_bs->claim_card(card_index))
            scanCard(card_index, card_region);
        }
@@ -338,14 +347,12 @@ void HRInto_G1RemSet::scanRS(OopsInHeapRegionClosure* oc, int worker_i) {
  _g1p->record_scan_rs_start_time(worker_i, rs_time_start * 1000.0);
  _g1p->record_scan_rs_time(worker_i, scan_rs_time_sec * 1000.0);
-  if (ParallelGCThreads > 0) {
-    // In this case, we called scanNewRefsRS and recorded the corresponding
+  double scan_new_refs_time_ms = _g1p->get_scan_new_refs_time(worker_i);
-    // time.
+  if (scan_new_refs_time_ms > 0.0) {
-    double scan_new_refs_time_ms = _g1p->get_scan_new_refs_time(worker_i);
+    closure_app_time_ms += scan_new_refs_time_ms;
-    if (scan_new_refs_time_ms > 0.0) {
-      closure_app_time_ms += scan_new_refs_time_ms;
-    }
  }
  _g1p->record_obj_copy_time(worker_i, closure_app_time_ms);
 }
@@ -469,8 +476,8 @@ HRInto_G1RemSet::scanNewRefsRS(OopsInHeapRegionClosure* oc,
  double scan_new_refs_start_sec = os::elapsedTime();
  G1CollectedHeap* g1h = G1CollectedHeap::heap();
  CardTableModRefBS* ct_bs = (CardTableModRefBS*) (g1h->barrier_set());
-  while (_new_refs[worker_i]->is_nonempty()) {
+  for (int i = 0; i < _new_refs[worker_i]->length(); i++) {
-    oop* p = _new_refs[worker_i]->pop();
+    oop* p = _new_refs[worker_i]->at(i);
    oop obj = *p;
    // *p was in the collection set when p was pushed on "_new_refs", but
    // another thread may have processed this location from an RS, so it
@@ -480,10 +487,6 @@ HRInto_G1RemSet::scanNewRefsRS(OopsInHeapRegionClosure* oc,
      HeapRegion* r = g1h->heap_region_containing(p);
      DEBUG_ONLY(HeapRegion* to = g1h->heap_region_containing(obj));
-      assert(ParallelGCThreads > 1
-             || to->rem_set()->contains_reference(p),
-             "Invariant: pushed after being added."
-             "(Not reliable in parallel code.)");
      oc->set_region(r);
      // If "p" has already been processed concurrently, this is
      // idempotent.
@@ -538,8 +541,8 @@ HRInto_G1RemSet::oops_into_collection_set_do(OopsInHeapRegionClosure* oc,
    }
  } else {
    assert(worker_i == 0, "invariant");
    updateRS(0);
+    scanNewRefsRS(oc, 0);
    scanRS(oc, 0);
  }
 }
@@ -559,11 +562,7 @@ prepare_for_oops_into_collection_set_do() {
  assert(!_par_traversal_in_progress, "Invariant between iterations.");
  if (ParallelGCThreads > 0) {
    set_par_traversal(true);
-    int n_workers = _g1->workers()->total_workers();
+    _seq_task->set_par_threads((int)n_workers());
-    _seq_task->set_par_threads(n_workers);
-    for (uint i = 0; i < ParallelGCThreads; i++)
-      _new_refs[i] = new (ResourceObj::C_HEAP) GrowableArray<oop*>(8192,true);
    if (cg1r->do_traversal()) {
      updateRS(0);
      // Have to do this again after updaters
@@ -587,6 +586,53 @@ class cleanUpIteratorsClosure : public HeapRegionClosure {
  }
 };
+class UpdateRSetOopsIntoCSImmediate : public OopClosure {
+  G1CollectedHeap* _g1;
+public:
+  UpdateRSetOopsIntoCSImmediate(G1CollectedHeap* g1) : _g1(g1) { }
+  virtual void do_oop(narrowOop* p) {
+    guarantee(false, "NYI");
+  }
+  virtual void do_oop(oop* p) {
+    HeapRegion* to = _g1->heap_region_containing(*p);
+    if (to->in_collection_set()) {
+      if (to->rem_set()->add_reference(p, 0)) {
+        _g1->schedule_popular_region_evac(to);
+      }
+    }
+  }
+};
+class UpdateRSetOopsIntoCSDeferred : public OopClosure {
+  G1CollectedHeap* _g1;
+  CardTableModRefBS* _ct_bs;
+  DirtyCardQueue* _dcq;
+public:
+  UpdateRSetOopsIntoCSDeferred(G1CollectedHeap* g1, DirtyCardQueue* dcq) :
+    _g1(g1), _ct_bs((CardTableModRefBS*)_g1->barrier_set()), _dcq(dcq) { }
+  virtual void do_oop(narrowOop* p) {
+    guarantee(false, "NYI");
+  }
+  virtual void do_oop(oop* p) {
+    oop obj = *p;
+    if (_g1->obj_in_cs(obj)) {
+      size_t card_index = _ct_bs->index_for(p);
+      if (_ct_bs->mark_card_deferred(card_index)) {
+        _dcq->enqueue((jbyte*)_ct_bs->byte_for_index(card_index));
+      }
+    }
+  }
+};
+void HRInto_G1RemSet::new_refs_iterate(OopClosure* cl) {
+  for (size_t i = 0; i < n_workers(); i++) {
+    for (int j = 0; j < _new_refs[i]->length(); j++) {
+      oop* p = _new_refs[i]->at(j);
+      cl->do_oop(p);
+    }
+  }
+}
 void HRInto_G1RemSet::cleanup_after_oops_into_collection_set_do() {
  guarantee( _cards_scanned != NULL, "invariant" );
  _total_cards_scanned = 0;
@@ -609,11 +655,25 @@ void HRInto_G1RemSet::cleanup_after_oops_into_collection_set_do() {
    if (cg1r->do_traversal()) {
      cg1r->cg1rThread()->set_do_traversal(false);
    }
-    for (uint i = 0; i < ParallelGCThreads; i++) {
-      delete _new_refs[i];
-    }
    set_par_traversal(false);
  }
+  if (_g1->evacuation_failed()) {
+    // Restore remembered sets for the regions pointing into
+    // the collection set.
+    if (G1DeferredRSUpdate) {
+      DirtyCardQueue dcq(&_g1->dirty_card_queue_set());
+      UpdateRSetOopsIntoCSDeferred deferred_update(_g1, &dcq);
+      new_refs_iterate(&deferred_update);
+    } else {
+      UpdateRSetOopsIntoCSImmediate immediate_update(_g1);
+      new_refs_iterate(&immediate_update);
+    }
+  }
+  for (uint i = 0; i < n_workers(); i++) {
+    _new_refs[i]->clear();
+  }
  assert(!_par_traversal_in_progress, "Invariant between iterations.");
 }
@@ -683,7 +743,8 @@ public:
  bool doHeapRegion(HeapRegion* r) {
    if (!r->in_collection_set() &&
        !r->continuesHumongous() &&
-        !r->is_young()) {
+        !r->is_young() &&
+        !r->is_survivor()) {
      _update_rs_oop_cl.set_from(r);
      UpdateRSObjectClosure update_rs_obj_cl(&_update_rs_oop_cl);
@@ -820,7 +881,7 @@ void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
  // before all the cards on the region are dirtied. This is unlikely,
  // and it doesn't happen often, but it can happen. So, the extra
  // check below filters out those cards.
-  if (r->is_young()) {
+  if (r->is_young() || r->is_survivor()) {
    return;
  }
  // While we are processing RSet buffers during the collection, we

--- a/src/share/vm/gc_implementation/g1/g1RemSet.hpp
+++ b/src/share/vm/gc_implementation/g1/g1RemSet.hpp
@@ -155,6 +155,7 @@ protected:
  bool _par_traversal_in_progress;
  void set_par_traversal(bool b);
  GrowableArray<oop*>** _new_refs;
+  void new_refs_iterate(OopClosure* cl);
 public:
  // This is called to reset dual hash tables after the gc pause

--- a/src/share/vm/gc_implementation/g1/g1RemSet.inline.hpp
+++ b/src/share/vm/gc_implementation/g1/g1RemSet.inline.hpp
@@ -31,24 +31,7 @@ inline size_t G1RemSet::n_workers() {
 }
 inline void HRInto_G1RemSet::write_ref_nv(HeapRegion* from, oop* p) {
-  oop obj = *p;
+  par_write_ref(from, p, 0);
-  assert(from != NULL && from->is_in_reserved(p),
-         "p is not in a from");
-  HeapRegion* to = _g1->heap_region_containing(obj);
-  if (from != to && to != NULL) {
-    if (!to->popular() && !from->is_survivor()) {
-#if G1_REM_SET_LOGGING
-      gclog_or_tty->print_cr("Adding " PTR_FORMAT " (" PTR_FORMAT ") to RS"
-                             " for region [" PTR_FORMAT ", " PTR_FORMAT ")",
-                             p, obj,
-                             to->bottom(), to->end());
-#endif
-      assert(to->rem_set() != NULL, "Need per-region 'into' remsets.");
-      if (to->rem_set()->add_reference(p)) {
-        _g1->schedule_popular_region_evac(to);
-      }
-    }
-  }
 }
 inline void HRInto_G1RemSet::write_ref(HeapRegion* from, oop* p) {
@@ -82,7 +65,22 @@ inline void HRInto_G1RemSet::par_write_ref(HeapRegion* from, oop* p, int tid) {
  HeapRegion* to = _g1->heap_region_containing(obj);
  // The test below could be optimized by applying a bit op to to and from.
  if (to != NULL && from != NULL && from != to) {
-    if (!to->popular() && !from->is_survivor()) {
+    bool update_delayed = false;
+    // There is a tricky infinite loop if we keep pushing
+    // self forwarding pointers onto our _new_refs list.
+    // The _par_traversal_in_progress flag is true during the collection pause,
+    // false during the evacuation failure handing.
+    if (_par_traversal_in_progress &&
+        to->in_collection_set() && !self_forwarded(obj)) {
+      _new_refs[tid]->push(p);
+      // Deferred updates to the Cset are either discarded (in the normal case),
+      // or processed (if an evacuation failure occurs) at the end
+      // of the collection.
+      // See HRInto_G1RemSet::cleanup_after_oops_into_collection_set_do().
+      update_delayed = true;
+    }
+    if (!to->popular() && !update_delayed) {
 #if G1_REM_SET_LOGGING
      gclog_or_tty->print_cr("Adding " PTR_FORMAT " (" PTR_FORMAT ") to RS"
                             " for region [" PTR_FORMAT ", " PTR_FORMAT ")",
@@ -94,11 +92,5 @@ inline void HRInto_G1RemSet::par_write_ref(HeapRegion* from, oop* p, int tid) {
        _g1->schedule_popular_region_evac(to);
      }
    }
-    // There is a tricky infinite loop if we keep pushing
-    // self forwarding pointers onto our _new_refs list.
-    if (_par_traversal_in_progress &&
-        to->in_collection_set() && !self_forwarded(obj)) {
-      _new_refs[tid]->push(p);
-    }
  }
 }
--- a/src/share/vm/gc_implementation/g1/g1_globals.hpp
+++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp
@@ -172,6 +172,9 @@
  develop(bool, G1RSBarrierUseQueue, true,                                  \
          "If true, use queueing RS barrier")                               \
                                                                            \
+  develop(bool, G1DeferredRSUpdate, true,                                   \
+          "If true, use deferred RS updates")                               \
+                                                                            \
  develop(bool, G1RSLogCheckCardTable, false,                               \
          "If true, verify that no dirty cards remain after RS log "        \
          "processing.")                                                    \

--- a/src/share/vm/gc_implementation/g1/ptrQueue.cpp
+++ b/src/share/vm/gc_implementation/g1/ptrQueue.cpp
@@ -91,15 +91,17 @@ PtrQueueSet::PtrQueueSet(bool notify_when_complete) :
  _n_completed_buffers(0),
  _process_completed_threshold(0), _process_completed(false),
  _buf_free_list(NULL), _buf_free_list_sz(0)
-{}
+{
+  _fl_owner = this;
+}
 void** PtrQueueSet::allocate_buffer() {
  assert(_sz > 0, "Didn't set a buffer size.");
-  MutexLockerEx x(_fl_lock, Mutex::_no_safepoint_check_flag);
+  MutexLockerEx x(_fl_owner->_fl_lock, Mutex::_no_safepoint_check_flag);
-  if (_buf_free_list != NULL) {
+  if (_fl_owner->_buf_free_list != NULL) {
-    void** res = _buf_free_list;
+    void** res = _fl_owner->_buf_free_list;
-    _buf_free_list = (void**)_buf_free_list[0];
+    _fl_owner->_buf_free_list = (void**)_fl_owner->_buf_free_list[0];
-    _buf_free_list_sz--;
+    _fl_owner->_buf_free_list_sz--;
    // Just override the next pointer with NULL, just in case we scan this part
    // of the buffer.
    res[0] = NULL;
@@ -111,10 +113,10 @@ void** PtrQueueSet::allocate_buffer() {
 void PtrQueueSet::deallocate_buffer(void** buf) {
  assert(_sz > 0, "Didn't set a buffer size.");
-  MutexLockerEx x(_fl_lock, Mutex::_no_safepoint_check_flag);
+  MutexLockerEx x(_fl_owner->_fl_lock, Mutex::_no_safepoint_check_flag);
-  buf[0] = (void*)_buf_free_list;
+  buf[0] = (void*)_fl_owner->_buf_free_list;
-  _buf_free_list = buf;
+  _fl_owner->_buf_free_list = buf;
-  _buf_free_list_sz++;
+  _fl_owner->_buf_free_list_sz++;
 }
 void PtrQueueSet::reduce_free_list() {
@@ -207,3 +209,58 @@ void PtrQueueSet::set_buffer_size(size_t sz) {
 void PtrQueueSet::set_process_completed_threshold(size_t sz) {
  _process_completed_threshold = sz;
 }
+// Merge lists of buffers. Notify waiting threads if the length of the list
+// exceeds threshold. The source queue is emptied as a result. The queues
+// must share the monitor.
+void PtrQueueSet::merge_bufferlists(PtrQueueSet *src) {
+  assert(_cbl_mon == src->_cbl_mon, "Should share the same lock");
+  MutexLockerEx x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  if (_completed_buffers_tail == NULL) {
+    assert(_completed_buffers_head == NULL, "Well-formedness");
+    _completed_buffers_head = src->_completed_buffers_head;
+    _completed_buffers_tail = src->_completed_buffers_tail;
+  } else {
+    assert(_completed_buffers_head != NULL, "Well formedness");
+    if (src->_completed_buffers_head != NULL) {
+      _completed_buffers_tail->next = src->_completed_buffers_head;
+      _completed_buffers_tail = src->_completed_buffers_tail;
+    }
+  }
+  _n_completed_buffers += src->_n_completed_buffers;
+  src->_n_completed_buffers = 0;
+  src->_completed_buffers_head = NULL;
+  src->_completed_buffers_tail = NULL;
+  assert(_completed_buffers_head == NULL && _completed_buffers_tail == NULL ||
+         _completed_buffers_head != NULL && _completed_buffers_tail != NULL,
+         "Sanity");
+  if (!_process_completed &&
+      _n_completed_buffers >= _process_completed_threshold) {
+    _process_completed = true;
+    if (_notify_when_complete)
+      _cbl_mon->notify_all();
+  }
+}
+// Merge free lists of the two queues. The free list of the source
+// queue is emptied as a result. The queues must share the same
+// mutex that guards free lists.
+void PtrQueueSet::merge_freelists(PtrQueueSet* src) {
+  assert(_fl_lock == src->_fl_lock, "Should share the same lock");
+  MutexLockerEx x(_fl_lock, Mutex::_no_safepoint_check_flag);
+  if (_buf_free_list != NULL) {
+    void **p = _buf_free_list;
+    while (*p != NULL) {
+      p = (void**)*p;
+    }
+    *p = src->_buf_free_list;
+  } else {
+    _buf_free_list = src->_buf_free_list;
+  }
+  _buf_free_list_sz += src->_buf_free_list_sz;
+  src->_buf_free_list = NULL;
+  src->_buf_free_list_sz = 0;
+}
--- a/src/share/vm/gc_implementation/g1/ptrQueue.hpp
+++ b/src/share/vm/gc_implementation/g1/ptrQueue.hpp
@@ -155,6 +155,9 @@ protected:
  Mutex* _fl_lock;
  void** _buf_free_list;
  size_t _buf_free_list_sz;
+  // Queue set can share a freelist. The _fl_owner variable
+  // specifies the owner. It is set to "this" by default.
+  PtrQueueSet* _fl_owner;
  // The size of all buffers in the set.
  size_t _sz;
@@ -188,10 +191,13 @@ public:
  // Because of init-order concerns, we can't pass these as constructor
  // arguments.
  void initialize(Monitor* cbl_mon, Mutex* fl_lock,
-                  int max_completed_queue = 0) {
+                  int max_completed_queue = 0,
+                  PtrQueueSet *fl_owner = NULL) {
    _max_completed_queue = max_completed_queue;
    assert(cbl_mon != NULL && fl_lock != NULL, "Init order issue?");
-    _cbl_mon = cbl_mon; _fl_lock = fl_lock;
+    _cbl_mon = cbl_mon;
+    _fl_lock = fl_lock;
+    _fl_owner = (fl_owner != NULL) ? fl_owner : this;
  }
  // Return an empty oop array of size _sz (required to be non-zero).
@@ -228,4 +234,7 @@ public:
  void reduce_free_list();
  size_t completed_buffers_num() { return _n_completed_buffers; }
+  void merge_bufferlists(PtrQueueSet* src);
+  void merge_freelists(PtrQueueSet* src);
 };
--- a/src/share/vm/gc_implementation/g1/sparsePRT.cpp
+++ b/src/share/vm/gc_implementation/g1/sparsePRT.cpp
@@ -504,6 +504,7 @@ void SparsePRT::cleanup() {
  // Make sure that the current and next tables agree.  (Another mechanism
  // takes care of deleting now-unused tables.)
  _cur = _next;
+  set_expanded(false);
 }
 void SparsePRT::expand() {

--- a/src/share/vm/gc_implementation/g1/sparsePRT.hpp
+++ b/src/share/vm/gc_implementation/g1/sparsePRT.hpp
@@ -274,7 +274,7 @@ public:
  // Clean up all tables on the expanded list.  Called single threaded.
  static void cleanup_all();
-  RSHashTable* next() const { return _next; }
+  RSHashTable* cur() const { return _cur; }
  void init_iterator(SparsePRTIter* sprt_iter);
@@ -300,7 +300,7 @@ public:
  {}
  void init(const SparsePRT* sprt) {
-    RSHashTableIter::init(sprt->next());
+    RSHashTableIter::init(sprt->cur());
  }
  bool has_next(size_t& card_index) {
    return RSHashTableIter::has_next(card_index);

--- a/src/share/vm/gc_implementation/parallelScavenge/psMarkSweep.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psMarkSweep.cpp
@@ -125,6 +125,8 @@ void PSMarkSweep::invoke_no_policy(bool clear_all_softrefs) {
    perm_gen->verify_object_start_array();
  }
+  heap->pre_full_gc_dump();
  // Filled in below to track the state of the young gen after the collection.
  bool eden_empty;
  bool survivors_empty;
@@ -363,6 +365,8 @@ void PSMarkSweep::invoke_no_policy(bool clear_all_softrefs) {
    Universe::print_heap_after_gc();
  }
+  heap->post_full_gc_dump();
 #ifdef TRACESPINNING
  ParallelTaskTerminator::print_termination_counts();
 #endif

--- a/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp
@@ -1982,6 +1982,8 @@ void PSParallelCompact::invoke_no_policy(bool maximum_heap_compaction) {
    heap->record_gen_tops_before_GC();
  }
+  heap->pre_full_gc_dump();
  _print_phases = PrintGCDetails && PrintParallelOldGCPhaseTimes;
  // Make sure data structures are sane, make the heap parsable, and do other
@@ -2204,6 +2206,8 @@ void PSParallelCompact::invoke_no_policy(bool maximum_heap_compaction) {
    gc_task_manager()->print_task_time_stamps();
  }
+  heap->post_full_gc_dump();
 #ifdef TRACESPINNING
  ParallelTaskTerminator::print_termination_counts();
 #endif

--- a/src/share/vm/gc_implementation/shared/vmGCOperations.cpp
+++ b/src/share/vm/gc_implementation/shared/vmGCOperations.cpp
@@ -121,7 +121,7 @@ void VM_GC_HeapInspection::doit() {
    // make the heap parsable (no need to retire TLABs)
    ch->ensure_parsability(false);
  }
-  HeapInspection::heap_inspection(_out);
+  HeapInspection::heap_inspection(_out, _need_prologue /* need_prologue */);
 }

--- a/src/share/vm/gc_implementation/shared/vmGCOperations.hpp
+++ b/src/share/vm/gc_implementation/shared/vmGCOperations.hpp
@@ -112,13 +112,16 @@ class VM_GC_HeapInspection: public VM_GC_Operation {
 private:
  outputStream* _out;
  bool _full_gc;
+  bool _need_prologue;
 public:
-  VM_GC_HeapInspection(outputStream* out, bool request_full_gc) :
+  VM_GC_HeapInspection(outputStream* out, bool request_full_gc,
+                       bool need_prologue) :
    VM_GC_Operation(0 /* total collections,      dummy, ignored */,
                    0 /* total full collections, dummy, ignored */,
                    request_full_gc) {
    _out = out;
    _full_gc = request_full_gc;
+    _need_prologue = need_prologue;
  }
  ~VM_GC_HeapInspection() {}

--- a/src/share/vm/gc_interface/collectedHeap.cpp
+++ b/src/share/vm/gc_interface/collectedHeap.cpp
@@ -294,3 +294,29 @@ void CollectedHeap::resize_all_tlabs() {
    ThreadLocalAllocBuffer::resize_all_tlabs();
  }
 }
+void CollectedHeap::pre_full_gc_dump() {
+  if (HeapDumpBeforeFullGC) {
+    TraceTime tt("Heap Dump: ", PrintGCDetails, false, gclog_or_tty);
+    // We are doing a "major" collection and a heap dump before
+    // major collection has been requested.
+    HeapDumper::dump_heap();
+  }
+  if (PrintClassHistogramBeforeFullGC) {
+    TraceTime tt("Class Histogram: ", PrintGCDetails, true, gclog_or_tty);
+    VM_GC_HeapInspection inspector(gclog_or_tty, false /* ! full gc */, false /* ! prologue */);
+    inspector.doit();
+  }
+}
+void CollectedHeap::post_full_gc_dump() {
+  if (HeapDumpAfterFullGC) {
+    TraceTime tt("Heap Dump", PrintGCDetails, false, gclog_or_tty);
+    HeapDumper::dump_heap();
+  }
+  if (PrintClassHistogramAfterFullGC) {
+    TraceTime tt("Class Histogram", PrintGCDetails, true, gclog_or_tty);
+    VM_GC_HeapInspection inspector(gclog_or_tty, false /* ! full gc */, false /* ! prologue */);
+    inspector.doit();
+  }
+}
--- a/src/share/vm/gc_interface/collectedHeap.hpp
+++ b/src/share/vm/gc_interface/collectedHeap.hpp
@@ -514,6 +514,10 @@ class CollectedHeap : public CHeapObj {
  // Perform any cleanup actions necessary before allowing a verification.
  virtual void prepare_for_verify() = 0;
+  // Generate any dumps preceding or following a full gc
+  void pre_full_gc_dump();
+  void post_full_gc_dump();
  virtual void print() const = 0;
  virtual void print_on(outputStream* st) const = 0;

--- a/src/share/vm/includeDB_gc
+++ b/src/share/vm/includeDB_gc
@@ -26,10 +26,12 @@
 collectedHeap.cpp                       collectedHeap.hpp
 collectedHeap.cpp                       collectedHeap.inline.hpp
+collectedHeap.cpp                       heapDumper.hpp
 collectedHeap.cpp                       init.hpp
 collectedHeap.cpp                       oop.inline.hpp
 collectedHeap.cpp                       systemDictionary.hpp
 collectedHeap.cpp                       thread_<os_family>.inline.hpp
+collectedHeap.cpp                       vmGCOperations.hpp
 collectedHeap.hpp                       allocation.hpp
 collectedHeap.hpp                       barrierSet.hpp

--- a/src/share/vm/memory/cardTableModRefBS.cpp
+++ b/src/share/vm/memory/cardTableModRefBS.cpp
@@ -356,18 +356,62 @@ void CardTableModRefBS::write_ref_field_work(void* field, oop newVal) {
  inline_write_ref_field(field, newVal);
 }
+/*
+   Claimed and deferred bits are used together in G1 during the evacuation
+   pause. These bits can have the following state transitions:
+   1. The claimed bit can be put over any other card state. Except that
+      the "dirty -> dirty and claimed" transition is checked for in
+      G1 code and is not used.
+   2. Deferred bit can be set only if the previous state of the card
+      was either clean or claimed. mark_card_deferred() is wait-free.
+      We do not care if the operation is be successful because if
+      it does not it will only result in duplicate entry in the update
+      buffer because of the "cache-miss". So it's not worth spinning.
+ */
 bool CardTableModRefBS::claim_card(size_t card_index) {
  jbyte val = _byte_map[card_index];
-  if (val != claimed_card_val()) {
+  assert(val != dirty_card_val(), "Shouldn't claim a dirty card");
-    jbyte res = Atomic::cmpxchg((jbyte) claimed_card_val(), &_byte_map[card_index], val);
+  while (val == clean_card_val() ||
-    if (res == val)
+         (val & (clean_card_mask_val() | claimed_card_val())) != claimed_card_val()) {
+    jbyte new_val = val;
+    if (val == clean_card_val()) {
+      new_val = (jbyte)claimed_card_val();
+    } else {
+      new_val = val | (jbyte)claimed_card_val();
+    }
+    jbyte res = Atomic::cmpxchg(new_val, &_byte_map[card_index], val);
+    if (res == val) {
      return true;
-    else return false;
+    }
+    val = res;
  }
  return false;
 }
+bool CardTableModRefBS::mark_card_deferred(size_t card_index) {
+  jbyte val = _byte_map[card_index];
+  // It's already processed
+  if ((val & (clean_card_mask_val() | deferred_card_val())) == deferred_card_val()) {
+    return false;
+  }
+  // Cached bit can be installed either on a clean card or on a claimed card.
+  jbyte new_val = val;
+  if (val == clean_card_val()) {
+    new_val = (jbyte)deferred_card_val();
+  } else {
+    if (val & claimed_card_val()) {
+      new_val = val | (jbyte)deferred_card_val();
+    }
+  }
+  if (new_val != val) {
+    Atomic::cmpxchg(new_val, &_byte_map[card_index], val);
+  }
+  return true;
+}
 void CardTableModRefBS::non_clean_card_iterate(Space* sp,
                                               MemRegion mr,
                                               DirtyCardToOopClosure* dcto_cl,

--- a/src/share/vm/memory/cardTableModRefBS.hpp
+++ b/src/share/vm/memory/cardTableModRefBS.hpp
@@ -52,11 +52,15 @@ class CardTableModRefBS: public ModRefBarrierSet {
  enum CardValues {
    clean_card                  = -1,
+    // The mask contains zeros in places for all other values.
+    clean_card_mask             = clean_card - 31,
    dirty_card                  =  0,
    precleaned_card             =  1,
-    claimed_card                =  3,
+    claimed_card                =  2,
-    last_card                   =  4,
+    deferred_card               =  4,
-    CT_MR_BS_last_reserved      = 10
+    last_card                   =  8,
+    CT_MR_BS_last_reserved      = 16
  };
  // dirty and precleaned are equivalent wrt younger_refs_iter.
@@ -254,9 +258,11 @@ public:
  };
  static int clean_card_val()      { return clean_card; }
+  static int clean_card_mask_val() { return clean_card_mask; }
  static int dirty_card_val()      { return dirty_card; }
  static int claimed_card_val()    { return claimed_card; }
  static int precleaned_card_val() { return precleaned_card; }
+  static int deferred_card_val()   { return deferred_card; }
  // For RTTI simulation.
  bool is_a(BarrierSet::Name bsn) {
@@ -329,7 +335,8 @@ public:
  }
  bool is_card_claimed(size_t card_index) {
-    return _byte_map[card_index] == claimed_card_val();
+    jbyte val = _byte_map[card_index];
+    return (val & (clean_card_mask_val() | claimed_card_val())) == claimed_card_val();
  }
  bool claim_card(size_t card_index);
@@ -338,6 +345,13 @@ public:
    return _byte_map[card_index] == clean_card_val();
  }
+  bool is_card_deferred(size_t card_index) {
+    jbyte val = _byte_map[card_index];
+    return (val & (clean_card_mask_val() | deferred_card_val())) == deferred_card_val();
+  }
+  bool mark_card_deferred(size_t card_index);
  // Card marking array base (adjusted for heap low boundary)
  // This would be the 0th element of _byte_map, if the heap started at 0x0.
  // But since the heap starts at some higher address, this points to somewhere
@@ -434,6 +448,10 @@ public:
    return byte_for(p) - _byte_map;
  }
+  const jbyte* byte_for_index(const size_t card_index) const {
+    return _byte_map + card_index;
+  }
  void verify();
  void verify_guard();

--- a/src/share/vm/memory/genCollectedHeap.cpp
+++ b/src/share/vm/memory/genCollectedHeap.cpp
@@ -456,6 +456,9 @@ void GenCollectedHeap::do_collection(bool  full,
    int max_level_collected = starting_level;
    for (int i = starting_level; i <= max_level; i++) {
      if (_gens[i]->should_collect(full, size, is_tlab)) {
+        if (i == n_gens() - 1) {  // a major collection is to happen
+          pre_full_gc_dump();    // do any pre full gc dumps
+        }
        // Timer for individual generations. Last argument is false: no CR
        TraceTime t1(_gens[i]->short_name(), PrintGCDetails, false, gclog_or_tty);
        TraceCollectorStats tcs(_gens[i]->counters());
@@ -573,6 +576,10 @@ void GenCollectedHeap::do_collection(bool  full,
    // a whole heap collection.
    complete = complete || (max_level_collected == n_gens() - 1);
+    if (complete) { // We did a "major" collection
+      post_full_gc_dump();   // do any post full gc dumps
+    }
    if (PrintGCDetails) {
      print_heap_change(gch_prev_used);

--- a/src/share/vm/memory/heapInspection.cpp
+++ b/src/share/vm/memory/heapInspection.cpp
@@ -233,7 +233,7 @@ class RecordInstanceClosure : public ObjectClosure {
  size_t missed_count() { return _missed_count; }
 };
-void HeapInspection::heap_inspection(outputStream* st) {
+void HeapInspection::heap_inspection(outputStream* st, bool need_prologue) {
  ResourceMark rm;
  HeapWord* ref;
@@ -244,7 +244,9 @@ void HeapInspection::heap_inspection(outputStream* st) {
    case CollectedHeap::GenCollectedHeap: {
      is_shared_heap = true;
      SharedHeap* sh = (SharedHeap*)heap;
-      sh->gc_prologue(false /* !full */); // get any necessary locks, etc.
+      if (need_prologue) {
+        sh->gc_prologue(false /* !full */); // get any necessary locks, etc.
+      }
      ref = sh->perm_gen()->used_region().start();
      break;
    }
@@ -290,7 +292,7 @@ void HeapInspection::heap_inspection(outputStream* st) {
  }
  st->flush();
-  if (is_shared_heap) {
+  if (need_prologue && is_shared_heap) {
    SharedHeap* sh = (SharedHeap*)heap;
    sh->gc_epilogue(false /* !full */); // release all acquired locks, etc.
  }

--- a/src/share/vm/memory/heapInspection.hpp
+++ b/src/share/vm/memory/heapInspection.hpp
@@ -127,6 +127,6 @@ class KlassInfoHisto : public StackObj {
 class HeapInspection : public AllStatic {
 public:
-  static void heap_inspection(outputStream* st) KERNEL_RETURN;
+  static void heap_inspection(outputStream* st, bool need_prologue) KERNEL_RETURN;
  static void find_instances_at_safepoint(klassOop k, GrowableArray<oop>* result) KERNEL_RETURN;
 };
--- a/src/share/vm/runtime/globals.hpp
+++ b/src/share/vm/runtime/globals.hpp
@@ -662,6 +662,12 @@ class CommandLineFlags {
  product(ccstrlist, OnOutOfMemoryError, "",                                \
          "Run user-defined commands on first java.lang.OutOfMemoryError")  \
                                                                            \
+  manageable(bool, HeapDumpBeforeFullGC, false,                             \
+          "Dump heap to file before any major stop-world GC")               \
+                                                                            \
+  manageable(bool, HeapDumpAfterFullGC, false,                              \
+          "Dump heap to file after any major stop-world GC")                \
+                                                                            \
  manageable(bool, HeapDumpOnOutOfMemoryError, false,                       \
          "Dump heap to file when java.lang.OutOfMemoryError is thrown")    \
                                                                            \
@@ -1971,6 +1977,12 @@ class CommandLineFlags {
  product(bool, PrintHeapAtSIGBREAK, true,                                  \
          "Print heap layout in response to SIGBREAK")                      \
                                                                            \
+  manageable(bool, PrintClassHistogramBeforeFullGC, false,                  \
+          "Print a class histogram before any major stop-world GC")         \
+                                                                            \
+  manageable(bool, PrintClassHistogramAfterFullGC, false,                   \
+          "Print a class histogram after any major stop-world GC")          \
+                                                                            \
  manageable(bool, PrintClassHistogram, false,                              \
          "Print a histogram of class instances")                           \
                                                                            \

--- a/src/share/vm/runtime/os.cpp
+++ b/src/share/vm/runtime/os.cpp
@@ -207,7 +207,8 @@ static void signal_thread_entry(JavaThread* thread, TRAPS) {
        VMThread::execute(&op1);
        Universe::print_heap_at_SIGBREAK();
        if (PrintClassHistogram) {
-          VM_GC_HeapInspection op1(gclog_or_tty, true /* force full GC before heap inspection */);
+          VM_GC_HeapInspection op1(gclog_or_tty, true /* force full GC before heap inspection */,
+                                   true /* need_prologue */);
          VMThread::execute(&op1);
        }
        if (JvmtiExport::should_post_data_dump()) {

--- a/src/share/vm/services/attachListener.cpp
+++ b/src/share/vm/services/attachListener.cpp
@@ -194,7 +194,7 @@ static jint heap_inspection(AttachOperation* op, outputStream* out) {
    }
    live_objects_only = strcmp(arg0, "-live") == 0;
  }
-  VM_GC_HeapInspection heapop(out, live_objects_only /* request gc */);
+  VM_GC_HeapInspection heapop(out, live_objects_only /* request full gc */, true /* need_prologue */);
  VMThread::execute(&heapop);
  return JNI_OK;
 }

--- a/src/share/vm/services/heapDumper.cpp
+++ b/src/share/vm/services/heapDumper.cpp
@@ -347,7 +347,6 @@ enum {
  INITIAL_CLASS_COUNT = 200
 };
 // Supports I/O operations on a dump file
 class DumpWriter : public StackObj {
@@ -1303,7 +1302,9 @@ void HeapObjectDumper::do_object(oop o) {
 // The VM operation that performs the heap dump
 class VM_HeapDumper : public VM_GC_Operation {
 private:
-  DumpWriter* _writer;
+  static VM_HeapDumper* _global_dumper;
+  static DumpWriter*    _global_writer;
+  DumpWriter*           _local_writer;
  bool _gc_before_heap_dump;
  bool _is_segmented_dump;
  jlong _dump_start;
@@ -1311,8 +1312,20 @@ class VM_HeapDumper : public VM_GC_Operation {
  ThreadStackTrace** _stack_traces;
  int _num_threads;
-  // accessors
+  // accessors and setters
-  DumpWriter* writer() const                    { return _writer; }
+  static VM_HeapDumper* dumper()         {  assert(_global_dumper != NULL, "Error"); return _global_dumper; }
+  static DumpWriter* writer()            {  assert(_global_writer != NULL, "Error"); return _global_writer; }
+  void set_global_dumper() {
+    assert(_global_dumper == NULL, "Error");
+    _global_dumper = this;
+  }
+  void set_global_writer() {
+    assert(_global_writer == NULL, "Error");
+    _global_writer = _local_writer;
+  }
+  void clear_global_dumper() { _global_dumper = NULL; }
+  void clear_global_writer() { _global_writer = NULL; }
  bool is_segmented_dump() const                { return _is_segmented_dump; }
  void set_segmented_dump()                     { _is_segmented_dump = true; }
  jlong dump_start() const                      { return _dump_start; }
@@ -1357,7 +1370,7 @@ class VM_HeapDumper : public VM_GC_Operation {
    VM_GC_Operation(0 /* total collections,      dummy, ignored */,
                    0 /* total full collections, dummy, ignored */,
                    gc_before_heap_dump) {
-    _writer = writer;
+    _local_writer = writer;
    _gc_before_heap_dump = gc_before_heap_dump;
    _is_segmented_dump = false;
    _dump_start = (jlong)-1;
@@ -1381,6 +1394,9 @@ class VM_HeapDumper : public VM_GC_Operation {
  void doit();
 };
+VM_HeapDumper* VM_HeapDumper::_global_dumper = NULL;
+DumpWriter*    VM_HeapDumper::_global_writer = NULL;
 bool VM_HeapDumper::skip_operation() const {
  return false;
 }
@@ -1479,31 +1495,28 @@ void HeapObjectDumper::mark_end_of_record() {
 void VM_HeapDumper::do_load_class(klassOop k) {
  static u4 class_serial_num = 0;
-  VM_HeapDumper* dumper = ((VM_HeapDumper*)VMThread::vm_operation());
-  DumpWriter* writer = dumper->writer();
  // len of HPROF_LOAD_CLASS record
  u4 remaining = 2*oopSize + 2*sizeof(u4);
  // write a HPROF_LOAD_CLASS for the class and each array class
  do {
-    DumperSupport::write_header(writer, HPROF_LOAD_CLASS, remaining);
+    DumperSupport::write_header(writer(), HPROF_LOAD_CLASS, remaining);
    // class serial number is just a number
-    writer->write_u4(++class_serial_num);
+    writer()->write_u4(++class_serial_num);
    // class ID
    Klass* klass = Klass::cast(k);
-    writer->write_classID(klass);
+    writer()->write_classID(klass);
    // add the klassOop and class serial number pair
-    dumper->add_class_serial_number(klass, class_serial_num);
+    dumper()->add_class_serial_number(klass, class_serial_num);
-    writer->write_u4(STACK_TRACE_ID);
+    writer()->write_u4(STACK_TRACE_ID);
    // class name ID
    symbolOop name = klass->name();
-    writer->write_objectID(name);
+    writer()->write_objectID(name);
    // write a LOAD_CLASS record for the array type (if it exists)
    k = klass->array_klass_or_null();
@@ -1512,17 +1525,13 @@ void VM_HeapDumper::do_load_class(klassOop k) {
 // writes a HPROF_GC_CLASS_DUMP record for the given class
 void VM_HeapDumper::do_class_dump(klassOop k) {
-  VM_HeapDumper* dumper = ((VM_HeapDumper*)VMThread::vm_operation());
+  DumperSupport::dump_class_and_array_classes(writer(), k);
-  DumpWriter* writer = dumper->writer();
-  DumperSupport::dump_class_and_array_classes(writer, k);
 }
 // writes a HPROF_GC_CLASS_DUMP records for a given basic type
 // array (and each multi-dimensional array too)
 void VM_HeapDumper::do_basic_type_array_class_dump(klassOop k) {
-  VM_HeapDumper* dumper = ((VM_HeapDumper*)VMThread::vm_operation());
+  DumperSupport::dump_basic_type_array_class(writer(), k);
-  DumpWriter* writer = dumper->writer();
-  DumperSupport::dump_basic_type_array_class(writer, k);
 }
 // Walk the stack of the given thread.
@@ -1658,6 +1667,11 @@ void VM_HeapDumper::doit() {
    ch->ensure_parsability(false);
  }
+  // At this point we should be the only dumper active, so
+  // the following should be safe.
+  set_global_dumper();
+  set_global_writer();
  // Write the file header - use 1.0.2 for large heaps, otherwise 1.0.1
  size_t used = ch->used();
  const char* header;
@@ -1667,6 +1681,7 @@ void VM_HeapDumper::doit() {
  } else {
    header = "JAVA PROFILE 1.0.1";
  }
  // header is few bytes long - no chance to overflow int
  writer()->write_raw((void*)header, (int)strlen(header));
  writer()->write_u1(0); // terminator
@@ -1723,6 +1738,10 @@ void VM_HeapDumper::doit() {
  // fixes up the length of the dump record. In the case of a segmented
  // heap then the HPROF_HEAP_DUMP_END record is also written.
  end_of_dump();
+  // Now we clear the global variables, so that a future dumper might run.
+  clear_global_dumper();
+  clear_global_writer();
 }
 void VM_HeapDumper::dump_stack_traces() {
@@ -1790,7 +1809,12 @@ int HeapDumper::dump(const char* path) {
  // generate the dump
  VM_HeapDumper dumper(&writer, _gc_before_heap_dump);
-  VMThread::execute(&dumper);
+  if (Thread::current()->is_VM_thread()) {
+    assert(SafepointSynchronize::is_at_safepoint(), "Expected to be called at a safepoint");
+    dumper.doit();
+  } else {
+    VMThread::execute(&dumper);
+  }
  // close dump file and record any error that the writer may have encountered
  writer.close();
@@ -1845,49 +1869,68 @@ void HeapDumper::set_error(char* error) {
  }
 }
+// Called by error reporting by a single Java thread outside of a JVM safepoint,
-// Called by error reporting
+// or by heap dumping by the VM thread during a (GC) safepoint. Thus, these various
+// callers are strictly serialized and guaranteed not to interfere below. For more
+// general use, however, this method will need modification to prevent
+// inteference when updating the static variables base_path and dump_file_seq below.
 void HeapDumper::dump_heap() {
-  static char path[JVM_MAXPATHLEN];
+  static char base_path[JVM_MAXPATHLEN] = {'\0'};
+  static uint dump_file_seq = 0;
+  char   my_path[JVM_MAXPATHLEN] = {'\0'};
  // The dump file defaults to java_pid<pid>.hprof in the current working
  // directory. HeapDumpPath=<file> can be used to specify an alternative
  // dump file name or a directory where dump file is created.
-  bool use_default_filename = true;
+  if (dump_file_seq == 0) { // first time in, we initialize base_path
-  if (HeapDumpPath == NULL || HeapDumpPath[0] == '\0') {
+    bool use_default_filename = true;
-    path[0] = '\0'; // HeapDumpPath=<file> not specified
+    if (HeapDumpPath == NULL || HeapDumpPath[0] == '\0') {
-  } else {
+      // HeapDumpPath=<file> not specified
-    assert(strlen(HeapDumpPath) < sizeof(path), "HeapDumpPath too long");
-    strcpy(path, HeapDumpPath);
-    // check if the path is a directory (must exist)
-    DIR* dir = os::opendir(path);
-    if (dir == NULL) {
-      use_default_filename = false;
    } else {
-      // HeapDumpPath specified a directory. We append a file separator
+      assert(strlen(HeapDumpPath) < sizeof(base_path), "HeapDumpPath too long");
-      // (if needed).
+      strcpy(base_path, HeapDumpPath);
-      os::closedir(dir);
+      // check if the path is a directory (must exist)
-      size_t fs_len = strlen(os::file_separator());
+      DIR* dir = os::opendir(base_path);
-      if (strlen(path) >= fs_len) {
+      if (dir == NULL) {
-        char* end = path;
+        use_default_filename = false;
-        end += (strlen(path) - fs_len);
+      } else {
-        if (strcmp(end, os::file_separator()) != 0) {
+        // HeapDumpPath specified a directory. We append a file separator
-          assert(strlen(path) + strlen(os::file_separator()) < sizeof(path),
+        // (if needed).
-            "HeapDumpPath too long");
+        os::closedir(dir);
-          strcat(path, os::file_separator());
+        size_t fs_len = strlen(os::file_separator());
+        if (strlen(base_path) >= fs_len) {
+          char* end = base_path;
+          end += (strlen(base_path) - fs_len);
+          if (strcmp(end, os::file_separator()) != 0) {
+            assert(strlen(base_path) + strlen(os::file_separator()) < sizeof(base_path),
+              "HeapDumpPath too long");
+            strcat(base_path, os::file_separator());
+          }
        }
      }
    }
+    // If HeapDumpPath wasn't a file name then we append the default name
+    if (use_default_filename) {
+      char fn[32];
+      sprintf(fn, "java_pid%d", os::current_process_id());
+      assert(strlen(base_path) + strlen(fn) < sizeof(base_path), "HeapDumpPath too long");
+      strcat(base_path, fn);
+    }
+    assert(strlen(base_path) < sizeof(my_path), "Buffer too small");
+    strcpy(my_path, base_path);
+  } else {
+    // Append a sequence number id for dumps following the first
+    char fn[33];
+    sprintf(fn, ".%d", dump_file_seq);
+    assert(strlen(base_path) + strlen(fn) < sizeof(my_path), "HeapDumpPath too long");
+    strcpy(my_path, base_path);
+    strcat(my_path, fn);
  }
-  // If HeapDumpPath wasn't a file name then we append the default name
+  dump_file_seq++;   // increment seq number for next time we dump
-  if (use_default_filename) {
+  assert(strlen(".hprof") + strlen(my_path) < sizeof(my_path), "HeapDumpPath too long");
-    char fn[32];
+  strcat(my_path, ".hprof");
-    sprintf(fn, "java_pid%d.hprof", os::current_process_id());
-    assert(strlen(path) + strlen(fn) < sizeof(path), "HeapDumpPath too long");
-    strcat(path, fn);
-  }
  HeapDumper dumper(false /* no GC before heap dump */,
                    true  /* send to tty */);
-  dumper.dump(path);
+  dumper.dump(my_path);
 }
--- a/src/share/vm/services/heapDumper.hpp
+++ b/src/share/vm/services/heapDumper.hpp
@@ -53,7 +53,7 @@ class HeapDumper : public StackObj {
 public:
  HeapDumper(bool gc_before_heap_dump) :
-    _gc_before_heap_dump(gc_before_heap_dump), _error(NULL), _print_to_tty(false)  { }
+    _gc_before_heap_dump(gc_before_heap_dump), _error(NULL), _print_to_tty(false) { }
  HeapDumper(bool gc_before_heap_dump, bool print_to_tty) :
    _gc_before_heap_dump(gc_before_heap_dump), _error(NULL), _print_to_tty(print_to_tty) { }