6631166: CMS: better heuristics when combatting fragmentation

Summary: Autonomic per-worker free block cache sizing, tunable coalition policies, fixes to per-size block statistics, retuned gain and bandwidth of some feedback loop filters to allow quicker reactivity to abrupt changes in ambient demand, and other heuristics to reduce fragmentation of the CMS old gen. Also tightened some assertions, including those related to locking. Reviewed-by: jmasa

6631166: CMS: better heuristics when combatting fragmentation
Summary: Autonomic per-worker free block cache sizing, tunable coalition policies, fixes to per-size block statistics, retuned gain and bandwidth of some feedback loop filters to allow quicker reactivity to abrupt changes in ambient demand, and other heuristics to reduce fragmentation of the CMS old gen. Also tightened some assertions, including those related to locking. Reviewed-by: jmasa
c290220e · ysr · 7f92f7a2 · c290220e · c290220e · c290220e
26 changed file
--- a/src/share/vm/gc_implementation/concurrentMarkSweep/binaryTreeDictionary.cpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/binaryTreeDictionary.cpp
@@ -62,12 +62,13 @@ TreeList* TreeList::as_TreeList(TreeChunk* tc) {
  tl->link_head(tc);
  tl->link_tail(tc);
  tl->set_count(1);
-  tl->init_statistics();
+  tl->init_statistics(true /* split_birth */);
  tl->setParent(NULL);
  tl->setLeft(NULL);
  tl->setRight(NULL);
  return tl;
 }
 TreeList* TreeList::as_TreeList(HeapWord* addr, size_t size) {
  TreeChunk* tc = (TreeChunk*) addr;
  assert(size >= sizeof(TreeChunk), "Chunk is too small for a TreeChunk");
@@ -267,6 +268,31 @@ TreeChunk* TreeList::first_available() {
  return retTC;
 }
+// Returns the block with the largest heap address amongst
+// those in the list for this size; potentially slow and expensive,
+// use with caution!
+TreeChunk* TreeList::largest_address() {
+  guarantee(head() != NULL, "The head of the list cannot be NULL");
+  FreeChunk* fc = head()->next();
+  TreeChunk* retTC;
+  if (fc == NULL) {
+    retTC = head_as_TreeChunk();
+  } else {
+    // walk down the list and return the one with the highest
+    // heap address among chunks of this size.
+    FreeChunk* last = fc;
+    while (fc->next() != NULL) {
+      if ((HeapWord*)last < (HeapWord*)fc) {
+        last = fc;
+      }
+      fc = fc->next();
+    }
+    retTC = TreeChunk::as_TreeChunk(last);
+  }
+  assert(retTC->list() == this, "Wrong type of chunk.");
+  return retTC;
+}
 BinaryTreeDictionary::BinaryTreeDictionary(MemRegion mr, bool splay):
  _splay(splay)
 {
@@ -379,7 +405,7 @@ BinaryTreeDictionary::getChunkFromTree(size_t size, Dither dither, bool splay)
            break;
          }
          // The evm code reset the hint of the candidate as
-          // at an interrim point.  Why?  Seems like this leaves
+          // at an interim point.  Why?  Seems like this leaves
          // the hint pointing to a list that didn't work.
          // curTL->set_hint(hintTL->size());
        }
@@ -436,7 +462,7 @@ FreeChunk* BinaryTreeDictionary::findLargestDict() const {
  TreeList *curTL = root();
  if (curTL != NULL) {
    while(curTL->right() != NULL) curTL = curTL->right();
-    return curTL->first_available();
+    return curTL->largest_address();
  } else {
    return NULL;
  }
@@ -664,7 +690,7 @@ void BinaryTreeDictionary::insertChunkInTree(FreeChunk* fc) {
    }
  }
  TreeChunk* tc = TreeChunk::as_TreeChunk(fc);
-  // This chunk is being returned to the binary try.  It's embedded
+  // This chunk is being returned to the binary tree.  Its embedded
  // TreeList should be unused at this point.
  tc->initialize();
  if (curTL != NULL) {          // exact match
@@ -807,6 +833,8 @@ void BinaryTreeDictionary::dictCensusUpdate(size_t size, bool split, bool birth)
 }
 bool BinaryTreeDictionary::coalDictOverPopulated(size_t size) {
+  if (FLSAlwaysCoalesceLarge) return true;
  TreeList* list_of_size = findList(size);
  // None of requested size implies overpopulated.
  return list_of_size == NULL || list_of_size->coalDesired() <= 0 ||
@@ -854,17 +882,20 @@ class BeginSweepClosure : public AscendTreeCensusClosure {
  double _percentage;
  float _inter_sweep_current;
  float _inter_sweep_estimate;
+  float _intra_sweep_estimate;
 public:
  BeginSweepClosure(double p, float inter_sweep_current,
-                              float inter_sweep_estimate) :
+                              float inter_sweep_estimate,
+                              float intra_sweep_estimate) :
   _percentage(p),
   _inter_sweep_current(inter_sweep_current),
-   _inter_sweep_estimate(inter_sweep_estimate) { }
+   _inter_sweep_estimate(inter_sweep_estimate),
+   _intra_sweep_estimate(intra_sweep_estimate) { }
  void do_list(FreeList* fl) {
    double coalSurplusPercent = _percentage;
-    fl->compute_desired(_inter_sweep_current, _inter_sweep_estimate);
+    fl->compute_desired(_inter_sweep_current, _inter_sweep_estimate, _intra_sweep_estimate);
    fl->set_coalDesired((ssize_t)((double)fl->desired() * coalSurplusPercent));
    fl->set_beforeSweep(fl->count());
    fl->set_bfrSurp(fl->surplus());
@@ -939,9 +970,10 @@ FreeChunk* BinaryTreeDictionary::find_chunk_ends_at(HeapWord* target) const {
 }
 void BinaryTreeDictionary::beginSweepDictCensus(double coalSurplusPercent,
-  float inter_sweep_current, float inter_sweep_estimate) {
+  float inter_sweep_current, float inter_sweep_estimate, float intra_sweep_estimate) {
  BeginSweepClosure bsc(coalSurplusPercent, inter_sweep_current,
-                                            inter_sweep_estimate);
+                                            inter_sweep_estimate,
+                                            intra_sweep_estimate);
  bsc.do_tree(root());
 }
@@ -1077,13 +1109,13 @@ void BinaryTreeDictionary::reportStatistics() const {
 // Print census information - counts, births, deaths, etc.
 // for each list in the tree.  Also print some summary
 // information.
-class printTreeCensusClosure : public AscendTreeCensusClosure {
+class PrintTreeCensusClosure : public AscendTreeCensusClosure {
  int _print_line;
  size_t _totalFree;
  FreeList _total;
 public:
-  printTreeCensusClosure() {
+  PrintTreeCensusClosure() {
    _print_line = 0;
    _totalFree = 0;
  }
@@ -1113,7 +1145,7 @@ void BinaryTreeDictionary::printDictCensus(void) const {
  gclog_or_tty->print("\nBinaryTree\n");
  FreeList::print_labels_on(gclog_or_tty, "size");
-  printTreeCensusClosure ptc;
+  PrintTreeCensusClosure ptc;
  ptc.do_tree(root());
  FreeList* total = ptc.total();
@@ -1130,6 +1162,38 @@ void BinaryTreeDictionary::printDictCensus(void) const {
             /(total->desired() != 0 ? (double)total->desired() : 1.0));
 }
+class PrintFreeListsClosure : public AscendTreeCensusClosure {
+  outputStream* _st;
+  int _print_line;
+ public:
+  PrintFreeListsClosure(outputStream* st) {
+    _st = st;
+    _print_line = 0;
+  }
+  void do_list(FreeList* fl) {
+    if (++_print_line >= 40) {
+      FreeList::print_labels_on(_st, "size");
+      _print_line = 0;
+    }
+    fl->print_on(gclog_or_tty);
+    size_t sz = fl->size();
+    for (FreeChunk* fc = fl->head(); fc != NULL;
+         fc = fc->next()) {
+      _st->print_cr("\t[" PTR_FORMAT "," PTR_FORMAT ")  %s",
+                    fc, (HeapWord*)fc + sz,
+                    fc->cantCoalesce() ? "\t CC" : "");
+    }
+  }
+};
+void BinaryTreeDictionary::print_free_lists(outputStream* st) const {
+  FreeList::print_labels_on(st, "size");
+  PrintFreeListsClosure pflc(st);
+  pflc.do_tree(root());
+}
 // Verify the following tree invariants:
 // . _root has no parent
 // . parent and child point to each other

--- a/src/share/vm/gc_implementation/concurrentMarkSweep/binaryTreeDictionary.hpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/binaryTreeDictionary.hpp
@@ -42,9 +42,6 @@ class TreeList: public FreeList {
  friend class AscendTreeCensusClosure;
  friend class DescendTreeCensusClosure;
  friend class DescendTreeSearchClosure;
-  TreeList* _parent;
-  TreeList* _left;
-  TreeList* _right;
 protected:
  TreeList* parent() const { return _parent; }
@@ -82,6 +79,11 @@ class TreeList: public FreeList {
  // to a TreeChunk.
  TreeChunk* first_available();
+  // Returns the block with the largest heap address amongst
+  // those in the list for this size; potentially slow and expensive,
+  // use with caution!
+  TreeChunk* largest_address();
  // removeChunkReplaceIfNeeded() removes the given "tc" from the TreeList.
  // If "tc" is the first chunk in the list, it is also the
  // TreeList that is the node in the tree.  removeChunkReplaceIfNeeded()
@@ -254,8 +256,9 @@ class BinaryTreeDictionary: public FreeBlockDictionary {
  // Methods called at the beginning of a sweep to prepare the
  // statistics for the sweep.
  void       beginSweepDictCensus(double coalSurplusPercent,
-                                  float sweep_current,
+                                  float inter_sweep_current,
-                                  float sweep_estimate);
+                                  float inter_sweep_estimate,
+                                  float intra_sweep_estimate);
  // Methods called after the end of a sweep to modify the
  // statistics for the sweep.
  void       endSweepDictCensus(double splitSurplusPercent);
@@ -269,6 +272,7 @@ class BinaryTreeDictionary: public FreeBlockDictionary {
  // Print the statistcis for all the lists in the tree.  Also may
  // print out summaries.
  void       printDictCensus(void) const;
+  void       print_free_lists(outputStream* st) const;
  // For debugging.  Returns the sum of the _returnedBytes for
  // all lists in the tree.

--- a/src/share/vm/gc_implementation/concurrentMarkSweep/cmsLockVerifier.cpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/cmsLockVerifier.cpp
@@ -32,7 +32,9 @@
 // threads. The second argument is in support of an extra locking
 // check for CFL spaces' free list locks.
 #ifndef PRODUCT
-void CMSLockVerifier::assert_locked(const Mutex* lock, const Mutex* p_lock) {
+void CMSLockVerifier::assert_locked(const Mutex* lock,
+                                    const Mutex* p_lock1,
+                                    const Mutex* p_lock2) {
  if (!Universe::is_fully_initialized()) {
    return;
  }
@@ -40,7 +42,7 @@ void CMSLockVerifier::assert_locked(const Mutex* lock, const Mutex* p_lock) {
  Thread* myThread = Thread::current();
  if (lock == NULL) { // a "lock-free" structure, e.g. MUT, protected by CMS token
-    assert(p_lock == NULL, "Unexpected state");
+    assert(p_lock1 == NULL && p_lock2 == NULL, "Unexpected caller error");
    if (myThread->is_ConcurrentGC_thread()) {
      // This test might have to change in the future, if there can be
      // multiple peer CMS threads.  But for now, if we're testing the CMS
@@ -60,36 +62,39 @@ void CMSLockVerifier::assert_locked(const Mutex* lock, const Mutex* p_lock) {
    return;
  }
-  if (ParallelGCThreads == 0) {
+  if (myThread->is_VM_thread()
+      || myThread->is_ConcurrentGC_thread()
+      || myThread->is_Java_thread()) {
+    // Make sure that we are holding the associated lock.
    assert_lock_strong(lock);
-  } else {
+    // The checking of p_lock is a spl case for CFLS' free list
-    if (myThread->is_VM_thread()
+    // locks: we make sure that none of the parallel GC work gang
-        || myThread->is_ConcurrentGC_thread()
+    // threads are holding "sub-locks" of freeListLock(). We check only
-        || myThread->is_Java_thread()) {
+    // the parDictionaryAllocLock because the others are too numerous.
-      // Make sure that we are holding the associated lock.
+    // This spl case code is somewhat ugly and any improvements
-      assert_lock_strong(lock);
+    // are welcome.
-      // The checking of p_lock is a spl case for CFLS' free list
+    assert(p_lock1 == NULL || !p_lock1->is_locked() || p_lock1->owned_by_self(),
-      // locks: we make sure that none of the parallel GC work gang
+           "Possible race between this and parallel GC threads");
-      // threads are holding "sub-locks" of freeListLock(). We check only
+    assert(p_lock2 == NULL || !p_lock2->is_locked() || p_lock2->owned_by_self(),
-      // the parDictionaryAllocLock because the others are too numerous.
+           "Possible race between this and parallel GC threads");
-      // This spl case code is somewhat ugly and any improvements
+  } else if (myThread->is_GC_task_thread()) {
-      // are welcome XXX FIX ME!!
+    // Make sure that the VM or CMS thread holds lock on our behalf
-      if (p_lock != NULL) {
+    // XXX If there were a concept of a gang_master for a (set of)
-        assert(!p_lock->is_locked() || p_lock->owned_by_self(),
+    // gang_workers, we could have used the identity of that thread
-               "Possible race between this and parallel GC threads");
+    // for checking ownership here; for now we just disjunct.
-      }
+    assert(lock->owner() == VMThread::vm_thread() ||
-    } else if (myThread->is_GC_task_thread()) {
+           lock->owner() == ConcurrentMarkSweepThread::cmst(),
-      // Make sure that the VM or CMS thread holds lock on our behalf
+           "Should be locked by VM thread or CMS thread on my behalf");
-      // XXX If there were a concept of a gang_master for a (set of)
+    if (p_lock1 != NULL) {
-      // gang_workers, we could have used the identity of that thread
+      assert_lock_strong(p_lock1);
-      // for checking ownership here; for now we just disjunct.
-      assert(lock->owner() == VMThread::vm_thread() ||
-             lock->owner() == ConcurrentMarkSweepThread::cmst(),
-             "Should be locked by VM thread or CMS thread on my behalf");
-    } else {
-      // Make sure we didn't miss some obscure corner case
-      ShouldNotReachHere();
    }
+    if (p_lock2 != NULL) {
+      assert_lock_strong(p_lock2);
+    }
+  } else {
+    // Make sure we didn't miss some other thread type calling into here;
+    // perhaps as a result of future VM evolution.
+    ShouldNotReachHere();
  }
 }
 #endif
--- a/src/share/vm/gc_implementation/concurrentMarkSweep/cmsLockVerifier.hpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/cmsLockVerifier.hpp
@@ -29,8 +29,11 @@
 // the parallel threads.
 class CMSLockVerifier: AllStatic {
 public:
-  static void assert_locked(const Mutex* lock, const Mutex* p_lock)
+  static void assert_locked(const Mutex* lock, const Mutex* p_lock1, const Mutex* p_lock2)
    PRODUCT_RETURN;
+  static void assert_locked(const Mutex* lock, const Mutex* p_lock) {
+    assert_locked(lock, p_lock, NULL);
+  }
  static void assert_locked(const Mutex* lock) {
    assert_locked(lock, NULL);
  }

--- a/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp
@@ -62,18 +62,15 @@ CompactibleFreeListSpace::CompactibleFreeListSpace(BlockOffsetSharedArray* bs,
  // implementation, namely, the simple binary tree (splaying
  // temporarily disabled).
  switch (dictionaryChoice) {
-    case FreeBlockDictionary::dictionaryBinaryTree:
-      _dictionary = new BinaryTreeDictionary(mr);
-      break;
    case FreeBlockDictionary::dictionarySplayTree:
    case FreeBlockDictionary::dictionarySkipList:
    default:
      warning("dictionaryChoice: selected option not understood; using"
              " default BinaryTreeDictionary implementation instead.");
+    case FreeBlockDictionary::dictionaryBinaryTree:
      _dictionary = new BinaryTreeDictionary(mr);
      break;
  }
-  splitBirth(mr.word_size());
  assert(_dictionary != NULL, "CMS dictionary initialization");
  // The indexed free lists are initially all empty and are lazily
  // filled in on demand. Initialize the array elements to NULL.
@@ -388,6 +385,105 @@ size_t CompactibleFreeListSpace::max_alloc_in_words() const {
  return res;
 }
+void CompactibleFreeListSpace::print_indexed_free_lists(outputStream* st)
+const {
+  reportIndexedFreeListStatistics();
+  gclog_or_tty->print_cr("Layout of Indexed Freelists");
+  gclog_or_tty->print_cr("---------------------------");
+  FreeList::print_labels_on(st, "size");
+  for (size_t i = IndexSetStart; i < IndexSetSize; i += IndexSetStride) {
+    _indexedFreeList[i].print_on(gclog_or_tty);
+    for (FreeChunk* fc = _indexedFreeList[i].head(); fc != NULL;
+         fc = fc->next()) {
+      gclog_or_tty->print_cr("\t[" PTR_FORMAT "," PTR_FORMAT ")  %s",
+                          fc, (HeapWord*)fc + i,
+                          fc->cantCoalesce() ? "\t CC" : "");
+    }
+  }
+}
+void CompactibleFreeListSpace::print_promo_info_blocks(outputStream* st)
+const {
+  _promoInfo.print_on(st);
+}
+void CompactibleFreeListSpace::print_dictionary_free_lists(outputStream* st)
+const {
+  _dictionary->reportStatistics();
+  st->print_cr("Layout of Freelists in Tree");
+  st->print_cr("---------------------------");
+  _dictionary->print_free_lists(st);
+}
+class BlkPrintingClosure: public BlkClosure {
+  const CMSCollector*             _collector;
+  const CompactibleFreeListSpace* _sp;
+  const CMSBitMap*                _live_bit_map;
+  const bool                      _post_remark;
+  outputStream*                   _st;
+public:
+  BlkPrintingClosure(const CMSCollector* collector,
+                     const CompactibleFreeListSpace* sp,
+                     const CMSBitMap* live_bit_map,
+                     outputStream* st):
+    _collector(collector),
+    _sp(sp),
+    _live_bit_map(live_bit_map),
+    _post_remark(collector->abstract_state() > CMSCollector::FinalMarking),
+    _st(st) { }
+  size_t do_blk(HeapWord* addr);
+};
+size_t BlkPrintingClosure::do_blk(HeapWord* addr) {
+  size_t sz = _sp->block_size_no_stall(addr, _collector);
+  assert(sz != 0, "Should always be able to compute a size");
+  if (_sp->block_is_obj(addr)) {
+    const bool dead = _post_remark && !_live_bit_map->isMarked(addr);
+    _st->print_cr(PTR_FORMAT ": %s object of size " SIZE_FORMAT "%s",
+      addr,
+      dead ? "dead" : "live",
+      sz,
+      (!dead && CMSPrintObjectsInDump) ? ":" : ".");
+    if (CMSPrintObjectsInDump && !dead) {
+      oop(addr)->print_on(_st);
+      _st->print_cr("--------------------------------------");
+    }
+  } else { // free block
+    _st->print_cr(PTR_FORMAT ": free block of size " SIZE_FORMAT "%s",
+      addr, sz, CMSPrintChunksInDump ? ":" : ".");
+    if (CMSPrintChunksInDump) {
+      ((FreeChunk*)addr)->print_on(_st);
+      _st->print_cr("--------------------------------------");
+    }
+  }
+  return sz;
+}
+void CompactibleFreeListSpace::dump_at_safepoint_with_locks(CMSCollector* c,
+  outputStream* st) {
+  st->print_cr("\n=========================");
+  st->print_cr("Block layout in CMS Heap:");
+  st->print_cr("=========================");
+  BlkPrintingClosure  bpcl(c, this, c->markBitMap(), st);
+  blk_iterate(&bpcl);
+  st->print_cr("\n=======================================");
+  st->print_cr("Order & Layout of Promotion Info Blocks");
+  st->print_cr("=======================================");
+  print_promo_info_blocks(st);
+  st->print_cr("\n===========================");
+  st->print_cr("Order of Indexed Free Lists");
+  st->print_cr("=========================");
+  print_indexed_free_lists(st);
+  st->print_cr("\n=================================");
+  st->print_cr("Order of Free Lists in Dictionary");
+  st->print_cr("=================================");
+  print_dictionary_free_lists(st);
+}
 void CompactibleFreeListSpace::reportFreeListStatistics() const {
  assert_lock_strong(&_freelistLock);
  assert(PrintFLSStatistics != 0, "Reporting error");
@@ -449,37 +545,37 @@ void CompactibleFreeListSpace::set_end(HeapWord* value) {
  if (prevEnd != NULL) {
    // Resize the underlying block offset table.
    _bt.resize(pointer_delta(value, bottom()));
-  if (value <= prevEnd) {
+    if (value <= prevEnd) {
-    assert(value >= unallocated_block(), "New end is below unallocated block");
+      assert(value >= unallocated_block(), "New end is below unallocated block");
-  } else {
-    // Now, take this new chunk and add it to the free blocks.
-    // Note that the BOT has not yet been updated for this block.
-    size_t newFcSize = pointer_delta(value, prevEnd);
-    // XXX This is REALLY UGLY and should be fixed up. XXX
-    if (!_adaptive_freelists && _smallLinearAllocBlock._ptr == NULL) {
-      // Mark the boundary of the new block in BOT
-      _bt.mark_block(prevEnd, value);
-      // put it all in the linAB
-      if (ParallelGCThreads == 0) {
-        _smallLinearAllocBlock._ptr = prevEnd;
-        _smallLinearAllocBlock._word_size = newFcSize;
-        repairLinearAllocBlock(&_smallLinearAllocBlock);
-      } else { // ParallelGCThreads > 0
-        MutexLockerEx x(parDictionaryAllocLock(),
-                        Mutex::_no_safepoint_check_flag);
-        _smallLinearAllocBlock._ptr = prevEnd;
-        _smallLinearAllocBlock._word_size = newFcSize;
-        repairLinearAllocBlock(&_smallLinearAllocBlock);
-      }
-      // Births of chunks put into a LinAB are not recorded.  Births
-      // of chunks as they are allocated out of a LinAB are.
    } else {
-      // Add the block to the free lists, if possible coalescing it
+      // Now, take this new chunk and add it to the free blocks.
-      // with the last free block, and update the BOT and census data.
+      // Note that the BOT has not yet been updated for this block.
-      addChunkToFreeListsAtEndRecordingStats(prevEnd, newFcSize);
+      size_t newFcSize = pointer_delta(value, prevEnd);
+      // XXX This is REALLY UGLY and should be fixed up. XXX
+      if (!_adaptive_freelists && _smallLinearAllocBlock._ptr == NULL) {
+        // Mark the boundary of the new block in BOT
+        _bt.mark_block(prevEnd, value);
+        // put it all in the linAB
+        if (ParallelGCThreads == 0) {
+          _smallLinearAllocBlock._ptr = prevEnd;
+          _smallLinearAllocBlock._word_size = newFcSize;
+          repairLinearAllocBlock(&_smallLinearAllocBlock);
+        } else { // ParallelGCThreads > 0
+          MutexLockerEx x(parDictionaryAllocLock(),
+                          Mutex::_no_safepoint_check_flag);
+          _smallLinearAllocBlock._ptr = prevEnd;
+          _smallLinearAllocBlock._word_size = newFcSize;
+          repairLinearAllocBlock(&_smallLinearAllocBlock);
+        }
+        // Births of chunks put into a LinAB are not recorded.  Births
+        // of chunks as they are allocated out of a LinAB are.
+      } else {
+        // Add the block to the free lists, if possible coalescing it
+        // with the last free block, and update the BOT and census data.
+        addChunkToFreeListsAtEndRecordingStats(prevEnd, newFcSize);
+      }
    }
  }
-  }
 }
 class FreeListSpace_DCTOC : public Filtering_DCTOC {
@@ -732,7 +828,7 @@ void CompactibleFreeListSpace::safe_object_iterate(ObjectClosure* blk) {
 void CompactibleFreeListSpace::object_iterate_mem(MemRegion mr,
                                                  UpwardsObjectClosure* cl) {
-  assert_locked();
+  assert_locked(freelistLock());
  NOT_PRODUCT(verify_objects_initialized());
  Space::object_iterate_mem(mr, cl);
 }
@@ -1212,12 +1308,15 @@ bool CompactibleFreeListSpace::verifyChunkInFreeLists(FreeChunk* fc) const {
 void CompactibleFreeListSpace::assert_locked() const {
  CMSLockVerifier::assert_locked(freelistLock(), parDictionaryAllocLock());
 }
+void CompactibleFreeListSpace::assert_locked(const Mutex* lock) const {
+  CMSLockVerifier::assert_locked(lock);
+}
 #endif
 FreeChunk* CompactibleFreeListSpace::allocateScratch(size_t size) {
  // In the parallel case, the main thread holds the free list lock
  // on behalf the parallel threads.
-  assert_locked();
  FreeChunk* fc;
  {
    // If GC is parallel, this might be called by several threads.
@@ -1298,17 +1397,18 @@ CompactibleFreeListSpace::getChunkFromLinearAllocBlock(LinearAllocBlock *blk,
    res = blk->_ptr;
    _bt.allocated(res, blk->_word_size);
  } else if (size + MinChunkSize <= blk->_refillSize) {
+    size_t sz = blk->_word_size;
    // Update _unallocated_block if the size is such that chunk would be
    // returned to the indexed free list.  All other chunks in the indexed
    // free lists are allocated from the dictionary so that _unallocated_block
    // has already been adjusted for them.  Do it here so that the cost
    // for all chunks added back to the indexed free lists.
-    if (blk->_word_size < SmallForDictionary) {
+    if (sz < SmallForDictionary) {
-      _bt.allocated(blk->_ptr, blk->_word_size);
+      _bt.allocated(blk->_ptr, sz);
    }
    // Return the chunk that isn't big enough, and then refill below.
-    addChunkToFreeLists(blk->_ptr, blk->_word_size);
+    addChunkToFreeLists(blk->_ptr, sz);
-    _bt.verify_single_block(blk->_ptr, (blk->_ptr + blk->_word_size));
+    splitBirth(sz);
    // Don't keep statistics on adding back chunk from a LinAB.
  } else {
    // A refilled block would not satisfy the request.
@@ -1376,11 +1476,13 @@ CompactibleFreeListSpace::getChunkFromIndexedFreeList(size_t size) {
    res = getChunkFromIndexedFreeListHelper(size);
  }
  _bt.verify_not_unallocated((HeapWord*) res, size);
+  assert(res == NULL || res->size() == size, "Incorrect block size");
  return res;
 }
 FreeChunk*
-CompactibleFreeListSpace::getChunkFromIndexedFreeListHelper(size_t size) {
+CompactibleFreeListSpace::getChunkFromIndexedFreeListHelper(size_t size,
+  bool replenish) {
  assert_locked();
  FreeChunk* fc = NULL;
  if (size < SmallForDictionary) {
@@ -1398,54 +1500,66 @@ CompactibleFreeListSpace::getChunkFromIndexedFreeListHelper(size_t size) {
      // and replenishing indexed lists from the small linAB.
      //
      FreeChunk* newFc = NULL;
-      size_t replenish_size = CMSIndexedFreeListReplenish * size;
+      const size_t replenish_size = CMSIndexedFreeListReplenish * size;
      if (replenish_size < SmallForDictionary) {
        // Do not replenish from an underpopulated size.
        if (_indexedFreeList[replenish_size].surplus() > 0 &&
            _indexedFreeList[replenish_size].head() != NULL) {
-          newFc =
+          newFc = _indexedFreeList[replenish_size].getChunkAtHead();
-            _indexedFreeList[replenish_size].getChunkAtHead();
+        } else if (bestFitFirst()) {
-        } else {
          newFc = bestFitSmall(replenish_size);
        }
      }
-      if (newFc != NULL) {
+      if (newFc == NULL && replenish_size > size) {
-        splitDeath(replenish_size);
-      } else if (replenish_size > size) {
        assert(CMSIndexedFreeListReplenish > 1, "ctl pt invariant");
-        newFc =
+        newFc = getChunkFromIndexedFreeListHelper(replenish_size, false);
-          getChunkFromIndexedFreeListHelper(replenish_size);
      }
+      // Note: The stats update re split-death of block obtained above
+      // will be recorded below precisely when we know we are going to
+      // be actually splitting it into more than one pieces below.
      if (newFc != NULL) {
-        assert(newFc->size() == replenish_size, "Got wrong size");
+        if  (replenish || CMSReplenishIntermediate) {
-        size_t i;
+          // Replenish this list and return one block to caller.
-        FreeChunk *curFc, *nextFc;
+          size_t i;
-        // carve up and link blocks 0, ..., CMSIndexedFreeListReplenish - 2
+          FreeChunk *curFc, *nextFc;
-        // The last chunk is not added to the lists but is returned as the
+          size_t num_blk = newFc->size() / size;
-        // free chunk.
+          assert(num_blk >= 1, "Smaller than requested?");
-        for (curFc = newFc, nextFc = (FreeChunk*)((HeapWord*)curFc + size),
+          assert(newFc->size() % size == 0, "Should be integral multiple of request");
-             i = 0;
+          if (num_blk > 1) {
-             i < (CMSIndexedFreeListReplenish - 1);
+            // we are sure we will be splitting the block just obtained
-             curFc = nextFc, nextFc = (FreeChunk*)((HeapWord*)nextFc + size),
+            // into multiple pieces; record the split-death of the original
-             i++) {
+            splitDeath(replenish_size);
+          }
+          // carve up and link blocks 0, ..., num_blk - 2
+          // The last chunk is not added to the lists but is returned as the
+          // free chunk.
+          for (curFc = newFc, nextFc = (FreeChunk*)((HeapWord*)curFc + size),
+               i = 0;
+               i < (num_blk - 1);
+               curFc = nextFc, nextFc = (FreeChunk*)((HeapWord*)nextFc + size),
+               i++) {
+            curFc->setSize(size);
+            // Don't record this as a return in order to try and
+            // determine the "returns" from a GC.
+            _bt.verify_not_unallocated((HeapWord*) fc, size);
+            _indexedFreeList[size].returnChunkAtTail(curFc, false);
+            _bt.mark_block((HeapWord*)curFc, size);
+            splitBirth(size);
+            // Don't record the initial population of the indexed list
+            // as a split birth.
+          }
+          // check that the arithmetic was OK above
+          assert((HeapWord*)nextFc == (HeapWord*)newFc + num_blk*size,
+            "inconsistency in carving newFc");
          curFc->setSize(size);
-          // Don't record this as a return in order to try and
-          // determine the "returns" from a GC.
-          _bt.verify_not_unallocated((HeapWord*) fc, size);
-          _indexedFreeList[size].returnChunkAtTail(curFc, false);
          _bt.mark_block((HeapWord*)curFc, size);
          splitBirth(size);
-          // Don't record the initial population of the indexed list
+          fc = curFc;
-          // as a split birth.
+        } else {
+          // Return entire block to caller
+          fc = newFc;
        }
-        // check that the arithmetic was OK above
-        assert((HeapWord*)nextFc == (HeapWord*)newFc + replenish_size,
-          "inconsistency in carving newFc");
-        curFc->setSize(size);
-        _bt.mark_block((HeapWord*)curFc, size);
-        splitBirth(size);
-        return curFc;
      }
    }
  } else {
@@ -1453,7 +1567,7 @@ CompactibleFreeListSpace::getChunkFromIndexedFreeListHelper(size_t size) {
    // replenish the indexed free list.
    fc = getChunkFromDictionaryExact(size);
  }
-  assert(fc == NULL || fc->isFree(), "Should be returning a free chunk");
+  // assert(fc == NULL || fc->isFree(), "Should be returning a free chunk");
  return fc;
 }
@@ -1512,6 +1626,11 @@ CompactibleFreeListSpace::returnChunkToDictionary(FreeChunk* chunk) {
  // adjust _unallocated_block downward, as necessary
  _bt.freed((HeapWord*)chunk, size);
  _dictionary->returnChunk(chunk);
+#ifndef PRODUCT
+  if (CMSCollector::abstract_state() != CMSCollector::Sweeping) {
+    TreeChunk::as_TreeChunk(chunk)->list()->verify_stats();
+  }
+#endif // PRODUCT
 }
 void
@@ -1525,6 +1644,11 @@ CompactibleFreeListSpace::returnChunkToFreeList(FreeChunk* fc) {
  } else {
    _indexedFreeList[size].returnChunkAtHead(fc);
  }
+#ifndef PRODUCT
+  if (CMSCollector::abstract_state() != CMSCollector::Sweeping) {
+     _indexedFreeList[size].verify_stats();
+  }
+#endif // PRODUCT
 }
 // Add chunk to end of last block -- if it's the largest
@@ -1537,7 +1661,6 @@ CompactibleFreeListSpace::addChunkToFreeListsAtEndRecordingStats(
  HeapWord* chunk, size_t     size) {
  // check that the chunk does lie in this space!
  assert(chunk != NULL && is_in_reserved(chunk), "Not in this space!");
-  assert_locked();
  // One of the parallel gc task threads may be here
  // whilst others are allocating.
  Mutex* lock = NULL;
@@ -1991,24 +2114,26 @@ double CompactibleFreeListSpace::flsFrag() const {
  return frag;
 }
-#define CoalSurplusPercent 1.05
-#define SplitSurplusPercent 1.10
 void CompactibleFreeListSpace::beginSweepFLCensus(
  float inter_sweep_current,
-  float inter_sweep_estimate) {
+  float inter_sweep_estimate,
+  float intra_sweep_estimate) {
  assert_locked();
  size_t i;
  for (i = IndexSetStart; i < IndexSetSize; i += IndexSetStride) {
    FreeList* fl    = &_indexedFreeList[i];
-    fl->compute_desired(inter_sweep_current, inter_sweep_estimate);
+    if (PrintFLSStatistics > 1) {
-    fl->set_coalDesired((ssize_t)((double)fl->desired() * CoalSurplusPercent));
+      gclog_or_tty->print("size[%d] : ", i);
+    }
+    fl->compute_desired(inter_sweep_current, inter_sweep_estimate, intra_sweep_estimate);
+    fl->set_coalDesired((ssize_t)((double)fl->desired() * CMSSmallCoalSurplusPercent));
    fl->set_beforeSweep(fl->count());
    fl->set_bfrSurp(fl->surplus());
  }
-  _dictionary->beginSweepDictCensus(CoalSurplusPercent,
+  _dictionary->beginSweepDictCensus(CMSLargeCoalSurplusPercent,
                                    inter_sweep_current,
-                                    inter_sweep_estimate);
+                                    inter_sweep_estimate,
+                                    intra_sweep_estimate);
 }
 void CompactibleFreeListSpace::setFLSurplus() {
@@ -2017,7 +2142,7 @@ void CompactibleFreeListSpace::setFLSurplus() {
  for (i = IndexSetStart; i < IndexSetSize; i += IndexSetStride) {
    FreeList *fl = &_indexedFreeList[i];
    fl->set_surplus(fl->count() -
-                    (ssize_t)((double)fl->desired() * SplitSurplusPercent));
+                    (ssize_t)((double)fl->desired() * CMSSmallSplitSurplusPercent));
  }
 }
@@ -2048,6 +2173,11 @@ void CompactibleFreeListSpace::clearFLCensus() {
 }
 void CompactibleFreeListSpace::endSweepFLCensus(size_t sweep_count) {
+  if (PrintFLSStatistics > 0) {
+    HeapWord* largestAddr = (HeapWord*) dictionary()->findLargestDict();
+    gclog_or_tty->print_cr("CMS: Large block " PTR_FORMAT,
+                           largestAddr);
+  }
  setFLSurplus();
  setFLHints();
  if (PrintGC && PrintFLSCensus > 0) {
@@ -2055,7 +2185,7 @@ void CompactibleFreeListSpace::endSweepFLCensus(size_t sweep_count) {
  }
  clearFLCensus();
  assert_locked();
-  _dictionary->endSweepDictCensus(SplitSurplusPercent);
+  _dictionary->endSweepDictCensus(CMSLargeSplitSurplusPercent);
 }
 bool CompactibleFreeListSpace::coalOverPopulated(size_t size) {
@@ -2312,13 +2442,18 @@ void CompactibleFreeListSpace::verifyIndexedFreeLists() const {
 }
 void CompactibleFreeListSpace::verifyIndexedFreeList(size_t size) const {
-  FreeChunk* fc =  _indexedFreeList[size].head();
+  FreeChunk* fc   =  _indexedFreeList[size].head();
+  FreeChunk* tail =  _indexedFreeList[size].tail();
+  size_t    num = _indexedFreeList[size].count();
+  size_t      n = 0;
  guarantee((size % 2 == 0) || fc == NULL, "Odd slots should be empty");
-  for (; fc != NULL; fc = fc->next()) {
+  for (; fc != NULL; fc = fc->next(), n++) {
    guarantee(fc->size() == size, "Size inconsistency");
    guarantee(fc->isFree(), "!free?");
    guarantee(fc->next() == NULL || fc->next()->prev() == fc, "Broken list");
+    guarantee((fc->next() == NULL) == (fc == tail), "Incorrect tail");
  }
+  guarantee(n == num, "Incorrect count");
 }
 #ifndef PRODUCT
@@ -2516,11 +2651,41 @@ void PromotionInfo::startTrackingPromotions() {
  _tracking = true;
 }
-void PromotionInfo::stopTrackingPromotions() {
+#define CMSPrintPromoBlockInfo 1
+void PromotionInfo::stopTrackingPromotions(uint worker_id) {
  assert(_spoolHead == _spoolTail && _firstIndex == _nextIndex,
         "spooling inconsistency?");
  _firstIndex = _nextIndex = 1;
  _tracking = false;
+  if (CMSPrintPromoBlockInfo > 1) {
+    print_statistics(worker_id);
+  }
+}
+void PromotionInfo::print_statistics(uint worker_id) const {
+  assert(_spoolHead == _spoolTail && _firstIndex == _nextIndex,
+         "Else will undercount");
+  assert(CMSPrintPromoBlockInfo > 0, "Else unnecessary call");
+  // Count the number of blocks and slots in the free pool
+  size_t slots  = 0;
+  size_t blocks = 0;
+  for (SpoolBlock* cur_spool = _spareSpool;
+       cur_spool != NULL;
+       cur_spool = cur_spool->nextSpoolBlock) {
+    // the first entry is just a self-pointer; indices 1 through
+    // bufferSize - 1 are occupied (thus, bufferSize - 1 slots).
+    guarantee((void*)cur_spool->displacedHdr == (void*)&cur_spool->displacedHdr,
+              "first entry of displacedHdr should be self-referential");
+    slots += cur_spool->bufferSize - 1;
+    blocks++;
+  }
+  if (_spoolHead != NULL) {
+    slots += _spoolHead->bufferSize - 1;
+    blocks++;
+  }
+  gclog_or_tty->print_cr(" [worker %d] promo_blocks = %d, promo_slots = %d ",
+                         worker_id, blocks, slots);
 }
 // When _spoolTail is not NULL, then the slot <_spoolTail, _nextIndex>
@@ -2584,15 +2749,84 @@ void PromotionInfo::verify() const {
  guarantee(numDisplacedHdrs == numObjsWithDisplacedHdrs, "Displaced hdr count");
 }
+void PromotionInfo::print_on(outputStream* st) const {
+  SpoolBlock* curSpool = NULL;
+  size_t i = 0;
+  st->print_cr("start & end indices: [" SIZE_FORMAT ", " SIZE_FORMAT ")",
+               _firstIndex, _nextIndex);
+  for (curSpool = _spoolHead; curSpool != _spoolTail && curSpool != NULL;
+       curSpool = curSpool->nextSpoolBlock) {
+    curSpool->print_on(st);
+    st->print_cr(" active ");
+    i++;
+  }
+  for (curSpool = _spoolTail; curSpool != NULL;
+       curSpool = curSpool->nextSpoolBlock) {
+    curSpool->print_on(st);
+    st->print_cr(" inactive ");
+    i++;
+  }
+  for (curSpool = _spareSpool; curSpool != NULL;
+       curSpool = curSpool->nextSpoolBlock) {
+    curSpool->print_on(st);
+    st->print_cr(" free ");
+    i++;
+  }
+  st->print_cr(SIZE_FORMAT " header spooling blocks", i);
+}
+void SpoolBlock::print_on(outputStream* st) const {
+  st->print("[" PTR_FORMAT "," PTR_FORMAT "), " SIZE_FORMAT " HeapWords -> " PTR_FORMAT,
+            this, (HeapWord*)displacedHdr + bufferSize,
+            bufferSize, nextSpoolBlock);
+}
+///////////////////////////////////////////////////////////////////////////
+// CFLS_LAB
+///////////////////////////////////////////////////////////////////////////
+#define VECTOR_257(x)                                                                                  \
+  /* 1  2  3  4  5  6  7  8  9 1x 11 12 13 14 15 16 17 18 19 2x 21 22 23 24 25 26 27 28 29 3x 31 32 */ \
+  {  x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,   \
+     x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,   \
+     x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,   \
+     x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,   \
+     x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,   \
+     x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,   \
+     x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,   \
+     x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,   \
+     x }
+// Initialize with default setting of CMSParPromoteBlocksToClaim, _not_
+// OldPLABSize, whose static default is different; if overridden at the
+// command-line, this will get reinitialized via a call to
+// modify_initialization() below.
+AdaptiveWeightedAverage CFLS_LAB::_blocks_to_claim[]    =
+  VECTOR_257(AdaptiveWeightedAverage(OldPLABWeight, (float)CMSParPromoteBlocksToClaim));
+size_t CFLS_LAB::_global_num_blocks[]  = VECTOR_257(0);
+int    CFLS_LAB::_global_num_workers[] = VECTOR_257(0);
 CFLS_LAB::CFLS_LAB(CompactibleFreeListSpace* cfls) :
  _cfls(cfls)
 {
-  _blocks_to_claim = CMSParPromoteBlocksToClaim;
+  assert(CompactibleFreeListSpace::IndexSetSize == 257, "Modify VECTOR_257() macro above");
  for (size_t i = CompactibleFreeListSpace::IndexSetStart;
       i < CompactibleFreeListSpace::IndexSetSize;
       i += CompactibleFreeListSpace::IndexSetStride) {
    _indexedFreeList[i].set_size(i);
+    _num_blocks[i] = 0;
+  }
+}
+static bool _CFLS_LAB_modified = false;
+void CFLS_LAB::modify_initialization(size_t n, unsigned wt) {
+  assert(!_CFLS_LAB_modified, "Call only once");
+  _CFLS_LAB_modified = true;
+  for (size_t i = CompactibleFreeListSpace::IndexSetStart;
+       i < CompactibleFreeListSpace::IndexSetSize;
+       i += CompactibleFreeListSpace::IndexSetStride) {
+    _blocks_to_claim[i].modify(n, wt, true /* force */);
  }
 }
@@ -2607,11 +2841,9 @@ HeapWord* CFLS_LAB::alloc(size_t word_sz) {
    if (res == NULL) return NULL;
  } else {
    FreeList* fl = &_indexedFreeList[word_sz];
-    bool filled = false; //TRAP
    if (fl->count() == 0) {
-      bool filled = true; //TRAP
      // Attempt to refill this local free list.
-      _cfls->par_get_chunk_of_blocks(word_sz, _blocks_to_claim, fl);
+      get_from_global_pool(word_sz, fl);
      // If it didn't work, give up.
      if (fl->count() == 0) return NULL;
    }
@@ -2626,80 +2858,190 @@ HeapWord* CFLS_LAB::alloc(size_t word_sz) {
  return (HeapWord*)res;
 }
-void CFLS_LAB::retire() {
+// Get a chunk of blocks of the right size and update related
-  for (size_t i = CompactibleFreeListSpace::IndexSetStart;
+// book-keeping stats
+void CFLS_LAB::get_from_global_pool(size_t word_sz, FreeList* fl) {
+  // Get the #blocks we want to claim
+  size_t n_blks = (size_t)_blocks_to_claim[word_sz].average();
+  assert(n_blks > 0, "Error");
+  assert(ResizePLAB || n_blks == OldPLABSize, "Error");
+  // In some cases, when the application has a phase change,
+  // there may be a sudden and sharp shift in the object survival
+  // profile, and updating the counts at the end of a scavenge
+  // may not be quick enough, giving rise to large scavenge pauses
+  // during these phase changes. It is beneficial to detect such
+  // changes on-the-fly during a scavenge and avoid such a phase-change
+  // pothole. The following code is a heuristic attempt to do that.
+  // It is protected by a product flag until we have gained
+  // enough experience with this heuristic and fine-tuned its behaviour.
+  // WARNING: This might increase fragmentation if we overreact to
+  // small spikes, so some kind of historical smoothing based on
+  // previous experience with the greater reactivity might be useful.
+  // Lacking sufficient experience, CMSOldPLABResizeQuicker is disabled by
+  // default.
+  if (ResizeOldPLAB && CMSOldPLABResizeQuicker) {
+    size_t multiple = _num_blocks[word_sz]/(CMSOldPLABToleranceFactor*CMSOldPLABNumRefills*n_blks);
+    n_blks +=  CMSOldPLABReactivityFactor*multiple*n_blks;
+    n_blks = MIN2(n_blks, CMSOldPLABMax);
+  }
+  assert(n_blks > 0, "Error");
+  _cfls->par_get_chunk_of_blocks(word_sz, n_blks, fl);
+  // Update stats table entry for this block size
+  _num_blocks[word_sz] += fl->count();
+}
+void CFLS_LAB::compute_desired_plab_size() {
+  for (size_t i =  CompactibleFreeListSpace::IndexSetStart;
       i < CompactibleFreeListSpace::IndexSetSize;
       i += CompactibleFreeListSpace::IndexSetStride) {
-    if (_indexedFreeList[i].count() > 0) {
+    assert((_global_num_workers[i] == 0) == (_global_num_blocks[i] == 0),
-      MutexLockerEx x(_cfls->_indexedFreeListParLocks[i],
+           "Counter inconsistency");
-                      Mutex::_no_safepoint_check_flag);
+    if (_global_num_workers[i] > 0) {
-      _cfls->_indexedFreeList[i].prepend(&_indexedFreeList[i]);
+      // Need to smooth wrt historical average
-      // Reset this list.
+      if (ResizeOldPLAB) {
-      _indexedFreeList[i] = FreeList();
+        _blocks_to_claim[i].sample(
-      _indexedFreeList[i].set_size(i);
+          MAX2((size_t)CMSOldPLABMin,
+          MIN2((size_t)CMSOldPLABMax,
+               _global_num_blocks[i]/(_global_num_workers[i]*CMSOldPLABNumRefills))));
+      }
+      // Reset counters for next round
+      _global_num_workers[i] = 0;
+      _global_num_blocks[i] = 0;
+      if (PrintOldPLAB) {
+        gclog_or_tty->print_cr("[%d]: %d", i, (size_t)_blocks_to_claim[i].average());
+      }
    }
  }
 }
-void
+void CFLS_LAB::retire(int tid) {
-CompactibleFreeListSpace::
+  // We run this single threaded with the world stopped;
-par_get_chunk_of_blocks(size_t word_sz, size_t n, FreeList* fl) {
+  // so no need for locks and such.
+#define CFLS_LAB_PARALLEL_ACCESS 0
+  NOT_PRODUCT(Thread* t = Thread::current();)
+  assert(Thread::current()->is_VM_thread(), "Error");
+  assert(CompactibleFreeListSpace::IndexSetStart == CompactibleFreeListSpace::IndexSetStride,
+         "Will access to uninitialized slot below");
+#if CFLS_LAB_PARALLEL_ACCESS
+  for (size_t i = CompactibleFreeListSpace::IndexSetSize - 1;
+       i > 0;
+       i -= CompactibleFreeListSpace::IndexSetStride) {
+#else // CFLS_LAB_PARALLEL_ACCESS
+  for (size_t i =  CompactibleFreeListSpace::IndexSetStart;
+       i < CompactibleFreeListSpace::IndexSetSize;
+       i += CompactibleFreeListSpace::IndexSetStride) {
+#endif // !CFLS_LAB_PARALLEL_ACCESS
+    assert(_num_blocks[i] >= (size_t)_indexedFreeList[i].count(),
+           "Can't retire more than what we obtained");
+    if (_num_blocks[i] > 0) {
+      size_t num_retire =  _indexedFreeList[i].count();
+      assert(_num_blocks[i] > num_retire, "Should have used at least one");
+      {
+#if CFLS_LAB_PARALLEL_ACCESS
+        MutexLockerEx x(_cfls->_indexedFreeListParLocks[i],
+                        Mutex::_no_safepoint_check_flag);
+#endif // CFLS_LAB_PARALLEL_ACCESS
+        // Update globals stats for num_blocks used
+        _global_num_blocks[i] += (_num_blocks[i] - num_retire);
+        _global_num_workers[i]++;
+        assert(_global_num_workers[i] <= (ssize_t)ParallelGCThreads, "Too big");
+        if (num_retire > 0) {
+          _cfls->_indexedFreeList[i].prepend(&_indexedFreeList[i]);
+          // Reset this list.
+          _indexedFreeList[i] = FreeList();
+          _indexedFreeList[i].set_size(i);
+        }
+      }
+      if (PrintOldPLAB) {
+        gclog_or_tty->print_cr("%d[%d]: %d/%d/%d",
+                               tid, i, num_retire, _num_blocks[i], (size_t)_blocks_to_claim[i].average());
+      }
+      // Reset stats for next round
+      _num_blocks[i]         = 0;
+    }
+  }
+}
+void CompactibleFreeListSpace:: par_get_chunk_of_blocks(size_t word_sz, size_t n, FreeList* fl) {
  assert(fl->count() == 0, "Precondition.");
  assert(word_sz < CompactibleFreeListSpace::IndexSetSize,
         "Precondition");
-  // We'll try all multiples of word_sz in the indexed set (starting with
+  // We'll try all multiples of word_sz in the indexed set, starting with
-  // word_sz itself), then try getting a big chunk and splitting it.
+  // word_sz itself and, if CMSSplitIndexedFreeListBlocks, try larger multiples,
-  int k = 1;
+  // then try getting a big chunk and splitting it.
-  size_t cur_sz = k * word_sz;
+  {
-  bool found = false;
+    bool found;
-  while (cur_sz < CompactibleFreeListSpace::IndexSetSize && k == 1) {
+    int  k;
-    FreeList* gfl = &_indexedFreeList[cur_sz];
+    size_t cur_sz;
-    FreeList fl_for_cur_sz;  // Empty.
+    for (k = 1, cur_sz = k * word_sz, found = false;
-    fl_for_cur_sz.set_size(cur_sz);
+         (cur_sz < CompactibleFreeListSpace::IndexSetSize) &&
-    {
+         (CMSSplitIndexedFreeListBlocks || k <= 1);
-      MutexLockerEx x(_indexedFreeListParLocks[cur_sz],
+         k++, cur_sz = k * word_sz) {
-                      Mutex::_no_safepoint_check_flag);
+      FreeList* gfl = &_indexedFreeList[cur_sz];
-      if (gfl->count() != 0) {
+      FreeList fl_for_cur_sz;  // Empty.
-        size_t nn = MAX2(n/k, (size_t)1);
+      fl_for_cur_sz.set_size(cur_sz);
-        gfl->getFirstNChunksFromList(nn, &fl_for_cur_sz);
+      {
-        found = true;
+        MutexLockerEx x(_indexedFreeListParLocks[cur_sz],
+                        Mutex::_no_safepoint_check_flag);
+        if (gfl->count() != 0) {
+          // nn is the number of chunks of size cur_sz that
+          // we'd need to split k-ways each, in order to create
+          // "n" chunks of size word_sz each.
+          const size_t nn = MAX2(n/k, (size_t)1);
+          gfl->getFirstNChunksFromList(nn, &fl_for_cur_sz);
+          found = true;
+          if (k > 1) {
+            // Update split death stats for the cur_sz-size blocks list:
+            // we increment the split death count by the number of blocks
+            // we just took from the cur_sz-size blocks list and which
+            // we will be splitting below.
+            ssize_t deaths = _indexedFreeList[cur_sz].splitDeaths() +
+                             fl_for_cur_sz.count();
+            _indexedFreeList[cur_sz].set_splitDeaths(deaths);
+          }
+        }
      }
-    }
+      // Now transfer fl_for_cur_sz to fl.  Common case, we hope, is k = 1.
-    // Now transfer fl_for_cur_sz to fl.  Common case, we hope, is k = 1.
+      if (found) {
-    if (found) {
+        if (k == 1) {
-      if (k == 1) {
+          fl->prepend(&fl_for_cur_sz);
-        fl->prepend(&fl_for_cur_sz);
+        } else {
-      } else {
+          // Divide each block on fl_for_cur_sz up k ways.
-        // Divide each block on fl_for_cur_sz up k ways.
+          FreeChunk* fc;
-        FreeChunk* fc;
+          while ((fc = fl_for_cur_sz.getChunkAtHead()) != NULL) {
-        while ((fc = fl_for_cur_sz.getChunkAtHead()) != NULL) {
+            // Must do this in reverse order, so that anybody attempting to
-          // Must do this in reverse order, so that anybody attempting to
+            // access the main chunk sees it as a single free block until we
-          // access the main chunk sees it as a single free block until we
+            // change it.
-          // change it.
+            size_t fc_size = fc->size();
-          size_t fc_size = fc->size();
+            for (int i = k-1; i >= 0; i--) {
-          for (int i = k-1; i >= 0; i--) {
+              FreeChunk* ffc = (FreeChunk*)((HeapWord*)fc + i * word_sz);
-            FreeChunk* ffc = (FreeChunk*)((HeapWord*)fc + i * word_sz);
+              ffc->setSize(word_sz);
-            ffc->setSize(word_sz);
+              ffc->linkNext(NULL);
-            ffc->linkNext(NULL);
+              ffc->linkPrev(NULL); // Mark as a free block for other (parallel) GC threads.
-            ffc->linkPrev(NULL); // Mark as a free block for other (parallel) GC threads.
+              // Above must occur before BOT is updated below.
-            // Above must occur before BOT is updated below.
+              // splitting from the right, fc_size == (k - i + 1) * wordsize
-            // splitting from the right, fc_size == (k - i + 1) * wordsize
+              _bt.mark_block((HeapWord*)ffc, word_sz);
-            _bt.mark_block((HeapWord*)ffc, word_sz);
+              fc_size -= word_sz;
-            fc_size -= word_sz;
+              _bt.verify_not_unallocated((HeapWord*)ffc, ffc->size());
-            _bt.verify_not_unallocated((HeapWord*)ffc, ffc->size());
+              _bt.verify_single_block((HeapWord*)fc, fc_size);
-            _bt.verify_single_block((HeapWord*)fc, fc_size);
+              _bt.verify_single_block((HeapWord*)ffc, ffc->size());
-            _bt.verify_single_block((HeapWord*)ffc, ffc->size());
+              // Push this on "fl".
-            // Push this on "fl".
+              fl->returnChunkAtHead(ffc);
-            fl->returnChunkAtHead(ffc);
+            }
+            // TRAP
+            assert(fl->tail()->next() == NULL, "List invariant.");
          }
-          // TRAP
-          assert(fl->tail()->next() == NULL, "List invariant.");
        }
+        // Update birth stats for this block size.
+        size_t num = fl->count();
+        MutexLockerEx x(_indexedFreeListParLocks[word_sz],
+                        Mutex::_no_safepoint_check_flag);
+        ssize_t births = _indexedFreeList[word_sz].splitBirths() + num;
+        _indexedFreeList[word_sz].set_splitBirths(births);
+        return;
      }
-      return;
    }
-    k++; cur_sz = k * word_sz;
  }
  // Otherwise, we'll split a block from the dictionary.
  FreeChunk* fc = NULL;
@@ -2723,17 +3065,20 @@ par_get_chunk_of_blocks(size_t word_sz, size_t n, FreeList* fl) {
      }
    }
    if (fc == NULL) return;
+    assert((ssize_t)n >= 1, "Control point invariant");
    // Otherwise, split up that block.
-    size_t nn = fc->size() / word_sz;
+    const size_t nn = fc->size() / word_sz;
    n = MIN2(nn, n);
+    assert((ssize_t)n >= 1, "Control point invariant");
    rem = fc->size() - n * word_sz;
    // If there is a remainder, and it's too small, allocate one fewer.
    if (rem > 0 && rem < MinChunkSize) {
      n--; rem += word_sz;
    }
+    assert((ssize_t)n >= 1, "Control point invariant");
    // First return the remainder, if any.
    // Note that we hold the lock until we decide if we're going to give
-    // back the remainder to the dictionary, since a contending allocator
+    // back the remainder to the dictionary, since a concurrent allocation
    // may otherwise see the heap as empty.  (We're willing to take that
    // hit if the block is a small block.)
    if (rem > 0) {
@@ -2743,18 +3088,16 @@ par_get_chunk_of_blocks(size_t word_sz, size_t n, FreeList* fl) {
      rem_fc->linkNext(NULL);
      rem_fc->linkPrev(NULL); // Mark as a free block for other (parallel) GC threads.
      // Above must occur before BOT is updated below.
+      assert((ssize_t)n > 0 && prefix_size > 0 && rem_fc > fc, "Error");
      _bt.split_block((HeapWord*)fc, fc->size(), prefix_size);
      if (rem >= IndexSetSize) {
        returnChunkToDictionary(rem_fc);
-        dictionary()->dictCensusUpdate(fc->size(),
+        dictionary()->dictCensusUpdate(rem, true /*split*/, true /*birth*/);
-                                       true /*split*/,
-                                       true /*birth*/);
        rem_fc = NULL;
      }
      // Otherwise, return it to the small list below.
    }
  }
-  //
  if (rem_fc != NULL) {
    MutexLockerEx x(_indexedFreeListParLocks[rem],
                    Mutex::_no_safepoint_check_flag);
@@ -2762,7 +3105,7 @@ par_get_chunk_of_blocks(size_t word_sz, size_t n, FreeList* fl) {
    _indexedFreeList[rem].returnChunkAtHead(rem_fc);
    smallSplitBirth(rem);
  }
+  assert((ssize_t)n > 0 && fc != NULL, "Consistency");
  // Now do the splitting up.
  // Must do this in reverse order, so that anybody attempting to
  // access the main chunk sees it as a single free block until we
@@ -2792,13 +3135,15 @@ par_get_chunk_of_blocks(size_t word_sz, size_t n, FreeList* fl) {
  _bt.verify_single_block((HeapWord*)fc, fc->size());
  fl->returnChunkAtHead(fc);
+  assert((ssize_t)n > 0 && (ssize_t)n == fl->count(), "Incorrect number of blocks");
  {
+    // Update the stats for this block size.
    MutexLockerEx x(_indexedFreeListParLocks[word_sz],
                    Mutex::_no_safepoint_check_flag);
-    ssize_t new_births = _indexedFreeList[word_sz].splitBirths() + n;
+    const ssize_t births = _indexedFreeList[word_sz].splitBirths() + n;
-    _indexedFreeList[word_sz].set_splitBirths(new_births);
+    _indexedFreeList[word_sz].set_splitBirths(births);
-    ssize_t new_surplus = _indexedFreeList[word_sz].surplus() + n;
+    // ssize_t new_surplus = _indexedFreeList[word_sz].surplus() + n;
-    _indexedFreeList[word_sz].set_surplus(new_surplus);
+    // _indexedFreeList[word_sz].set_surplus(new_surplus);
  }
  // TRAP

--- a/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.hpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.hpp
@@ -25,8 +25,6 @@
 // Classes in support of keeping track of promotions into a non-Contiguous
 // space, in this case a CompactibleFreeListSpace.
-#define CFLS_LAB_REFILL_STATS 0
 // Forward declarations
 class CompactibleFreeListSpace;
 class BlkClosure;
@@ -89,6 +87,9 @@ class SpoolBlock: public FreeChunk {
    displacedHdr = (markOop*)&displacedHdr;
    nextSpoolBlock = NULL;
  }
+  void print_on(outputStream* st) const;
+  void print() const { print_on(gclog_or_tty); }
 };
 class PromotionInfo VALUE_OBJ_CLASS_SPEC {
@@ -121,7 +122,7 @@ class PromotionInfo VALUE_OBJ_CLASS_SPEC {
    return _promoHead == NULL;
  }
  void startTrackingPromotions();
-  void stopTrackingPromotions();
+  void stopTrackingPromotions(uint worker_id = 0);
  bool tracking() const          { return _tracking;  }
  void track(PromotedObject* trackOop);      // keep track of a promoted oop
  // The following variant must be used when trackOop is not fully
@@ -161,6 +162,9 @@ class PromotionInfo VALUE_OBJ_CLASS_SPEC {
    _nextIndex = 0;
  }
+  void print_on(outputStream* st) const;
+  void print_statistics(uint worker_id) const;
 };
 class LinearAllocBlock VALUE_OBJ_CLASS_SPEC {
@@ -243,6 +247,7 @@ class CompactibleFreeListSpace: public CompactibleSpace {
  mutable Mutex _freelistLock;
  // locking verifier convenience function
  void assert_locked() const PRODUCT_RETURN;
+  void assert_locked(const Mutex* lock) const PRODUCT_RETURN;
  // Linear allocation blocks
  LinearAllocBlock _smallLinearAllocBlock;
@@ -281,13 +286,6 @@ class CompactibleFreeListSpace: public CompactibleSpace {
  // Locks protecting the exact lists during par promotion allocation.
  Mutex* _indexedFreeListParLocks[IndexSetSize];
-#if CFLS_LAB_REFILL_STATS
-  // Some statistics.
-  jint  _par_get_chunk_from_small;
-  jint  _par_get_chunk_from_large;
-#endif
  // Attempt to obtain up to "n" blocks of the size "word_sz" (which is
  // required to be smaller than "IndexSetSize".)  If successful,
  // adds them to "fl", which is required to be an empty free list.
@@ -320,7 +318,7 @@ class CompactibleFreeListSpace: public CompactibleSpace {
  // Helper function for getChunkFromIndexedFreeList.
  // Replenish the indexed free list for this "size".  Do not take from an
  // underpopulated size.
-  FreeChunk*  getChunkFromIndexedFreeListHelper(size_t size);
+  FreeChunk*  getChunkFromIndexedFreeListHelper(size_t size, bool replenish = true);
  // Get a chunk from the indexed free list.  If the indexed free list
  // does not have a free chunk, try to replenish the indexed free list
@@ -430,10 +428,6 @@ class CompactibleFreeListSpace: public CompactibleSpace {
  void initialize_sequential_subtasks_for_marking(int n_threads,
         HeapWord* low = NULL);
-#if CFLS_LAB_REFILL_STATS
-  void print_par_alloc_stats();
-#endif
  // Space enquiries
  size_t used() const;
  size_t free() const;
@@ -617,6 +611,12 @@ class CompactibleFreeListSpace: public CompactibleSpace {
  // Do some basic checks on the the free lists.
  void checkFreeListConsistency()         const PRODUCT_RETURN;
+  // Printing support
+  void dump_at_safepoint_with_locks(CMSCollector* c, outputStream* st);
+  void print_indexed_free_lists(outputStream* st) const;
+  void print_dictionary_free_lists(outputStream* st) const;
+  void print_promo_info_blocks(outputStream* st) const;
  NOT_PRODUCT (
    void initializeIndexedFreeListArrayReturnedBytes();
    size_t sumIndexedFreeListArrayReturnedBytes();
@@ -638,8 +638,9 @@ class CompactibleFreeListSpace: public CompactibleSpace {
  // Statistics functions
  // Initialize census for lists before the sweep.
-  void beginSweepFLCensus(float sweep_current,
+  void beginSweepFLCensus(float inter_sweep_current,
-                          float sweep_estimate);
+                          float inter_sweep_estimate,
+                          float intra_sweep_estimate);
  // Set the surplus for each of the free lists.
  void setFLSurplus();
  // Set the hint for each of the free lists.
@@ -730,16 +731,17 @@ class CFLS_LAB : public CHeapObj {
  FreeList _indexedFreeList[CompactibleFreeListSpace::IndexSetSize];
  // Initialized from a command-line arg.
-  size_t _blocks_to_claim;
-#if CFLS_LAB_REFILL_STATS
+  // Allocation statistics in support of dynamic adjustment of
-  // Some statistics.
+  // #blocks to claim per get_from_global_pool() call below.
-  int _refills;
+  static AdaptiveWeightedAverage
-  int _blocksTaken;
+                 _blocks_to_claim  [CompactibleFreeListSpace::IndexSetSize];
-  static int _tot_refills;
+  static size_t _global_num_blocks [CompactibleFreeListSpace::IndexSetSize];
-  static int _tot_blocksTaken;
+  static int    _global_num_workers[CompactibleFreeListSpace::IndexSetSize];
-  static int _next_threshold;
+  size_t        _num_blocks        [CompactibleFreeListSpace::IndexSetSize];
-#endif
+  // Internal work method
+  void get_from_global_pool(size_t word_sz, FreeList* fl);
 public:
  CFLS_LAB(CompactibleFreeListSpace* cfls);
@@ -748,7 +750,12 @@ public:
  HeapWord* alloc(size_t word_sz);
  // Return any unused portions of the buffer to the global pool.
-  void retire();
+  void retire(int tid);
+  // Dynamic OldPLABSize sizing
+  static void compute_desired_plab_size();
+  // When the settings are modified from default static initialization
+  static void modify_initialization(size_t n, unsigned wt);
 };
 size_t PromotionInfo::refillSize() const {

--- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
@@ -253,7 +253,6 @@ void ConcurrentMarkSweepGeneration::init_initiating_occupancy(intx io, intx tr)
  }
 }
 void ConcurrentMarkSweepGeneration::ref_processor_init() {
  assert(collector() != NULL, "no collector");
  collector()->ref_processor_init();
@@ -341,6 +340,14 @@ CMSStats::CMSStats(ConcurrentMarkSweepGeneration* cms_gen, unsigned int alpha):
  _icms_duty_cycle = CMSIncrementalDutyCycle;
 }
+double CMSStats::cms_free_adjustment_factor(size_t free) const {
+  // TBD: CR 6909490
+  return 1.0;
+}
+void CMSStats::adjust_cms_free_adjustment_factor(bool fail, size_t free) {
+}
 // If promotion failure handling is on use
 // the padded average size of the promotion for each
 // young generation collection.
@@ -361,7 +368,11 @@ double CMSStats::time_until_cms_gen_full() const {
    // Adjust by the safety factor.
    double cms_free_dbl = (double)cms_free;
-    cms_free_dbl = cms_free_dbl * (100.0 - CMSIncrementalSafetyFactor) / 100.0;
+    double cms_adjustment = (100.0 - CMSIncrementalSafetyFactor)/100.0;
+    // Apply a further correction factor which tries to adjust
+    // for recent occurance of concurrent mode failures.
+    cms_adjustment = cms_adjustment * cms_free_adjustment_factor(cms_free);
+    cms_free_dbl = cms_free_dbl * cms_adjustment;
    if (PrintGCDetails && Verbose) {
      gclog_or_tty->print_cr("CMSStats::time_until_cms_gen_full: cms_free "
@@ -395,6 +406,8 @@ double CMSStats::time_until_cms_start() const {
  // late.
  double work = cms_duration() + gc0_period();
  double deadline = time_until_cms_gen_full();
+  // If a concurrent mode failure occurred recently, we want to be
+  // more conservative and halve our expected time_until_cms_gen_full()
  if (work > deadline) {
    if (Verbose && PrintGCDetails) {
      gclog_or_tty->print(
@@ -556,7 +569,8 @@ CMSCollector::CMSCollector(ConcurrentMarkSweepGeneration* cmsGen,
  _should_unload_classes(false),
  _concurrent_cycles_since_last_unload(0),
  _roots_scanning_options(0),
-  _sweep_estimate(CMS_SweepWeight, CMS_SweepPadding)
+  _inter_sweep_estimate(CMS_SweepWeight, CMS_SweepPadding),
+  _intra_sweep_estimate(CMS_SweepWeight, CMS_SweepPadding)
 {
  if (ExplicitGCInvokesConcurrentAndUnloadsClasses) {
    ExplicitGCInvokesConcurrent = true;
@@ -773,7 +787,7 @@ CMSCollector::CMSCollector(ConcurrentMarkSweepGeneration* cmsGen,
  NOT_PRODUCT(_overflow_counter = CMSMarkStackOverflowInterval;)
  _gc_counters = new CollectorCounters("CMS", 1);
  _completed_initialization = true;
-  _sweep_timer.start();  // start of time
+  _inter_sweep_timer.start();  // start of time
 }
 const char* ConcurrentMarkSweepGeneration::name() const {
@@ -900,6 +914,14 @@ bool ConcurrentMarkSweepGeneration::promotion_attempt_is_safe(
  return result;
 }
+// At a promotion failure dump information on block layout in heap
+// (cms old generation).
+void ConcurrentMarkSweepGeneration::promotion_failure_occurred() {
+  if (CMSDumpAtPromotionFailure) {
+    cmsSpace()->dump_at_safepoint_with_locks(collector(), gclog_or_tty);
+  }
+}
 CompactibleSpace*
 ConcurrentMarkSweepGeneration::first_compaction_space() const {
  return _cmsSpace;
@@ -1368,12 +1390,7 @@ void
 ConcurrentMarkSweepGeneration::
 par_promote_alloc_done(int thread_num) {
  CMSParGCThreadState* ps = _par_gc_thread_states[thread_num];
-  ps->lab.retire();
+  ps->lab.retire(thread_num);
-#if CFLS_LAB_REFILL_STATS
-  if (thread_num == 0) {
-    _cmsSpace->print_par_alloc_stats();
-  }
-#endif
 }
 void
@@ -1974,11 +1991,14 @@ void CMSCollector::do_compaction_work(bool clear_all_soft_refs) {
  // We must adjust the allocation statistics being maintained
  // in the free list space. We do so by reading and clearing
  // the sweep timer and updating the block flux rate estimates below.
-  assert(_sweep_timer.is_active(), "We should never see the timer inactive");
+  assert(!_intra_sweep_timer.is_active(), "_intra_sweep_timer should be inactive");
-  _sweep_timer.stop();
+  if (_inter_sweep_timer.is_active()) {
-  // Note that we do not use this sample to update the _sweep_estimate.
+    _inter_sweep_timer.stop();
-  _cmsGen->cmsSpace()->beginSweepFLCensus((float)(_sweep_timer.seconds()),
+    // Note that we do not use this sample to update the _inter_sweep_estimate.
-                                          _sweep_estimate.padded_average());
+    _cmsGen->cmsSpace()->beginSweepFLCensus((float)(_inter_sweep_timer.seconds()),
+                                            _inter_sweep_estimate.padded_average(),
+                                            _intra_sweep_estimate.padded_average());
+  }
  GenMarkSweep::invoke_at_safepoint(_cmsGen->level(),
    ref_processor(), clear_all_soft_refs);
@@ -2015,10 +2035,10 @@ void CMSCollector::do_compaction_work(bool clear_all_soft_refs) {
  }
  // Adjust the per-size allocation stats for the next epoch.
-  _cmsGen->cmsSpace()->endSweepFLCensus(sweepCount() /* fake */);
+  _cmsGen->cmsSpace()->endSweepFLCensus(sweep_count() /* fake */);
-  // Restart the "sweep timer" for next epoch.
+  // Restart the "inter sweep timer" for the next epoch.
-  _sweep_timer.reset();
+  _inter_sweep_timer.reset();
-  _sweep_timer.start();
+  _inter_sweep_timer.start();
  // Sample collection pause time and reset for collection interval.
  if (UseAdaptiveSizePolicy) {
@@ -2676,7 +2696,7 @@ void ConcurrentMarkSweepGeneration::gc_epilogue(bool full) {
  // Also reset promotion tracking in par gc thread states.
  if (ParallelGCThreads > 0) {
    for (uint i = 0; i < ParallelGCThreads; i++) {
-      _par_gc_thread_states[i]->promo.stopTrackingPromotions();
+      _par_gc_thread_states[i]->promo.stopTrackingPromotions(i);
    }
  }
 }
@@ -2771,7 +2791,7 @@ class VerifyMarkedClosure: public BitMapClosure {
  bool do_bit(size_t offset) {
    HeapWord* addr = _marks->offsetToHeapWord(offset);
    if (!_marks->isMarked(addr)) {
-      oop(addr)->print();
+      oop(addr)->print_on(gclog_or_tty);
      gclog_or_tty->print_cr(" ("INTPTR_FORMAT" should have been marked)", addr);
      _failed = true;
    }
@@ -2820,7 +2840,7 @@ bool CMSCollector::verify_after_remark() {
  // Clear any marks from a previous round
  verification_mark_bm()->clear_all();
  assert(verification_mark_stack()->isEmpty(), "markStack should be empty");
-  assert(overflow_list_is_empty(), "overflow list should be empty");
+  verify_work_stacks_empty();
  GenCollectedHeap* gch = GenCollectedHeap::heap();
  gch->ensure_parsability(false);  // fill TLABs, but no need to retire them
@@ -2893,8 +2913,8 @@ void CMSCollector::verify_after_remark_work_1() {
  verification_mark_bm()->iterate(&vcl);
  if (vcl.failed()) {
    gclog_or_tty->print("Verification failed");
-    Universe::heap()->print();
+    Universe::heap()->print_on(gclog_or_tty);
-    fatal(" ... aborting");
+    fatal("CMS: failed marking verification after remark");
  }
 }
@@ -3314,7 +3334,7 @@ bool ConcurrentMarkSweepGeneration::grow_by(size_t bytes) {
    Universe::heap()->barrier_set()->resize_covered_region(mr);
    // Hmmmm... why doesn't CFLS::set_end verify locking?
    // This is quite ugly; FIX ME XXX
-    _cmsSpace->assert_locked();
+    _cmsSpace->assert_locked(freelistLock());
    _cmsSpace->set_end((HeapWord*)_virtual_space.high());
    // update the space and generation capacity counters
@@ -5868,9 +5888,9 @@ void CMSCollector::sweep(bool asynch) {
  check_correct_thread_executing();
  verify_work_stacks_empty();
  verify_overflow_empty();
-  incrementSweepCount();
+  increment_sweep_count();
-  _sweep_timer.stop();
+  _inter_sweep_timer.stop();
-  _sweep_estimate.sample(_sweep_timer.seconds());
+  _inter_sweep_estimate.sample(_inter_sweep_timer.seconds());
  size_policy()->avg_cms_free_at_sweep()->sample(_cmsGen->free());
  // PermGen verification support: If perm gen sweeping is disabled in
@@ -5893,6 +5913,9 @@ void CMSCollector::sweep(bool asynch) {
    }
  }
+  assert(!_intra_sweep_timer.is_active(), "Should not be active");
+  _intra_sweep_timer.reset();
+  _intra_sweep_timer.start();
  if (asynch) {
    TraceCPUTime tcpu(PrintGCDetails, true, gclog_or_tty);
    CMSPhaseAccounting pa(this, "sweep", !PrintGCDetails);
@@ -5937,8 +5960,11 @@ void CMSCollector::sweep(bool asynch) {
  verify_work_stacks_empty();
  verify_overflow_empty();
-  _sweep_timer.reset();
+  _intra_sweep_timer.stop();
-  _sweep_timer.start();
+  _intra_sweep_estimate.sample(_intra_sweep_timer.seconds());
+  _inter_sweep_timer.reset();
+  _inter_sweep_timer.start();
  update_time_of_last_gc(os::javaTimeMillis());
@@ -5981,11 +6007,11 @@ void CMSCollector::sweep(bool asynch) {
 // FIX ME!!! Looks like this belongs in CFLSpace, with
 // CMSGen merely delegating to it.
 void ConcurrentMarkSweepGeneration::setNearLargestChunk() {
-  double nearLargestPercent = 0.999;
+  double nearLargestPercent = FLSLargestBlockCoalesceProximity;
  HeapWord*  minAddr        = _cmsSpace->bottom();
  HeapWord*  largestAddr    =
    (HeapWord*) _cmsSpace->dictionary()->findLargestDict();
-  if (largestAddr == 0) {
+  if (largestAddr == NULL) {
    // The dictionary appears to be empty.  In this case
    // try to coalesce at the end of the heap.
    largestAddr = _cmsSpace->end();
@@ -5993,6 +6019,13 @@ void ConcurrentMarkSweepGeneration::setNearLargestChunk() {
  size_t largestOffset     = pointer_delta(largestAddr, minAddr);
  size_t nearLargestOffset =
    (size_t)((double)largestOffset * nearLargestPercent) - MinChunkSize;
+  if (PrintFLSStatistics != 0) {
+    gclog_or_tty->print_cr(
+      "CMS: Large Block: " PTR_FORMAT ";"
+      " Proximity: " PTR_FORMAT " -> " PTR_FORMAT,
+      largestAddr,
+      _cmsSpace->nearLargestChunk(), minAddr + nearLargestOffset);
+  }
  _cmsSpace->set_nearLargestChunk(minAddr + nearLargestOffset);
 }
@@ -6072,9 +6105,11 @@ void CMSCollector::sweepWork(ConcurrentMarkSweepGeneration* gen,
  assert_lock_strong(gen->freelistLock());
  assert_lock_strong(bitMapLock());
-  assert(!_sweep_timer.is_active(), "Was switched off in an outer context");
+  assert(!_inter_sweep_timer.is_active(), "Was switched off in an outer context");
-  gen->cmsSpace()->beginSweepFLCensus((float)(_sweep_timer.seconds()),
+  assert(_intra_sweep_timer.is_active(),  "Was switched on  in an outer context");
-                                      _sweep_estimate.padded_average());
+  gen->cmsSpace()->beginSweepFLCensus((float)(_inter_sweep_timer.seconds()),
+                                      _inter_sweep_estimate.padded_average(),
+                                      _intra_sweep_estimate.padded_average());
  gen->setNearLargestChunk();
  {
@@ -6087,7 +6122,7 @@ void CMSCollector::sweepWork(ConcurrentMarkSweepGeneration* gen,
    // end-of-sweep-census below will be off by a little bit.
  }
  gen->cmsSpace()->sweep_completed();
-  gen->cmsSpace()->endSweepFLCensus(sweepCount());
+  gen->cmsSpace()->endSweepFLCensus(sweep_count());
  if (should_unload_classes()) {                // unloaded classes this cycle,
    _concurrent_cycles_since_last_unload = 0;   // ... reset count
  } else {                                      // did not unload classes,

--- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp
@@ -355,6 +355,11 @@ class CMSStats VALUE_OBJ_CLASS_SPEC {
                                             unsigned int new_duty_cycle);
  unsigned int icms_update_duty_cycle_impl();
+  // In support of adjusting of cms trigger ratios based on history
+  // of concurrent mode failure.
+  double cms_free_adjustment_factor(size_t free) const;
+  void   adjust_cms_free_adjustment_factor(bool fail, size_t free);
 public:
  CMSStats(ConcurrentMarkSweepGeneration* cms_gen,
           unsigned int alpha = CMSExpAvgFactor);
@@ -570,8 +575,11 @@ class CMSCollector: public CHeapObj {
  // appropriately.
  void check_gc_time_limit();
  // XXX Move these to CMSStats ??? FIX ME !!!
-  elapsedTimer _sweep_timer;
+  elapsedTimer _inter_sweep_timer;   // time between sweeps
-  AdaptivePaddedAverage _sweep_estimate;
+  elapsedTimer _intra_sweep_timer;   // time _in_ sweeps
+  // padded decaying average estimates of the above
+  AdaptivePaddedAverage _inter_sweep_estimate;
+  AdaptivePaddedAverage _intra_sweep_estimate;
 protected:
  ConcurrentMarkSweepGeneration* _cmsGen;  // old gen (CMS)
@@ -625,6 +633,7 @@ class CMSCollector: public CHeapObj {
  // . _collectorState <= Idling ==  post-sweep && pre-mark
  // . _collectorState in (Idling, Sweeping) == {initial,final}marking ||
  //                                            precleaning || abortablePrecleanb
+ public:
  enum CollectorState {
    Resizing            = 0,
    Resetting           = 1,
@@ -636,6 +645,7 @@ class CMSCollector: public CHeapObj {
    FinalMarking        = 7,
    Sweeping            = 8
  };
+ protected:
  static CollectorState _collectorState;
  // State related to prologue/epilogue invocation for my generations
@@ -655,7 +665,7 @@ class CMSCollector: public CHeapObj {
  int    _numYields;
  size_t _numDirtyCards;
-  uint   _sweepCount;
+  size_t _sweep_count;
  // number of full gc's since the last concurrent gc.
  uint   _full_gcs_since_conc_gc;
@@ -905,7 +915,7 @@ class CMSCollector: public CHeapObj {
  // Check that the currently executing thread is the expected
  // one (foreground collector or background collector).
-  void check_correct_thread_executing()        PRODUCT_RETURN;
+  static void check_correct_thread_executing() PRODUCT_RETURN;
  // XXXPERM void print_statistics()           PRODUCT_RETURN;
  bool is_cms_reachable(HeapWord* addr);
@@ -930,8 +940,8 @@ class CMSCollector: public CHeapObj {
  static void set_foregroundGCShouldWait(bool v) { _foregroundGCShouldWait = v; }
  static bool foregroundGCIsActive() { return _foregroundGCIsActive; }
  static void set_foregroundGCIsActive(bool v) { _foregroundGCIsActive = v; }
-  uint  sweepCount() const             { return _sweepCount; }
+  size_t sweep_count() const             { return _sweep_count; }
-  void incrementSweepCount()           { _sweepCount++; }
+  void   increment_sweep_count()         { _sweep_count++; }
  // Timers/stats for gc scheduling and incremental mode pacing.
  CMSStats& stats() { return _stats; }
@@ -1165,6 +1175,11 @@ class ConcurrentMarkSweepGeneration: public CardGeneration {
  virtual bool promotion_attempt_is_safe(size_t promotion_in_bytes,
    bool younger_handles_promotion_failure) const;
+  // Inform this (non-young) generation that a promotion failure was
+  // encountered during a collection of a younger generation that
+  // promotes into this generation.
+  virtual void promotion_failure_occurred();
  bool should_collect(bool full, size_t size, bool tlab);
  virtual bool should_concurrent_collect() const;
  virtual bool is_too_full() const;

--- a/src/share/vm/gc_implementation/concurrentMarkSweep/freeBlockDictionary.hpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/freeBlockDictionary.hpp
@@ -55,7 +55,8 @@ class FreeBlockDictionary: public CHeapObj {
  virtual void       dictCensusUpdate(size_t size, bool split, bool birth) = 0;
  virtual bool       coalDictOverPopulated(size_t size) = 0;
  virtual void       beginSweepDictCensus(double coalSurplusPercent,
-                       float sweep_current, float sweep_ewstimate) = 0;
+                       float inter_sweep_current, float inter_sweep_estimate,
+                       float intra__sweep_current) = 0;
  virtual void       endSweepDictCensus(double splitSurplusPercent) = 0;
  virtual FreeChunk* findLargestDict() const = 0;
  // verify that the given chunk is in the dictionary.
@@ -79,6 +80,7 @@ class FreeBlockDictionary: public CHeapObj {
  }
  virtual void       printDictCensus() const = 0;
+  virtual void       print_free_lists(outputStream* st) const = 0;
  virtual void       verify()         const = 0;

--- a/src/share/vm/gc_implementation/concurrentMarkSweep/freeChunk.cpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/freeChunk.cpp
@@ -67,3 +67,8 @@ void FreeChunk::verifyList() const {
  }
 }
 #endif
+void FreeChunk::print_on(outputStream* st) {
+  st->print_cr("Next: " PTR_FORMAT " Prev: " PTR_FORMAT " %s",
+    next(), prev(), cantCoalesce() ? "[can't coalesce]" : "");
+}
--- a/src/share/vm/gc_implementation/concurrentMarkSweep/freeChunk.hpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/freeChunk.hpp
@@ -129,6 +129,8 @@ class FreeChunk VALUE_OBJ_CLASS_SPEC {
  void verifyList()         const PRODUCT_RETURN;
  void mangleAllocated(size_t size) PRODUCT_RETURN;
  void mangleFreed(size_t size)     PRODUCT_RETURN;
+  void print_on(outputStream* st);
 };
 // Alignment helpers etc.

--- a/src/share/vm/gc_implementation/concurrentMarkSweep/freeList.cpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/freeList.cpp
@@ -81,8 +81,8 @@ void FreeList::reset(size_t hint) {
  set_hint(hint);
 }
-void FreeList::init_statistics() {
+void FreeList::init_statistics(bool split_birth) {
-  _allocation_stats.initialize();
+  _allocation_stats.initialize(split_birth);
 }
 FreeChunk* FreeList::getChunkAtHead() {
@@ -292,14 +292,31 @@ bool FreeList::verifyChunkInFreeLists(FreeChunk* fc) const {
 }
 #ifndef PRODUCT
+void FreeList::verify_stats() const {
+  // The +1 of the LH comparand is to allow some "looseness" in
+  // checking: we usually call this interface when adding a block
+  // and we'll subsequently update the stats; we cannot update the
+  // stats beforehand because in the case of the large-block BT
+  // dictionary for example, this might be the first block and
+  // in that case there would be no place that we could record
+  // the stats (which are kept in the block itself).
+  assert(_allocation_stats.prevSweep() + _allocation_stats.splitBirths() + 1   // Total Stock + 1
+          >= _allocation_stats.splitDeaths() + (ssize_t)count(), "Conservation Principle");
+}
 void FreeList::assert_proper_lock_protection_work() const {
-#ifdef ASSERT
+  assert(_protecting_lock != NULL, "Don't call this directly");
-  if (_protecting_lock != NULL &&
+  assert(ParallelGCThreads > 0, "Don't call this directly");
-      SharedHeap::heap()->n_par_threads() > 0) {
+  Thread* thr = Thread::current();
-    // Should become an assert.
+  if (thr->is_VM_thread() || thr->is_ConcurrentGC_thread()) {
-    guarantee(_protecting_lock->owned_by_self(), "FreeList RACE DETECTED");
+    // assert that we are holding the freelist lock
+  } else if (thr->is_GC_task_thread()) {
+    assert(_protecting_lock->owned_by_self(), "FreeList RACE DETECTED");
+  } else if (thr->is_Java_thread()) {
+    assert(!SafepointSynchronize::is_at_safepoint(), "Should not be executing");
+  } else {
+    ShouldNotReachHere();  // unaccounted thread type?
  }
-#endif
 }
 #endif

--- a/src/share/vm/gc_implementation/concurrentMarkSweep/freeList.hpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/freeList.hpp
@@ -35,18 +35,26 @@ class CompactibleFreeListSpace;
 // for that implementation.
 class Mutex;
+class TreeList;
 class FreeList VALUE_OBJ_CLASS_SPEC {
  friend class CompactibleFreeListSpace;
  friend class VMStructs;
-  friend class printTreeCensusClosure;
+  friend class PrintTreeCensusClosure;
-  FreeChunk*    _head;          // List of free chunks
+ protected:
+  TreeList* _parent;
+  TreeList* _left;
+  TreeList* _right;
+ private:
+  FreeChunk*    _head;          // Head of list of free chunks
  FreeChunk*    _tail;          // Tail of list of free chunks
-  size_t        _size;          // Size in Heap words of each chunks
+  size_t        _size;          // Size in Heap words of each chunk
  ssize_t       _count;         // Number of entries in list
  size_t        _hint;          // next larger size list with a positive surplus
-  AllocationStats _allocation_stats;            // statistics for smart allocation
+  AllocationStats _allocation_stats; // allocation-related statistics
 #ifdef ASSERT
  Mutex*        _protecting_lock;
@@ -63,9 +71,12 @@ class FreeList VALUE_OBJ_CLASS_SPEC {
  // Initialize the allocation statistics.
 protected:
-  void init_statistics();
+  void init_statistics(bool split_birth = false);
  void set_count(ssize_t v) { _count = v;}
-  void increment_count()    { _count++; }
+  void increment_count()    {
+    _count++;
+  }
  void decrement_count() {
    _count--;
    assert(_count >= 0, "Count should not be negative");
@@ -167,11 +178,13 @@ class FreeList VALUE_OBJ_CLASS_SPEC {
    _allocation_stats.set_desired(v);
  }
  void compute_desired(float inter_sweep_current,
-                       float inter_sweep_estimate) {
+                       float inter_sweep_estimate,
+                       float intra_sweep_estimate) {
    assert_proper_lock_protection();
    _allocation_stats.compute_desired(_count,
                                      inter_sweep_current,
-                                      inter_sweep_estimate);
+                                      inter_sweep_estimate,
+                                      intra_sweep_estimate);
  }
  ssize_t coalDesired() const {
    return _allocation_stats.coalDesired();
@@ -306,6 +319,9 @@ class FreeList VALUE_OBJ_CLASS_SPEC {
  // found.  Return NULL if "fc" is not found.
  bool verifyChunkInFreeLists(FreeChunk* fc) const;
+  // Stats verification
+  void verify_stats() const PRODUCT_RETURN;
  // Printing support
  static void print_labels_on(outputStream* st, const char* c);
  void print_on(outputStream* st, const char* c = NULL) const;

--- a/src/share/vm/gc_implementation/includeDB_gc_concurrentMarkSweep
+++ b/src/share/vm/gc_implementation/includeDB_gc_concurrentMarkSweep
@@ -221,6 +221,7 @@ freeList.cpp                            freeList.hpp
 freeList.cpp                            globals.hpp
 freeList.cpp                            mutex.hpp
 freeList.cpp                            sharedHeap.hpp
+freeList.cpp                            vmThread.hpp
 freeList.hpp                            allocationStats.hpp

--- a/src/share/vm/gc_implementation/includeDB_gc_serial
+++ b/src/share/vm/gc_implementation/includeDB_gc_serial
@@ -71,6 +71,7 @@ gcUtil.cpp                              gcUtil.hpp
 gcUtil.hpp                              allocation.hpp
 gcUtil.hpp                              debug.hpp
 gcUtil.hpp                              globalDefinitions.hpp
+gcUtil.hpp                              ostream.hpp
 gcUtil.hpp				timer.hpp
 generationCounters.cpp                  generationCounters.hpp

--- a/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
+++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
@@ -50,6 +50,7 @@ ParScanThreadState::ParScanThreadState(Space* to_space_,
                      work_queue_set_, &term_),
  _is_alive_closure(gen_), _scan_weak_ref_closure(gen_, this),
  _keep_alive_closure(&_scan_weak_ref_closure),
+  _promotion_failure_size(0),
  _pushes(0), _pops(0), _steals(0), _steal_attempts(0), _term_attempts(0),
  _strong_roots_time(0.0), _term_time(0.0)
 {
@@ -249,6 +250,16 @@ void ParScanThreadState::undo_alloc_in_to_space(HeapWord* obj,
  }
 }
+void ParScanThreadState::print_and_clear_promotion_failure_size() {
+  if (_promotion_failure_size != 0) {
+    if (PrintPromotionFailure) {
+      gclog_or_tty->print(" (%d: promotion failure size = " SIZE_FORMAT ") ",
+        _thread_num, _promotion_failure_size);
+    }
+    _promotion_failure_size = 0;
+  }
+}
 class ParScanThreadStateSet: private ResourceArray {
 public:
  // Initializes states for the specified number of threads;
@@ -260,11 +271,11 @@ public:
                        GrowableArray<oop>**    overflow_stacks_,
                        size_t                  desired_plab_sz,
                        ParallelTaskTerminator& term);
-  inline ParScanThreadState& thread_sate(int i);
+  inline ParScanThreadState& thread_state(int i);
  int pushes() { return _pushes; }
  int pops()   { return _pops; }
  int steals() { return _steals; }
-  void reset();
+  void reset(bool promotion_failed);
  void flush();
 private:
  ParallelTaskTerminator& _term;
@@ -295,22 +306,31 @@ ParScanThreadStateSet::ParScanThreadStateSet(
  }
 }
-inline ParScanThreadState& ParScanThreadStateSet::thread_sate(int i)
+inline ParScanThreadState& ParScanThreadStateSet::thread_state(int i)
 {
  assert(i >= 0 && i < length(), "sanity check!");
  return ((ParScanThreadState*)_data)[i];
 }
-void ParScanThreadStateSet::reset()
+void ParScanThreadStateSet::reset(bool promotion_failed)
 {
  _term.reset_for_reuse();
+  if (promotion_failed) {
+    for (int i = 0; i < length(); ++i) {
+      thread_state(i).print_and_clear_promotion_failure_size();
+    }
+  }
 }
 void ParScanThreadStateSet::flush()
 {
+  // Work in this loop should be kept as lightweight as
+  // possible since this might otherwise become a bottleneck
+  // to scaling. Should we add heavy-weight work into this
+  // loop, consider parallelizing the loop into the worker threads.
  for (int i = 0; i < length(); ++i) {
-    ParScanThreadState& par_scan_state = thread_sate(i);
+    ParScanThreadState& par_scan_state = thread_state(i);
    // Flush stats related to To-space PLAB activity and
    // retire the last buffer.
@@ -362,6 +382,14 @@ void ParScanThreadStateSet::flush()
      }
    }
  }
+  if (UseConcMarkSweepGC && ParallelGCThreads > 0) {
+    // We need to call this even when ResizeOldPLAB is disabled
+    // so as to avoid breaking some asserts. While we may be able
+    // to avoid this by reorganizing the code a bit, I am loathe
+    // to do that unless we find cases where ergo leads to bad
+    // performance.
+    CFLS_LAB::compute_desired_plab_size();
+  }
 }
 ParScanClosure::ParScanClosure(ParNewGeneration* g,
@@ -475,7 +503,7 @@ void ParNewGenTask::work(int i) {
  Generation* old_gen = gch->next_gen(_gen);
-  ParScanThreadState& par_scan_state = _state_set->thread_sate(i);
+  ParScanThreadState& par_scan_state = _state_set->thread_state(i);
  par_scan_state.set_young_old_boundary(_young_old_boundary);
  par_scan_state.start_strong_roots();
@@ -659,7 +687,7 @@ void ParNewRefProcTaskProxy::work(int i)
 {
  ResourceMark rm;
  HandleMark hm;
-  ParScanThreadState& par_scan_state = _state_set.thread_sate(i);
+  ParScanThreadState& par_scan_state = _state_set.thread_state(i);
  par_scan_state.set_young_old_boundary(_young_old_boundary);
  _task.work(i, par_scan_state.is_alive_closure(),
             par_scan_state.keep_alive_closure(),
@@ -693,7 +721,7 @@ void ParNewRefProcTaskExecutor::execute(ProcessTask& task)
  ParNewRefProcTaskProxy rp_task(task, _generation, *_generation.next_gen(),
                                 _generation.reserved().end(), _state_set);
  workers->run_task(&rp_task);
-  _state_set.reset();
+  _state_set.reset(_generation.promotion_failed());
 }
 void ParNewRefProcTaskExecutor::execute(EnqueueTask& task)
@@ -813,7 +841,7 @@ void ParNewGeneration::collect(bool   full,
    GenCollectedHeap::StrongRootsScope srs(gch);
    tsk.work(0);
  }
-  thread_state_set.reset();
+  thread_state_set.reset(promotion_failed());
  if (PAR_STATS_ENABLED && ParallelGCVerbose) {
    gclog_or_tty->print("Thread totals:\n"
@@ -882,6 +910,8 @@ void ParNewGeneration::collect(bool   full,
    swap_spaces();  // Make life simpler for CMS || rescan; see 6483690.
    from()->set_next_compaction_space(to());
    gch->set_incremental_collection_will_fail();
+    // Inform the next generation that a promotion failure occurred.
+    _next_gen->promotion_failure_occurred();
    // Reset the PromotionFailureALot counters.
    NOT_PRODUCT(Universe::heap()->reset_promotion_should_fail();)
@@ -1029,6 +1059,8 @@ oop ParNewGeneration::copy_to_survivor_space_avoiding_promotion_undo(
      new_obj = old;
      preserve_mark_if_necessary(old, m);
+      // Log the size of the maiden promotion failure
+      par_scan_state->log_promotion_failure(sz);
    }
    old->forward_to(new_obj);
@@ -1150,6 +1182,8 @@ oop ParNewGeneration::copy_to_survivor_space_with_undo(
      failed_to_promote = true;
      preserve_mark_if_necessary(old, m);
+      // Log the size of the maiden promotion failure
+      par_scan_state->log_promotion_failure(sz);
    }
  } else {
    // Is in to-space; do copying ourselves.

--- a/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp
+++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp
@@ -97,6 +97,9 @@ class ParScanThreadState {
  int _pushes, _pops, _steals, _steal_attempts, _term_attempts;
  int _overflow_pushes, _overflow_refills, _overflow_refill_objs;
+  // Stats for promotion failure
+  size_t _promotion_failure_size;
  // Timing numbers.
  double _start;
  double _start_strong_roots;
@@ -169,6 +172,15 @@ class ParScanThreadState {
  // Undo the most recent allocation ("obj", of "word_sz").
  void undo_alloc_in_to_space(HeapWord* obj, size_t word_sz);
+  // Promotion failure stats
+  size_t promotion_failure_size() { return promotion_failure_size(); }
+  void log_promotion_failure(size_t sz) {
+    if (_promotion_failure_size == 0) {
+      _promotion_failure_size = sz;
+    }
+  }
+  void print_and_clear_promotion_failure_size();
  int pushes() { return _pushes; }
  int pops()   { return _pops; }
  int steals() { return _steals; }

--- a/src/share/vm/gc_implementation/shared/allocationStats.hpp
+++ b/src/share/vm/gc_implementation/shared/allocationStats.hpp
@@ -31,7 +31,7 @@ class AllocationStats VALUE_OBJ_CLASS_SPEC {
  // beginning of this sweep:
  //   Count(end_last_sweep) - Count(start_this_sweep)
  //     + splitBirths(between) - splitDeaths(between)
-  // The above number divided by the time since the start [END???] of the
+  // The above number divided by the time since the end of the
  // previous sweep gives us a time rate of demand for blocks
  // of this size. We compute a padded average of this rate as
  // our current estimate for the time rate of demand for blocks
@@ -41,7 +41,7 @@ class AllocationStats VALUE_OBJ_CLASS_SPEC {
  // estimates.
  AdaptivePaddedAverage _demand_rate_estimate;
-  ssize_t     _desired;          // Estimate computed as described above
+  ssize_t     _desired;         // Demand stimate computed as described above
  ssize_t     _coalDesired;     // desired +/- small-percent for tuning coalescing
  ssize_t     _surplus;         // count - (desired +/- small-percent),
@@ -53,9 +53,9 @@ class AllocationStats VALUE_OBJ_CLASS_SPEC {
  ssize_t     _coalDeaths;      // loss from coalescing
  ssize_t     _splitBirths;     // additional chunks from splitting
  ssize_t     _splitDeaths;     // loss from splitting
-  size_t     _returnedBytes;    // number of bytes returned to list.
+  size_t      _returnedBytes;   // number of bytes returned to list.
 public:
-  void initialize() {
+  void initialize(bool split_birth = false) {
    AdaptivePaddedAverage* dummy =
      new (&_demand_rate_estimate) AdaptivePaddedAverage(CMS_FLSWeight,
                                                         CMS_FLSPadding);
@@ -67,7 +67,7 @@ class AllocationStats VALUE_OBJ_CLASS_SPEC {
    _beforeSweep = 0;
    _coalBirths = 0;
    _coalDeaths = 0;
-    _splitBirths = 0;
+    _splitBirths = split_birth? 1 : 0;
    _splitDeaths = 0;
    _returnedBytes = 0;
  }
@@ -75,10 +75,12 @@ class AllocationStats VALUE_OBJ_CLASS_SPEC {
  AllocationStats() {
    initialize();
  }
  // The rate estimate is in blocks per second.
  void compute_desired(size_t count,
                       float inter_sweep_current,
-                       float inter_sweep_estimate) {
+                       float inter_sweep_estimate,
+                       float intra_sweep_estimate) {
    // If the latest inter-sweep time is below our granularity
    // of measurement, we may call in here with
    // inter_sweep_current == 0. However, even for suitably small
@@ -88,12 +90,31 @@ class AllocationStats VALUE_OBJ_CLASS_SPEC {
    // vulnerable to noisy glitches. In such cases, we
    // ignore the current sample and use currently available
    // historical estimates.
+    // XXX NEEDS TO BE FIXED
+    // assert(prevSweep() + splitBirths() >= splitDeaths() + (ssize_t)count, "Conservation Principle");
+    //     ^^^^^^^^^^^^^^^^^^^^^^^^^^^    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    //     "Total Stock"                  "Not used at this block size"
    if (inter_sweep_current > _threshold) {
-      ssize_t demand = prevSweep() - count + splitBirths() - splitDeaths();
+      ssize_t demand = prevSweep() - (ssize_t)count + splitBirths() - splitDeaths();
+      // XXX NEEDS TO BE FIXED
+      // assert(demand >= 0, "Demand should be non-negative");
+      // Defensive: adjust for imprecision in event counting
+      if (demand < 0) {
+        demand = 0;
+      }
+      float old_rate = _demand_rate_estimate.padded_average();
      float rate = ((float)demand)/inter_sweep_current;
      _demand_rate_estimate.sample(rate);
-      _desired = (ssize_t)(_demand_rate_estimate.padded_average()
+      float new_rate = _demand_rate_estimate.padded_average();
-                           *inter_sweep_estimate);
+      ssize_t old_desired = _desired;
+      _desired = (ssize_t)(new_rate * (inter_sweep_estimate
+                                       + CMSExtrapolateSweep
+                                         ? intra_sweep_estimate
+                                         : 0.0));
+      if (PrintFLSStatistics > 1) {
+        gclog_or_tty->print_cr("demand: %d, old_rate: %f, current_rate: %f, new_rate: %f, old_desired: %d, new_desired: %d",
+                                demand,     old_rate,     rate,             new_rate,     old_desired,     _desired);
+      }
    }
  }

--- a/src/share/vm/gc_implementation/shared/gcUtil.cpp
+++ b/src/share/vm/gc_implementation/shared/gcUtil.cpp
@@ -52,11 +52,35 @@ void AdaptiveWeightedAverage::sample(float new_sample) {
  _last_sample = new_sample;
 }
+void AdaptiveWeightedAverage::print() const {
+  print_on(tty);
+}
+void AdaptiveWeightedAverage::print_on(outputStream* st) const {
+  guarantee(false, "NYI");
+}
+void AdaptivePaddedAverage::print() const {
+  print_on(tty);
+}
+void AdaptivePaddedAverage::print_on(outputStream* st) const {
+  guarantee(false, "NYI");
+}
+void AdaptivePaddedNoZeroDevAverage::print() const {
+  print_on(tty);
+}
+void AdaptivePaddedNoZeroDevAverage::print_on(outputStream* st) const {
+  guarantee(false, "NYI");
+}
 void AdaptivePaddedAverage::sample(float new_sample) {
-  // Compute our parent classes sample information
+  // Compute new adaptive weighted average based on new sample.
  AdaptiveWeightedAverage::sample(new_sample);
-  // Now compute the deviation and the new padded sample
+  // Now update the deviation and the padded average.
  float new_avg = average();
  float new_dev = compute_adaptive_average(fabsd(new_sample - new_avg),
                                           deviation());

--- a/src/share/vm/gc_implementation/shared/gcUtil.hpp
+++ b/src/share/vm/gc_implementation/shared/gcUtil.hpp
@@ -54,8 +54,8 @@ class AdaptiveWeightedAverage : public CHeapObj {
 public:
  // Input weight must be between 0 and 100
-  AdaptiveWeightedAverage(unsigned weight) :
+  AdaptiveWeightedAverage(unsigned weight, float avg = 0.0) :
-    _average(0.0), _sample_count(0), _weight(weight), _last_sample(0.0) {
+    _average(avg), _sample_count(0), _weight(weight), _last_sample(0.0) {
  }
  void clear() {
@@ -64,6 +64,13 @@ class AdaptiveWeightedAverage : public CHeapObj {
    _last_sample = 0;
  }
+  // Useful for modifying static structures after startup.
+  void  modify(size_t avg, unsigned wt, bool force = false)  {
+    assert(force, "Are you sure you want to call this?");
+    _average = (float)avg;
+    _weight  = wt;
+  }
  // Accessors
  float    average() const       { return _average;       }
  unsigned weight()  const       { return _weight;        }
@@ -83,6 +90,10 @@ class AdaptiveWeightedAverage : public CHeapObj {
    // Convert to float and back to avoid integer overflow.
    return (size_t)exp_avg((float)avg, (float)sample, weight);
  }
+  // Printing
+  void print_on(outputStream* st) const;
+  void print() const;
 };
@@ -129,6 +140,10 @@ class AdaptivePaddedAverage : public AdaptiveWeightedAverage {
  // Override
  void  sample(float new_sample);
+  // Printing
+  void print_on(outputStream* st) const;
+  void print() const;
 };
 // A weighted average that includes a deviation from the average,
@@ -146,7 +161,12 @@ public:
    AdaptivePaddedAverage(weight, padding)  {}
  // Override
  void  sample(float new_sample);
+  // Printing
+  void print_on(outputStream* st) const;
+  void print() const;
 };
 // Use a least squares fit to a set of data to generate a linear
 // equation.
 //              y = intercept + slope * x

--- a/src/share/vm/includeDB_gc_parallel
+++ b/src/share/vm/includeDB_gc_parallel
@@ -21,6 +21,8 @@
 // have any questions.
 //  
+arguments.cpp                           compactibleFreeListSpace.hpp
 assembler_<arch>.cpp                    g1SATBCardTableModRefBS.hpp
 assembler_<arch>.cpp                    g1CollectedHeap.inline.hpp
 assembler_<arch>.cpp                    heapRegion.hpp

--- a/src/share/vm/memory/defNewGeneration.cpp
+++ b/src/share/vm/memory/defNewGeneration.cpp
@@ -609,7 +609,7 @@ void DefNewGeneration::collect(bool   full,
    remove_forwarding_pointers();
    if (PrintGCDetails) {
-      gclog_or_tty->print(" (promotion failed)");
+      gclog_or_tty->print(" (promotion failed) ");
    }
    // Add to-space to the list of space to compact
    // when a promotion failure has occurred.  In that
@@ -620,6 +620,9 @@ void DefNewGeneration::collect(bool   full,
    from()->set_next_compaction_space(to());
    gch->set_incremental_collection_will_fail();
+    // Inform the next generation that a promotion failure occurred.
+    _next_gen->promotion_failure_occurred();
    // Reset the PromotionFailureALot counters.
    NOT_PRODUCT(Universe::heap()->reset_promotion_should_fail();)
  }
@@ -679,6 +682,11 @@ void DefNewGeneration::preserve_mark_if_necessary(oop obj, markOop m) {
 void DefNewGeneration::handle_promotion_failure(oop old) {
  preserve_mark_if_necessary(old, old->mark());
+  if (!_promotion_failed && PrintPromotionFailure) {
+    gclog_or_tty->print(" (promotion failure size = " SIZE_FORMAT ") ",
+                        old->size());
+  }
  // forward to self
  old->forward_to(old);
  _promotion_failed = true;

--- a/src/share/vm/memory/generation.hpp
+++ b/src/share/vm/memory/generation.hpp
@@ -181,6 +181,12 @@ class Generation: public CHeapObj {
  virtual bool promotion_attempt_is_safe(size_t promotion_in_bytes,
    bool younger_handles_promotion_failure) const;
+  // For a non-young generation, this interface can be used to inform a
+  // generation that a promotion attempt into that generation failed.
+  // Typically used to enable diagnostic output for post-mortem analysis,
+  // but other uses of the interface are not ruled out.
+  virtual void promotion_failure_occurred() { /* does nothing */ }
  // Return an estimate of the maximum allocation that could be performed
  // in the generation without triggering any collection or expansion
  // activity.  It is "unsafe" because no locks are taken; the result

--- a/src/share/vm/runtime/arguments.cpp
+++ b/src/share/vm/runtime/arguments.cpp
@@ -948,6 +948,7 @@ static void no_shared_spaces() {
  }
 }
+#ifndef KERNEL
 // If the user has chosen ParallelGCThreads > 0, we set UseParNewGC
 // if it's not explictly set or unset. If the user has chosen
 // UseParNewGC and not explicitly set ParallelGCThreads we
@@ -1177,8 +1178,7 @@ void Arguments::set_cms_and_parnew_gc_flags() {
      // the value (either from the command line or ergonomics) of
      // OldPLABSize.  Following OldPLABSize is an ergonomics decision.
      FLAG_SET_ERGO(uintx, CMSParPromoteBlocksToClaim, OldPLABSize);
-    }
+    } else {
-    else {
      // OldPLABSize and CMSParPromoteBlocksToClaim are both set.
      // CMSParPromoteBlocksToClaim is a collector-specific flag, so
      // we'll let it to take precedence.
@@ -1188,7 +1188,23 @@ void Arguments::set_cms_and_parnew_gc_flags() {
                  " CMSParPromoteBlocksToClaim will take precedence.\n");
    }
  }
+  if (!FLAG_IS_DEFAULT(ResizeOldPLAB) && !ResizeOldPLAB) {
+    // OldPLAB sizing manually turned off: Use a larger default setting,
+    // unless it was manually specified. This is because a too-low value
+    // will slow down scavenges.
+    if (FLAG_IS_DEFAULT(CMSParPromoteBlocksToClaim)) {
+      FLAG_SET_ERGO(uintx, CMSParPromoteBlocksToClaim, 50); // default value before 6631166
+    }
+  }
+  // Overwrite OldPLABSize which is the variable we will internally use everywhere.
+  FLAG_SET_ERGO(uintx, OldPLABSize, CMSParPromoteBlocksToClaim);
+  // If either of the static initialization defaults have changed, note this
+  // modification.
+  if (!FLAG_IS_DEFAULT(CMSParPromoteBlocksToClaim) || !FLAG_IS_DEFAULT(OldPLABWeight)) {
+    CFLS_LAB::modify_initialization(OldPLABSize, OldPLABWeight);
+  }
 }
+#endif // KERNEL
 inline uintx max_heap_for_compressed_oops() {
  LP64_ONLY(return oopDesc::OopEncodingHeapMax - MaxPermSize - os::vm_page_size());
@@ -2370,22 +2386,25 @@ SOLARIS_ONLY(
                  "ExtendedDTraceProbes flag is only applicable on Solaris\n");
      return JNI_EINVAL;
 #endif // ndef SOLARIS
-    } else
 #ifdef ASSERT
-    if (match_option(option, "-XX:+FullGCALot", &tail)) {
+    } else if (match_option(option, "-XX:+FullGCALot", &tail)) {
      FLAG_SET_CMDLINE(bool, FullGCALot, true);
      // disable scavenge before parallel mark-compact
      FLAG_SET_CMDLINE(bool, ScavengeBeforeFullGC, false);
-    } else
 #endif
-    if (match_option(option, "-XX:ParCMSPromoteBlocksToClaim=", &tail)) {
+    } else if (match_option(option, "-XX:CMSParPromoteBlocksToClaim=", &tail)) {
      julong cms_blocks_to_claim = (julong)atol(tail);
      FLAG_SET_CMDLINE(uintx, CMSParPromoteBlocksToClaim, cms_blocks_to_claim);
      jio_fprintf(defaultStream::error_stream(),
-        "Please use -XX:CMSParPromoteBlocksToClaim in place of "
+        "Please use -XX:OldPLABSize in place of "
+        "-XX:CMSParPromoteBlocksToClaim in the future\n");
+    } else if (match_option(option, "-XX:ParCMSPromoteBlocksToClaim=", &tail)) {
+      julong cms_blocks_to_claim = (julong)atol(tail);
+      FLAG_SET_CMDLINE(uintx, CMSParPromoteBlocksToClaim, cms_blocks_to_claim);
+      jio_fprintf(defaultStream::error_stream(),
+        "Please use -XX:OldPLABSize in place of "
        "-XX:ParCMSPromoteBlocksToClaim in the future\n");
-    } else
+    } else if (match_option(option, "-XX:ParallelGCOldGenAllocBufferSize=", &tail)) {
-    if (match_option(option, "-XX:ParallelGCOldGenAllocBufferSize=", &tail)) {
      julong old_plab_size = 0;
      ArgsRange errcode = parse_memory_size(tail, &old_plab_size, 1);
      if (errcode != arg_in_range) {
@@ -2398,8 +2417,7 @@ SOLARIS_ONLY(
      jio_fprintf(defaultStream::error_stream(),
                  "Please use -XX:OldPLABSize in place of "
                  "-XX:ParallelGCOldGenAllocBufferSize in the future\n");
-    } else
+    } else if (match_option(option, "-XX:ParallelGCToSpaceAllocBufferSize=", &tail)) {
-    if (match_option(option, "-XX:ParallelGCToSpaceAllocBufferSize=", &tail)) {
      julong young_plab_size = 0;
      ArgsRange errcode = parse_memory_size(tail, &young_plab_size, 1);
      if (errcode != arg_in_range) {
@@ -2412,8 +2430,7 @@ SOLARIS_ONLY(
      jio_fprintf(defaultStream::error_stream(),
                  "Please use -XX:YoungPLABSize in place of "
                  "-XX:ParallelGCToSpaceAllocBufferSize in the future\n");
-    } else
+    } else if (match_option(option, "-XX:", &tail)) { // -XX:xxxx
-    if (match_option(option, "-XX:", &tail)) { // -XX:xxxx
      // Skip -XX:Flags= since that case has already been handled
      if (strncmp(tail, "Flags=", strlen("Flags=")) != 0) {
        if (!process_argument(tail, args->ignoreUnrecognized, origin)) {
@@ -2727,6 +2744,7 @@ jint Arguments::parse(const JavaVMInitArgs* args) {
    return JNI_EINVAL;
  }
+#ifndef KERNEL
  if (UseConcMarkSweepGC) {
    // Set flags for CMS and ParNew.  Check UseConcMarkSweep first
    // to ensure that when both UseConcMarkSweepGC and UseParNewGC
@@ -2744,6 +2762,7 @@ jint Arguments::parse(const JavaVMInitArgs* args) {
      set_g1_gc_flags();
    }
  }
+#endif // KERNEL
 #ifdef SERIALGC
  assert(verify_serial_gc_flags(), "SerialGC unset");

--- a/src/share/vm/runtime/globals.hpp
+++ b/src/share/vm/runtime/globals.hpp
@@ -1355,10 +1355,46 @@ class CommandLineFlags {
  product(uintx, ParGCDesiredObjsFromOverflowList, 20,                      \
          "The desired number of objects to claim from the overflow list")  \
                                                                            \
-  product(uintx, CMSParPromoteBlocksToClaim, 50,                            \
+  product(uintx, CMSParPromoteBlocksToClaim, 16,                             \
          "Number of blocks to attempt to claim when refilling CMS LAB for "\
          "parallel GC.")                                                   \
                                                                            \
+  product(uintx, OldPLABWeight, 50,                                         \
+          "Percentage (0-100) used to weight the current sample when"       \
+          "computing exponentially decaying average for resizing CMSParPromoteBlocksToClaim.") \
+                                                                            \
+  product(bool, ResizeOldPLAB, true,                                        \
+          "Dynamically resize (old gen) promotion labs")                    \
+                                                                            \
+  product(bool, PrintOldPLAB, false,                                        \
+          "Print (old gen) promotion labs sizing decisions")                \
+                                                                            \
+  product(uintx, CMSOldPLABMin, 16,                                         \
+          "Min size of CMS gen promotion lab caches per worker per blksize")\
+                                                                            \
+  product(uintx, CMSOldPLABMax, 1024,                                       \
+          "Max size of CMS gen promotion lab caches per worker per blksize")\
+                                                                            \
+  product(uintx, CMSOldPLABNumRefills, 4,                                   \
+          "Nominal number of refills of CMS gen promotion lab cache"        \
+          " per worker per block size")                                     \
+                                                                            \
+  product(bool, CMSOldPLABResizeQuicker, false,                             \
+          "Whether to react on-the-fly during a scavenge to a sudden"       \
+          " change in block demand rate")                                   \
+                                                                            \
+  product(uintx, CMSOldPLABToleranceFactor, 4,                              \
+          "The tolerance of the phase-change detector for on-the-fly"       \
+          " PLAB resizing during a scavenge")                               \
+                                                                            \
+  product(uintx, CMSOldPLABReactivityFactor, 2,                             \
+          "The gain in the feedback loop for on-the-fly PLAB resizing"      \
+          " during a scavenge")                                             \
+                                                                            \
+  product(uintx, CMSOldPLABReactivityCeiling, 10,                           \
+          "The clamping of the gain in the feedback loop for on-the-fly"    \
+          " PLAB resizing during a scavenge")                               \
+                                                                            \
  product(bool, AlwaysPreTouch, false,                                      \
          "It forces all freshly committed pages to be pre-touched.")       \
                                                                            \
@@ -1400,27 +1436,54 @@ class CommandLineFlags {
          "Percentage (0-100) by which the CMS incremental mode duty cycle" \
          " is shifted to the right within the period between young GCs")   \
                                                                            \
-  product(uintx, CMSExpAvgFactor, 25,                                       \
+  product(uintx, CMSExpAvgFactor, 50,                                       \
-          "Percentage (0-100) used to weight the current sample when "      \
+          "Percentage (0-100) used to weight the current sample when"       \
-          "computing exponential averages for CMS statistics")              \
+          "computing exponential averages for CMS statistics.")             \
                                                                            \
-  product(uintx, CMS_FLSWeight, 50,                                         \
+  product(uintx, CMS_FLSWeight, 75,                                         \
-          "Percentage (0-100) used to weight the current sample when "      \
+          "Percentage (0-100) used to weight the current sample when"       \
-          "computing exponentially decating averages for CMS FLS statistics") \
+          "computing exponentially decating averages for CMS FLS statistics.") \
                                                                            \
-  product(uintx, CMS_FLSPadding, 2,                                         \
+  product(uintx, CMS_FLSPadding, 1,                                         \
-          "The multiple of deviation from mean to use for buffering "       \
+          "The multiple of deviation from mean to use for buffering"        \
          "against volatility in free list demand.")                        \
                                                                            \
  product(uintx, FLSCoalescePolicy, 2,                                      \
          "CMS: Aggression level for coalescing, increasing from 0 to 4")   \
                                                                            \
-  product(uintx, CMS_SweepWeight, 50,                                       \
+  product(bool, FLSAlwaysCoalesceLarge, false,                              \
+          "CMS: Larger free blocks are always available for coalescing")    \
+                                                                            \
+  product(double, FLSLargestBlockCoalesceProximity, 0.99,                   \
+          "CMS: the smaller the percentage the greater the coalition force")\
+                                                                            \
+  product(double, CMSSmallCoalSurplusPercent, 1.05,                         \
+          "CMS: the factor by which to inflate estimated demand of small"   \
+          " block sizes to prevent coalescing with an adjoining block")     \
+                                                                            \
+  product(double, CMSLargeCoalSurplusPercent, 0.95,                         \
+          "CMS: the factor by which to inflate estimated demand of large"   \
+          " block sizes to prevent coalescing with an adjoining block")     \
+                                                                            \
+  product(double, CMSSmallSplitSurplusPercent, 1.10,                        \
+          "CMS: the factor by which to inflate estimated demand of small"   \
+          " block sizes to prevent splitting to supply demand for smaller"  \
+          " blocks")                                                        \
+                                                                            \
+  product(double, CMSLargeSplitSurplusPercent, 1.00,                        \
+          "CMS: the factor by which to inflate estimated demand of large"   \
+          " block sizes to prevent splitting to supply demand for smaller"  \
+          " blocks")                                                        \
+                                                                            \
+  product(bool, CMSExtrapolateSweep, false,                                 \
+          "CMS: cushion for block demand during sweep")                     \
+                                                                            \
+  product(uintx, CMS_SweepWeight, 75,                                       \
          "Percentage (0-100) used to weight the current sample when "      \
          "computing exponentially decaying average for inter-sweep "       \
          "duration")                                                       \
                                                                            \
-  product(uintx, CMS_SweepPadding, 2,                                       \
+  product(uintx, CMS_SweepPadding, 1,                                       \
          "The multiple of deviation from mean to use for buffering "       \
          "against volatility in inter-sweep duration.")                    \
                                                                            \
@@ -1459,6 +1522,13 @@ class CommandLineFlags {
  product(uintx, CMSIndexedFreeListReplenish, 4,                            \
          "Replenish and indexed free list with this number of chunks")     \
                                                                            \
+  product(bool, CMSReplenishIntermediate, true,                             \
+          "Replenish all intermediate free-list caches")                    \
+                                                                            \
+  product(bool, CMSSplitIndexedFreeListBlocks, true,                        \
+          "When satisfying batched demand, splot blocks from the "          \
+          "IndexedFreeList whose size is a multiple of requested size")     \
+                                                                            \
  product(bool, CMSLoopWarn, false,                                         \
          "Warn in case of excessive CMS looping")                          \
                                                                            \
@@ -1593,6 +1663,18 @@ class CommandLineFlags {
          "Bitmap operations should process at most this many bits"         \
          "between yields")                                                 \
                                                                            \
+  product(bool, CMSDumpAtPromotionFailure, false,                           \
+          "Dump useful information about the state of the CMS old "         \
+          " generation upon a promotion failure.")                          \
+                                                                            \
+  product(bool, CMSPrintChunksInDump, false,                                \
+          "In a dump enabled by CMSDumpAtPromotionFailure, include "        \
+          " more detailed information about the free chunks.")              \
+                                                                            \
+  product(bool, CMSPrintObjectsInDump, false,                               \
+          "In a dump enabled by CMSDumpAtPromotionFailure, include "        \
+          " more detailed information about the allocated objects.")        \
+                                                                            \
  diagnostic(bool, FLSVerifyAllHeapReferences, false,                       \
          "Verify that all refs across the FLS boundary "                   \
          " are to valid objects")                                          \
@@ -1677,6 +1759,10 @@ class CommandLineFlags {
          "The youngest generation collection does not require "            \
          "a guarantee of full promotion of all live objects.")             \
                                                                            \
+  product(bool, PrintPromotionFailure, false,                               \
+          "Print additional diagnostic information following "              \
+          " promotion failure")                                             \
+                                                                            \
  notproduct(bool, PromotionFailureALot, false,                             \
          "Use promotion failure handling on every youngest generation "    \
          "collection")                                                     \

--- a/src/share/vm/services/classLoadingService.cpp
+++ b/src/share/vm/services/classLoadingService.cpp
@@ -128,7 +128,7 @@ void ClassLoadingService::notify_class_unloaded(instanceKlass* k) {
  if (TraceClassUnloading) {
    ResourceMark rm;
-    tty->print_cr("[Unloading class %s]", k->external_name());
+    gclog_or_tty->print_cr("[Unloading class %s]", k->external_name());
  }
 }