Merge

1bda0c9e · ysr · 01eef6f8 · 72f55a45 · 1bda0c9e · 1bda0c9e
15 changed file
--- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
@@ -8508,7 +8508,7 @@ bool CMSCollector::take_from_overflow_list(size_t num, CMSMarkStack* stack) {
  size_t i = num;
  oop  cur = _overflow_list;
  const markOop proto = markOopDesc::prototype();
-  NOT_PRODUCT(size_t n = 0;)
+  NOT_PRODUCT(ssize_t n = 0;)
  for (oop next; i > 0 && cur != NULL; cur = next, i--) {
    next = oop(cur->mark());
    cur->set_mark(proto);   // until proven otherwise
@@ -8525,45 +8525,131 @@ bool CMSCollector::take_from_overflow_list(size_t num, CMSMarkStack* stack) {
  return !stack->isEmpty();
 }

-// Multi-threaded; use CAS to break off a prefix
+#define BUSY  (oop(0x1aff1aff))
+// (MT-safe) Get a prefix of at most "num" from the list.
+// The overflow list is chained through the mark word of
+// each object in the list. We fetch the entire list,
+// break off a prefix of the right size and return the
+// remainder. If other threads try to take objects from
+// the overflow list at that time, they will wait for
+// some time to see if data becomes available. If (and
+// only if) another thread places one or more object(s)
+// on the global list before we have returned the suffix
+// to the global list, we will walk down our local list
+// to find its end and append the global list to
+// our suffix before returning it. This suffix walk can
+// prove to be expensive (quadratic in the amount of traffic)
+// when there are many objects in the overflow list and
+// there is much producer-consumer contention on the list.
+// *NOTE*: The overflow list manipulation code here and
+// in ParNewGeneration:: are very similar in shape,
+// except that in the ParNew case we use the old (from/eden)
+// copy of the object to thread the list via its klass word.
+// Because of the common code, if you make any changes in
+// the code below, please check the ParNew version to see if
+// similar changes might be needed.
+// CR 6797058 has been filed to consolidate the common code.
 bool CMSCollector::par_take_from_overflow_list(size_t num,
                                               OopTaskQueue* work_q) {
-  assert(work_q->size() == 0, "That's the current policy");
+  assert(work_q->size() == 0, "First empty local work queue");
  assert(num < work_q->max_elems(), "Can't bite more than we can chew");
  if (_overflow_list == NULL) {
    return false;
  }
  // Grab the entire list; we'll put back a suffix
-  oop prefix = (oop)Atomic::xchg_ptr(NULL, &_overflow_list);
-  if (prefix == NULL) {  // someone grabbed it before we did ...
-    // ... we could spin for a short while, but for now we don't
-    return false;
+  oop prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list);
+  Thread* tid = Thread::current();
+  size_t CMSOverflowSpinCount = (size_t)ParallelGCThreads;
+  size_t sleep_time_millis = MAX2((size_t)1, num/100);
+  // If the list is busy, we spin for a short while,
+  // sleeping between attempts to get the list.
+  for (size_t spin = 0; prefix == BUSY && spin < CMSOverflowSpinCount; spin++) {
+    os::sleep(tid, sleep_time_millis, false);
+    if (_overflow_list == NULL) {
+      // Nothing left to take
+      return false;
+    } else if (_overflow_list != BUSY) {
+      // Try and grab the prefix
+      prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list);
+    }
+  }
+  // If the list was found to be empty, or we spun long
+  // enough, we give up and return empty-handed. If we leave
+  // the list in the BUSY state below, it must be the case that
+  // some other thread holds the overflow list and will set it
+  // to a non-BUSY state in the future.
+  if (prefix == NULL || prefix == BUSY) {
+     // Nothing to take or waited long enough
+     if (prefix == NULL) {
+       // Write back the NULL in case we overwrote it with BUSY above
+       // and it is still the same value.
+       (void) Atomic::cmpxchg_ptr(NULL, &_overflow_list, BUSY);
+     }
+     return false;
  }
+  assert(prefix != NULL && prefix != BUSY, "Error");
  size_t i = num;
  oop cur = prefix;
+  // Walk down the first "num" objects, unless we reach the end.
  for (; i > 1 && cur->mark() != NULL; cur = oop(cur->mark()), i--);
-  if (cur->mark() != NULL) {
+  if (cur->mark() == NULL) {
+    // We have "num" or fewer elements in the list, so there
+    // is nothing to return to the global list.
+    // Write back the NULL in lieu of the BUSY we wrote
+    // above, if it is still the same value.
+    if (_overflow_list == BUSY) {
+      (void) Atomic::cmpxchg_ptr(NULL, &_overflow_list, BUSY);
+    }
+  } else {
+    // Chop off the suffix and rerturn it to the global list.
+    assert(cur->mark() != BUSY, "Error");
    oop suffix_head = cur->mark(); // suffix will be put back on global list
    cur->set_mark(NULL);           // break off suffix
-    // Find tail of suffix so we can prepend suffix to global list
-    for (cur = suffix_head; cur->mark() != NULL; cur = (oop)(cur->mark()));
-    oop suffix_tail = cur;
-    assert(suffix_tail != NULL && suffix_tail->mark() == NULL,
-           "Tautology");
+    // It's possible that the list is still in the empty(busy) state
+    // we left it in a short while ago; in that case we may be
+    // able to place back the suffix without incurring the cost
+    // of a walk down the list.
    oop observed_overflow_list = _overflow_list;
-    do {
-      cur = observed_overflow_list;
-      suffix_tail->set_mark(markOop(cur));
+    oop cur_overflow_list = observed_overflow_list;
+    bool attached = false;
+    while (observed_overflow_list == BUSY || observed_overflow_list == NULL) {
      observed_overflow_list =
-        (oop) Atomic::cmpxchg_ptr(suffix_head, &_overflow_list, cur);
-    } while (cur != observed_overflow_list);
+        (oop) Atomic::cmpxchg_ptr(suffix_head, &_overflow_list, cur_overflow_list);
+      if (cur_overflow_list == observed_overflow_list) {
+        attached = true;
+        break;
+      } else cur_overflow_list = observed_overflow_list;
+    }
+    if (!attached) {
+      // Too bad, someone else sneaked in (at least) an element; we'll need
+      // to do a splice. Find tail of suffix so we can prepend suffix to global
+      // list.
+      for (cur = suffix_head; cur->mark() != NULL; cur = (oop)(cur->mark()));
+      oop suffix_tail = cur;
+      assert(suffix_tail != NULL && suffix_tail->mark() == NULL,
+             "Tautology");
+      observed_overflow_list = _overflow_list;
+      do {
+        cur_overflow_list = observed_overflow_list;
+        if (cur_overflow_list != BUSY) {
+          // Do the splice ...
+          suffix_tail->set_mark(markOop(cur_overflow_list));
+        } else { // cur_overflow_list == BUSY
+          suffix_tail->set_mark(NULL);
+        }
+        // ... and try to place spliced list back on overflow_list ...
+        observed_overflow_list =
+          (oop) Atomic::cmpxchg_ptr(suffix_head, &_overflow_list, cur_overflow_list);
+      } while (cur_overflow_list != observed_overflow_list);
+      // ... until we have succeeded in doing so.
+    }
  }

  // Push the prefix elements on work_q
  assert(prefix != NULL, "control point invariant");
  const markOop proto = markOopDesc::prototype();
  oop next;
-  NOT_PRODUCT(size_t n = 0;)
+  NOT_PRODUCT(ssize_t n = 0;)
  for (cur = prefix; cur != NULL; cur = next) {
    next = oop(cur->mark());
    cur->set_mark(proto);   // until proven otherwise
@@ -8597,11 +8683,16 @@ void CMSCollector::par_push_on_overflow_list(oop p) {
  oop cur_overflow_list;
  do {
    cur_overflow_list = observed_overflow_list;
-    p->set_mark(markOop(cur_overflow_list));
+    if (cur_overflow_list != BUSY) {
+      p->set_mark(markOop(cur_overflow_list));
+    } else {
+      p->set_mark(NULL);
+    }
    observed_overflow_list =
      (oop) Atomic::cmpxchg_ptr(p, &_overflow_list, cur_overflow_list);
  } while (cur_overflow_list != observed_overflow_list);
 }
+#undef BUSY

 // Single threaded
 // General Note on GrowableArray: pushes may silently fail
@@ -8610,7 +8701,7 @@ void CMSCollector::par_push_on_overflow_list(oop p) {
 // a lot of code in the JVM. The prudent thing for GrowableArray
 // to do (for now) is to exit with an error. However, that may
 // be too draconian in some cases because the caller may be
-// able to recover without much harm. For suych cases, we
+// able to recover without much harm. For such cases, we
 // should probably introduce a "soft_push" method which returns
 // an indication of success or failure with the assumption that
 // the caller may be able to recover from a failure; code in
@@ -8618,8 +8709,6 @@ void CMSCollector::par_push_on_overflow_list(oop p) {
 // failures where possible, thus, incrementally hardening the VM
 // in such low resource situations.
 void CMSCollector::preserve_mark_work(oop p, markOop m) {
-  int PreserveMarkStackSize = 128;
-
  if (_preserved_oop_stack == NULL) {
    assert(_preserved_mark_stack == NULL,
           "bijection with preserved_oop_stack");

--- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp
@@ -595,7 +595,7 @@ class CMSCollector: public CHeapObj {
  size_t        _ser_kac_preclean_ovflw;
  size_t        _ser_kac_ovflw;
  size_t        _par_kac_ovflw;
-  NOT_PRODUCT(size_t _num_par_pushes;)
+  NOT_PRODUCT(ssize_t _num_par_pushes;)

  // ("Weak") Reference processing support
  ReferenceProcessor*            _ref_processor;

--- a/src/share/vm/gc_implementation/includeDB_gc_parNew
+++ b/src/share/vm/gc_implementation/includeDB_gc_parNew
@@ -79,6 +79,7 @@ parNewGeneration.cpp                    resourceArea.hpp
 parNewGeneration.cpp                    sharedHeap.hpp
 parNewGeneration.cpp                    space.hpp
 parNewGeneration.cpp                    spaceDecorator.hpp
+parNewGeneration.cpp                    thread.hpp
 parNewGeneration.cpp                    workgroup.hpp

 parNewGeneration.hpp                    defNewGeneration.hpp

--- a/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
+++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
@@ -404,6 +404,8 @@ void ParEvacuateFollowersClosure::do_void() {
    if (terminator()->offer_termination()) break;
    par_scan_state()->end_term_time();
  }
+  assert(par_gen()->_overflow_list == NULL && par_gen()->_num_par_pushes == 0,
+         "Broken overflow list?");
  // Finish the last termination pause.
  par_scan_state()->end_term_time();
 }
@@ -456,6 +458,8 @@ ParNewGeneration(ReservedSpace rs, size_t initial_byte_size, int level)
  _is_alive_closure(this),
  _plab_stats(YoungPLABSize, PLABWeight)
 {
+  NOT_PRODUCT(_overflow_counter = ParGCWorkQueueOverflowInterval;)
+  NOT_PRODUCT(_num_par_pushes = 0;)
  _task_queues = new ObjToScanQueueSet(ParallelGCThreads);
  guarantee(_task_queues != NULL, "task_queues allocation failure.");

@@ -993,12 +997,19 @@ oop ParNewGeneration::copy_to_survivor_space_avoiding_promotion_undo(
             "push forwarded object");
    }
    // Push it on one of the queues of to-be-scanned objects.
-    if (!par_scan_state->work_queue()->push(obj_to_push)) {
+    bool simulate_overflow = false;
+    NOT_PRODUCT(
+      if (ParGCWorkQueueOverflowALot && should_simulate_overflow()) {
+        // simulate a stack overflow
+        simulate_overflow = true;
+      }
+    )
+    if (simulate_overflow || !par_scan_state->work_queue()->push(obj_to_push)) {
      // Add stats for overflow pushes.
      if (Verbose && PrintGCDetails) {
        gclog_or_tty->print("queue overflow!\n");
      }
-      push_on_overflow_list(old);
+      push_on_overflow_list(old, par_scan_state);
      par_scan_state->note_overflow_push();
    }
    par_scan_state->note_push();
@@ -1110,9 +1121,16 @@ oop ParNewGeneration::copy_to_survivor_space_with_undo(
             "push forwarded object");
    }
    // Push it on one of the queues of to-be-scanned objects.
-    if (!par_scan_state->work_queue()->push(obj_to_push)) {
+    bool simulate_overflow = false;
+    NOT_PRODUCT(
+      if (ParGCWorkQueueOverflowALot && should_simulate_overflow()) {
+        // simulate a stack overflow
+        simulate_overflow = true;
+      }
+    )
+    if (simulate_overflow || !par_scan_state->work_queue()->push(obj_to_push)) {
      // Add stats for overflow pushes.
-      push_on_overflow_list(old);
+      push_on_overflow_list(old, par_scan_state);
      par_scan_state->note_overflow_push();
    }
    par_scan_state->note_push();
@@ -1135,89 +1153,190 @@ oop ParNewGeneration::copy_to_survivor_space_with_undo(
  return forward_ptr;
 }

-void ParNewGeneration::push_on_overflow_list(oop from_space_obj) {
-  oop cur_overflow_list = _overflow_list;
+#ifndef PRODUCT
+// It's OK to call this multi-threaded;  the worst thing
+// that can happen is that we'll get a bunch of closely
+// spaced simulated oveflows, but that's OK, in fact
+// probably good as it would exercise the overflow code
+// under contention.
+bool ParNewGeneration::should_simulate_overflow() {
+  if (_overflow_counter-- <= 0) { // just being defensive
+    _overflow_counter = ParGCWorkQueueOverflowInterval;
+    return true;
+  } else {
+    return false;
+  }
+}
+#endif
+
+#define BUSY (oop(0x1aff1aff))
+void ParNewGeneration::push_on_overflow_list(oop from_space_obj, ParScanThreadState* par_scan_state) {
  // if the object has been forwarded to itself, then we cannot
  // use the klass pointer for the linked list.  Instead we have
  // to allocate an oopDesc in the C-Heap and use that for the linked list.
+  // XXX This is horribly inefficient when a promotion failure occurs
+  // and should be fixed. XXX FIX ME !!!
+#ifndef PRODUCT
+  Atomic::inc_ptr(&_num_par_pushes);
+  assert(_num_par_pushes > 0, "Tautology");
+#endif
  if (from_space_obj->forwardee() == from_space_obj) {
    oopDesc* listhead = NEW_C_HEAP_ARRAY(oopDesc, 1);
    listhead->forward_to(from_space_obj);
    from_space_obj = listhead;
  }
-  while (true) {
-    from_space_obj->set_klass_to_list_ptr(cur_overflow_list);
-    oop observed_overflow_list =
-      (oop)Atomic::cmpxchg_ptr(from_space_obj, &_overflow_list, cur_overflow_list);
-    if (observed_overflow_list == cur_overflow_list) break;
-    // Otherwise...
+  oop observed_overflow_list = _overflow_list;
+  oop cur_overflow_list;
+  do {
    cur_overflow_list = observed_overflow_list;
-  }
+    if (cur_overflow_list != BUSY) {
+      from_space_obj->set_klass_to_list_ptr(cur_overflow_list);
+    } else {
+      from_space_obj->set_klass_to_list_ptr(NULL);
+    }
+    observed_overflow_list =
+      (oop)Atomic::cmpxchg_ptr(from_space_obj, &_overflow_list, cur_overflow_list);
+  } while (cur_overflow_list != observed_overflow_list);
 }

+// *NOTE*: The overflow list manipulation code here and
+// in CMSCollector:: are very similar in shape,
+// except that in the CMS case we thread the objects
+// directly into the list via their mark word, and do
+// not need to deal with special cases below related
+// to chunking of object arrays and promotion failure
+// handling.
+// CR 6797058 has been filed to attempt consolidation of
+// the common code.
+// Because of the common code, if you make any changes in
+// the code below, please check the CMS version to see if
+// similar changes might be needed.
+// See CMSCollector::par_take_from_overflow_list() for
+// more extensive documentation comments.
 bool
 ParNewGeneration::take_from_overflow_list(ParScanThreadState* par_scan_state) {
  ObjToScanQueue* work_q = par_scan_state->work_queue();
+  assert(work_q->size() == 0, "Should first empty local work queue");
  // How many to take?
-  int objsFromOverflow = MIN2(work_q->max_elems()/4,
-                              (juint)ParGCDesiredObjsFromOverflowList);
+  size_t objsFromOverflow = MIN2((size_t)work_q->max_elems()/4,
+                                 (size_t)ParGCDesiredObjsFromOverflowList);

  if (_overflow_list == NULL) return false;

  // Otherwise, there was something there; try claiming the list.
-  oop prefix = (oop)Atomic::xchg_ptr(NULL, &_overflow_list);
-
-  if (prefix == NULL) {
-    return false;
-  }
+  oop prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list);
  // Trim off a prefix of at most objsFromOverflow items
-  int i = 1;
+  Thread* tid = Thread::current();
+  size_t spin_count = (size_t)ParallelGCThreads;
+  size_t sleep_time_millis = MAX2((size_t)1, objsFromOverflow/100);
+  for (size_t spin = 0; prefix == BUSY && spin < spin_count; spin++) {
+    // someone grabbed it before we did ...
+    // ... we spin for a short while...
+    os::sleep(tid, sleep_time_millis, false);
+    if (_overflow_list == NULL) {
+      // nothing left to take
+      return false;
+    } else if (_overflow_list != BUSY) {
+     // try and grab the prefix
+     prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list);
+    }
+  }
+  if (prefix == NULL || prefix == BUSY) {
+     // Nothing to take or waited long enough
+     if (prefix == NULL) {
+       // Write back the NULL in case we overwrote it with BUSY above
+       // and it is still the same value.
+       (void) Atomic::cmpxchg_ptr(NULL, &_overflow_list, BUSY);
+     }
+     return false;
+  }
+  assert(prefix != NULL && prefix != BUSY, "Error");
+  size_t i = 1;
  oop cur = prefix;
  while (i < objsFromOverflow && cur->klass_or_null() != NULL) {
    i++; cur = oop(cur->klass());
  }

  // Reattach remaining (suffix) to overflow list
-  if (cur->klass_or_null() != NULL) {
-    oop suffix = oop(cur->klass());
-    cur->set_klass_to_list_ptr(NULL);
-
-    // Find last item of suffix list
-    oop last = suffix;
-    while (last->klass_or_null() != NULL) {
-      last = oop(last->klass());
+  if (cur->klass_or_null() == NULL) {
+    // Write back the NULL in lieu of the BUSY we wrote
+    // above and it is still the same value.
+    if (_overflow_list == BUSY) {
+      (void) Atomic::cmpxchg_ptr(NULL, &_overflow_list, BUSY);
    }
-    // Atomically prepend suffix to current overflow list
-    oop cur_overflow_list = _overflow_list;
-    while (true) {
-      last->set_klass_to_list_ptr(cur_overflow_list);
-      oop observed_overflow_list =
-        (oop)Atomic::cmpxchg_ptr(suffix, &_overflow_list, cur_overflow_list);
-      if (observed_overflow_list == cur_overflow_list) break;
-      // Otherwise...
-      cur_overflow_list = observed_overflow_list;
+  } else {
+    assert(cur->klass_or_null() != BUSY, "Error");
+    oop suffix = oop(cur->klass());       // suffix will be put back on global list
+    cur->set_klass_to_list_ptr(NULL);     // break off suffix
+    // It's possible that the list is still in the empty(busy) state
+    // we left it in a short while ago; in that case we may be
+    // able to place back the suffix.
+    oop observed_overflow_list = _overflow_list;
+    oop cur_overflow_list = observed_overflow_list;
+    bool attached = false;
+    while (observed_overflow_list == BUSY || observed_overflow_list == NULL) {
+      observed_overflow_list =
+        (oop) Atomic::cmpxchg_ptr(suffix, &_overflow_list, cur_overflow_list);
+      if (cur_overflow_list == observed_overflow_list) {
+        attached = true;
+        break;
+      } else cur_overflow_list = observed_overflow_list;
+    }
+    if (!attached) {
+      // Too bad, someone else got in in between; we'll need to do a splice.
+      // Find the last item of suffix list
+      oop last = suffix;
+      while (last->klass_or_null() != NULL) {
+        last = oop(last->klass());
+      }
+      // Atomically prepend suffix to current overflow list
+      observed_overflow_list = _overflow_list;
+      do {
+        cur_overflow_list = observed_overflow_list;
+        if (cur_overflow_list != BUSY) {
+          // Do the splice ...
+          last->set_klass_to_list_ptr(cur_overflow_list);
+        } else { // cur_overflow_list == BUSY
+          last->set_klass_to_list_ptr(NULL);
+        }
+        observed_overflow_list =
+          (oop)Atomic::cmpxchg_ptr(suffix, &_overflow_list, cur_overflow_list);
+      } while (cur_overflow_list != observed_overflow_list);
    }
  }

  // Push objects on prefix list onto this thread's work queue
-  assert(cur != NULL, "program logic");
+  assert(prefix != NULL && prefix != BUSY, "program logic");
  cur = prefix;
-  int n = 0;
+  ssize_t n = 0;
  while (cur != NULL) {
    oop obj_to_push = cur->forwardee();
    oop next        = oop(cur->klass_or_null());
    cur->set_klass(obj_to_push->klass());
-    if (par_scan_state->should_be_partially_scanned(obj_to_push, cur)) {
-      obj_to_push = cur;
+    // This may be an array object that is self-forwarded. In that case, the list pointer
+    // space, cur, is not in the Java heap, but rather in the C-heap and should be freed.
+    if (!is_in_reserved(cur)) {
+      // This can become a scaling bottleneck when there is work queue overflow coincident
+      // with promotion failure.
+      oopDesc* f = cur;
+      FREE_C_HEAP_ARRAY(oopDesc, f);
+    } else if (par_scan_state->should_be_partially_scanned(obj_to_push, cur)) {
      assert(arrayOop(cur)->length() == 0, "entire array remaining to be scanned");
+      obj_to_push = cur;
    }
-    work_q->push(obj_to_push);
+    bool ok = work_q->push(obj_to_push);
+    assert(ok, "Should have succeeded");
    cur = next;
    n++;
  }
  par_scan_state->note_overflow_refill(n);
+#ifndef PRODUCT
+  assert(_num_par_pushes >= n, "Too many pops?");
+  Atomic::add_ptr(-(intptr_t)n, &_num_par_pushes);
+#endif
  return true;
 }
+#undef BUSY

 void ParNewGeneration::ref_processor_init()
 {

--- a/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp
+++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp
@@ -278,6 +278,7 @@ class ParNewGeneration: public DefNewGeneration {
  friend class ParNewRefProcTask;
  friend class ParNewRefProcTaskExecutor;
  friend class ParScanThreadStateSet;
+  friend class ParEvacuateFollowersClosure;

 private:
  // XXX use a global constant instead of 64!
@@ -296,6 +297,7 @@ class ParNewGeneration: public DefNewGeneration {
  // klass-pointers (klass information already copied to the forwarded
  // image.)  Manipulated with CAS.
  oop _overflow_list;
+  NOT_PRODUCT(ssize_t _num_par_pushes;)

  // If true, older generation does not support promotion undo, so avoid.
  static bool _avoid_promotion_undo;
@@ -372,8 +374,12 @@ class ParNewGeneration: public DefNewGeneration {
  oop copy_to_survivor_space_with_undo(ParScanThreadState* par_scan_state,
                             oop obj, size_t obj_sz, markOop m);

+  // in support of testing overflow code
+  NOT_PRODUCT(int _overflow_counter;)
+  NOT_PRODUCT(bool should_simulate_overflow();)
+
  // Push the given (from-space) object on the global overflow list.
-  void push_on_overflow_list(oop from_space_obj);
+  void push_on_overflow_list(oop from_space_obj, ParScanThreadState* par_scan_state);

  // If the global overflow list is non-empty, move some tasks from it
  // onto "work_q" (which must be empty).  No more than 1/4 of the

--- a/src/share/vm/gc_implementation/parallelScavenge/psOldGen.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psOldGen.cpp
@@ -116,7 +116,7 @@ void PSOldGen::initialize_work(const char* perf_data_name, int level) {
  // ObjectSpace stuff
  //

-  _object_space = new MutableSpace();
+  _object_space = new MutableSpace(virtual_space()->alignment());

  if (_object_space == NULL)
    vm_exit_during_initialization("Could not allocate an old gen space");
@@ -385,10 +385,10 @@ void PSOldGen::post_resize() {
  start_array()->set_covered_region(new_memregion);
  Universe::heap()->barrier_set()->resize_covered_region(new_memregion);

-  HeapWord* const virtual_space_high = (HeapWord*) virtual_space()->high();
-
  // ALWAYS do this last!!
-  object_space()->set_end(virtual_space_high);
+  object_space()->initialize(new_memregion,
+                             SpaceDecorator::DontClear,
+                             SpaceDecorator::DontMangle);

  assert(new_word_size == heap_word_size(object_space()->capacity_in_bytes()),
    "Sanity");

--- a/src/share/vm/gc_implementation/parallelScavenge/psVirtualspace.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psVirtualspace.cpp
@@ -78,7 +78,7 @@ void PSVirtualSpace::release() {
  _special = false;
 }

-bool PSVirtualSpace::expand_by(size_t bytes, bool pre_touch) {
+bool PSVirtualSpace::expand_by(size_t bytes) {
  assert(is_aligned(bytes), "arg not aligned");
  DEBUG_ONLY(PSVirtualSpaceVerifier this_verifier(this));

@@ -92,15 +92,6 @@ bool PSVirtualSpace::expand_by(size_t bytes, bool pre_touch) {
    _committed_high_addr += bytes;
  }

-  if (pre_touch || AlwaysPreTouch) {
-    for (char* curr = base_addr;
-         curr < _committed_high_addr;
-         curr += os::vm_page_size()) {
-      char tmp = *curr;
-      *curr = 0;
-    }
-  }
-
  return result;
 }

@@ -255,7 +246,7 @@ PSVirtualSpaceHighToLow::PSVirtualSpaceHighToLow(ReservedSpace rs) {
  DEBUG_ONLY(verify());
 }

-bool PSVirtualSpaceHighToLow::expand_by(size_t bytes, bool pre_touch) {
+bool PSVirtualSpaceHighToLow::expand_by(size_t bytes) {
  assert(is_aligned(bytes), "arg not aligned");
  DEBUG_ONLY(PSVirtualSpaceVerifier this_verifier(this));

@@ -269,15 +260,6 @@ bool PSVirtualSpaceHighToLow::expand_by(size_t bytes, bool pre_touch) {
    _committed_low_addr -= bytes;
  }

-  if (pre_touch || AlwaysPreTouch) {
-    for (char* curr = base_addr;
-         curr < _committed_high_addr;
-         curr += os::vm_page_size()) {
-      char tmp = *curr;
-      *curr = 0;
-    }
-  }
-
  return result;
 }


--- a/src/share/vm/gc_implementation/parallelScavenge/psVirtualspace.hpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psVirtualspace.hpp
@@ -80,7 +80,7 @@ class PSVirtualSpace : public CHeapObj {
  inline  void   set_reserved(char* low_addr, char* high_addr, bool special);
  inline  void   set_reserved(ReservedSpace rs);
  inline  void   set_committed(char* low_addr, char* high_addr);
-  virtual bool   expand_by(size_t bytes, bool pre_touch = false);
+  virtual bool   expand_by(size_t bytes);
  virtual bool   shrink_by(size_t bytes);
  virtual size_t expand_into(PSVirtualSpace* space, size_t bytes);
  void           release();
@@ -127,7 +127,7 @@ class PSVirtualSpaceHighToLow : public PSVirtualSpace {
  PSVirtualSpaceHighToLow(ReservedSpace rs, size_t alignment);
  PSVirtualSpaceHighToLow(ReservedSpace rs);

-  virtual bool   expand_by(size_t bytes, bool pre_touch = false);
+  virtual bool   expand_by(size_t bytes);
  virtual bool   shrink_by(size_t bytes);
  virtual size_t expand_into(PSVirtualSpace* space, size_t bytes);


--- a/src/share/vm/gc_implementation/parallelScavenge/psYoungGen.cpp
+++ b/src/share/vm/gc_implementation/parallelScavenge/psYoungGen.cpp
@@ -64,12 +64,12 @@ void PSYoungGen::initialize_work() {
  }

  if (UseNUMA) {
-    _eden_space = new MutableNUMASpace();
+    _eden_space = new MutableNUMASpace(virtual_space()->alignment());
  } else {
-    _eden_space = new MutableSpace();
+    _eden_space = new MutableSpace(virtual_space()->alignment());
  }
-  _from_space = new MutableSpace();
-  _to_space   = new MutableSpace();
+  _from_space = new MutableSpace(virtual_space()->alignment());
+  _to_space   = new MutableSpace(virtual_space()->alignment());

  if (_eden_space == NULL || _from_space == NULL || _to_space == NULL) {
    vm_exit_during_initialization("Could not allocate a young gen space");

--- a/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp
+++ b/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp
@@ -27,7 +27,7 @@
 # include "incls/_mutableNUMASpace.cpp.incl"


-MutableNUMASpace::MutableNUMASpace() {
+MutableNUMASpace::MutableNUMASpace(size_t alignment) : MutableSpace(alignment) {
  _lgrp_spaces = new (ResourceObj::C_HEAP) GrowableArray<LGRPSpace*>(0, true);
  _page_size = os::vm_page_size();
  _adaptation_cycles = 0;
@@ -221,7 +221,7 @@ bool MutableNUMASpace::update_layout(bool force) {
        }
      }
      if (!found) {
-        lgrp_spaces()->append(new LGRPSpace(lgrp_ids[i]));
+        lgrp_spaces()->append(new LGRPSpace(lgrp_ids[i], alignment()));
      }
    }

@@ -443,10 +443,10 @@ void MutableNUMASpace::select_tails(MemRegion new_region, MemRegion intersection
  // Is there bottom?
  if (new_region.start() < intersection.start()) { // Yes
    // Try to coalesce small pages into a large one.
-    if (UseLargePages && page_size() >= os::large_page_size()) {
-      HeapWord* p = (HeapWord*)round_to((intptr_t) intersection.start(), os::large_page_size());
+    if (UseLargePages && page_size() >= alignment()) {
+      HeapWord* p = (HeapWord*)round_to((intptr_t) intersection.start(), alignment());
      if (new_region.contains(p)
-          && pointer_delta(p, new_region.start(), sizeof(char)) >= os::large_page_size()) {
+          && pointer_delta(p, new_region.start(), sizeof(char)) >= alignment()) {
        if (intersection.contains(p)) {
          intersection = MemRegion(p, intersection.end());
        } else {
@@ -462,10 +462,10 @@ void MutableNUMASpace::select_tails(MemRegion new_region, MemRegion intersection
  // Is there top?
  if (intersection.end() < new_region.end()) { // Yes
    // Try to coalesce small pages into a large one.
-    if (UseLargePages && page_size() >= os::large_page_size()) {
-      HeapWord* p = (HeapWord*)round_down((intptr_t) intersection.end(), os::large_page_size());
+    if (UseLargePages && page_size() >= alignment()) {
+      HeapWord* p = (HeapWord*)round_down((intptr_t) intersection.end(), alignment());
      if (new_region.contains(p)
-          && pointer_delta(new_region.end(), p, sizeof(char)) >= os::large_page_size()) {
+          && pointer_delta(new_region.end(), p, sizeof(char)) >= alignment()) {
        if (intersection.contains(p)) {
          intersection = MemRegion(intersection.start(), p);
        } else {
@@ -504,12 +504,12 @@ void MutableNUMASpace::merge_regions(MemRegion new_region, MemRegion* intersecti
            // That's the only case we have to make an additional bias_region() call.
            HeapWord* start = invalid_region->start();
            HeapWord* end = invalid_region->end();
-            if (UseLargePages && page_size() >= os::large_page_size()) {
-              HeapWord *p = (HeapWord*)round_down((intptr_t) start, os::large_page_size());
+            if (UseLargePages && page_size() >= alignment()) {
+              HeapWord *p = (HeapWord*)round_down((intptr_t) start, alignment());
              if (new_region.contains(p)) {
                start = p;
              }
-              p = (HeapWord*)round_to((intptr_t) end, os::large_page_size());
+              p = (HeapWord*)round_to((intptr_t) end, alignment());
              if (new_region.contains(end)) {
                end = p;
              }
@@ -526,7 +526,8 @@ void MutableNUMASpace::merge_regions(MemRegion new_region, MemRegion* intersecti

 void MutableNUMASpace::initialize(MemRegion mr,
                                  bool clear_space,
-                                  bool mangle_space) {
+                                  bool mangle_space,
+                                  bool setup_pages) {
  assert(clear_space, "Reallocation will destory data!");
  assert(lgrp_spaces()->length() > 0, "There should be at least one space");

@@ -538,7 +539,7 @@ void MutableNUMASpace::initialize(MemRegion mr,

  // Compute chunk sizes
  size_t prev_page_size = page_size();
-  set_page_size(UseLargePages ? os::large_page_size() : os::vm_page_size());
+  set_page_size(UseLargePages ? alignment() : os::vm_page_size());
  HeapWord* rounded_bottom = (HeapWord*)round_to((intptr_t) bottom(), page_size());
  HeapWord* rounded_end = (HeapWord*)round_down((intptr_t) end(), page_size());
  size_t base_space_size_pages = pointer_delta(rounded_end, rounded_bottom, sizeof(char)) / page_size();
@@ -666,7 +667,7 @@ void MutableNUMASpace::initialize(MemRegion mr,
    }

    // Clear space (set top = bottom) but never mangle.
-    s->initialize(new_region, SpaceDecorator::Clear, SpaceDecorator::DontMangle);
+    s->initialize(new_region, SpaceDecorator::Clear, SpaceDecorator::DontMangle, MutableSpace::DontSetupPages);

    set_adaptation_cycles(samples_count());
  }

--- a/src/share/vm/gc_implementation/shared/mutableNUMASpace.hpp
+++ b/src/share/vm/gc_implementation/shared/mutableNUMASpace.hpp
@@ -82,8 +82,8 @@ class MutableNUMASpace : public MutableSpace {
    char* last_page_scanned()            { return _last_page_scanned; }
    void set_last_page_scanned(char* p)  { _last_page_scanned = p;    }
   public:
-    LGRPSpace(int l) : _lgrp_id(l), _last_page_scanned(NULL), _allocation_failed(false) {
-      _space = new MutableSpace();
+    LGRPSpace(int l, size_t alignment) : _lgrp_id(l), _last_page_scanned(NULL), _allocation_failed(false) {
+      _space = new MutableSpace(alignment);
      _alloc_rate = new AdaptiveWeightedAverage(NUMAChunkResizeWeight);
    }
    ~LGRPSpace() {
@@ -183,10 +183,10 @@ class MutableNUMASpace : public MutableSpace {

 public:
  GrowableArray<LGRPSpace*>* lgrp_spaces() const     { return _lgrp_spaces;       }
-  MutableNUMASpace();
+  MutableNUMASpace(size_t alignment);
  virtual ~MutableNUMASpace();
  // Space initialization.
-  virtual void initialize(MemRegion mr, bool clear_space, bool mangle_space);
+  virtual void initialize(MemRegion mr, bool clear_space, bool mangle_space, bool setup_pages = SetupPages);
  // Update space layout if necessary. Do all adaptive resizing job.
  virtual void update();
  // Update allocation rate averages.

--- a/src/share/vm/gc_implementation/shared/mutableSpace.cpp
+++ b/src/share/vm/gc_implementation/shared/mutableSpace.cpp
@@ -25,7 +25,10 @@
 # include "incls/_precompiled.incl"
 # include "incls/_mutableSpace.cpp.incl"

-MutableSpace::MutableSpace(): ImmutableSpace(), _top(NULL) {
+MutableSpace::MutableSpace(size_t alignment): ImmutableSpace(), _top(NULL), _alignment(alignment) {
+  assert(MutableSpace::alignment() >= 0 &&
+         MutableSpace::alignment() % os::vm_page_size() == 0,
+         "Space should be aligned");
  _mangler = new MutableSpaceMangler(this);
 }

@@ -33,16 +36,88 @@ MutableSpace::~MutableSpace() {
  delete _mangler;
 }

+void MutableSpace::numa_setup_pages(MemRegion mr, bool clear_space) {
+  if (!mr.is_empty()) {
+    size_t page_size = UseLargePages ? alignment() : os::vm_page_size();
+    HeapWord *start = (HeapWord*)round_to((intptr_t) mr.start(), page_size);
+    HeapWord *end =  (HeapWord*)round_down((intptr_t) mr.end(), page_size);
+    if (end > start) {
+      size_t size = pointer_delta(end, start, sizeof(char));
+      if (clear_space) {
+        // Prefer page reallocation to migration.
+        os::free_memory((char*)start, size);
+      }
+      os::numa_make_global((char*)start, size);
+    }
+  }
+}
+
+void MutableSpace::pretouch_pages(MemRegion mr) {
+  for (volatile char *p = (char*)mr.start(); p < (char*)mr.end(); p += os::vm_page_size()) {
+    char t = *p; *p = t;
+  }
+}
+
 void MutableSpace::initialize(MemRegion mr,
                              bool clear_space,
-                              bool mangle_space) {
-  HeapWord* bottom = mr.start();
-  HeapWord* end    = mr.end();
+                              bool mangle_space,
+                              bool setup_pages) {

-  assert(Universe::on_page_boundary(bottom) && Universe::on_page_boundary(end),
+  assert(Universe::on_page_boundary(mr.start()) && Universe::on_page_boundary(mr.end()),
         "invalid space boundaries");
-  set_bottom(bottom);
-  set_end(end);
+
+  if (setup_pages && (UseNUMA || AlwaysPreTouch)) {
+    // The space may move left and right or expand/shrink.
+    // We'd like to enforce the desired page placement.
+    MemRegion head, tail;
+    if (last_setup_region().is_empty()) {
+      // If it's the first initialization don't limit the amount of work.
+      head = mr;
+      tail = MemRegion(mr.end(), mr.end());
+    } else {
+      // Is there an intersection with the address space?
+      MemRegion intersection = last_setup_region().intersection(mr);
+      if (intersection.is_empty()) {
+        intersection = MemRegion(mr.end(), mr.end());
+      }
+      // All the sizes below are in words.
+      size_t head_size = 0, tail_size = 0;
+      if (mr.start() <= intersection.start()) {
+        head_size = pointer_delta(intersection.start(), mr.start());
+      }
+      if(intersection.end() <= mr.end()) {
+        tail_size = pointer_delta(mr.end(), intersection.end());
+      }
+      // Limit the amount of page manipulation if necessary.
+      if (NUMASpaceResizeRate > 0 && !AlwaysPreTouch) {
+        const size_t change_size = head_size + tail_size;
+        const float setup_rate_words = NUMASpaceResizeRate >> LogBytesPerWord;
+        head_size = MIN2((size_t)(setup_rate_words * head_size / change_size),
+                         head_size);
+        tail_size = MIN2((size_t)(setup_rate_words * tail_size / change_size),
+                         tail_size);
+      }
+      head = MemRegion(intersection.start() - head_size, intersection.start());
+      tail = MemRegion(intersection.end(), intersection.end() + tail_size);
+    }
+    assert(mr.contains(head) && mr.contains(tail), "Sanity");
+
+    if (UseNUMA) {
+      numa_setup_pages(head, clear_space);
+      numa_setup_pages(tail, clear_space);
+    }
+
+    if (AlwaysPreTouch) {
+      pretouch_pages(head);
+      pretouch_pages(tail);
+    }
+
+    // Remember where we stopped so that we can continue later.
+    set_last_setup_region(MemRegion(head.start(), tail.end()));
+  }
+
+  set_bottom(mr.start());
+  set_end(mr.end());

  if (clear_space) {
    clear(mangle_space);

--- a/src/share/vm/gc_implementation/shared/mutableSpace.hpp
+++ b/src/share/vm/gc_implementation/shared/mutableSpace.hpp
@@ -25,7 +25,10 @@
 // A MutableSpace is a subtype of ImmutableSpace that supports the
 // concept of allocation. This includes the concepts that a space may
 // be only partially full, and the querry methods that go with such
-// an assumption.
+// an assumption. MutableSpace is also responsible for minimizing the
+// page allocation time by having the memory pretouched (with
+// AlwaysPretouch) and for optimizing page placement on NUMA systems
+// by make the underlying region interleaved (with UseNUMA).
 //
 // Invariant: (ImmutableSpace +) bottom() <= top() <= end()
 // top() is inclusive and end() is exclusive.
@@ -37,15 +40,23 @@ class MutableSpace: public ImmutableSpace {

  // Helper for mangling unused space in debug builds
  MutableSpaceMangler* _mangler;
-
+  // The last region which page had been setup to be interleaved.
+  MemRegion _last_setup_region;
+  size_t _alignment;
 protected:
  HeapWord* _top;

  MutableSpaceMangler* mangler() { return _mangler; }

+  void numa_setup_pages(MemRegion mr, bool clear_space);
+  void pretouch_pages(MemRegion mr);
+
+  void set_last_setup_region(MemRegion mr) { _last_setup_region = mr;   }
+  MemRegion last_setup_region() const      { return _last_setup_region; }
+
 public:
  virtual ~MutableSpace();
-  MutableSpace();
+  MutableSpace(size_t page_size);

  // Accessors
  HeapWord* top() const                    { return _top;    }
@@ -57,13 +68,20 @@ class MutableSpace: public ImmutableSpace {
  virtual void set_bottom(HeapWord* value) { _bottom = value; }
  virtual void set_end(HeapWord* value)    { _end = value; }

+  size_t alignment()                       { return _alignment; }
+
  // Returns a subregion containing all objects in this space.
  MemRegion used_region() { return MemRegion(bottom(), top()); }

+  static const bool SetupPages = true;
+  static const bool DontSetupPages = false;
+
  // Initialization
  virtual void initialize(MemRegion mr,
                          bool clear_space,
-                          bool mangle_space);
+                          bool mangle_space,
+                          bool setup_pages = SetupPages);
+
  virtual void clear(bool mangle_space);
  // Does the usual initialization but optionally resets top to bottom.
 #if 0  // MANGLE_SPACE

--- a/src/share/vm/memory/referenceProcessor.cpp
+++ b/src/share/vm/memory/referenceProcessor.cpp
@@ -721,12 +721,6 @@ ReferenceProcessor::process_phase3(DiscoveredList&    refs_list,
                             iter.obj(), iter.obj()->blueprint()->internal_name());
    }
    assert(iter.obj()->is_oop(UseConcMarkSweepGC), "Adding a bad reference");
-    // If discovery is concurrent, we may have objects with null referents,
-    // being those that were concurrently cleared after they were discovered
-    // (and not subsequently precleaned).
-    assert(   (discovery_is_atomic() && iter.referent()->is_oop())
-           || (!discovery_is_atomic() && iter.referent()->is_oop_or_null(UseConcMarkSweepGC)),
-           "Adding a bad referent");
    iter.next();
  }
  // Remember to keep sentinel pointer around

--- a/src/share/vm/runtime/globals.hpp
+++ b/src/share/vm/runtime/globals.hpp
@@ -1307,7 +1307,14 @@ class CommandLineFlags {
  product(intx, ParGCArrayScanChunk, 50,                                    \
          "Scan a subset and push remainder, if array is bigger than this") \
                                                                            \
-  product(intx, ParGCDesiredObjsFromOverflowList, 20,                       \
+  notproduct(bool, ParGCWorkQueueOverflowALot, false,                       \
+          "Whether we should simulate work queue overflow in ParNew")       \
+                                                                            \
+  notproduct(uintx, ParGCWorkQueueOverflowInterval, 1000,                   \
+          "An `interval' counter that determines how frequently"            \
+          " we simulate overflow; a smaller number increases frequency")    \
+                                                                            \
+  product(uintx, ParGCDesiredObjsFromOverflowList, 20,                      \
          "The desired number of objects to claim from the overflow list")  \
                                                                            \
  product(uintx, CMSParPromoteBlocksToClaim, 50,                            \
@@ -1429,8 +1436,8 @@ class CommandLineFlags {
          "Whether we should simulate frequent marking stack / work queue"  \
          " overflow")                                                      \
                                                                            \
-  notproduct(intx, CMSMarkStackOverflowInterval, 1000,                      \
-          "A per-thread `interval' counter that determines how frequently"  \
+  notproduct(uintx, CMSMarkStackOverflowInterval, 1000,                     \
+          "An `interval' counter that determines how frequently"            \
          " we simulate overflow; a smaller number increases frequency")    \
                                                                            \
  product(uintx, CMSMaxAbortablePrecleanLoops, 0,                           \
@@ -1648,7 +1655,7 @@ class CommandLineFlags {
  develop(uintx, WorkStealingYieldsBeforeSleep, 1000,                       \
          "Number of yields before a sleep is done during workstealing")    \
                                                                            \
-  product(uintx, PreserveMarkStackSize, 40,                                 \
+  product(uintx, PreserveMarkStackSize, 1024,                               \
           "Size for stack used in promotion failure handling")             \
                                                                            \
  product_pd(bool, UseTLAB, "Use thread-local object allocation")           \