Merge

c5347e0b · jcoomes · c21ca64b · 96586688 · c5347e0b · c5347e0b
15 changed file
--- a/src/cpu/sparc/vm/assembler_sparc.cpp
+++ b/src/cpu/sparc/vm/assembler_sparc.cpp
@@ -2161,29 +2161,6 @@ void MacroAssembler::br_notnull( Register s1, bool a, Predict p, Label& L ) {
 #endif
 }

-void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p,
-                                     Register s1, address d,
-                                     relocInfo::relocType rt ) {
-  assert_not_delayed();
-  if (VM_Version::v9_instructions_work()) {
-    bpr(rc, a, p, s1, d, rt);
-  } else {
-    tst(s1);
-    br(reg_cond_to_cc_cond(rc), a, p, d, rt);
-  }
-}
-
-void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p,
-                                     Register s1, Label& L ) {
-  assert_not_delayed();
-  if (VM_Version::v9_instructions_work()) {
-    bpr(rc, a, p, s1, L);
-  } else {
-    tst(s1);
-    br(reg_cond_to_cc_cond(rc), a, p, L);
-  }
-}
-
 // Compare registers and branch with nop in delay slot or cbcond without delay slot.

 // Compare integer (32 bit) values (icc only).
@@ -4340,22 +4317,29 @@ static void generate_satb_log_enqueue(bool with_frame) {
  } else {
    pre_val = O0;
  }
+
  int satb_q_index_byte_offset =
    in_bytes(JavaThread::satb_mark_queue_offset() +
             PtrQueue::byte_offset_of_index());
+
  int satb_q_buf_byte_offset =
    in_bytes(JavaThread::satb_mark_queue_offset() +
             PtrQueue::byte_offset_of_buf());
+
  assert(in_bytes(PtrQueue::byte_width_of_index()) == sizeof(intptr_t) &&
         in_bytes(PtrQueue::byte_width_of_buf()) == sizeof(intptr_t),
         "check sizes in assembly below");

  __ bind(restart);
+
+  // Load the index into the SATB buffer. PtrQueue::_index is a size_t
+  // so ld_ptr is appropriate.
  __ ld_ptr(G2_thread, satb_q_index_byte_offset, L0);

-  __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn, L0, refill);
-  // If the branch is taken, no harm in executing this in the delay slot.
-  __ delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, L1);
+  // index == 0?
+  __ cmp_and_brx_short(L0, G0, Assembler::equal, Assembler::pn, refill);
+
+  __ ld_ptr(G2_thread, satb_q_buf_byte_offset, L1);
  __ sub(L0, oopSize, L0);

  __ st_ptr(pre_val, L1, L0);  // [_buf + index] := I0
@@ -4466,9 +4450,8 @@ void MacroAssembler::g1_write_barrier_pre(Register obj,
         tmp);
  }

-  // Check on whether to annul.
-  br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered);
-  delayed()->nop();
+  // Is marking active?
+  cmp_and_br_short(tmp, G0, Assembler::equal, Assembler::pt, filtered);

  // Do we need to load the previous value?
  if (obj != noreg) {
@@ -4490,9 +4473,7 @@ void MacroAssembler::g1_write_barrier_pre(Register obj,
  assert(pre_val != noreg, "must have a real register");

  // Is the previous value null?
-  // Check on whether to annul.
-  br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, pre_val, filtered);
-  delayed()->nop();
+  cmp_and_brx_short(pre_val, G0, Assembler::equal, Assembler::pt, filtered);

  // OK, it's not filtered, so we'll need to call enqueue.  In the normal
  // case, pre_val will be a scratch G-reg, but there are some cases in
@@ -4519,39 +4500,6 @@ void MacroAssembler::g1_write_barrier_pre(Register obj,
  bind(filtered);
 }

-static jint num_ct_writes = 0;
-static jint num_ct_writes_filtered_in_hr = 0;
-static jint num_ct_writes_filtered_null = 0;
-static G1CollectedHeap* g1 = NULL;
-
-static Thread* count_ct_writes(void* filter_val, void* new_val) {
-  Atomic::inc(&num_ct_writes);
-  if (filter_val == NULL) {
-    Atomic::inc(&num_ct_writes_filtered_in_hr);
-  } else if (new_val == NULL) {
-    Atomic::inc(&num_ct_writes_filtered_null);
-  } else {
-    if (g1 == NULL) {
-      g1 = G1CollectedHeap::heap();
-    }
-  }
-  if ((num_ct_writes % 1000000) == 0) {
-    jint num_ct_writes_filtered =
-      num_ct_writes_filtered_in_hr +
-      num_ct_writes_filtered_null;
-
-    tty->print_cr("%d potential CT writes: %5.2f%% filtered\n"
-                  "   (%5.2f%% intra-HR, %5.2f%% null).",
-                  num_ct_writes,
-                  100.0*(float)num_ct_writes_filtered/(float)num_ct_writes,
-                  100.0*(float)num_ct_writes_filtered_in_hr/
-                  (float)num_ct_writes,
-                  100.0*(float)num_ct_writes_filtered_null/
-                  (float)num_ct_writes);
-  }
-  return Thread::current();
-}
-
 static address dirty_card_log_enqueue = 0;
 static u_char* dirty_card_log_enqueue_end = 0;

@@ -4574,11 +4522,8 @@ static void generate_dirty_card_log_enqueue(jbyte* byte_map_base) {
  __ set(addrlit, O1); // O1 := <card table base>
  __ ldub(O0, O1, O2); // O2 := [O0 + O1]

-  __ br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt,
-                      O2, not_already_dirty);
-  // Get O1 + O2 into a reg by itself -- useful in the take-the-branch
-  // case, harmless if not.
-  __ delayed()->add(O0, O1, O3);
+  assert(CardTableModRefBS::dirty_card_val() == 0, "otherwise check this code");
+  __ cmp_and_br_short(O2, G0, Assembler::notEqual, Assembler::pt, not_already_dirty);

  // We didn't take the branch, so we're already dirty: return.
  // Use return-from-leaf
@@ -4587,8 +4532,13 @@ static void generate_dirty_card_log_enqueue(jbyte* byte_map_base) {

  // Not dirty.
  __ bind(not_already_dirty);
+
+  // Get O0 + O1 into a reg by itself
+  __ add(O0, O1, O3);
+
  // First, dirty it.
  __ stb(G0, O3, G0);  // [cardPtr] := 0  (i.e., dirty).
+
  int dirty_card_q_index_byte_offset =
    in_bytes(JavaThread::dirty_card_queue_offset() +
             PtrQueue::byte_offset_of_index());
@@ -4596,12 +4546,15 @@ static void generate_dirty_card_log_enqueue(jbyte* byte_map_base) {
    in_bytes(JavaThread::dirty_card_queue_offset() +
             PtrQueue::byte_offset_of_buf());
  __ bind(restart);
+
+  // Load the index into the update buffer. PtrQueue::_index is
+  // a size_t so ld_ptr is appropriate here.
  __ ld_ptr(G2_thread, dirty_card_q_index_byte_offset, L0);

-  __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn,
-                      L0, refill);
-  // If the branch is taken, no harm in executing this in the delay slot.
-  __ delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, L1);
+  // index == 0?
+  __ cmp_and_brx_short(L0, G0, Assembler::equal, Assembler::pn, refill);
+
+  __ ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, L1);
  __ sub(L0, oopSize, L0);

  __ st_ptr(O3, L1, L0);  // [_buf + index] := I0
@@ -4664,6 +4617,7 @@ void MacroAssembler::g1_write_barrier_post(Register store_addr, Register new_val
  G1SATBCardTableModRefBS* bs = (G1SATBCardTableModRefBS*) Universe::heap()->barrier_set();
  assert(bs->kind() == BarrierSet::G1SATBCT ||
         bs->kind() == BarrierSet::G1SATBCTLogging, "wrong barrier");
+
  if (G1RSBarrierRegionFilter) {
    xor3(store_addr, new_val, tmp);
 #ifdef _LP64
@@ -4672,33 +4626,8 @@ void MacroAssembler::g1_write_barrier_post(Register store_addr, Register new_val
    srl(tmp, HeapRegion::LogOfHRGrainBytes, tmp);
 #endif

-    if (G1PrintCTFilterStats) {
-      guarantee(tmp->is_global(), "Or stats won't work...");
-      // This is a sleazy hack: I'm temporarily hijacking G2, which I
-      // promise to restore.
-      mov(new_val, G2);
-      save_frame(0);
-      mov(tmp, O0);
-      mov(G2, O1);
-      // Save G-regs that target may use.
-      mov(G1, L1);
-      mov(G2, L2);
-      mov(G3, L3);
-      mov(G4, L4);
-      mov(G5, L5);
-      call(CAST_FROM_FN_PTR(address, &count_ct_writes));
-      delayed()->nop();
-      mov(O0, G2);
-      // Restore G-regs that target may have used.
-      mov(L1, G1);
-      mov(L3, G3);
-      mov(L4, G4);
-      mov(L5, G5);
-      restore(G0, G0, G0);
-    }
-    // XXX Should I predict this taken or not?  Does it mattern?
-    br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered);
-    delayed()->nop();
+    // XXX Should I predict this taken or not?  Does it matter?
+    cmp_and_brx_short(tmp, G0, Assembler::equal, Assembler::pt, filtered);
  }

  // If the "store_addr" register is an "in" or "local" register, move it to
@@ -4723,7 +4652,6 @@ void MacroAssembler::g1_write_barrier_post(Register store_addr, Register new_val
  restore();

  bind(filtered);
-
 }

 #endif  // SERIALGC

--- a/src/cpu/sparc/vm/assembler_sparc.hpp
+++ b/src/cpu/sparc/vm/assembler_sparc.hpp
@@ -1940,12 +1940,6 @@ class MacroAssembler: public Assembler {
  void br_null   ( Register s1, bool a, Predict p, Label& L );
  void br_notnull( Register s1, bool a, Predict p, Label& L );

-  // These versions will do the most efficient thing on v8 and v9.  Perhaps
-  // this is what the routine above was meant to do, but it didn't (and
-  // didn't cover both target address kinds.)
-  void br_on_reg_cond( RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt = relocInfo::none );
-  void br_on_reg_cond( RCondition c, bool a, Predict p, Register s1, Label& L);
-
  //
  // Compare registers and branch with nop in delay slot or cbcond without delay slot.
  //

--- a/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp
+++ b/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp
@@ -421,8 +421,7 @@ void G1PreBarrierStub::emit_code(LIR_Assembler* ce) {
  }

  if (__ is_in_wdisp16_range(_continuation)) {
-    __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
-                      pre_val_reg, _continuation);
+    __ br_null(pre_val_reg, /*annul*/false, Assembler::pt, _continuation);
  } else {
    __ cmp(pre_val_reg, G0);
    __ brx(Assembler::equal, false, Assembler::pn, _continuation);
@@ -458,8 +457,7 @@ void G1UnsafeGetObjSATBBarrierStub::emit_code(LIR_Assembler* ce) {
    // The original src operand was not a constant.
    // Generate src == null?
    if (__ is_in_wdisp16_range(_continuation)) {
-      __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
-                        src_reg, _continuation);
+      __ br_null(src_reg, /*annul*/false, Assembler::pt, _continuation);
    } else {
      __ cmp(src_reg, G0);
      __ brx(Assembler::equal, false, Assembler::pt, _continuation);
@@ -476,13 +474,9 @@ void G1UnsafeGetObjSATBBarrierStub::emit_code(LIR_Assembler* ce) {
  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset_in_bytes() + sizeof(oopDesc));
  __ ld(ref_type_adr, tmp_reg);

-  if (__ is_in_wdisp16_range(_continuation)) {
-    __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
-                      tmp_reg, _continuation);
-  } else {
-    __ cmp(tmp_reg, G0);
-    __ brx(Assembler::equal, false, Assembler::pt, _continuation);
-  }
+  // _reference_type field is of type ReferenceType (enum)
+  assert(REF_NONE == 0, "check this code");
+  __ cmp_zero_and_br(Assembler::equal, tmp_reg, _continuation, /*annul*/false, Assembler::pt);
  __ delayed()->nop();

  // Is marking active?
@@ -498,13 +492,8 @@ void G1UnsafeGetObjSATBBarrierStub::emit_code(LIR_Assembler* ce) {
    assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
    __ ldsb(in_progress, tmp_reg);
  }
-  if (__ is_in_wdisp16_range(_continuation)) {
-    __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
-                      tmp_reg, _continuation);
-  } else {
-    __ cmp(tmp_reg, G0);
-    __ brx(Assembler::equal, false, Assembler::pt, _continuation);
-  }
+
+  __ cmp_zero_and_br(Assembler::equal, tmp_reg, _continuation, /*annul*/false, Assembler::pt);
  __ delayed()->nop();

  // val == null?
@@ -512,8 +501,7 @@ void G1UnsafeGetObjSATBBarrierStub::emit_code(LIR_Assembler* ce) {
  Register val_reg = val()->as_register();

  if (__ is_in_wdisp16_range(_continuation)) {
-    __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
-                      val_reg, _continuation);
+    __ br_null(val_reg, /*annul*/false, Assembler::pt, _continuation);
  } else {
    __ cmp(val_reg, G0);
    __ brx(Assembler::equal, false, Assembler::pt, _continuation);
@@ -542,9 +530,9 @@ void G1PostBarrierStub::emit_code(LIR_Assembler* ce) {
  assert(new_val()->is_register(), "Precondition.");
  Register addr_reg = addr()->as_pointer_register();
  Register new_val_reg = new_val()->as_register();
+
  if (__ is_in_wdisp16_range(_continuation)) {
-    __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
-                      new_val_reg, _continuation);
+    __ br_null(new_val_reg, /*annul*/false, Assembler::pt, _continuation);
  } else {
    __ cmp(new_val_reg, G0);
    __ brx(Assembler::equal, false, Assembler::pn, _continuation);

--- a/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp
+++ b/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp
@@ -834,14 +834,16 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
        int satb_q_buf_byte_offset =
          in_bytes(JavaThread::satb_mark_queue_offset() +
                   PtrQueue::byte_offset_of_buf());
+
        __ bind(restart);
+        // Load the index into the SATB buffer. PtrQueue::_index is a
+        // size_t so ld_ptr is appropriate
        __ ld_ptr(G2_thread, satb_q_index_byte_offset, tmp);

-        __ br_on_reg_cond(Assembler::rc_z, /*annul*/false,
-                          Assembler::pn, tmp, refill);
+        // index == 0?
+        __ cmp_and_brx_short(tmp, G0, Assembler::equal, Assembler::pn, refill);

-        // If the branch is taken, no harm in executing this in the delay slot.
-        __ delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, tmp2);
+        __ ld_ptr(G2_thread, satb_q_buf_byte_offset, tmp2);
        __ sub(tmp, oopSize, tmp);

        __ st_ptr(pre_val, tmp2, tmp);  // [_buf + index] := <address_of_card>
@@ -901,11 +903,8 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
        __ set(rs, cardtable);         // cardtable := <card table base>
        __ ldub(addr, cardtable, tmp); // tmp := [addr + cardtable]

-        __ br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt,
-                          tmp, not_already_dirty);
-        // Get cardtable + tmp into a reg by itself -- useful in the take-the-branch
-        // case, harmless if not.
-        __ delayed()->add(addr, cardtable, tmp2);
+        assert(CardTableModRefBS::dirty_card_val() == 0, "otherwise check this code");
+        __ cmp_and_br_short(tmp, G0, Assembler::notEqual, Assembler::pt, not_already_dirty);

        // We didn't take the branch, so we're already dirty: return.
        // Use return-from-leaf
@@ -914,6 +913,10 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {

        // Not dirty.
        __ bind(not_already_dirty);
+
+        // Get cardtable + tmp into a reg by itself
+        __ add(addr, cardtable, tmp2);
+
        // First, dirty it.
        __ stb(G0, tmp2, 0);  // [cardPtr] := 0  (i.e., dirty).

@@ -929,13 +932,17 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
        int dirty_card_q_buf_byte_offset =
          in_bytes(JavaThread::dirty_card_queue_offset() +
                   PtrQueue::byte_offset_of_buf());
+
        __ bind(restart);
+
+        // Get the index into the update buffer. PtrQueue::_index is
+        // a size_t so ld_ptr is appropriate here.
        __ ld_ptr(G2_thread, dirty_card_q_index_byte_offset, tmp3);

-        __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn,
-                          tmp3, refill);
-        // If the branch is taken, no harm in executing this in the delay slot.
-        __ delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, tmp4);
+        // index == 0?
+        __ cmp_and_brx_short(tmp3, G0, Assembler::equal,  Assembler::pn, refill);
+
+        __ ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, tmp4);
        __ sub(tmp3, oopSize, tmp3);

        __ st_ptr(tmp2, tmp4, tmp3);  // [_buf + index] := <address_of_card>

--- a/src/os/linux/vm/os_linux.cpp
+++ b/src/os/linux/vm/os_linux.cpp
@@ -125,10 +125,6 @@
 # include <inttypes.h>
 # include <sys/ioctl.h>

-#ifdef AMD64
-#include <asm/vsyscall.h>
-#endif
-
 #define MAX_PATH    (2 * K)

 // for timer info max values which include all bits
@@ -2502,7 +2498,13 @@ bool os::commit_memory(char* addr, size_t size, bool exec) {
  int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
  uintptr_t res = (uintptr_t) ::mmap(addr, size, prot,
                                   MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
-  return res != (uintptr_t) MAP_FAILED;
+  if (res != (uintptr_t) MAP_FAILED) {
+    if (UseNUMAInterleaving) {
+      numa_make_global(addr, size);
+    }
+    return true;
+  }
+  return false;
 }

 // Define MAP_HUGETLB here so we can build HotSpot on old systems.
@@ -2523,7 +2525,13 @@ bool os::commit_memory(char* addr, size_t size, size_t alignment_hint,
      (uintptr_t) ::mmap(addr, size, prot,
                         MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS|MAP_HUGETLB,
                         -1, 0);
-    return res != (uintptr_t) MAP_FAILED;
+    if (res != (uintptr_t) MAP_FAILED) {
+      if (UseNUMAInterleaving) {
+        numa_make_global(addr, size);
+      }
+      return true;
+    }
+    return false;
  }

  return commit_memory(addr, size, exec);
@@ -2588,8 +2596,17 @@ int os::Linux::sched_getcpu_syscall(void) {
  int retval = -1;

 #if defined(IA32)
+# ifndef SYS_getcpu
+# define SYS_getcpu 318
+# endif
  retval = syscall(SYS_getcpu, &cpu, NULL, NULL);
 #elif defined(AMD64)
+// Unfortunately we have to bring all these macros here from vsyscall.h
+// to be able to compile on old linuxes.
+# define __NR_vgetcpu 2
+# define VSYSCALL_START (-10UL << 20)
+# define VSYSCALL_SIZE 1024
+# define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
  typedef long (*vgetcpu_t)(unsigned int *cpu, unsigned int *node, unsigned long *tcache);
  vgetcpu_t vgetcpu = (vgetcpu_t)VSYSCALL_ADDR(__NR_vgetcpu);
  retval = vgetcpu(&cpu, NULL, NULL);
@@ -3115,6 +3132,10 @@ char* os::reserve_memory_special(size_t bytes, char* req_addr, bool exec) {
     return NULL;
  }

+  if ((addr != NULL) && UseNUMAInterleaving) {
+    numa_make_global(addr, bytes);
+  }
+
  return addr;
 }


--- a/src/os/solaris/vm/os_solaris.cpp
+++ b/src/os/solaris/vm/os_solaris.cpp
@@ -2777,8 +2777,14 @@ int os::vm_allocation_granularity() {
 bool os::commit_memory(char* addr, size_t bytes, bool exec) {
  int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
  size_t size = bytes;
-  return
-     NULL != Solaris::mmap_chunk(addr, size, MAP_PRIVATE|MAP_FIXED, prot);
+  char *res = Solaris::mmap_chunk(addr, size, MAP_PRIVATE|MAP_FIXED, prot);
+  if (res != NULL) {
+    if (UseNUMAInterleaving) {
+      numa_make_global(addr, bytes);
+    }
+    return true;
+  }
+  return false;
 }

 bool os::commit_memory(char* addr, size_t bytes, size_t alignment_hint,
@@ -3389,12 +3395,11 @@ bool os::Solaris::set_mpss_range(caddr_t start, size_t bytes, size_t align) {
  return true;
 }

-char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) {
+char* os::reserve_memory_special(size_t size, char* addr, bool exec) {
  // "exec" is passed in but not used.  Creating the shared image for
  // the code cache doesn't have an SHM_X executable permission to check.
  assert(UseLargePages && UseISM, "only for ISM large pages");

-  size_t size = bytes;
  char* retAddr = NULL;
  int shmid;
  key_t ismKey;
@@ -3436,7 +3441,9 @@ char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) {
    }
    return NULL;
  }
-
+  if ((retAddr != NULL) && UseNUMAInterleaving) {
+    numa_make_global(retAddr, size);
+  }
  return retAddr;
 }


--- a/src/os/windows/vm/os_windows.cpp
+++ b/src/os/windows/vm/os_windows.cpp
--- a/src/os/windows/vm/os_windows.hpp
+++ b/src/os/windows/vm/os_windows.hpp
@@ -173,13 +173,25 @@ public:
  static BOOL GetNativeSystemInfoAvailable();
  static void GetNativeSystemInfo(LPSYSTEM_INFO);

+  // NUMA calls
+  static BOOL NumaCallsAvailable();
+  static LPVOID VirtualAllocExNuma(HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+  static BOOL GetNumaHighestNodeNumber(PULONG);
+  static BOOL GetNumaNodeProcessorMask(UCHAR, PULONGLONG);
+
 private:
  // GetLargePageMinimum available on Windows Vista/Windows Server 2003
  // and later
+  // NUMA calls available Windows Vista/WS2008 and later
+
  static SIZE_T (WINAPI *_GetLargePageMinimum)(void);
+  static LPVOID (WINAPI *_VirtualAllocExNuma) (HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+  static BOOL (WINAPI *_GetNumaHighestNodeNumber) (PULONG);
+  static BOOL (WINAPI *_GetNumaNodeProcessorMask) (UCHAR, PULONGLONG);
  static BOOL initialized;

  static void initialize();
+  static void initializeCommon();

 #ifdef JDK6_OR_EARLIER
 private:

--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@@ -4069,6 +4069,23 @@ bool GCLabBitMapClosure::do_bit(size_t offset) {
 }
 #endif // PRODUCT

+G1ParGCAllocBuffer::G1ParGCAllocBuffer(size_t gclab_word_size) :
+  ParGCAllocBuffer(gclab_word_size),
+  _should_mark_objects(false),
+  _bitmap(G1CollectedHeap::heap()->reserved_region().start(), gclab_word_size),
+  _retired(false)
+{
+  //_should_mark_objects is set to true when G1ParCopyHelper needs to
+  // mark the forwarded location of an evacuated object.
+  // We set _should_mark_objects to true if marking is active, i.e. when we
+  // need to propagate a mark, or during an initial mark pause, i.e. when we
+  // need to mark objects immediately reachable by the roots.
+  if (G1CollectedHeap::heap()->mark_in_progress() ||
+      G1CollectedHeap::heap()->g1_policy()->during_initial_mark_pause()) {
+    _should_mark_objects = true;
+  }
+}
+
 G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, int queue_num)
  : _g1h(g1h),
    _refs(g1h->task_queue(queue_num)),
@@ -4184,12 +4201,14 @@ void G1ParScanThreadState::trim_queue() {

 G1ParClosureSuper::G1ParClosureSuper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
  _g1(g1), _g1_rem(_g1->g1_rem_set()), _cm(_g1->concurrent_mark()),
-  _par_scan_state(par_scan_state) { }
+  _par_scan_state(par_scan_state),
+  _during_initial_mark(_g1->g1_policy()->during_initial_mark_pause()),
+  _mark_in_progress(_g1->mark_in_progress()) { }

-template <class T> void G1ParCopyHelper::mark_forwardee(T* p) {
-  // This is called _after_ do_oop_work has been called, hence after
-  // the object has been relocated to its new location and *p points
-  // to its new location.
+template <class T> void G1ParCopyHelper::mark_object(T* p) {
+  // This is called from do_oop_work for objects that are not
+  // in the collection set. Objects in the collection set
+  // are marked after they have been evacuated.

  T heap_oop = oopDesc::load_heap_oop(p);
  if (!oopDesc::is_null(heap_oop)) {
@@ -4201,7 +4220,7 @@ template <class T> void G1ParCopyHelper::mark_forwardee(T* p) {
  }
 }

-oop G1ParCopyHelper::copy_to_survivor_space(oop old) {
+oop G1ParCopyHelper::copy_to_survivor_space(oop old, bool should_mark_copy) {
  size_t    word_sz = old->size();
  HeapRegion* from_region = _g1->heap_region_containing_raw(old);
  // +1 to make the -1 indexes valid...
@@ -4257,8 +4276,8 @@ oop G1ParCopyHelper::copy_to_survivor_space(oop old) {
      obj->set_mark(m);
    }

-    // preserve "next" mark bit
-    if (_g1->mark_in_progress() && !_g1->is_obj_ill(old)) {
+    // Mark the evacuated object or propagate "next" mark bit
+    if (should_mark_copy) {
      if (!use_local_bitmaps ||
          !_par_scan_state->alloc_buffer(alloc_purpose)->mark(obj_ptr)) {
        // if we couldn't mark it on the local bitmap (this happens when
@@ -4266,11 +4285,12 @@ oop G1ParCopyHelper::copy_to_survivor_space(oop old) {
        // the bullet and do the standard parallel mark
        _cm->markAndGrayObjectIfNecessary(obj);
      }
-#if 1
+
      if (_g1->isMarkedNext(old)) {
+        // Unmark the object's old location so that marking
+        // doesn't think the old object is alive.
        _cm->nextMarkBitMap()->parClear((HeapWord*)old);
      }
-#endif
    }

    size_t* surv_young_words = _par_scan_state->surviving_young_words();
@@ -4293,26 +4313,62 @@ oop G1ParCopyHelper::copy_to_survivor_space(oop old) {
  return obj;
 }

-template <bool do_gen_barrier, G1Barrier barrier, bool do_mark_forwardee>
+template <bool do_gen_barrier, G1Barrier barrier, bool do_mark_object>
 template <class T>
-void G1ParCopyClosure <do_gen_barrier, barrier, do_mark_forwardee>
+void G1ParCopyClosure<do_gen_barrier, barrier, do_mark_object>
 ::do_oop_work(T* p) {
  oop obj = oopDesc::load_decode_heap_oop(p);
  assert(barrier != G1BarrierRS || obj != NULL,
         "Precondition: G1BarrierRS implies obj is nonNull");

+  // Marking:
+  // If the object is in the collection set, then the thread
+  // that copies the object should mark, or propagate the
+  // mark to, the evacuated object.
+  // If the object is not in the collection set then we
+  // should call the mark_object() method depending on the
+  // value of the template parameter do_mark_object (which will
+  // be true for root scanning closures during an initial mark
+  // pause).
+  // The mark_object() method first checks whether the object
+  // is marked and, if not, attempts to mark the object.
+
  // here the null check is implicit in the cset_fast_test() test
  if (_g1->in_cset_fast_test(obj)) {
    if (obj->is_forwarded()) {
      oopDesc::encode_store_heap_oop(p, obj->forwardee());
+      // If we are a root scanning closure during an initial
+      // mark pause (i.e. do_mark_object will be true) then
+      // we also need to handle marking of roots in the
+      // event of an evacuation failure. In the event of an
+      // evacuation failure, the object is forwarded to itself
+      // and not copied so let's mark it here.
+      if (do_mark_object && obj->forwardee() == obj) {
+        mark_object(p);
+      }
    } else {
-      oop copy_oop = copy_to_survivor_space(obj);
+      // We need to mark the copied object if we're a root scanning
+      // closure during an initial mark pause (i.e. do_mark_object
+      // will be true), or the object is already marked and we need
+      // to propagate the mark to the evacuated copy.
+      bool should_mark_copy = do_mark_object ||
+                              _during_initial_mark ||
+                              (_mark_in_progress && !_g1->is_obj_ill(obj));
+
+      oop copy_oop = copy_to_survivor_space(obj, should_mark_copy);
      oopDesc::encode_store_heap_oop(p, copy_oop);
    }
    // When scanning the RS, we only care about objs in CS.
    if (barrier == G1BarrierRS) {
      _par_scan_state->update_rs(_from, p, _par_scan_state->queue_num());
    }
+  } else {
+    // The object is not in collection set. If we're a root scanning
+    // closure during an initial mark pause (i.e. do_mark_object will
+    // be true) then attempt to mark the object.
+    if (do_mark_object) {
+      mark_object(p);
+    }
  }

  if (barrier == G1BarrierEvac && obj != NULL) {

--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
@@ -1715,26 +1715,22 @@ public:
 class G1ParGCAllocBuffer: public ParGCAllocBuffer {
 private:
  bool        _retired;
-  bool        _during_marking;
+  bool        _should_mark_objects;
  GCLabBitMap _bitmap;

 public:
-  G1ParGCAllocBuffer(size_t gclab_word_size) :
-    ParGCAllocBuffer(gclab_word_size),
-    _during_marking(G1CollectedHeap::heap()->mark_in_progress()),
-    _bitmap(G1CollectedHeap::heap()->reserved_region().start(), gclab_word_size),
-    _retired(false)
-  { }
+  G1ParGCAllocBuffer(size_t gclab_word_size);

  inline bool mark(HeapWord* addr) {
    guarantee(use_local_bitmaps, "invariant");
-    assert(_during_marking, "invariant");
+    assert(_should_mark_objects, "invariant");
    return _bitmap.mark(addr);
  }

  inline void set_buf(HeapWord* buf) {
-    if (use_local_bitmaps && _during_marking)
+    if (use_local_bitmaps && _should_mark_objects) {
      _bitmap.set_buffer(buf);
+    }
    ParGCAllocBuffer::set_buf(buf);
    _retired = false;
  }
@@ -1742,7 +1738,7 @@ public:
  inline void retire(bool end_of_gc, bool retain) {
    if (_retired)
      return;
-    if (use_local_bitmaps && _during_marking) {
+    if (use_local_bitmaps && _should_mark_objects) {
      _bitmap.retire();
    }
    ParGCAllocBuffer::retire(end_of_gc, retain);

--- a/src/share/vm/gc_implementation/g1/g1OopClosures.hpp
+++ b/src/share/vm/gc_implementation/g1/g1OopClosures.hpp
@@ -50,6 +50,8 @@ protected:
  G1RemSet* _g1_rem;
  ConcurrentMark* _cm;
  G1ParScanThreadState* _par_scan_state;
+  bool _during_initial_mark;
+  bool _mark_in_progress;
 public:
  G1ParClosureSuper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state);
  bool apply_to_weak_ref_discovered_field() { return true; }
@@ -102,8 +104,8 @@ public:
 class G1ParCopyHelper : public G1ParClosureSuper {
  G1ParScanClosure *_scanner;
 protected:
-  template <class T> void mark_forwardee(T* p);
-  oop copy_to_survivor_space(oop obj);
+  template <class T> void mark_object(T* p);
+  oop copy_to_survivor_space(oop obj, bool should_mark_copy);
 public:
  G1ParCopyHelper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state,
                  G1ParScanClosure *scanner) :
@@ -111,7 +113,7 @@ public:
 };

 template<bool do_gen_barrier, G1Barrier barrier,
-         bool do_mark_forwardee>
+         bool do_mark_object>
 class G1ParCopyClosure : public G1ParCopyHelper {
  G1ParScanClosure _scanner;
  template <class T> void do_oop_work(T* p);
@@ -120,8 +122,6 @@ public:
    _scanner(g1, par_scan_state), G1ParCopyHelper(g1, par_scan_state, &_scanner) { }
  template <class T> void do_oop_nv(T* p) {
    do_oop_work(p);
-    if (do_mark_forwardee)
-      mark_forwardee(p);
  }
  virtual void do_oop(oop* p)       { do_oop_nv(p); }
  virtual void do_oop(narrowOop* p) { do_oop_nv(p); }

--- a/src/share/vm/gc_implementation/g1/g1_globals.hpp
+++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp
@@ -124,9 +124,6 @@
  develop(bool, G1RSBarrierNullFilter, true,                                \
          "If true, generate null-pointer filtering code in RS barrier")    \
                                                                            \
-  develop(bool, G1PrintCTFilterStats, false,                                \
-          "If true, print stats on RS filtering effectiveness")             \
-                                                                            \
  develop(bool, G1DeferredRSUpdate, true,                                   \
          "If true, use deferred RS updates")                               \
                                                                            \

--- a/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp
+++ b/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp
@@ -36,7 +36,7 @@ enum G1Barrier {
 };

 template<bool do_gen_barrier, G1Barrier barrier,
-         bool do_mark_forwardee>
+         bool do_mark_object>
 class G1ParCopyClosure;
 class G1ParScanClosure;
 class G1ParPushHeapRSClosure;

--- a/src/share/vm/runtime/arguments.cpp
+++ b/src/share/vm/runtime/arguments.cpp
@@ -1423,6 +1423,9 @@ void Arguments::set_parallel_gc_flags() {
    if (FLAG_IS_DEFAULT(MinHeapDeltaBytes)) {
      FLAG_SET_DEFAULT(MinHeapDeltaBytes, 64*M);
    }
+    // For those collectors or operating systems (eg, Windows) that do
+    // not support full UseNUMA, we will map to UseNUMAInterleaving for now
+    UseNUMAInterleaving = true;
  }
 }


--- a/src/share/vm/runtime/globals.hpp
+++ b/src/share/vm/runtime/globals.hpp
@@ -475,6 +475,12 @@ class CommandLineFlags {
  product(bool, UseNUMA, false,                                             \
          "Use NUMA if available")                                          \
                                                                            \
+  product(bool, UseNUMAInterleaving, false,                                 \
+          "Interleave memory across NUMA nodes if available")               \
+                                                                            \
+  product(uintx, NUMAInterleaveGranularity, 2*M,                            \
+          "Granularity to use for NUMA interleaving on Windows OS")         \
+                                                                            \
  product(bool, ForceNUMA, false,                                           \
          "Force NUMA optimizations on single-node/UMA systems")            \
                                                                            \