6532536: Optimize arraycopy stubs for Intel cpus

Summary: Use SSE2 movdqu in arraycopy stubs on newest Intel's cpus Reviewed-by: rasbold

6532536: Optimize arraycopy stubs for Intel cpus
Summary: Use SSE2 movdqu in arraycopy stubs on newest Intel's cpus Reviewed-by: rasbold
f9dfe7e1 · kvn · 79e27e98 · f9dfe7e1 · f9dfe7e1 · f9dfe7e1
10 changed file
--- a/src/cpu/x86/vm/assembler_x86.cpp
+++ b/src/cpu/x86/vm/assembler_x86.cpp
@@ -1575,6 +1575,35 @@ void Assembler::movdqa(Address dst, XMMRegister src) {
  emit_operand(src, dst);
 }
+void Assembler::movdqu(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  emit_byte(0xF3);
+  prefix(src, dst);
+  emit_byte(0x0F);
+  emit_byte(0x6F);
+  emit_operand(dst, src);
+}
+void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  emit_byte(0xF3);
+  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+  emit_byte(0x0F);
+  emit_byte(0x6F);
+  emit_byte(0xC0 | encode);
+}
+void Assembler::movdqu(Address dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  emit_byte(0xF3);
+  prefix(dst, src);
+  emit_byte(0x0F);
+  emit_byte(0x7F);
+  emit_operand(src, dst);
+}
 // Uses zero extension on 64bit
 void Assembler::movl(Register dst, int32_t imm32) {

--- a/src/cpu/x86/vm/assembler_x86.hpp
+++ b/src/cpu/x86/vm/assembler_x86.hpp
@@ -1055,6 +1055,11 @@ private:
  void movdqa(XMMRegister dst, Address src);
  void movdqa(XMMRegister dst, XMMRegister src);
+  // Move Unaligned Double Quadword
+  void movdqu(Address     dst, XMMRegister src);
+  void movdqu(XMMRegister dst, Address src);
+  void movdqu(XMMRegister dst, XMMRegister src);
  void movl(Register dst, int32_t imm32);
  void movl(Address dst, int32_t imm32);
  void movl(Register dst, Register src);

--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@@ -791,6 +791,69 @@ class StubGenerator: public StubCodeGenerator {
    }
  }
+  // Copy 64 bytes chunks
+  //
+  // Inputs:
+  //   from        - source array address
+  //   to_from     - destination array address - from
+  //   qword_count - 8-bytes element count, negative
+  //
+  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
+    assert( UseSSE >= 2, "supported cpu only" );
+    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
+    // Copy 64-byte chunks
+    __ jmpb(L_copy_64_bytes);
+    __ align(16);
+  __ BIND(L_copy_64_bytes_loop);
+    if(UseUnalignedLoadStores) {
+      __ movdqu(xmm0, Address(from, 0));
+      __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
+      __ movdqu(xmm1, Address(from, 16));
+      __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
+      __ movdqu(xmm2, Address(from, 32));
+      __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
+      __ movdqu(xmm3, Address(from, 48));
+      __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
+    } else {
+      __ movq(xmm0, Address(from, 0));
+      __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
+      __ movq(xmm1, Address(from, 8));
+      __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
+      __ movq(xmm2, Address(from, 16));
+      __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
+      __ movq(xmm3, Address(from, 24));
+      __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
+      __ movq(xmm4, Address(from, 32));
+      __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
+      __ movq(xmm5, Address(from, 40));
+      __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
+      __ movq(xmm6, Address(from, 48));
+      __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
+      __ movq(xmm7, Address(from, 56));
+      __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
+    }
+    __ addl(from, 64);
+  __ BIND(L_copy_64_bytes);
+    __ subl(qword_count, 8);
+    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
+    __ addl(qword_count, 8);
+    __ jccb(Assembler::zero, L_exit);
+    //
+    // length is too short, just copy qwords
+    //
+  __ BIND(L_copy_8_bytes);
+    __ movq(xmm0, Address(from, 0));
+    __ movq(Address(from, to_from, Address::times_1), xmm0);
+    __ addl(from, 8);
+    __ decrement(qword_count);
+    __ jcc(Assembler::greater, L_copy_8_bytes);
+  __ BIND(L_exit);
+  }
  // Copy 64 bytes chunks
  //
  // Inputs:
@@ -799,6 +862,7 @@ class StubGenerator: public StubCodeGenerator {
  //   qword_count - 8-bytes element count, negative
  //
  void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
+    assert( VM_Version::supports_mmx(), "supported cpu only" );
    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
    // Copy 64-byte chunks
    __ jmpb(L_copy_64_bytes);
@@ -876,7 +940,7 @@ class StubGenerator: public StubCodeGenerator {
    __ subptr(to, from); // to --> to_from
    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
-    if (!aligned && (t == T_BYTE || t == T_SHORT)) {
+    if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
      // align source address at 4 bytes address boundary
      if (t == T_BYTE) {
        // One byte misalignment happens only for byte arrays
@@ -906,20 +970,26 @@ class StubGenerator: public StubCodeGenerator {
      __ mov(count, rax);      // restore 'count'
      __ jmpb(L_copy_2_bytes); // all dwords were copied
    } else {
-      // align to 8 bytes, we know we are 4 byte aligned to start
+      if (!UseUnalignedLoadStores) {
-      __ testptr(from, 4);
+        // align to 8 bytes, we know we are 4 byte aligned to start
-      __ jccb(Assembler::zero, L_copy_64_bytes);
+        __ testptr(from, 4);
-      __ movl(rax, Address(from, 0));
+        __ jccb(Assembler::zero, L_copy_64_bytes);
-      __ movl(Address(from, to_from, Address::times_1, 0), rax);
+        __ movl(rax, Address(from, 0));
-      __ addptr(from, 4);
+        __ movl(Address(from, to_from, Address::times_1, 0), rax);
-      __ subl(count, 1<<shift);
+        __ addptr(from, 4);
+        __ subl(count, 1<<shift);
+      }
    __ BIND(L_copy_64_bytes);
      __ mov(rax, count);
      __ shrl(rax, shift+1);  // 8 bytes chunk count
      //
      // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
      //
-      mmx_copy_forward(from, to_from, rax);
+      if (UseXMMForArrayCopy) {
+        xmm_copy_forward(from, to_from, rax);
+      } else {
+        mmx_copy_forward(from, to_from, rax);
+      }
    }
    // copy tailing dword
  __ BIND(L_copy_4_bytes);
@@ -1069,13 +1139,20 @@ class StubGenerator: public StubCodeGenerator {
      __ align(16);
      // Move 8 bytes
    __ BIND(L_copy_8_bytes_loop);
-      __ movq(mmx0, Address(from, count, sf, 0));
+      if (UseXMMForArrayCopy) {
-      __ movq(Address(to, count, sf, 0), mmx0);
+        __ movq(xmm0, Address(from, count, sf, 0));
+        __ movq(Address(to, count, sf, 0), xmm0);
+      } else {
+        __ movq(mmx0, Address(from, count, sf, 0));
+        __ movq(Address(to, count, sf, 0), mmx0);
+      }
    __ BIND(L_copy_8_bytes);
      __ subl(count, 2<<shift);
      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
      __ addl(count, 2<<shift);
-      __ emms();
+      if (!UseXMMForArrayCopy) {
+        __ emms();
+      }
    }
  __ BIND(L_copy_4_bytes);
    // copy prefix qword
@@ -1143,7 +1220,11 @@ class StubGenerator: public StubCodeGenerator {
    __ subptr(to, from); // to --> to_from
    if (VM_Version::supports_mmx()) {
-      mmx_copy_forward(from, to_from, count);
+      if (UseXMMForArrayCopy) {
+        xmm_copy_forward(from, to_from, count);
+      } else {
+        mmx_copy_forward(from, to_from, count);
+      }
    } else {
      __ jmpb(L_copy_8_bytes);
      __ align(16);
@@ -1196,8 +1277,13 @@ class StubGenerator: public StubCodeGenerator {
    __ align(16);
  __ BIND(L_copy_8_bytes_loop);
    if (VM_Version::supports_mmx()) {
-      __ movq(mmx0, Address(from, count, Address::times_8));
+      if (UseXMMForArrayCopy) {
-      __ movq(Address(to, count, Address::times_8), mmx0);
+        __ movq(xmm0, Address(from, count, Address::times_8));
+        __ movq(Address(to, count, Address::times_8), xmm0);
+      } else {
+        __ movq(mmx0, Address(from, count, Address::times_8));
+        __ movq(Address(to, count, Address::times_8), mmx0);
+      }
    } else {
      __ fild_d(Address(from, count, Address::times_8));
      __ fistp_d(Address(to, count, Address::times_8));
@@ -1206,7 +1292,7 @@ class StubGenerator: public StubCodeGenerator {
    __ decrement(count);
    __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
-    if (VM_Version::supports_mmx()) {
+    if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
      __ emms();
    }
    inc_copy_counter_np(T_LONG);

--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -1251,6 +1251,7 @@ class StubGenerator: public StubCodeGenerator {
    }
  }
  // Copy big chunks forward
  //
  // Inputs:
@@ -1268,14 +1269,22 @@ class StubGenerator: public StubCodeGenerator {
    Label L_loop;
    __ align(16);
  __ BIND(L_loop);
-    __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
+    if(UseUnalignedLoadStores) {
-    __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
+      __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
-    __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
+      __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
-    __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
+      __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
-    __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
+      __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
-    __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
-    __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
+    } else {
-    __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
+      __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
+      __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
+      __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
+      __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
+      __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
+      __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
+      __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
+      __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
+    }
  __ BIND(L_copy_32_bytes);
    __ addptr(qword_count, 4);
    __ jcc(Assembler::lessEqual, L_loop);
@@ -1301,14 +1310,22 @@ class StubGenerator: public StubCodeGenerator {
    Label L_loop;
    __ align(16);
  __ BIND(L_loop);
-    __ movq(to, Address(from, qword_count, Address::times_8, 24));
+    if(UseUnalignedLoadStores) {
-    __ movq(Address(dest, qword_count, Address::times_8, 24), to);
+      __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
-    __ movq(to, Address(from, qword_count, Address::times_8, 16));
+      __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
-    __ movq(Address(dest, qword_count, Address::times_8, 16), to);
+      __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
-    __ movq(to, Address(from, qword_count, Address::times_8,  8));
+      __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
-    __ movq(Address(dest, qword_count, Address::times_8,  8), to);
-    __ movq(to, Address(from, qword_count, Address::times_8,  0));
+    } else {
-    __ movq(Address(dest, qword_count, Address::times_8,  0), to);
+      __ movq(to, Address(from, qword_count, Address::times_8, 24));
+      __ movq(Address(dest, qword_count, Address::times_8, 24), to);
+      __ movq(to, Address(from, qword_count, Address::times_8, 16));
+      __ movq(Address(dest, qword_count, Address::times_8, 16), to);
+      __ movq(to, Address(from, qword_count, Address::times_8,  8));
+      __ movq(Address(dest, qword_count, Address::times_8,  8), to);
+      __ movq(to, Address(from, qword_count, Address::times_8,  0));
+      __ movq(Address(dest, qword_count, Address::times_8,  0), to);
+    }
  __ BIND(L_copy_32_bytes);
    __ subptr(qword_count, 4);
    __ jcc(Assembler::greaterEqual, L_loop);

--- a/src/cpu/x86/vm/vm_version_x86_32.cpp
+++ b/src/cpu/x86/vm/vm_version_x86_32.cpp
@@ -242,9 +242,11 @@ void VM_Version::get_processor_features() {
  _supports_cx8 = supports_cmpxchg8();
  // if the OS doesn't support SSE, we can't use this feature even if the HW does
  if( !os::supports_sse())
-    _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4|CPU_SSE4A);
+    _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4A|CPU_SSE4_1|CPU_SSE4_2);
-  if (UseSSE < 4)
+  if (UseSSE < 4) {
-    _cpuFeatures &= ~CPU_SSE4;
+    _cpuFeatures &= ~CPU_SSE4_1;
+    _cpuFeatures &= ~CPU_SSE4_2;
+  }
  if (UseSSE < 3) {
    _cpuFeatures &= ~CPU_SSE3;
    _cpuFeatures &= ~CPU_SSSE3;
@@ -261,7 +263,7 @@ void VM_Version::get_processor_features() {
  }
  char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
               cores_per_cpu(), threads_per_core(),
               cpu_family(), _model, _stepping,
               (supports_cmov() ? ", cmov" : ""),
@@ -272,7 +274,8 @@ void VM_Version::get_processor_features() {
               (supports_sse2() ? ", sse2" : ""),
               (supports_sse3() ? ", sse3" : ""),
               (supports_ssse3()? ", ssse3": ""),
-               (supports_sse4() ? ", sse4" : ""),
+               (supports_sse4_1() ? ", sse4.1" : ""),
+               (supports_sse4_2() ? ", sse4.2" : ""),
               (supports_mmx_ext() ? ", mmxext" : ""),
               (supports_3dnow()   ? ", 3dnow"  : ""),
               (supports_3dnow2()  ? ", 3dnowext" : ""),
@@ -285,7 +288,7 @@ void VM_Version::get_processor_features() {
  // older Pentiums which do not support it.
  if( UseSSE > 4 ) UseSSE=4;
  if( UseSSE < 0 ) UseSSE=0;
-  if( !supports_sse4() ) // Drop to 3 if no SSE4 support
+  if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
    UseSSE = MIN2((intx)3,UseSSE);
  if( !supports_sse3() ) // Drop to 2 if no SSE3 support
    UseSSE = MIN2((intx)2,UseSSE);
@@ -375,6 +378,14 @@ void VM_Version::get_processor_features() {
        MaxLoopPad = 11;
      }
 #endif // COMPILER2
+      if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
+        UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
+      }
+      if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
+        if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
+          UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
+        }
+      }
    }
  }
@@ -413,7 +424,7 @@ void VM_Version::get_processor_features() {
 #ifndef PRODUCT
  if (PrintMiscellaneous && Verbose) {
-    tty->print_cr("Logical CPUs per package: %u",
+    tty->print_cr("Logical CPUs per core: %u",
                  logical_processors_per_package());
    tty->print_cr("UseSSE=%d",UseSSE);
    tty->print("Allocation: ");

--- a/src/cpu/x86/vm/vm_version_x86_32.hpp
+++ b/src/cpu/x86/vm/vm_version_x86_32.hpp
@@ -68,9 +68,9 @@ public:
               cmpxchg16: 1,
                        : 4,
               dca      : 1,
-                        : 4,
+               sse4_1   : 1,
-               popcnt   : 1,
+               sse4_2   : 1,
-                        : 8;
+                        : 11;
    } bits;
  };
@@ -177,8 +177,9 @@ protected:
     CPU_SSE2 = (1 << 7),
     CPU_SSE3 = (1 << 8), // sse3  comes from cpuid 1 (ECX)
     CPU_SSSE3= (1 << 9),
-     CPU_SSE4 = (1 <<10),
+     CPU_SSE4A= (1 <<10),
-     CPU_SSE4A= (1 <<11)
+     CPU_SSE4_1 = (1 << 11),
+     CPU_SSE4_2 = (1 << 12)
   } cpuFeatureFlags;
  // cpuid information block.  All info derived from executing cpuid with
@@ -240,22 +241,14 @@ protected:
  static CpuidInfo _cpuid_info;
  // Extractors and predicates
-  static bool is_extended_cpu_family() {
-    const uint32_t Extended_Cpu_Family = 0xf;
-    return _cpuid_info.std_cpuid1_rax.bits.family == Extended_Cpu_Family;
-  }
  static uint32_t extended_cpu_family() {
    uint32_t result = _cpuid_info.std_cpuid1_rax.bits.family;
-    if (is_extended_cpu_family()) {
+    result += _cpuid_info.std_cpuid1_rax.bits.ext_family;
-      result += _cpuid_info.std_cpuid1_rax.bits.ext_family;
-    }
    return result;
  }
  static uint32_t extended_cpu_model() {
    uint32_t result = _cpuid_info.std_cpuid1_rax.bits.model;
-    if (is_extended_cpu_family()) {
+    result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4;
-      result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4;
-    }
    return result;
  }
  static uint32_t cpu_stepping() {
@@ -293,6 +286,10 @@ protected:
      result |= CPU_SSSE3;
    if (is_amd() && _cpuid_info.ext_cpuid1_rcx.bits.sse4a != 0)
      result |= CPU_SSE4A;
+    if (_cpuid_info.std_cpuid1_rcx.bits.sse4_1 != 0)
+      result |= CPU_SSE4_1;
+    if (_cpuid_info.std_cpuid1_rcx.bits.sse4_2 != 0)
+      result |= CPU_SSE4_2;
    return result;
  }
@@ -380,7 +377,8 @@ public:
  static bool supports_sse2()     { return (_cpuFeatures & CPU_SSE2) != 0; }
  static bool supports_sse3()     { return (_cpuFeatures & CPU_SSE3) != 0; }
  static bool supports_ssse3()    { return (_cpuFeatures & CPU_SSSE3)!= 0; }
-  static bool supports_sse4()     { return (_cpuFeatures & CPU_SSE4) != 0; }
+  static bool supports_sse4_1()   { return (_cpuFeatures & CPU_SSE4_1) != 0; }
+  static bool supports_sse4_2()   { return (_cpuFeatures & CPU_SSE4_2) != 0; }
  //
  // AMD features
  //

--- a/src/cpu/x86/vm/vm_version_x86_64.cpp
+++ b/src/cpu/x86/vm/vm_version_x86_64.cpp
@@ -186,8 +186,10 @@ void VM_Version::get_processor_features() {
  if (!VM_Version::supports_sse2()) {
    vm_exit_during_initialization("Unknown x64 processor: SSE2 not supported");
  }
-  if (UseSSE < 4)
+  if (UseSSE < 4) {
-    _cpuFeatures &= ~CPU_SSE4;
+    _cpuFeatures &= ~CPU_SSE4_1;
+    _cpuFeatures &= ~CPU_SSE4_2;
+  }
  if (UseSSE < 3) {
    _cpuFeatures &= ~CPU_SSE3;
    _cpuFeatures &= ~CPU_SSSE3;
@@ -204,7 +206,7 @@ void VM_Version::get_processor_features() {
  }
  char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
               cores_per_cpu(), threads_per_core(),
               cpu_family(), _model, _stepping,
               (supports_cmov() ? ", cmov" : ""),
@@ -215,7 +217,8 @@ void VM_Version::get_processor_features() {
               (supports_sse2() ? ", sse2" : ""),
               (supports_sse3() ? ", sse3" : ""),
               (supports_ssse3()? ", ssse3": ""),
-               (supports_sse4() ? ", sse4" : ""),
+               (supports_sse4_1() ? ", sse4.1" : ""),
+               (supports_sse4_2() ? ", sse4.2" : ""),
               (supports_mmx_ext() ? ", mmxext" : ""),
               (supports_3dnow()   ? ", 3dnow"  : ""),
               (supports_3dnow2()  ? ", 3dnowext" : ""),
@@ -228,7 +231,7 @@ void VM_Version::get_processor_features() {
  // older Pentiums which do not support it.
  if( UseSSE > 4 ) UseSSE=4;
  if( UseSSE < 0 ) UseSSE=0;
-  if( !supports_sse4() ) // Drop to 3 if no SSE4 support
+  if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
    UseSSE = MIN2((intx)3,UseSSE);
  if( !supports_sse3() ) // Drop to 2 if no SSE3 support
    UseSSE = MIN2((intx)2,UseSSE);
@@ -314,6 +317,14 @@ void VM_Version::get_processor_features() {
        MaxLoopPad = 11;
      }
 #endif // COMPILER2
+      if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
+        UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
+      }
+      if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
+        if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
+          UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
+        }
+      }
    }
  }
@@ -355,7 +366,7 @@ void VM_Version::get_processor_features() {
 #ifndef PRODUCT
  if (PrintMiscellaneous && Verbose) {
-    tty->print_cr("Logical CPUs per package: %u",
+    tty->print_cr("Logical CPUs per core: %u",
                  logical_processors_per_package());
    tty->print_cr("UseSSE=%d",UseSSE);
    tty->print("Allocation: ");

--- a/src/cpu/x86/vm/vm_version_x86_64.hpp
+++ b/src/cpu/x86/vm/vm_version_x86_64.hpp
@@ -68,9 +68,9 @@ public:
               cmpxchg16: 1,
                        : 4,
               dca      : 1,
-                        : 4,
+               sse4_1   : 1,
-               popcnt   : 1,
+               sse4_2   : 1,
-                        : 8;
+                        : 11;
    } bits;
  };
@@ -177,8 +177,9 @@ protected:
     CPU_SSE2 = (1 << 7),
     CPU_SSE3 = (1 << 8),
     CPU_SSSE3= (1 << 9),
-     CPU_SSE4 = (1 <<10),
+     CPU_SSE4A= (1 <<10),
-     CPU_SSE4A= (1 <<11)
+     CPU_SSE4_1 = (1 << 11),
+     CPU_SSE4_2 = (1 << 12)
   } cpuFeatureFlags;
  // cpuid information block.  All info derived from executing cpuid with
@@ -240,22 +241,14 @@ protected:
  static CpuidInfo _cpuid_info;
  // Extractors and predicates
-  static bool is_extended_cpu_family() {
-    const uint32_t Extended_Cpu_Family = 0xf;
-    return _cpuid_info.std_cpuid1_eax.bits.family == Extended_Cpu_Family;
-  }
  static uint32_t extended_cpu_family() {
    uint32_t result = _cpuid_info.std_cpuid1_eax.bits.family;
-    if (is_extended_cpu_family()) {
+    result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
-      result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
-    }
    return result;
  }
  static uint32_t extended_cpu_model() {
    uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model;
-    if (is_extended_cpu_family()) {
+    result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
-      result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
-    }
    return result;
  }
  static uint32_t cpu_stepping() {
@@ -293,6 +286,10 @@ protected:
      result |= CPU_SSSE3;
    if (is_amd() && _cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0)
      result |= CPU_SSE4A;
+    if (_cpuid_info.std_cpuid1_ecx.bits.sse4_1 != 0)
+      result |= CPU_SSE4_1;
+    if (_cpuid_info.std_cpuid1_ecx.bits.sse4_2 != 0)
+      result |= CPU_SSE4_2;
    return result;
  }
@@ -380,7 +377,8 @@ public:
  static bool supports_sse2()     { return (_cpuFeatures & CPU_SSE2) != 0; }
  static bool supports_sse3()     { return (_cpuFeatures & CPU_SSE3) != 0; }
  static bool supports_ssse3()    { return (_cpuFeatures & CPU_SSSE3)!= 0; }
-  static bool supports_sse4()     { return (_cpuFeatures & CPU_SSE4) != 0; }
+  static bool supports_sse4_1()   { return (_cpuFeatures & CPU_SSE4_1) != 0; }
+  static bool supports_sse4_2()   { return (_cpuFeatures & CPU_SSE4_2) != 0; }
  //
  // AMD features
  //

--- a/src/os/solaris/vm/os_solaris.cpp
+++ b/src/os/solaris/vm/os_solaris.cpp
@@ -3758,7 +3758,7 @@ int     set_lwp_priority (int ThreadID, int lwpid, int newPrio )
    int maxClamped     = MIN2(iaLimits.maxPrio, (int)iaInfo->ia_uprilim);
    iaInfo->ia_upri    = scale_to_lwp_priority(iaLimits.minPrio, maxClamped, newPrio);
    iaInfo->ia_uprilim = IA_NOCHANGE;
-    iaInfo->ia_nice    = IA_NOCHANGE;
+//    iaInfo->ia_nice    = IA_NOCHANGE;
    iaInfo->ia_mode    = IA_NOCHANGE;
    if (ThreadPriorityVerbose) {
      tty->print_cr ("IA: [%d...%d] %d->%d\n",

--- a/src/share/vm/runtime/globals.hpp
+++ b/src/share/vm/runtime/globals.hpp
@@ -991,6 +991,12 @@ class CommandLineFlags {
  product(bool, UseXmmI2F, false,                                           \
          "Use SSE2 CVTDQ2PS instruction to convert Integer to Float")      \
                                                                            \
+  product(bool, UseXMMForArrayCopy, false,                                  \
+          "Use SSE2 MOVQ instruction for Arraycopy")                        \
+                                                                            \
+  product(bool, UseUnalignedLoadStores, false,                              \
+          "Use SSE2 MOVDQU instruction for Arraycopy")                      \
+                                                                            \
  product(intx, FieldsAllocationStyle, 1,                                   \
          "0 - type based with oops first, 1 - with oops last")             \
                                                                            \