7119644: Increase superword's vector size up to 256 bits

Summary: Increase vector size up to 256-bits for YMM AVX registers on x86. Reviewed-by: never, twisti, roland

7119644: Increase superword's vector size up to 256 bits
Summary: Increase vector size up to 256-bits for YMM AVX registers on x86. Reviewed-by: never, twisti, roland
2c8dad1c · kvn · 190ebde9 · 2c8dad1c · 2c8dad1c · 2c8dad1c
74 changed file
--- a/src/cpu/sparc/vm/sparc.ad
+++ b/src/cpu/sparc/vm/sparc.ad
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -217,6 +217,8 @@ void VM_Version::initialize() {
  // Currently not supported anywhere.
  FLAG_SET_DEFAULT(UseFPUForSpilling, false);

+  MaxVectorSize = 8;
+
  assert((InteriorEntryAlignment % relocInfo::addr_unit()) == 0, "alignment is not a multiple of NOP size");
 #endif


--- a/src/cpu/x86/vm/assembler_x86.cpp
+++ b/src/cpu/x86/vm/assembler_x86.cpp
@@ -1637,6 +1637,13 @@ void Assembler::movaps(XMMRegister dst, XMMRegister src) {
  emit_byte(0xC0 | encode);
 }

+void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE);
+  emit_byte(0x16);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::movb(Register dst, Address src) {
  NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
  InstructionMark im(this);
@@ -1686,6 +1693,14 @@ void Assembler::movdl(XMMRegister dst, Address src) {
  emit_operand(dst, src);
 }

+void Assembler::movdl(Address dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66);
+  emit_byte(0x7E);
+  emit_operand(src, dst);
+}
+
 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
@@ -1716,6 +1731,35 @@ void Assembler::movdqu(Address dst, XMMRegister src) {
  emit_operand(src, dst);
 }

+// Move Unaligned 256bit Vector
+void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
+  assert(UseAVX, "");
+  bool vector256 = true;
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256);
+  emit_byte(0x6F);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vmovdqu(XMMRegister dst, Address src) {
+  assert(UseAVX, "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256);
+  emit_byte(0x6F);
+  emit_operand(dst, src);
+}
+
+void Assembler::vmovdqu(Address dst, XMMRegister src) {
+  assert(UseAVX, "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  // swap src<->dst for encoding
+  assert(src != xnoreg, "sanity");
+  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector256);
+  emit_byte(0x7F);
+  emit_operand(src, dst);
+}
+
 // Uses zero extension on 64bit

 void Assembler::movl(Register dst, int32_t imm32) {
@@ -3112,6 +3156,13 @@ void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src) {
  emit_operand(dst, src);
 }

+void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256);
+  emit_byte(0x57);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
@@ -3120,6 +3171,30 @@ void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) {
  emit_operand(dst, src);
 }

+void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, vector256);
+  emit_byte(0x57);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  bool vector256 = true;
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
+  emit_byte(0x18);
+  emit_byte(0xC0 | encode);
+  // 0x00 - insert into lower 128 bits
+  // 0x01 - insert into upper 128 bits
+  emit_byte(0x01);
+}
+
+void Assembler::vzeroupper() {
+  assert(VM_Version::supports_avx(), "");
+  (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
+  emit_byte(0x77);
+}
+

 #ifndef _LP64
 // 32bit only pieces of the assembler

--- a/src/cpu/x86/vm/assembler_x86.hpp
+++ b/src/cpu/x86/vm/assembler_x86.hpp
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -591,8 +591,9 @@ private:

  void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
                  VexSimdPrefix pre, bool vector256 = false) {
-     vex_prefix(src, nds->encoding(), dst->encoding(),
-                pre, VEX_OPCODE_0F, false, vector256);
+    int dst_enc = dst->encoding();
+    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+    vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256);
  }

  int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
@@ -600,9 +601,12 @@ private:
                             bool vex_w, bool vector256);

  int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
-                             VexSimdPrefix pre, bool vector256 = false) {
-     return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
-                                  pre, VEX_OPCODE_0F, false, vector256);
+                             VexSimdPrefix pre, bool vector256 = false,
+                             VexOpcode opc = VEX_OPCODE_0F) {
+    int src_enc = src->encoding();
+    int dst_enc = dst->encoding();
+    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector256);
  }

  void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
@@ -1261,6 +1265,7 @@ private:
  void movdl(XMMRegister dst, Register src);
  void movdl(Register dst, XMMRegister src);
  void movdl(XMMRegister dst, Address src);
+  void movdl(Address dst, XMMRegister src);

  // Move Double Quadword
  void movdq(XMMRegister dst, Register src);
@@ -1274,6 +1279,14 @@ private:
  void movdqu(XMMRegister dst, Address src);
  void movdqu(XMMRegister dst, XMMRegister src);

+  // Move Unaligned 256bit Vector
+  void vmovdqu(Address dst, XMMRegister src);
+  void vmovdqu(XMMRegister dst, Address src);
+  void vmovdqu(XMMRegister dst, XMMRegister src);
+
+  // Move lower 64bit to high 64bit in 128bit register
+  void movlhps(XMMRegister dst, XMMRegister src);
+
  void movl(Register dst, int32_t imm32);
  void movl(Address dst, int32_t imm32);
  void movl(Register dst, Register src);
@@ -1615,6 +1628,17 @@ private:
  void vxorpd(XMMRegister dst, XMMRegister nds, Address src);
  void vxorps(XMMRegister dst, XMMRegister nds, Address src);

+  // AVX Vector instrucitons.
+  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
+
+  // AVX instruction which is used to clear upper 128 bits of YMM registers and
+  // to avoid transaction penalty between AVX and SSE states. There is no
+  // penalty if legacy SSE instructions are encoded using VEX prefix because
+  // they always clear upper 128 bits. It should be used before calling
+  // runtime code and native libraries.
+  void vzeroupper();

 protected:
  // Next instructions require address alignment 16 bytes SSE mode.
@@ -2529,9 +2553,13 @@ public:
  void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
  void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);

+  // AVX Vector instructions
+
+  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
  void vxorpd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorpd(dst, nds, src); }
  void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);

+  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
  void vxorps(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorps(dst, nds, src); }
  void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src);


--- a/src/cpu/x86/vm/register_x86.cpp
+++ b/src/cpu/x86/vm/register_x86.cpp
 /*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -35,7 +35,7 @@ const int ConcreteRegisterImpl::max_gpr = RegisterImpl::number_of_registers << 1
 const int ConcreteRegisterImpl::max_fpr = ConcreteRegisterImpl::max_gpr +
                                                                 2 * FloatRegisterImpl::number_of_registers;
 const int ConcreteRegisterImpl::max_xmm = ConcreteRegisterImpl::max_fpr +
-                                                                 2 * XMMRegisterImpl::number_of_registers;
+                                                                 8 * XMMRegisterImpl::number_of_registers;
 const char* RegisterImpl::name() const {
  const char* names[number_of_registers] = {
 #ifndef AMD64

--- a/src/cpu/x86/vm/register_x86.hpp
+++ b/src/cpu/x86/vm/register_x86.hpp
 /*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -158,7 +158,7 @@ class XMMRegisterImpl: public AbstractRegisterImpl {
  XMMRegister successor() const                          { return as_XMMRegister(encoding() + 1); }

  // accessors
-  int   encoding() const                          { assert(is_valid(), "invalid register"); return (intptr_t)this; }
+  int   encoding() const                          { assert(is_valid(), err_msg("invalid register (%d)", (int)(intptr_t)this )); return (intptr_t)this; }
  bool  is_valid() const                          { return 0 <= (intptr_t)this && (intptr_t)this < number_of_registers; }
  const char* name() const;
 };
@@ -216,7 +216,7 @@ class ConcreteRegisterImpl : public AbstractRegisterImpl {
                               RegisterImpl::number_of_registers +  // "H" half of a 64bit register
 #endif // AMD64
                           2 * FloatRegisterImpl::number_of_registers +
-                           2 * XMMRegisterImpl::number_of_registers +
+                           8 * XMMRegisterImpl::number_of_registers +
                           1 // eflags
  };


--- a/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/src/cpu/x86/vm/vm_version_x86.cpp
@@ -467,6 +467,32 @@ void VM_Version::get_processor_features() {
  if (!supports_avx ()) // Drop to 0 if no AVX  support
    UseAVX = 0;

+#ifdef COMPILER2
+  if (UseFPUForSpilling) {
+    if (UseSSE < 2) {
+      // Only supported with SSE2+
+      FLAG_SET_DEFAULT(UseFPUForSpilling, false);
+    }
+  }
+  if (MaxVectorSize > 0) {
+    if (!is_power_of_2(MaxVectorSize)) {
+      warning("MaxVectorSize must be a power of 2");
+      FLAG_SET_DEFAULT(MaxVectorSize, 32);
+    }
+    if (MaxVectorSize > 32) {
+      FLAG_SET_DEFAULT(MaxVectorSize, 32);
+    }
+    if (MaxVectorSize > 16 && UseAVX == 0) {
+      // Only supported with AVX+
+      FLAG_SET_DEFAULT(MaxVectorSize, 16);
+    }
+    if (UseSSE < 2) {
+      // Only supported with SSE2+
+      FLAG_SET_DEFAULT(MaxVectorSize, 0);
+    }
+  }
+#endif
+
  // On new cpus instructions which update whole XMM register should be used
  // to prevent partial register stall due to dependencies on high half.
  //
@@ -544,6 +570,12 @@ void VM_Version::get_processor_features() {
      }
    }

+#ifdef COMPILER2
+    if (MaxVectorSize > 16) {
+      // Limit vectors size to 16 bytes on current AMD cpus.
+      FLAG_SET_DEFAULT(MaxVectorSize, 16);
+    }
+#endif // COMPILER2
  }

  if( is_intel() ) { // Intel cpus specific settings
@@ -606,15 +638,6 @@ void VM_Version::get_processor_features() {
    FLAG_SET_DEFAULT(UsePopCountInstruction, false);
  }

-#ifdef COMPILER2
-  if (UseFPUForSpilling) {
-    if (UseSSE < 2) {
-      // Only supported with SSE2+
-      FLAG_SET_DEFAULT(UseFPUForSpilling, false);
-    }
-  }
-#endif
-
  assert(0 <= ReadPrefetchInstr && ReadPrefetchInstr <= 3, "invalid value");
  assert(0 <= AllocatePrefetchInstr && AllocatePrefetchInstr <= 3, "invalid value");


--- a/src/cpu/x86/vm/vmreg_x86.cpp
+++ b/src/cpu/x86/vm/vmreg_x86.cpp
 /*
- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -48,8 +48,9 @@ void VMRegImpl::set_regName() {

  XMMRegister xreg = ::as_XMMRegister(0);
  for ( ; i < ConcreteRegisterImpl::max_xmm ; ) {
-    regName[i++] = xreg->name();
-    regName[i++] = xreg->name();
+    for (int j = 0 ; j < 8 ; j++) {
+      regName[i++] = xreg->name();
+    }
    xreg = xreg->successor();
  }
  for ( ; i < ConcreteRegisterImpl::number_of_registers ; i ++ ) {

--- a/src/cpu/x86/vm/vmreg_x86.inline.hpp
+++ b/src/cpu/x86/vm/vmreg_x86.inline.hpp
 /*
- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -39,7 +39,7 @@ inline VMReg FloatRegisterImpl::as_VMReg() {
 }

 inline VMReg XMMRegisterImpl::as_VMReg() {
-  return VMRegImpl::as_VMReg((encoding() << 1) + ConcreteRegisterImpl::max_fpr);
+  return VMRegImpl::as_VMReg((encoding() << 3) + ConcreteRegisterImpl::max_fpr);
 }


@@ -75,7 +75,7 @@ inline FloatRegister VMRegImpl::as_FloatRegister() {
 inline XMMRegister VMRegImpl::as_XMMRegister() {
  assert( is_XMMRegister() && is_even(value()), "must be" );
  // Yuk
-  return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 1);
+  return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 3);
 }

 inline   bool VMRegImpl::is_concrete() {

--- a/src/cpu/x86/vm/x86.ad
+++ b/src/cpu/x86/vm/x86.ad
--- a/src/cpu/x86/vm/x86_32.ad
+++ b/src/cpu/x86/vm/x86_32.ad
--- a/src/cpu/x86/vm/x86_64.ad
+++ b/src/cpu/x86/vm/x86_64.ad
--- a/src/share/vm/adlc/adlparse.cpp
+++ b/src/share/vm/adlc/adlparse.cpp
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -115,6 +115,12 @@ void ADLParser::parse() {
      parse_err(SYNERR, "expected one of - instruct, operand, ins_attrib, op_attrib, source, register, pipeline, encode\n     Found %s",ident);
    }
  }
+  // Add reg_class spill_regs after parsing.
+  RegisterForm *regBlock = _AD.get_registers();
+  if (regBlock == NULL) {
+    parse_err(SEMERR, "Did not declare 'register' definitions");
+  }
+  regBlock->addSpillRegClass();

  // Done with parsing, check consistency.

@@ -768,11 +774,12 @@ void ADLParser::source_hpp_parse(void) {

 //------------------------------reg_parse--------------------------------------
 void ADLParser::reg_parse(void) {
-
-  // Create the RegisterForm for the architecture description.
-  RegisterForm *regBlock = new RegisterForm();    // Build new Source object
-  regBlock->_linenum = linenum();
-  _AD.addForm(regBlock);
+  RegisterForm *regBlock = _AD.get_registers(); // Information about registers encoding
+  if (regBlock == NULL) {
+    // Create the RegisterForm for the architecture description.
+    regBlock = new RegisterForm();    // Build new Source object
+    _AD.addForm(regBlock);
+  }

  skipws();                       // Skip leading whitespace
  if (_curchar == '%' && *(_ptr+1) == '{') {
@@ -796,15 +803,11 @@ void ADLParser::reg_parse(void) {
    parse_err(SYNERR, "Missing %c{ ... %c} block after register keyword.\n",'%','%');
    return;
  }
-
-  // Add reg_class spill_regs
-  regBlock->addSpillRegClass();
 }

 //------------------------------encode_parse-----------------------------------
 void ADLParser::encode_parse(void) {
  EncodeForm *encBlock;         // Information about instruction/operand encoding
-  char       *desc = NULL;      // String representation of encode rule

  _AD.getForm(&encBlock);
  if ( encBlock == NULL) {

--- a/src/share/vm/adlc/archDesc.cpp
+++ b/src/share/vm/adlc/archDesc.cpp
 //
-// Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -911,12 +911,24 @@ const char *ArchDesc::getIdealType(const char *idealOp) {
  // Find last character in idealOp, it specifies the type
  char  last_char = 0;
  const char *ptr = idealOp;
-  for( ; *ptr != '\0'; ++ptr) {
+  for (; *ptr != '\0'; ++ptr) {
    last_char = *ptr;
  }

+  // Match Vector types.
+  if (strncmp(idealOp, "Vec",3)==0) {
+    switch(last_char) {
+    case 'S':  return "TypeVect::VECTS";
+    case 'D':  return "TypeVect::VECTD";
+    case 'X':  return "TypeVect::VECTX";
+    case 'Y':  return "TypeVect::VECTY";
+    default:
+      internal_err("Vector type %s with unrecognized type\n",idealOp);
+    }
+  }
+
  // !!!!!
-  switch( last_char ) {
+  switch(last_char) {
  case 'I':    return "TypeInt::INT";
  case 'P':    return "TypePtr::BOTTOM";
  case 'N':    return "TypeNarrowOop::BOTTOM";

--- a/src/share/vm/adlc/forms.cpp
+++ b/src/share/vm/adlc/forms.cpp
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -265,47 +265,22 @@ Form::DataType Form::is_load_from_memory(const char *opType) const {
  if( strcmp(opType,"LoadN")==0 )  return Form::idealN;
  if( strcmp(opType,"LoadRange")==0 )  return Form::idealI;
  if( strcmp(opType,"LoadS")==0 )  return Form::idealS;
-  if( strcmp(opType,"Load16B")==0 )  return Form::idealB;
-  if( strcmp(opType,"Load8B")==0 )  return Form::idealB;
-  if( strcmp(opType,"Load4B")==0 )  return Form::idealB;
-  if( strcmp(opType,"Load8C")==0 )  return Form::idealC;
-  if( strcmp(opType,"Load4C")==0 )  return Form::idealC;
-  if( strcmp(opType,"Load2C")==0 )  return Form::idealC;
-  if( strcmp(opType,"Load8S")==0 )  return Form::idealS;
-  if( strcmp(opType,"Load4S")==0 )  return Form::idealS;
-  if( strcmp(opType,"Load2S")==0 )  return Form::idealS;
-  if( strcmp(opType,"Load2D")==0 )  return Form::idealD;
-  if( strcmp(opType,"Load4F")==0 )  return Form::idealF;
-  if( strcmp(opType,"Load2F")==0 )  return Form::idealF;
-  if( strcmp(opType,"Load4I")==0 )  return Form::idealI;
-  if( strcmp(opType,"Load2I")==0 )  return Form::idealI;
-  if( strcmp(opType,"Load2L")==0 )  return Form::idealL;
+  if( strcmp(opType,"LoadVector")==0 )  return Form::idealV;
  assert( strcmp(opType,"Load") != 0, "Must type Loads" );
  return Form::none;
 }

 Form::DataType Form::is_store_to_memory(const char *opType) const {
  if( strcmp(opType,"StoreB")==0)  return Form::idealB;
-  if( strcmp(opType,"StoreCM")==0)  return Form::idealB;
+  if( strcmp(opType,"StoreCM")==0) return Form::idealB;
  if( strcmp(opType,"StoreC")==0)  return Form::idealC;
  if( strcmp(opType,"StoreD")==0)  return Form::idealD;
  if( strcmp(opType,"StoreF")==0)  return Form::idealF;
  if( strcmp(opType,"StoreI")==0)  return Form::idealI;
  if( strcmp(opType,"StoreL")==0)  return Form::idealL;
  if( strcmp(opType,"StoreP")==0)  return Form::idealP;
-  if( strcmp(opType,"StoreN")==0) return Form::idealN;
-  if( strcmp(opType,"Store16B")==0)  return Form::idealB;
-  if( strcmp(opType,"Store8B")==0)  return Form::idealB;
-  if( strcmp(opType,"Store4B")==0)  return Form::idealB;
-  if( strcmp(opType,"Store8C")==0)  return Form::idealC;
-  if( strcmp(opType,"Store4C")==0)  return Form::idealC;
-  if( strcmp(opType,"Store2C")==0)  return Form::idealC;
-  if( strcmp(opType,"Store2D")==0)  return Form::idealD;
-  if( strcmp(opType,"Store4F")==0)  return Form::idealF;
-  if( strcmp(opType,"Store2F")==0)  return Form::idealF;
-  if( strcmp(opType,"Store4I")==0)  return Form::idealI;
-  if( strcmp(opType,"Store2I")==0)  return Form::idealI;
-  if( strcmp(opType,"Store2L")==0)  return Form::idealL;
+  if( strcmp(opType,"StoreN")==0)  return Form::idealN;
+  if( strcmp(opType,"StoreVector")==0 )  return Form::idealV;
  assert( strcmp(opType,"Store") != 0, "Must type Stores" );
  return Form::none;
 }

--- a/src/share/vm/adlc/forms.hpp
+++ b/src/share/vm/adlc/forms.hpp
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -172,7 +172,8 @@ public:
    idealB      =  6,  // Byte    type
    idealC      =  7,  // Char    type
    idealS      =  8,  // String  type
-    idealN      =  9   // Narrow oop types
+    idealN      =  9,  // Narrow oop types
+    idealV      = 10   // Vector  type
  };
  // Convert ideal name to a DataType, return DataType::none if not a 'ConX'
  Form::DataType  ideal_to_const_type(const char *ideal_type_name) const;

--- a/src/share/vm/adlc/formsopt.cpp
+++ b/src/share/vm/adlc/formsopt.cpp
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -66,7 +66,7 @@ AllocClass *RegisterForm::addAllocClass(char *className) {
 // for spill-slots/regs.
 void RegisterForm::addSpillRegClass() {
  // Stack slots start at the next available even register number.
-  _reg_ctr = (_reg_ctr+1) & ~1;
+  _reg_ctr = (_reg_ctr+7) & ~7;
  const char *rc_name   = "stack_slots";
  RegClass   *reg_class = new RegClass(rc_name);
  reg_class->_stack_or_reg = true;
@@ -150,9 +150,14 @@ bool   RegisterForm::verify() {
 int RegisterForm::RegMask_Size() {
  // Need at least this many words
  int words_for_regs = (_reg_ctr + 31)>>5;
-  // Add a few for incoming & outgoing arguments to calls.
+  // The array of Register Mask bits should be large enough to cover
+  // all the machine registers and all parameters that need to be passed
+  // on the stack (stack registers) up to some interesting limit.  Methods
+  // that need more parameters will NOT be compiled.  On Intel, the limit
+  // is something like 90+ parameters.
+  // Add a few (3 words == 96 bits) for incoming & outgoing arguments to calls.
  // Round up to the next doubleword size.
-  return (words_for_regs + 2 + 1) & ~1;
+  return (words_for_regs + 3 + 1) & ~1;
 }

 void RegisterForm::dump() {                  // Debug printer

--- a/src/share/vm/adlc/formssel.cpp
+++ b/src/share/vm/adlc/formssel.cpp
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -432,6 +432,14 @@ Form::DataType InstructForm::is_ideal_store() const {
  return  _matrule->is_ideal_store();
 }

+// Return 'true' if this instruction matches an ideal vector node
+bool InstructForm::is_vector() const {
+  if( _matrule == NULL ) return false;
+
+  return _matrule->is_vector();
+}
+
+
 // Return the input register that must match the output register
 // If this is not required, return 0
 uint InstructForm::two_address(FormDict &globals) {
@@ -751,6 +759,9 @@ bool InstructForm::captures_bottom_type(FormDict &globals) const {

  if (needs_base_oop_edge(globals)) return true;

+  if (is_vector()) return true;
+  if (is_mach_constant()) return true;
+
  return  false;
 }

@@ -3381,11 +3392,8 @@ int MatchNode::needs_ideal_memory_edge(FormDict &globals) const {
    "StoreI","StoreL","StoreP","StoreN","StoreD","StoreF" ,
    "StoreB","StoreC","Store" ,"StoreFP",
    "LoadI", "LoadUI2L", "LoadL", "LoadP" ,"LoadN", "LoadD" ,"LoadF"  ,
-    "LoadB" , "LoadUB", "LoadUS" ,"LoadS" ,"Load"   ,
-    "Store4I","Store2I","Store2L","Store2D","Store4F","Store2F","Store16B",
-    "Store8B","Store4B","Store8C","Store4C","Store2C",
-    "Load4I" ,"Load2I" ,"Load2L" ,"Load2D" ,"Load4F" ,"Load2F" ,"Load16B" ,
-    "Load8B" ,"Load4B" ,"Load8C" ,"Load4C" ,"Load2C" ,"Load8S", "Load4S","Load2S",
+    "LoadB" , "LoadUB", "LoadUS" ,"LoadS" ,"Load" ,
+    "StoreVector", "LoadVector",
    "LoadRange", "LoadKlass", "LoadNKlass", "LoadL_unaligned", "LoadD_unaligned",
    "LoadPLocked",
    "StorePConditional", "StoreIConditional", "StoreLConditional",
@@ -3822,6 +3830,10 @@ bool MatchRule::is_base_register(FormDict &globals) const {
         strcmp(opType,"RegL")==0 ||
         strcmp(opType,"RegF")==0 ||
         strcmp(opType,"RegD")==0 ||
+         strcmp(opType,"VecS")==0 ||
+         strcmp(opType,"VecD")==0 ||
+         strcmp(opType,"VecX")==0 ||
+         strcmp(opType,"VecY")==0 ||
         strcmp(opType,"Reg" )==0) ) {
      return 1;
    }
@@ -3938,19 +3950,12 @@ int MatchRule::is_expensive() const {
        strcmp(opType,"ReverseBytesL")==0 ||
        strcmp(opType,"ReverseBytesUS")==0 ||
        strcmp(opType,"ReverseBytesS")==0 ||
-        strcmp(opType,"Replicate16B")==0 ||
-        strcmp(opType,"Replicate8B")==0 ||
-        strcmp(opType,"Replicate4B")==0 ||
-        strcmp(opType,"Replicate8C")==0 ||
-        strcmp(opType,"Replicate4C")==0 ||
-        strcmp(opType,"Replicate8S")==0 ||
-        strcmp(opType,"Replicate4S")==0 ||
-        strcmp(opType,"Replicate4I")==0 ||
-        strcmp(opType,"Replicate2I")==0 ||
-        strcmp(opType,"Replicate2L")==0 ||
-        strcmp(opType,"Replicate4F")==0 ||
-        strcmp(opType,"Replicate2F")==0 ||
-        strcmp(opType,"Replicate2D")==0 ||
+        strcmp(opType,"ReplicateB")==0 ||
+        strcmp(opType,"ReplicateS")==0 ||
+        strcmp(opType,"ReplicateI")==0 ||
+        strcmp(opType,"ReplicateL")==0 ||
+        strcmp(opType,"ReplicateF")==0 ||
+        strcmp(opType,"ReplicateD")==0 ||
        0 /* 0 to line up columns nicely */ )
      return 1;
  }
@@ -4034,6 +4039,23 @@ Form::DataType MatchRule::is_ideal_load() const {
  return ideal_load;
 }

+bool MatchRule::is_vector() const {
+  if( _rChild ) {
+    const char  *opType = _rChild->_opType;
+    if( strcmp(opType,"ReplicateB")==0 ||
+        strcmp(opType,"ReplicateS")==0 ||
+        strcmp(opType,"ReplicateI")==0 ||
+        strcmp(opType,"ReplicateL")==0 ||
+        strcmp(opType,"ReplicateF")==0 ||
+        strcmp(opType,"ReplicateD")==0 ||
+        strcmp(opType,"LoadVector")==0 ||
+        strcmp(opType,"StoreVector")==0 ||
+        0 /* 0 to line up columns nicely */ )
+      return true;
+  }
+  return false;
+}
+

 bool MatchRule::skip_antidep_check() const {
  // Some loads operate on what is effectively immutable memory so we

--- a/src/share/vm/adlc/formssel.hpp
+++ b/src/share/vm/adlc/formssel.hpp
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -160,6 +160,7 @@ public:
  virtual bool        is_ideal_safepoint() const; // node matches 'SafePoint'
  virtual bool        is_ideal_nop() const;     // node matches 'Nop'
  virtual bool        is_ideal_control() const; // control node
+  virtual bool        is_vector() const;        // vector instruction

  virtual Form::CallType is_ideal_call() const; // matches ideal 'Call'
  virtual Form::DataType is_ideal_load() const; // node matches ideal 'LoadXNode'
@@ -1011,6 +1012,7 @@ public:
  bool       is_ideal_goto() const;    // node matches ideal 'Goto'
  bool       is_ideal_loopEnd() const; // node matches ideal 'LoopEnd'
  bool       is_ideal_bool() const;    // node matches ideal 'Bool'
+  bool       is_vector() const;        // vector instruction
  Form::DataType is_ideal_load() const;// node matches ideal 'LoadXNode'
  // Should antidep checks be disabled for this rule
  // See definition of MatchRule::skip_antidep_check

--- a/src/share/vm/adlc/main.cpp
+++ b/src/share/vm/adlc/main.cpp
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -250,6 +250,7 @@ int main(int argc, char *argv[])
  AD.addInclude(AD._HPP_file, "opto/node.hpp");
  AD.addInclude(AD._HPP_file, "opto/regalloc.hpp");
  AD.addInclude(AD._HPP_file, "opto/subnode.hpp");
+  AD.addInclude(AD._HPP_file, "opto/vectornode.hpp");
  AD.addInclude(AD._CPP_CLONE_file, "precompiled.hpp");
  AD.addInclude(AD._CPP_CLONE_file, "adfiles", get_basename(AD._HPP_file._name));
  AD.addInclude(AD._CPP_EXPAND_file, "precompiled.hpp");

--- a/src/share/vm/code/vmreg.cpp
+++ b/src/share/vm/code/vmreg.cpp
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -27,7 +27,7 @@
 #include "code/vmreg.hpp"

 // First VMReg value that could refer to a stack slot
-VMReg VMRegImpl::stack0 = (VMReg)(intptr_t)((ConcreteRegisterImpl::number_of_registers + 1) & ~1);
+VMReg VMRegImpl::stack0 = (VMReg)(intptr_t)((ConcreteRegisterImpl::number_of_registers + 7) & ~7);

 // VMRegs are 4 bytes wide on all platforms
 const int VMRegImpl::stack_slot_size = 4;

--- a/src/share/vm/opto/c2_globals.hpp
+++ b/src/share/vm/opto/c2_globals.hpp
@@ -81,6 +81,13 @@
  product(intx, MaxLoopPad, (OptoLoopAlignment-1),                          \
          "Align a loop if padding size in bytes is less or equal to this value") \
                                                                            \
+  product(intx, MaxVectorSize, 32,                                          \
+          "Max vector size in bytes, "                                      \
+          "actual size could be less depending on elements type")           \
+                                                                            \
+  product(bool, AlignVector, false,                                         \
+          "Perform vector store/load alignment in loop")                    \
+                                                                            \
  product(intx, NumberOfLoopInstrToAlign, 4,                                \
          "Number of first instructions in a loop to align")                \
                                                                            \

--- a/src/share/vm/opto/chaitin.cpp
+++ b/src/share/vm/opto/chaitin.cpp
@@ -75,6 +75,7 @@ void LRG::dump( ) const {
  // Flags
  if( _is_oop ) tty->print("Oop ");
  if( _is_float ) tty->print("Float ");
+  if( _is_vector ) tty->print("Vector ");
  if( _was_spilled1 ) tty->print("Spilled ");
  if( _was_spilled2 ) tty->print("Spilled2 ");
  if( _direct_conflict ) tty->print("Direct_conflict ");
@@ -479,16 +480,18 @@ void PhaseChaitin::Register_Allocate() {

  // Move important info out of the live_arena to longer lasting storage.
  alloc_node_regs(_names.Size());
-  for( uint i=0; i < _names.Size(); i++ ) {
-    if( _names[i] ) {           // Live range associated with Node?
-      LRG &lrg = lrgs( _names[i] );
-      if( lrg.num_regs() == 1 ) {
-        _node_regs[i].set1( lrg.reg() );
+  for (uint i=0; i < _names.Size(); i++) {
+    if (_names[i]) {           // Live range associated with Node?
+      LRG &lrg = lrgs(_names[i]);
+      if (!lrg.alive()) {
+        _node_regs[i].set_bad();
+      } else if (lrg.num_regs() == 1) {
+        _node_regs[i].set1(lrg.reg());
      } else {                  // Must be a register-pair
-        if( !lrg._fat_proj ) {  // Must be aligned adjacent register pair
+        if (!lrg._fat_proj) {   // Must be aligned adjacent register pair
          // Live ranges record the highest register in their mask.
          // We want the low register for the AD file writer's convenience.
-          _node_regs[i].set2( OptoReg::add(lrg.reg(),-1) );
+          _node_regs[i].set2( OptoReg::add(lrg.reg(),(1-lrg.num_regs())) );
        } else {                // Misaligned; extract 2 bits
          OptoReg::Name hi = lrg.reg(); // Get hi register
          lrg.Remove(hi);       // Yank from mask
@@ -568,7 +571,7 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
        // Check for float-vs-int live range (used in register-pressure
        // calculations)
        const Type *n_type = n->bottom_type();
-        if( n_type->is_floatingpoint() )
+        if (n_type->is_floatingpoint())
          lrg._is_float = 1;

        // Check for twice prior spilling.  Once prior spilling might have
@@ -599,18 +602,28 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
        // Limit result register mask to acceptable registers
        const RegMask &rm = n->out_RegMask();
        lrg.AND( rm );
+
+        int ireg = n->ideal_reg();
+        assert( !n->bottom_type()->isa_oop_ptr() || ireg == Op_RegP,
+                "oops must be in Op_RegP's" );
+
+        // Check for vector live range (only if vector register is used).
+        // On SPARC vector uses RegD which could be misaligned so it is not
+        // processes as vector in RA.
+        if (RegMask::is_vector(ireg))
+          lrg._is_vector = 1;
+        assert(n_type->isa_vect() == NULL || lrg._is_vector || ireg == Op_RegD,
+               "vector must be in vector registers");
+
        // Check for bound register masks
        const RegMask &lrgmask = lrg.mask();
-        if( lrgmask.is_bound1() || lrgmask.is_bound2() )
+        if (lrgmask.is_bound(ireg))
          lrg._is_bound = 1;

        // Check for maximum frequency value
-        if( lrg._maxfreq < b->_freq )
+        if (lrg._maxfreq < b->_freq)
          lrg._maxfreq = b->_freq;

-        int ireg = n->ideal_reg();
-        assert( !n->bottom_type()->isa_oop_ptr() || ireg == Op_RegP,
-                "oops must be in Op_RegP's" );
        // Check for oop-iness, or long/double
        // Check for multi-kill projection
        switch( ireg ) {
@@ -689,7 +702,7 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
          // AND changes how we count interferences.  A mis-aligned
          // double can interfere with TWO aligned pairs, or effectively
          // FOUR registers!
-          if( rm.is_misaligned_Pair() ) {
+          if (rm.is_misaligned_pair()) {
            lrg._fat_proj = 1;
            lrg._is_bound = 1;
          }
@@ -706,6 +719,33 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
          lrg.set_reg_pressure(1);
 #endif
          break;
+        case Op_VecS:
+          assert(Matcher::vector_size_supported(T_BYTE,4), "sanity");
+          assert(RegMask::num_registers(Op_VecS) == RegMask::SlotsPerVecS, "sanity");
+          lrg.set_num_regs(RegMask::SlotsPerVecS);
+          lrg.set_reg_pressure(1);
+          break;
+        case Op_VecD:
+          assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecD), "sanity");
+          assert(RegMask::num_registers(Op_VecD) == RegMask::SlotsPerVecD, "sanity");
+          assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecD), "vector should be aligned");
+          lrg.set_num_regs(RegMask::SlotsPerVecD);
+          lrg.set_reg_pressure(1);
+          break;
+        case Op_VecX:
+          assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecX), "sanity");
+          assert(RegMask::num_registers(Op_VecX) == RegMask::SlotsPerVecX, "sanity");
+          assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecX), "vector should be aligned");
+          lrg.set_num_regs(RegMask::SlotsPerVecX);
+          lrg.set_reg_pressure(1);
+          break;
+        case Op_VecY:
+          assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecY), "sanity");
+          assert(RegMask::num_registers(Op_VecY) == RegMask::SlotsPerVecY, "sanity");
+          assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecY), "vector should be aligned");
+          lrg.set_num_regs(RegMask::SlotsPerVecY);
+          lrg.set_reg_pressure(1);
+          break;
        default:
          ShouldNotReachHere();
        }
@@ -763,24 +803,38 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
        } else {
          lrg.AND( rm );
        }
+
        // Check for bound register masks
        const RegMask &lrgmask = lrg.mask();
-        if( lrgmask.is_bound1() || lrgmask.is_bound2() )
+        int kreg = n->in(k)->ideal_reg();
+        bool is_vect = RegMask::is_vector(kreg);
+        assert(n->in(k)->bottom_type()->isa_vect() == NULL ||
+               is_vect || kreg == Op_RegD,
+               "vector must be in vector registers");
+        if (lrgmask.is_bound(kreg))
          lrg._is_bound = 1;
+
        // If this use of a double forces a mis-aligned double,
        // flag as '_fat_proj' - really flag as allowing misalignment
        // AND changes how we count interferences.  A mis-aligned
        // double can interfere with TWO aligned pairs, or effectively
        // FOUR registers!
-        if( lrg.num_regs() == 2 && !lrg._fat_proj && rm.is_misaligned_Pair() ) {
+#ifdef ASSERT
+        if (is_vect) {
+          assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
+          assert(!lrg._fat_proj, "sanity");
+          assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+        }
+#endif
+        if (!is_vect && lrg.num_regs() == 2 && !lrg._fat_proj && rm.is_misaligned_pair()) {
          lrg._fat_proj = 1;
          lrg._is_bound = 1;
        }
        // if the LRG is an unaligned pair, we will have to spill
        // so clear the LRG's register mask if it is not already spilled
-        if ( !n->is_SpillCopy() &&
-               (lrg._def == NULL || lrg.is_multidef() || !lrg._def->is_SpillCopy()) &&
-               lrgmask.is_misaligned_Pair()) {
+        if (!is_vect && !n->is_SpillCopy() &&
+            (lrg._def == NULL || lrg.is_multidef() || !lrg._def->is_SpillCopy()) &&
+            lrgmask.is_misaligned_pair()) {
          lrg.Clear();
        }

@@ -793,12 +847,14 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
  } // end for all blocks

  // Final per-liverange setup
-  for( uint i2=0; i2<_maxlrg; i2++ ) {
+  for (uint i2=0; i2<_maxlrg; i2++) {
    LRG &lrg = lrgs(i2);
-    if( lrg.num_regs() == 2 && !lrg._fat_proj )
-      lrg.ClearToPairs();
+    assert(!lrg._is_vector || !lrg._fat_proj, "sanity");
+    if (lrg.num_regs() > 1 && !lrg._fat_proj) {
+      lrg.clear_to_sets();
+    }
    lrg.compute_set_mask_size();
-    if( lrg.not_free() ) {      // Handle case where we lose from the start
+    if (lrg.not_free()) {      // Handle case where we lose from the start
      lrg.set_reg(OptoReg::Name(LRG::SPILL_REG));
      lrg._direct_conflict = 1;
    }
@@ -1104,22 +1160,17 @@ OptoReg::Name PhaseChaitin::bias_color( LRG &lrg, int chunk ) {
      // Choose a color which is legal for him
      RegMask tempmask = lrg.mask();
      tempmask.AND(lrgs(copy_lrg).mask());
-      OptoReg::Name reg;
-      if( lrg.num_regs() == 1 ) {
-        reg = tempmask.find_first_elem();
-      } else {
-        tempmask.ClearToPairs();
-        reg = tempmask.find_first_pair();
-      }
-      if( OptoReg::is_valid(reg) )
+      tempmask.clear_to_sets(lrg.num_regs());
+      OptoReg::Name reg = tempmask.find_first_set(lrg.num_regs());
+      if (OptoReg::is_valid(reg))
        return reg;
    }
  }

  // If no bias info exists, just go with the register selection ordering
-  if( lrg.num_regs() == 2 ) {
-    // Find an aligned pair
-    return OptoReg::add(lrg.mask().find_first_pair(),chunk);
+  if (lrg._is_vector || lrg.num_regs() == 2) {
+    // Find an aligned set
+    return OptoReg::add(lrg.mask().find_first_set(lrg.num_regs()),chunk);
  }

  // CNC - Fun hack.  Alternate 1st and 2nd selection.  Enables post-allocate
@@ -1149,6 +1200,7 @@ OptoReg::Name PhaseChaitin::choose_color( LRG &lrg, int chunk ) {
    // Use a heuristic to "bias" the color choice
    return bias_color(lrg, chunk);

+  assert(!lrg._is_vector, "should be not vector here" );
  assert( lrg.num_regs() >= 2, "dead live ranges do not color" );

  // Fat-proj case or misaligned double argument.
@@ -1238,14 +1290,16 @@ uint PhaseChaitin::Select( ) {
    }
    //assert(is_allstack == lrg->mask().is_AllStack(), "nbrs must not change AllStackedness");
    // Aligned pairs need aligned masks
-    if( lrg->num_regs() == 2 && !lrg->_fat_proj )
-      lrg->ClearToPairs();
+    assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity");
+    if (lrg->num_regs() > 1 && !lrg->_fat_proj) {
+      lrg->clear_to_sets();
+    }

    // Check if a color is available and if so pick the color
    OptoReg::Name reg = choose_color( *lrg, chunk );
 #ifdef SPARC
    debug_only(lrg->compute_set_mask_size());
-    assert(lrg->num_regs() != 2 || lrg->is_bound() || is_even(reg-1), "allocate all doubles aligned");
+    assert(lrg->num_regs() < 2 || lrg->is_bound() || is_even(reg-1), "allocate all doubles aligned");
 #endif

    //---------------
@@ -1277,17 +1331,16 @@ uint PhaseChaitin::Select( ) {
      // If the live range is not bound, then we actually had some choices
      // to make.  In this case, the mask has more bits in it than the colors
      // chosen.  Restrict the mask to just what was picked.
-      if( lrg->num_regs() == 1 ) { // Size 1 live range
-        lrg->Clear();           // Clear the mask
-        lrg->Insert(reg);       // Set regmask to match selected reg
-        lrg->set_mask_size(1);
-      } else if( !lrg->_fat_proj ) {
-        // For pairs, also insert the low bit of the pair
-        assert( lrg->num_regs() == 2, "unbound fatproj???" );
+      int n_regs = lrg->num_regs();
+      assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity");
+      if (n_regs == 1 || !lrg->_fat_proj) {
+        assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecY, "sanity");
        lrg->Clear();           // Clear the mask
        lrg->Insert(reg);       // Set regmask to match selected reg
-        lrg->Insert(OptoReg::add(reg,-1));
-        lrg->set_mask_size(2);
+        // For vectors and pairs, also insert the low bit of the pair
+        for (int i = 1; i < n_regs; i++)
+          lrg->Insert(OptoReg::add(reg,-i));
+        lrg->set_mask_size(n_regs);
      } else {                  // Else fatproj
        // mask must be equal to fatproj bits, by definition
      }
@@ -1860,12 +1913,20 @@ char *PhaseChaitin::dump_register( const Node *n, char *buf  ) const {
      sprintf(buf,"L%d",lidx);  // No register binding yet
    } else if( !lidx ) {        // Special, not allocated value
      strcpy(buf,"Special");
-    } else if( (lrgs(lidx).num_regs() == 1)
-                ? !lrgs(lidx).mask().is_bound1()
-                : !lrgs(lidx).mask().is_bound2() ) {
-      sprintf(buf,"L%d",lidx); // No register binding yet
-    } else {                    // Hah!  We have a bound machine register
-      print_reg( lrgs(lidx).reg(), this, buf );
+    } else {
+      if (lrgs(lidx)._is_vector) {
+        if (lrgs(lidx).mask().is_bound_set(lrgs(lidx).num_regs()))
+          print_reg( lrgs(lidx).reg(), this, buf ); // a bound machine register
+        else
+          sprintf(buf,"L%d",lidx); // No register binding yet
+      } else if( (lrgs(lidx).num_regs() == 1)
+                 ? lrgs(lidx).mask().is_bound1()
+                 : lrgs(lidx).mask().is_bound_pair() ) {
+        // Hah!  We have a bound machine register
+        print_reg( lrgs(lidx).reg(), this, buf );
+      } else {
+        sprintf(buf,"L%d",lidx); // No register binding yet
+      }
    }
  }
  return buf+strlen(buf);

--- a/src/share/vm/opto/chaitin.hpp
+++ b/src/share/vm/opto/chaitin.hpp
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -99,8 +99,15 @@ public:
  void set_mask_size( int size ) {
    assert((size == 65535) || (size == (int)_mask.Size()), "");
    _mask_size = size;
-    debug_only(_msize_valid=1;)
-    debug_only( if( _num_regs == 2 && !_fat_proj ) _mask.VerifyPairs(); )
+#ifdef ASSERT
+    _msize_valid=1;
+    if (_is_vector) {
+      assert(!_fat_proj, "sanity");
+      _mask.verify_sets(_num_regs);
+    } else if (_num_regs == 2 && !_fat_proj) {
+      _mask.verify_pairs();
+    }
+#endif
  }
  void compute_set_mask_size() { set_mask_size(compute_mask_size()); }
  int mask_size() const { assert( _msize_valid, "mask size not valid" );
@@ -116,7 +123,8 @@ public:
  void Set_All() { _mask.Set_All(); debug_only(_msize_valid=1); _mask_size = RegMask::CHUNK_SIZE; }
  void Insert( OptoReg::Name reg ) { _mask.Insert(reg);  debug_only(_msize_valid=0;) }
  void Remove( OptoReg::Name reg ) { _mask.Remove(reg);  debug_only(_msize_valid=0;) }
-  void ClearToPairs() { _mask.ClearToPairs(); debug_only(_msize_valid=0;) }
+  void clear_to_pairs() { _mask.clear_to_pairs(); debug_only(_msize_valid=0;) }
+  void clear_to_sets()  { _mask.clear_to_sets(_num_regs); debug_only(_msize_valid=0;) }

  // Number of registers this live range uses when it colors
 private:
@@ -150,6 +158,7 @@ public:

  uint   _is_oop:1,             // Live-range holds an oop
         _is_float:1,           // True if in float registers
+         _is_vector:1,          // True if in vector registers
         _was_spilled1:1,       // True if prior spilling on def
         _was_spilled2:1,       // True if twice prior spilling on def
         _is_bound:1,           // live range starts life with no

--- a/src/share/vm/opto/classes.hpp
+++ b/src/share/vm/opto/classes.hpp
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -245,14 +245,12 @@ macro(XorI)
 macro(XorL)
 macro(Vector)
 macro(AddVB)
-macro(AddVC)
 macro(AddVS)
 macro(AddVI)
 macro(AddVL)
 macro(AddVF)
 macro(AddVD)
 macro(SubVB)
-macro(SubVC)
 macro(SubVS)
 macro(SubVI)
 macro(SubVL)
@@ -263,74 +261,36 @@ macro(MulVD)
 macro(DivVF)
 macro(DivVD)
 macro(LShiftVB)
-macro(LShiftVC)
 macro(LShiftVS)
 macro(LShiftVI)
-macro(URShiftVB)
-macro(URShiftVC)
-macro(URShiftVS)
-macro(URShiftVI)
+macro(RShiftVB)
+macro(RShiftVS)
+macro(RShiftVI)
 macro(AndV)
 macro(OrV)
 macro(XorV)
-macro(VectorLoad)
-macro(Load16B)
-macro(Load8B)
-macro(Load4B)
-macro(Load8C)
-macro(Load4C)
-macro(Load2C)
-macro(Load8S)
-macro(Load4S)
-macro(Load2S)
-macro(Load4I)
-macro(Load2I)
-macro(Load2L)
-macro(Load4F)
-macro(Load2F)
-macro(Load2D)
-macro(VectorStore)
-macro(Store16B)
-macro(Store8B)
-macro(Store4B)
-macro(Store8C)
-macro(Store4C)
-macro(Store2C)
-macro(Store4I)
-macro(Store2I)
-macro(Store2L)
-macro(Store4F)
-macro(Store2F)
-macro(Store2D)
+macro(LoadVector)
+macro(StoreVector)
 macro(Pack)
 macro(PackB)
 macro(PackS)
-macro(PackC)
 macro(PackI)
 macro(PackL)
 macro(PackF)
 macro(PackD)
-macro(Pack2x1B)
-macro(Pack2x2B)
-macro(Replicate16B)
-macro(Replicate8B)
-macro(Replicate4B)
-macro(Replicate8S)
-macro(Replicate4S)
-macro(Replicate2S)
-macro(Replicate8C)
-macro(Replicate4C)
-macro(Replicate2C)
-macro(Replicate4I)
-macro(Replicate2I)
-macro(Replicate2L)
-macro(Replicate4F)
-macro(Replicate2F)
-macro(Replicate2D)
+macro(Pack2L)
+macro(Pack2D)
+macro(ReplicateB)
+macro(ReplicateS)
+macro(ReplicateI)
+macro(ReplicateL)
+macro(ReplicateF)
+macro(ReplicateD)
 macro(Extract)
 macro(ExtractB)
-macro(ExtractS)
+macro(ExtractUB)
 macro(ExtractC)
+macro(ExtractS)
 macro(ExtractI)
 macro(ExtractL)
 macro(ExtractF)

--- a/src/share/vm/opto/compile.cpp
+++ b/src/share/vm/opto/compile.cpp
@@ -2591,38 +2591,12 @@ static void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc ) {
    }
    break;

-  case Op_Load16B:
-  case Op_Load8B:
-  case Op_Load4B:
-  case Op_Load8S:
-  case Op_Load4S:
-  case Op_Load2S:
-  case Op_Load8C:
-  case Op_Load4C:
-  case Op_Load2C:
-  case Op_Load4I:
-  case Op_Load2I:
-  case Op_Load2L:
-  case Op_Load4F:
-  case Op_Load2F:
-  case Op_Load2D:
-  case Op_Store16B:
-  case Op_Store8B:
-  case Op_Store4B:
-  case Op_Store8C:
-  case Op_Store4C:
-  case Op_Store2C:
-  case Op_Store4I:
-  case Op_Store2I:
-  case Op_Store2L:
-  case Op_Store4F:
-  case Op_Store2F:
-  case Op_Store2D:
+  case Op_LoadVector:
+  case Op_StoreVector:
    break;

  case Op_PackB:
  case Op_PackS:
-  case Op_PackC:
  case Op_PackI:
  case Op_PackF:
  case Op_PackL:

--- a/src/share/vm/opto/ifg.cpp
+++ b/src/share/vm/opto/ifg.cpp
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -416,6 +416,7 @@ uint PhaseChaitin::count_int_pressure( IndexSet *liveout ) {
    if( lrgs(lidx).mask().is_UP() &&
        lrgs(lidx).mask_size() &&
        !lrgs(lidx)._is_float &&
+        !lrgs(lidx)._is_vector &&
        lrgs(lidx).mask().overlap(*Matcher::idealreg2regmask[Op_RegI]) )
      cnt += lrgs(lidx).reg_pressure();
  }
@@ -430,7 +431,7 @@ uint PhaseChaitin::count_float_pressure( IndexSet *liveout ) {
  while ((lidx = elements.next()) != 0) {
    if( lrgs(lidx).mask().is_UP() &&
        lrgs(lidx).mask_size() &&
-        lrgs(lidx)._is_float )
+        (lrgs(lidx)._is_float || lrgs(lidx)._is_vector))
      cnt += lrgs(lidx).reg_pressure();
  }
  return cnt;
@@ -439,8 +440,8 @@ uint PhaseChaitin::count_float_pressure( IndexSet *liveout ) {
 //------------------------------lower_pressure---------------------------------
 // Adjust register pressure down by 1.  Capture last hi-to-low transition,
 static void lower_pressure( LRG *lrg, uint where, Block *b, uint *pressure, uint *hrp_index ) {
-  if( lrg->mask().is_UP() && lrg->mask_size() ) {
-    if( lrg->_is_float ) {
+  if (lrg->mask().is_UP() && lrg->mask_size()) {
+    if (lrg->_is_float || lrg->_is_vector) {
      pressure[1] -= lrg->reg_pressure();
      if( pressure[1] == (uint)FLOATPRESSURE ) {
        hrp_index[1] = where;
@@ -522,8 +523,8 @@ uint PhaseChaitin::build_ifg_physical( ResourceArea *a ) {
      LRG &lrg = lrgs(lidx);
      lrg._area += cost;
      // Compute initial register pressure
-      if( lrg.mask().is_UP() && lrg.mask_size() ) {
-        if( lrg._is_float ) {   // Count float pressure
+      if (lrg.mask().is_UP() && lrg.mask_size()) {
+        if (lrg._is_float || lrg._is_vector) {   // Count float pressure
          pressure[1] += lrg.reg_pressure();
 #ifdef EXACT_PRESSURE
          if( pressure[1] > b->_freg_pressure )
@@ -681,13 +682,10 @@ uint PhaseChaitin::build_ifg_physical( ResourceArea *a ) {
        // according to its bindings.
        const RegMask &rmask = lrgs(r).mask();
        if( lrgs(r).is_bound() && !(n->rematerialize()) && rmask.is_NotEmpty() ) {
-          // Smear odd bits; leave only aligned pairs of bits.
-          RegMask r2mask = rmask;
-          r2mask.SmearToPairs();
          // Check for common case
          int r_size = lrgs(r).num_regs();
          OptoReg::Name r_reg = (r_size == 1) ? rmask.find_first_elem() : OptoReg::Physical;
-
+          // Smear odd bits
          IndexSetIterator elements(&liveout);
          uint l;
          while ((l = elements.next()) != 0) {
@@ -701,10 +699,15 @@ uint PhaseChaitin::build_ifg_physical( ResourceArea *a ) {
            // Remove the bits from LRG 'r' from LRG 'l' so 'l' no
            // longer interferes with 'r'.  If 'l' requires aligned
            // adjacent pairs, subtract out bit pairs.
-            if( lrg.num_regs() == 2 && !lrg._fat_proj ) {
+            assert(!lrg._is_vector || !lrg._fat_proj, "sanity");
+            if (lrg.num_regs() > 1 && !lrg._fat_proj) {
+              RegMask r2mask = rmask;
+              // Leave only aligned set of bits.
+              r2mask.smear_to_sets(lrg.num_regs());
+              // It includes vector case.
              lrg.SUBTRACT( r2mask );
              lrg.compute_set_mask_size();
-            } else if( r_size != 1 ) {
+            } else if( r_size != 1 ) { // fat proj
              lrg.SUBTRACT( rmask );
              lrg.compute_set_mask_size();
            } else {            // Common case: size 1 bound removal
@@ -763,8 +766,8 @@ uint PhaseChaitin::build_ifg_physical( ResourceArea *a ) {
            // Newly live things assumed live from here to top of block
            lrg._area += cost;
            // Adjust register pressure
-            if( lrg.mask().is_UP() && lrg.mask_size() ) {
-              if( lrg._is_float ) {
+            if (lrg.mask().is_UP() && lrg.mask_size()) {
+              if (lrg._is_float || lrg._is_vector) {
                pressure[1] += lrg.reg_pressure();
 #ifdef EXACT_PRESSURE
                if( pressure[1] > b->_freg_pressure )

--- a/src/share/vm/opto/lcm.cpp
+++ b/src/share/vm/opto/lcm.cpp
 /*
- * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -139,6 +139,7 @@ void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowe
    int iop = mach->ideal_Opcode();
    switch( iop ) {
    case Op_LoadB:
+    case Op_LoadUB:
    case Op_LoadUS:
    case Op_LoadD:
    case Op_LoadF:
@@ -445,6 +446,11 @@ Node *Block::select(PhaseCFG *cfg, Node_List &worklist, GrowableArray<int> &read
    if( e->is_MachNullCheck() && e->in(1) == n )
      continue;

+    // Schedule IV increment last.
+    if (e->is_Mach() && e->as_Mach()->ideal_Opcode() == Op_CountedLoopEnd &&
+        e->in(1)->in(1) == n && n->is_iteratively_computed())
+      continue;
+
    uint n_choice  = 2;

    // See if this instruction is consumed by a branch. If so, then (as the

--- a/src/share/vm/opto/loopnode.cpp
+++ b/src/share/vm/opto/loopnode.cpp
@@ -2751,7 +2751,8 @@ int PhaseIdealLoop::build_loop_tree_impl( Node *n, int pre_order ) {
        // Do not count uncommon calls
        if( !n->is_CallStaticJava() || !n->as_CallStaticJava()->_name ) {
          Node *iff = n->in(0)->in(0);
-          if( !iff->is_If() ||
+          // No any calls for vectorized loops.
+          if( UseSuperWord || !iff->is_If() ||
              (n->in(0)->Opcode() == Op_IfFalse &&
               (1.0 - iff->as_If()->_prob) >= 0.01) ||
              (iff->as_If()->_prob >= 0.01) )
@@ -3216,7 +3217,8 @@ void PhaseIdealLoop::build_loop_late_post( Node *n ) {
    case Op_ModF:
    case Op_ModD:
    case Op_LoadB:              // Same with Loads; they can sink
-    case Op_LoadUS:             // during loop optimizations.
+    case Op_LoadUB:             // during loop optimizations.
+    case Op_LoadUS:
    case Op_LoadD:
    case Op_LoadF:
    case Op_LoadI:

--- a/src/share/vm/opto/machnode.cpp
+++ b/src/share/vm/opto/machnode.cpp
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -439,9 +439,9 @@ bool MachNode::rematerialize() const {
  // Don't remateralize somebody with bound inputs - it stretches a
  // fixed register lifetime.
  uint idx = oper_input_base();
-  if( req() > idx ) {
+  if (req() > idx) {
    const RegMask &rm = in_RegMask(idx);
-    if( rm.is_bound1() || rm.is_bound2() )
+    if (rm.is_bound(ideal_reg()))
      return false;
  }


--- a/src/share/vm/opto/machnode.hpp
+++ b/src/share/vm/opto/machnode.hpp
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -319,6 +319,7 @@ public:
 class MachTypeNode : public MachNode {
  virtual uint size_of() const { return sizeof(*this); } // Size is bigger
 public:
+  MachTypeNode( ) {}
  const Type *_bottom_type;

  virtual const class Type *bottom_type() const { return _bottom_type; }
@@ -370,12 +371,12 @@ public:

 //------------------------------MachConstantNode-------------------------------
 // Machine node that holds a constant which is stored in the constant table.
-class MachConstantNode : public MachNode {
+class MachConstantNode : public MachTypeNode {
 protected:
  Compile::Constant _constant;  // This node's constant.

 public:
-  MachConstantNode() : MachNode() {
+  MachConstantNode() : MachTypeNode() {
    init_class_id(Class_MachConstant);
  }


--- a/src/share/vm/opto/matcher.cpp
+++ b/src/share/vm/opto/matcher.cpp
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -35,6 +35,7 @@
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
 #include "opto/type.hpp"
+#include "opto/vectornode.hpp"
 #include "runtime/atomic.hpp"
 #include "runtime/os.hpp"
 #ifdef TARGET_ARCH_MODEL_x86_32
@@ -58,18 +59,6 @@

 OptoReg::Name OptoReg::c_frame_pointer;

-
-
-const int Matcher::base2reg[Type::lastype] = {
-  Node::NotAMachineReg,0,0, Op_RegI, Op_RegL, 0, Op_RegN,
-  Node::NotAMachineReg, Node::NotAMachineReg, /* tuple, array */
-  Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, /* the pointers */
-  0, 0/*abio*/,
-  Op_RegP /* Return address */, 0, /* the memories */
-  Op_RegF, Op_RegF, Op_RegF, Op_RegD, Op_RegD, Op_RegD,
-  0  /*bottom*/
-};
-
 const RegMask *Matcher::idealreg2regmask[_last_machine_leaf];
 RegMask Matcher::mreg2regmask[_last_Mach_Reg];
 RegMask Matcher::STACK_ONLY_mask;
@@ -107,6 +96,10 @@ Matcher::Matcher( Node_List &proj_list ) :
  idealreg2spillmask  [Op_RegF] = NULL;
  idealreg2spillmask  [Op_RegD] = NULL;
  idealreg2spillmask  [Op_RegP] = NULL;
+  idealreg2spillmask  [Op_VecS] = NULL;
+  idealreg2spillmask  [Op_VecD] = NULL;
+  idealreg2spillmask  [Op_VecX] = NULL;
+  idealreg2spillmask  [Op_VecY] = NULL;

  idealreg2debugmask  [Op_RegI] = NULL;
  idealreg2debugmask  [Op_RegN] = NULL;
@@ -114,6 +107,10 @@ Matcher::Matcher( Node_List &proj_list ) :
  idealreg2debugmask  [Op_RegF] = NULL;
  idealreg2debugmask  [Op_RegD] = NULL;
  idealreg2debugmask  [Op_RegP] = NULL;
+  idealreg2debugmask  [Op_VecS] = NULL;
+  idealreg2debugmask  [Op_VecD] = NULL;
+  idealreg2debugmask  [Op_VecX] = NULL;
+  idealreg2debugmask  [Op_VecY] = NULL;

  idealreg2mhdebugmask[Op_RegI] = NULL;
  idealreg2mhdebugmask[Op_RegN] = NULL;
@@ -121,6 +118,10 @@ Matcher::Matcher( Node_List &proj_list ) :
  idealreg2mhdebugmask[Op_RegF] = NULL;
  idealreg2mhdebugmask[Op_RegD] = NULL;
  idealreg2mhdebugmask[Op_RegP] = NULL;
+  idealreg2mhdebugmask[Op_VecS] = NULL;
+  idealreg2mhdebugmask[Op_VecD] = NULL;
+  idealreg2mhdebugmask[Op_VecX] = NULL;
+  idealreg2mhdebugmask[Op_VecY] = NULL;

  debug_only(_mem_node = NULL;)   // Ideal memory node consumed by mach node
 }
@@ -134,7 +135,7 @@ OptoReg::Name Matcher::warp_incoming_stk_arg( VMReg reg ) {
    warped = OptoReg::add(warped, C->out_preserve_stack_slots());
    if( warped >= _in_arg_limit )
      _in_arg_limit = OptoReg::add(warped, 1); // Bump max stack slot seen
-    if (!RegMask::can_represent(warped)) {
+    if (!RegMask::can_represent_arg(warped)) {
      // the compiler cannot represent this method's calling sequence
      C->record_method_not_compilable_all_tiers("unsupported incoming calling sequence");
      return OptoReg::Bad;
@@ -302,7 +303,7 @@ void Matcher::match( ) {
  _out_arg_limit = OptoReg::add(_new_SP, C->out_preserve_stack_slots());
  assert( is_even(_out_arg_limit), "out_preserve must be even" );

-  if (!RegMask::can_represent(OptoReg::add(_out_arg_limit,-1))) {
+  if (!RegMask::can_represent_arg(OptoReg::add(_out_arg_limit,-1))) {
    // the compiler cannot represent this method's calling sequence
    C->record_method_not_compilable("must be able to represent all call arguments in reg mask");
  }
@@ -428,7 +429,7 @@ static RegMask *init_input_masks( uint size, RegMask &ret_adr, RegMask &fp ) {
 void Matcher::init_first_stack_mask() {

  // Allocate storage for spill masks as masks for the appropriate load type.
-  RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * 3*6);
+  RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+4));

  idealreg2spillmask  [Op_RegN] = &rms[0];
  idealreg2spillmask  [Op_RegI] = &rms[1];
@@ -451,6 +452,11 @@ void Matcher::init_first_stack_mask() {
  idealreg2mhdebugmask[Op_RegD] = &rms[16];
  idealreg2mhdebugmask[Op_RegP] = &rms[17];

+  idealreg2spillmask  [Op_VecS] = &rms[18];
+  idealreg2spillmask  [Op_VecD] = &rms[19];
+  idealreg2spillmask  [Op_VecX] = &rms[20];
+  idealreg2spillmask  [Op_VecY] = &rms[21];
+
  OptoReg::Name i;

  // At first, start with the empty mask
@@ -462,7 +468,7 @@ void Matcher::init_first_stack_mask() {
    C->FIRST_STACK_mask().Insert(i);

  // Add in all bits past the outgoing argument area
-  guarantee(RegMask::can_represent(OptoReg::add(_out_arg_limit,-1)),
+  guarantee(RegMask::can_represent_arg(OptoReg::add(_out_arg_limit,-1)),
            "must be able to represent all call arguments in reg mask");
  init = _out_arg_limit;
  for (i = init; RegMask::can_represent(i); i = OptoReg::add(i,1))
@@ -472,21 +478,48 @@ void Matcher::init_first_stack_mask() {
  C->FIRST_STACK_mask().set_AllStack();

  // Make spill masks.  Registers for their class, plus FIRST_STACK_mask.
+  RegMask aligned_stack_mask = C->FIRST_STACK_mask();
+  // Keep spill masks aligned.
+  aligned_stack_mask.clear_to_pairs();
+  assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
+
+  *idealreg2spillmask[Op_RegP] = *idealreg2regmask[Op_RegP];
 #ifdef _LP64
  *idealreg2spillmask[Op_RegN] = *idealreg2regmask[Op_RegN];
   idealreg2spillmask[Op_RegN]->OR(C->FIRST_STACK_mask());
+   idealreg2spillmask[Op_RegP]->OR(aligned_stack_mask);
+#else
+   idealreg2spillmask[Op_RegP]->OR(C->FIRST_STACK_mask());
 #endif
  *idealreg2spillmask[Op_RegI] = *idealreg2regmask[Op_RegI];
   idealreg2spillmask[Op_RegI]->OR(C->FIRST_STACK_mask());
  *idealreg2spillmask[Op_RegL] = *idealreg2regmask[Op_RegL];
-   idealreg2spillmask[Op_RegL]->OR(C->FIRST_STACK_mask());
+   idealreg2spillmask[Op_RegL]->OR(aligned_stack_mask);
  *idealreg2spillmask[Op_RegF] = *idealreg2regmask[Op_RegF];
   idealreg2spillmask[Op_RegF]->OR(C->FIRST_STACK_mask());
  *idealreg2spillmask[Op_RegD] = *idealreg2regmask[Op_RegD];
-   idealreg2spillmask[Op_RegD]->OR(C->FIRST_STACK_mask());
-  *idealreg2spillmask[Op_RegP] = *idealreg2regmask[Op_RegP];
-   idealreg2spillmask[Op_RegP]->OR(C->FIRST_STACK_mask());
+   idealreg2spillmask[Op_RegD]->OR(aligned_stack_mask);

+  if (Matcher::vector_size_supported(T_BYTE,4)) {
+    *idealreg2spillmask[Op_VecS] = *idealreg2regmask[Op_VecS];
+     idealreg2spillmask[Op_VecS]->OR(C->FIRST_STACK_mask());
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,2)) {
+    *idealreg2spillmask[Op_VecD] = *idealreg2regmask[Op_VecD];
+     idealreg2spillmask[Op_VecD]->OR(aligned_stack_mask);
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,4)) {
+     aligned_stack_mask.clear_to_sets(RegMask::SlotsPerVecX);
+     assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
+    *idealreg2spillmask[Op_VecX] = *idealreg2regmask[Op_VecX];
+     idealreg2spillmask[Op_VecX]->OR(aligned_stack_mask);
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,8)) {
+     aligned_stack_mask.clear_to_sets(RegMask::SlotsPerVecY);
+     assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
+    *idealreg2spillmask[Op_VecY] = *idealreg2regmask[Op_VecY];
+     idealreg2spillmask[Op_VecY]->OR(aligned_stack_mask);
+  }
   if (UseFPUForSpilling) {
     // This mask logic assumes that the spill operations are
     // symmetric and that the registers involved are the same size.
@@ -807,6 +840,25 @@ void Matcher::init_spill_mask( Node *ret ) {
  idealreg2regmask[Op_RegF] = &spillF->out_RegMask();
  idealreg2regmask[Op_RegD] = &spillD->out_RegMask();
  idealreg2regmask[Op_RegP] = &spillP->out_RegMask();
+
+  // Vector regmasks.
+  if (Matcher::vector_size_supported(T_BYTE,4)) {
+    TypeVect::VECTS = TypeVect::make(T_BYTE, 4);
+    MachNode *spillVectS = match_tree(new (C, 3) LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTS));
+    idealreg2regmask[Op_VecS] = &spillVectS->out_RegMask();
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,2)) {
+    MachNode *spillVectD = match_tree(new (C, 3) LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTD));
+    idealreg2regmask[Op_VecD] = &spillVectD->out_RegMask();
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,4)) {
+    MachNode *spillVectX = match_tree(new (C, 3) LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTX));
+    idealreg2regmask[Op_VecX] = &spillVectX->out_RegMask();
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,8)) {
+    MachNode *spillVectY = match_tree(new (C, 3) LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTY));
+    idealreg2regmask[Op_VecY] = &spillVectY->out_RegMask();
+  }
 }

 #ifdef ASSERT
@@ -1063,7 +1115,7 @@ OptoReg::Name Matcher::warp_outgoing_stk_arg( VMReg reg, OptoReg::Name begin_out
    // that is killed by the call.
    if( warped >= out_arg_limit_per_call )
      out_arg_limit_per_call = OptoReg::add(warped,1);
-    if (!RegMask::can_represent(warped)) {
+    if (!RegMask::can_represent_arg(warped)) {
      C->record_method_not_compilable_all_tiers("unsupported calling sequence");
      return OptoReg::Bad;
    }
@@ -1251,7 +1303,7 @@ MachNode *Matcher::match_sfpt( SafePointNode *sfpt ) {
    // this killed area.
    uint r_cnt = mcall->tf()->range()->cnt();
    MachProjNode *proj = new (C, 1) MachProjNode( mcall, r_cnt+10000, RegMask::Empty, MachProjNode::fat_proj );
-    if (!RegMask::can_represent(OptoReg::Name(out_arg_limit_per_call-1))) {
+    if (!RegMask::can_represent_arg(OptoReg::Name(out_arg_limit_per_call-1))) {
      C->record_method_not_compilable_all_tiers("unsupported outgoing calling sequence");
    } else {
      for (int i = begin_out_arg_area; i < out_arg_limit_per_call; i++)

--- a/src/share/vm/opto/matcher.hpp
+++ b/src/share/vm/opto/matcher.hpp
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -250,10 +250,21 @@ public:
  static const bool convL2FSupported(void);

  // Vector width in bytes
-  static const uint vector_width_in_bytes(void);
+  static const int vector_width_in_bytes(BasicType bt);
+
+  // Limits on vector size (number of elements).
+  static const int max_vector_size(const BasicType bt);
+  static const int min_vector_size(const BasicType bt);
+  static const bool vector_size_supported(const BasicType bt, int size) {
+    return (Matcher::max_vector_size(bt) >= size &&
+            Matcher::min_vector_size(bt) <= size);
+  }

  // Vector ideal reg
-  static const uint vector_ideal_reg(void);
+  static const int vector_ideal_reg(int len);
+
+  // CPU supports misaligned vectors store/load.
+  static const bool misaligned_vectors_ok();

  // Used to determine a "low complexity" 64-bit constant.  (Zero is simple.)
  // The standard of comparison is one (StoreL ConL) vs. two (StoreI ConI).

--- a/src/share/vm/opto/memnode.cpp
+++ b/src/share/vm/opto/memnode.cpp
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -1543,6 +1543,7 @@ const Type *LoadNode::Value( PhaseTransform *phase ) const {
    // had an original form like p1:(AddP x x (LShiftL quux 3)), where the
    // expression (LShiftL quux 3) independently optimized to the constant 8.
    if ((t->isa_int() == NULL) && (t->isa_long() == NULL)
+        && (_type->isa_vect() == NULL)
        && Opcode() != Op_LoadKlass && Opcode() != Op_LoadNKlass) {
      // t might actually be lower than _type, if _type is a unique
      // concrete subclass of abstract class t.

--- a/src/share/vm/opto/mulnode.hpp
+++ b/src/share/vm/opto/mulnode.hpp
--- a/src/share/vm/opto/node.cpp
+++ b/src/share/vm/opto/node.cpp
--- a/src/share/vm/opto/node.hpp
+++ b/src/share/vm/opto/node.hpp
--- a/src/share/vm/opto/opcodes.cpp
+++ b/src/share/vm/opto/opcodes.cpp
--- a/src/share/vm/opto/opcodes.hpp
+++ b/src/share/vm/opto/opcodes.hpp
--- a/src/share/vm/opto/postaloc.cpp
+++ b/src/share/vm/opto/postaloc.cpp
--- a/src/share/vm/opto/reg_split.cpp
+++ b/src/share/vm/opto/reg_split.cpp
--- a/src/share/vm/opto/regmask.cpp
+++ b/src/share/vm/opto/regmask.cpp
--- a/src/share/vm/opto/regmask.hpp
+++ b/src/share/vm/opto/regmask.hpp
--- a/src/share/vm/opto/superword.cpp
+++ b/src/share/vm/opto/superword.cpp
--- a/src/share/vm/opto/superword.hpp
+++ b/src/share/vm/opto/superword.hpp
--- a/src/share/vm/opto/type.cpp
+++ b/src/share/vm/opto/type.cpp
--- a/src/share/vm/opto/type.hpp
+++ b/src/share/vm/opto/type.hpp
--- a/src/share/vm/opto/vectornode.cpp
+++ b/src/share/vm/opto/vectornode.cpp
--- a/src/share/vm/opto/vectornode.hpp
+++ b/src/share/vm/opto/vectornode.hpp
--- a/src/share/vm/runtime/vmStructs.cpp
+++ b/src/share/vm/runtime/vmStructs.cpp
--- a/test/compiler/7119644/TestBooleanVect.java
+++ b/test/compiler/7119644/TestBooleanVect.java
--- a/test/compiler/7119644/TestByteDoubleVect.java
+++ b/test/compiler/7119644/TestByteDoubleVect.java
--- a/test/compiler/7119644/TestByteFloatVect.java
+++ b/test/compiler/7119644/TestByteFloatVect.java
--- a/test/compiler/7119644/TestByteIntVect.java
+++ b/test/compiler/7119644/TestByteIntVect.java
--- a/test/compiler/7119644/TestByteLongVect.java
+++ b/test/compiler/7119644/TestByteLongVect.java
--- a/test/compiler/7119644/TestByteShortVect.java
+++ b/test/compiler/7119644/TestByteShortVect.java
--- a/test/compiler/7119644/TestByteVect.java
+++ b/test/compiler/7119644/TestByteVect.java
--- a/test/compiler/7119644/TestCharShortVect.java
+++ b/test/compiler/7119644/TestCharShortVect.java
--- a/test/compiler/7119644/TestCharVect.java
+++ b/test/compiler/7119644/TestCharVect.java
--- a/test/compiler/7119644/TestDoubleVect.java
+++ b/test/compiler/7119644/TestDoubleVect.java
--- a/test/compiler/7119644/TestFloatDoubleVect.java
+++ b/test/compiler/7119644/TestFloatDoubleVect.java
--- a/test/compiler/7119644/TestFloatVect.java
+++ b/test/compiler/7119644/TestFloatVect.java
--- a/test/compiler/7119644/TestIntDoubleVect.java
+++ b/test/compiler/7119644/TestIntDoubleVect.java
--- a/test/compiler/7119644/TestIntFloatVect.java
+++ b/test/compiler/7119644/TestIntFloatVect.java
--- a/test/compiler/7119644/TestIntLongVect.java
+++ b/test/compiler/7119644/TestIntLongVect.java
--- a/test/compiler/7119644/TestIntVect.java
+++ b/test/compiler/7119644/TestIntVect.java
--- a/test/compiler/7119644/TestLongDoubleVect.java
+++ b/test/compiler/7119644/TestLongDoubleVect.java
--- a/test/compiler/7119644/TestLongFloatVect.java
+++ b/test/compiler/7119644/TestLongFloatVect.java
--- a/test/compiler/7119644/TestLongVect.java
+++ b/test/compiler/7119644/TestLongVect.java
--- a/test/compiler/7119644/TestShortDoubleVect.java
+++ b/test/compiler/7119644/TestShortDoubleVect.java
--- a/test/compiler/7119644/TestShortFloatVect.java
+++ b/test/compiler/7119644/TestShortFloatVect.java
--- a/test/compiler/7119644/TestShortIntVect.java
+++ b/test/compiler/7119644/TestShortIntVect.java
--- a/test/compiler/7119644/TestShortLongVect.java
+++ b/test/compiler/7119644/TestShortLongVect.java
--- a/test/compiler/7119644/TestShortVect.java
+++ b/test/compiler/7119644/TestShortVect.java