From f375da7cf526e5598b85592ad6a4a85bb73fa1ae Mon Sep 17 00:00:00 2001 From: kvn Date: Mon, 20 Aug 2012 09:07:21 -0700 Subject: [PATCH] 6340864: Implement vectorization optimizations in hotspot-server Summary: Added asm encoding and mach nodes for vector arithmetic instructions on x86. Reviewed-by: roland --- src/cpu/x86/vm/assembler_x86.cpp | 1175 ++++++---- src/cpu/x86/vm/assembler_x86.hpp | 207 +- src/cpu/x86/vm/x86.ad | 2443 ++++++++++++++++++++- src/cpu/x86/vm/x86_32.ad | 16 - src/cpu/x86/vm/x86_64.ad | 46 +- src/share/vm/opto/classes.hpp | 8 + src/share/vm/opto/loopnode.cpp | 2 + src/share/vm/opto/superword.cpp | 101 +- src/share/vm/opto/vectornode.cpp | 64 + src/share/vm/opto/vectornode.hpp | 93 +- test/compiler/6340864/TestByteVect.java | 1274 +++++++++++ test/compiler/6340864/TestDoubleVect.java | 560 +++++ test/compiler/6340864/TestFloatVect.java | 560 +++++ test/compiler/6340864/TestIntVect.java | 1012 +++++++++ test/compiler/6340864/TestLongVect.java | 917 ++++++++ test/compiler/6340864/TestShortVect.java | 1127 ++++++++++ 16 files changed, 9053 insertions(+), 552 deletions(-) create mode 100644 test/compiler/6340864/TestByteVect.java create mode 100644 test/compiler/6340864/TestDoubleVect.java create mode 100644 test/compiler/6340864/TestFloatVect.java create mode 100644 test/compiler/6340864/TestIntVect.java create mode 100644 test/compiler/6340864/TestLongVect.java create mode 100644 test/compiler/6340864/TestShortVect.java diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp index f88f9c62b..abf837339 100644 --- a/src/cpu/x86/vm/assembler_x86.cpp +++ b/src/cpu/x86/vm/assembler_x86.cpp @@ -999,32 +999,22 @@ void Assembler::addr_nop_8() { void Assembler::addsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x58); - emit_byte(0xC0 | encode); + emit_simd_arith(0x58, dst, src, VEX_SIMD_F2); } void Assembler::addsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x58); - emit_operand(dst, src); + emit_simd_arith(0x58, dst, src, VEX_SIMD_F2); } void Assembler::addss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x58); - emit_byte(0xC0 | encode); + emit_simd_arith(0x58, dst, src, VEX_SIMD_F3); } void Assembler::addss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x58); - emit_operand(dst, src); + emit_simd_arith(0x58, dst, src, VEX_SIMD_F3); } void Assembler::andl(Address dst, int32_t imm32) { @@ -1052,36 +1042,6 @@ void Assembler::andl(Register dst, Register src) { emit_arith(0x23, 0xC0, dst, src); } -void Assembler::andpd(XMMRegister dst, Address src) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66); - emit_byte(0x54); - emit_operand(dst, src); -} - -void Assembler::andpd(XMMRegister dst, XMMRegister src) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66); - emit_byte(0x54); - emit_byte(0xC0 | encode); -} - -void Assembler::andps(XMMRegister dst, Address src) { - NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_NONE); - emit_byte(0x54); - emit_operand(dst, src); -} - -void Assembler::andps(XMMRegister dst, XMMRegister src) { - NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE); - emit_byte(0x54); - emit_byte(0xC0 | encode); -} - void Assembler::bsfl(Register dst, Register src) { int encode = prefix_and_encode(dst->encoding(), src->encoding()); emit_byte(0x0F); @@ -1246,61 +1206,42 @@ void Assembler::comisd(XMMRegister dst, Address src) { // NOTE: dbx seems to decode this as comiss even though the // 0x66 is there. Strangly ucomisd comes out correct NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66); - emit_byte(0x2F); - emit_operand(dst, src); + emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66); } void Assembler::comisd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66); - emit_byte(0x2F); - emit_byte(0xC0 | encode); + emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66); } void Assembler::comiss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_NONE); - emit_byte(0x2F); - emit_operand(dst, src); + emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE); } void Assembler::comiss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE); - emit_byte(0x2F); - emit_byte(0xC0 | encode); + emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE); } void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3); - emit_byte(0xE6); - emit_byte(0xC0 | encode); + emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3); } void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE); - emit_byte(0x5B); - emit_byte(0xC0 | encode); + emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE); } void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x5A); - emit_byte(0xC0 | encode); + emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2); } void Assembler::cvtsd2ss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x5A); - emit_operand(dst, src); + emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2); } void Assembler::cvtsi2sdl(XMMRegister dst, Register src) { @@ -1312,10 +1253,7 @@ void Assembler::cvtsi2sdl(XMMRegister dst, Register src) { void Assembler::cvtsi2sdl(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x2A); - emit_operand(dst, src); + emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2); } void Assembler::cvtsi2ssl(XMMRegister dst, Register src) { @@ -1327,25 +1265,17 @@ void Assembler::cvtsi2ssl(XMMRegister dst, Register src) { void Assembler::cvtsi2ssl(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x2A); - emit_operand(dst, src); + emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3); } void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x5A); - emit_byte(0xC0 | encode); + emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3); } void Assembler::cvtss2sd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x5A); - emit_operand(dst, src); + emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3); } @@ -1373,32 +1303,22 @@ void Assembler::decl(Address dst) { void Assembler::divsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x5E); - emit_operand(dst, src); + emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2); } void Assembler::divsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x5E); - emit_byte(0xC0 | encode); + emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2); } void Assembler::divss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x5E); - emit_operand(dst, src); + emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3); } void Assembler::divss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x5E); - emit_byte(0xC0 | encode); + emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3); } void Assembler::emms() { @@ -1634,16 +1554,12 @@ void Assembler::mov(Register dst, Register src) { void Assembler::movapd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66); - emit_byte(0x28); - emit_byte(0xC0 | encode); + emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66); } void Assembler::movaps(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE); - emit_byte(0x28); - emit_byte(0xC0 | encode); + emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE); } void Assembler::movlhps(XMMRegister dst, XMMRegister src) { @@ -1712,24 +1628,17 @@ void Assembler::movdl(Address dst, XMMRegister src) { void Assembler::movdqa(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66); - emit_byte(0x6F); - emit_byte(0xC0 | encode); + emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66); } void Assembler::movdqu(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_F3); - emit_byte(0x6F); - emit_operand(dst, src); + emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3); } void Assembler::movdqu(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3); - emit_byte(0x6F); - emit_byte(0xC0 | encode); + emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3); } void Assembler::movdqu(Address dst, XMMRegister src) { @@ -1810,10 +1719,7 @@ void Assembler::movl(Address dst, Register src) { // The selection is done in MacroAssembler::movdbl() and movflt(). void Assembler::movlpd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66); - emit_byte(0x12); - emit_operand(dst, src); + emit_simd_arith(0x12, dst, src, VEX_SIMD_66); } void Assembler::movq( MMXRegister dst, Address src ) { @@ -1870,17 +1776,12 @@ void Assembler::movsbl(Register dst, Register src) { // movsxb void Assembler::movsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x10); - emit_byte(0xC0 | encode); + emit_simd_arith(0x10, dst, src, VEX_SIMD_F2); } void Assembler::movsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_F2); - emit_byte(0x10); - emit_operand(dst, src); + emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2); } void Assembler::movsd(Address dst, XMMRegister src) { @@ -1893,17 +1794,12 @@ void Assembler::movsd(Address dst, XMMRegister src) { void Assembler::movss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x10); - emit_byte(0xC0 | encode); + emit_simd_arith(0x10, dst, src, VEX_SIMD_F3); } void Assembler::movss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_F3); - emit_byte(0x10); - emit_operand(dst, src); + emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3); } void Assembler::movss(Address dst, XMMRegister src) { @@ -2001,32 +1897,22 @@ void Assembler::mull(Register src) { void Assembler::mulsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x59); - emit_operand(dst, src); + emit_simd_arith(0x59, dst, src, VEX_SIMD_F2); } void Assembler::mulsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x59); - emit_byte(0xC0 | encode); + emit_simd_arith(0x59, dst, src, VEX_SIMD_F2); } void Assembler::mulss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x59); - emit_operand(dst, src); + emit_simd_arith(0x59, dst, src, VEX_SIMD_F3); } void Assembler::mulss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x59); - emit_byte(0xC0 | encode); + emit_simd_arith(0x59, dst, src, VEX_SIMD_F3); } void Assembler::negl(Register dst) { @@ -2315,17 +2201,12 @@ void Assembler::orl(Register dst, Register src) { void Assembler::packuswb(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66); - emit_byte(0x67); - emit_operand(dst, src); + emit_simd_arith(0x67, dst, src, VEX_SIMD_66); } void Assembler::packuswb(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66); - emit_byte(0x67); - emit_byte(0xC0 | encode); + emit_simd_arith(0x67, dst, src, VEX_SIMD_66); } void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) { @@ -2339,7 +2220,7 @@ void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) { void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_2(), ""); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A); emit_byte(0x61); emit_byte(0xC0 | encode); emit_byte(imm8); @@ -2355,7 +2236,7 @@ void Assembler::pmovzxbw(XMMRegister dst, Address src) { void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38); emit_byte(0x30); emit_byte(0xC0 | encode); } @@ -2456,28 +2337,10 @@ void Assembler::prefix(Prefix p) { a_byte(p); } -void Assembler::por(XMMRegister dst, XMMRegister src) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66); - emit_byte(0xEB); - emit_byte(0xC0 | encode); -} - -void Assembler::por(XMMRegister dst, Address src) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66); - emit_byte(0xEB); - emit_operand(dst, src); -} - void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) { assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66); - emit_byte(0x70); - emit_byte(0xC0 | encode); + emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_66); emit_byte(mode & 0xFF); } @@ -2496,9 +2359,7 @@ void Assembler::pshufd(XMMRegister dst, Address src, int mode) { void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) { assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2); - emit_byte(0x70); - emit_byte(0xC0 | encode); + emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2); emit_byte(mode & 0xFF); } @@ -2513,18 +2374,6 @@ void Assembler::pshuflw(XMMRegister dst, Address src, int mode) { emit_byte(mode & 0xFF); } -void Assembler::psrlq(XMMRegister dst, int shift) { - // Shift 64 bit value logically right by specified number of bits. - // HMM Table D-1 says sse2 or mmx. - // Do not confuse it with psrldq SSE2 instruction which - // shifts 128 bit value in xmm register by number of bytes. - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66); - emit_byte(0x73); - emit_byte(0xC0 | encode); - emit_byte(shift); -} - void Assembler::psrldq(XMMRegister dst, int shift) { // Shift 128 bit value in xmm register by number of bytes. NOT_LP64(assert(VM_Version::supports_sse2(), "")); @@ -2545,7 +2394,7 @@ void Assembler::ptest(XMMRegister dst, Address src) { void Assembler::ptest(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38); emit_byte(0x17); emit_byte(0xC0 | encode); } @@ -2553,40 +2402,28 @@ void Assembler::ptest(XMMRegister dst, XMMRegister src) { void Assembler::punpcklbw(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66); - emit_byte(0x60); - emit_operand(dst, src); + emit_simd_arith(0x60, dst, src, VEX_SIMD_66); } void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66); - emit_byte(0x60); - emit_byte(0xC0 | encode); + emit_simd_arith(0x60, dst, src, VEX_SIMD_66); } void Assembler::punpckldq(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66); - emit_byte(0x62); - emit_operand(dst, src); + emit_simd_arith(0x62, dst, src, VEX_SIMD_66); } void Assembler::punpckldq(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66); - emit_byte(0x62); - emit_byte(0xC0 | encode); + emit_simd_arith(0x62, dst, src, VEX_SIMD_66); } void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66); - emit_byte(0x6C); - emit_byte(0xC0 | encode); + emit_simd_arith(0x6C, dst, src, VEX_SIMD_66); } void Assembler::push(int32_t imm32) { @@ -2616,22 +2453,6 @@ void Assembler::pushl(Address src) { } #endif -void Assembler::pxor(XMMRegister dst, Address src) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66); - emit_byte(0xEF); - emit_operand(dst, src); -} - -void Assembler::pxor(XMMRegister dst, XMMRegister src) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66); - emit_byte(0xEF); - emit_byte(0xC0 | encode); -} - void Assembler::rcll(Register dst, int imm8) { assert(isShiftCount(imm8), "illegal shift count"); int encode = prefix_and_encode(dst->encoding()); @@ -2790,32 +2611,22 @@ void Assembler::smovl() { void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x51); - emit_byte(0xC0 | encode); + emit_simd_arith(0x51, dst, src, VEX_SIMD_F2); } void Assembler::sqrtsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x51); - emit_operand(dst, src); + emit_simd_arith(0x51, dst, src, VEX_SIMD_F2); } void Assembler::sqrtss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x51); - emit_byte(0xC0 | encode); + emit_simd_arith(0x51, dst, src, VEX_SIMD_F3); } void Assembler::sqrtss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x51); - emit_operand(dst, src); + emit_simd_arith(0x51, dst, src, VEX_SIMD_F3); } void Assembler::stmxcsr( Address dst) { @@ -2865,32 +2676,22 @@ void Assembler::subl(Register dst, Register src) { void Assembler::subsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x5C); - emit_byte(0xC0 | encode); + emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2); } void Assembler::subsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F2); - emit_byte(0x5C); - emit_operand(dst, src); + emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2); } void Assembler::subss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x5C); - emit_byte(0xC0 | encode); + emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3); } void Assembler::subss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_F3); - emit_byte(0x5C); - emit_operand(dst, src); + emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3); } void Assembler::testb(Register dst, int imm8) { @@ -2928,32 +2729,22 @@ void Assembler::testl(Register dst, Address src) { void Assembler::ucomisd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66); - emit_byte(0x2E); - emit_operand(dst, src); + emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66); } void Assembler::ucomisd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66); - emit_byte(0x2E); - emit_byte(0xC0 | encode); + emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66); } void Assembler::ucomiss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_NONE); - emit_byte(0x2E); - emit_operand(dst, src); + emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE); } void Assembler::ucomiss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE); - emit_byte(0x2E); - emit_byte(0xC0 | encode); + emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE); } @@ -2995,226 +2786,729 @@ void Assembler::xorl(Register dst, Register src) { emit_arith(0x33, 0xC0, dst, src); } -void Assembler::xorpd(XMMRegister dst, XMMRegister src) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66); - emit_byte(0x57); - emit_byte(0xC0 | encode); -} - -void Assembler::xorpd(XMMRegister dst, Address src) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66); - emit_byte(0x57); - emit_operand(dst, src); -} - - -void Assembler::xorps(XMMRegister dst, XMMRegister src) { - NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE); - emit_byte(0x57); - emit_byte(0xC0 | encode); -} - -void Assembler::xorps(XMMRegister dst, Address src) { - NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_NONE); - emit_byte(0x57); - emit_operand(dst, src); -} -// AVX 3-operands non destructive source instructions (encoded with VEX prefix) +// AVX 3-operands scalar float-point arithmetic instructions void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - InstructionMark im(this); - vex_prefix(dst, nds, src, VEX_SIMD_F2); - emit_byte(0x58); - emit_operand(dst, src); + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); } void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2); - emit_byte(0x58); - emit_byte(0xC0 | encode); + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); } void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - InstructionMark im(this); - vex_prefix(dst, nds, src, VEX_SIMD_F3); - emit_byte(0x58); - emit_operand(dst, src); + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); } void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3); - emit_byte(0x58); - emit_byte(0xC0 | encode); -} - -void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src) { - assert(VM_Version::supports_avx(), ""); - InstructionMark im(this); - vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector - emit_byte(0x54); - emit_operand(dst, src); -} - -void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src) { - assert(VM_Version::supports_avx(), ""); - InstructionMark im(this); - vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector - emit_byte(0x54); - emit_operand(dst, src); + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); } void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - InstructionMark im(this); - vex_prefix(dst, nds, src, VEX_SIMD_F2); - emit_byte(0x5E); - emit_operand(dst, src); + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); } void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2); - emit_byte(0x5E); - emit_byte(0xC0 | encode); + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); } void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - InstructionMark im(this); - vex_prefix(dst, nds, src, VEX_SIMD_F3); - emit_byte(0x5E); - emit_operand(dst, src); + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); } void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3); - emit_byte(0x5E); - emit_byte(0xC0 | encode); + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); } void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - InstructionMark im(this); - vex_prefix(dst, nds, src, VEX_SIMD_F2); - emit_byte(0x59); - emit_operand(dst, src); + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); } void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2); - emit_byte(0x59); - emit_byte(0xC0 | encode); + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); } void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) { - InstructionMark im(this); - vex_prefix(dst, nds, src, VEX_SIMD_F3); - emit_byte(0x59); - emit_operand(dst, src); + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); } void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3); - emit_byte(0x59); - emit_byte(0xC0 | encode); + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); } - void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - InstructionMark im(this); - vex_prefix(dst, nds, src, VEX_SIMD_F2); - emit_byte(0x5C); - emit_operand(dst, src); + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); } void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2); - emit_byte(0x5C); - emit_byte(0xC0 | encode); + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false); } void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); - InstructionMark im(this); - vex_prefix(dst, nds, src, VEX_SIMD_F3); - emit_byte(0x5C); - emit_operand(dst, src); + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); } void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3); - emit_byte(0x5C); - emit_byte(0xC0 | encode); + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false); } -void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src) { - assert(VM_Version::supports_avx(), ""); - InstructionMark im(this); - vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector - emit_byte(0x57); - emit_operand(dst, src); +//====================VECTOR ARITHMETIC===================================== + +// Float-point vector arithmetic + +void Assembler::addpd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0x58, dst, src, VEX_SIMD_66); } -void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx(), ""); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256); - emit_byte(0x57); - emit_byte(0xC0 | encode); +void Assembler::addps(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0x58, dst, src, VEX_SIMD_NONE); } -void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) { +void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { assert(VM_Version::supports_avx(), ""); - InstructionMark im(this); - vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector - emit_byte(0x57); - emit_operand(dst, src); + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256); } -void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { +void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { assert(VM_Version::supports_avx(), ""); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, vector256); - emit_byte(0x57); - emit_byte(0xC0 | encode); + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256); } -void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { - assert(VM_Version::supports_avx2() || (!vector256) && VM_Version::supports_avx(), ""); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256); - emit_byte(0xEF); - emit_byte(0xC0 | encode); +void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256); } -void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { +void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { assert(VM_Version::supports_avx(), ""); - bool vector256 = true; - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A); - emit_byte(0x18); - emit_byte(0xC0 | encode); - // 0x00 - insert into lower 128 bits - // 0x01 - insert into upper 128 bits - emit_byte(0x01); + emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256); } -void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { - assert(VM_Version::supports_avx2(), ""); - bool vector256 = true; +void Assembler::subpd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0x5C, dst, src, VEX_SIMD_66); +} + +void Assembler::subps(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0x5C, dst, src, VEX_SIMD_NONE); +} + +void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256); +} + +void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256); +} + +void Assembler::mulpd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0x59, dst, src, VEX_SIMD_66); +} + +void Assembler::mulps(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE); +} + +void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256); +} + +void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256); +} + +void Assembler::divpd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0x5E, dst, src, VEX_SIMD_66); +} + +void Assembler::divps(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0x5E, dst, src, VEX_SIMD_NONE); +} + +void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256); +} + +void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256); +} + +void Assembler::andpd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0x54, dst, src, VEX_SIMD_66); +} + +void Assembler::andps(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE); +} + +void Assembler::andps(XMMRegister dst, Address src) { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE); +} + +void Assembler::andpd(XMMRegister dst, Address src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0x54, dst, src, VEX_SIMD_66); +} + +void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256); +} + +void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256); +} + +void Assembler::xorpd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0x57, dst, src, VEX_SIMD_66); +} + +void Assembler::xorps(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE); +} + +void Assembler::xorpd(XMMRegister dst, Address src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0x57, dst, src, VEX_SIMD_66); +} + +void Assembler::xorps(XMMRegister dst, Address src) { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE); +} + +void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256); +} + +void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx(), ""); + emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256); +} + + +// Integer vector arithmetic +void Assembler::paddb(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xFC, dst, src, VEX_SIMD_66); +} + +void Assembler::paddw(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xFD, dst, src, VEX_SIMD_66); +} + +void Assembler::paddd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xFE, dst, src, VEX_SIMD_66); +} + +void Assembler::paddq(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xD4, dst, src, VEX_SIMD_66); +} + +void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::psubb(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xF8, dst, src, VEX_SIMD_66); +} + +void Assembler::psubw(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xF9, dst, src, VEX_SIMD_66); +} + +void Assembler::psubd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xFA, dst, src, VEX_SIMD_66); +} + +void Assembler::psubq(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xFB, dst, src, VEX_SIMD_66); +} + +void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::pmullw(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xD5, dst, src, VEX_SIMD_66); +} + +void Assembler::pmulld(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0x40); + emit_byte(0xC0 | encode); +} + +void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38); + emit_byte(0x40); + emit_byte(0xC0 | encode); +} + +void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + InstructionMark im(this); + int dst_enc = dst->encoding(); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256); + emit_byte(0x40); + emit_operand(dst, src); +} + +// Shift packed integers left by specified number of bits. +void Assembler::psllw(XMMRegister dst, int shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + // XMM6 is for /6 encoding: 66 0F 71 /6 ib + int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66); + emit_byte(0x71); + emit_byte(0xC0 | encode); + emit_byte(shift & 0xFF); +} + +void Assembler::pslld(XMMRegister dst, int shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + // XMM6 is for /6 encoding: 66 0F 72 /6 ib + int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66); + emit_byte(0x72); + emit_byte(0xC0 | encode); + emit_byte(shift & 0xFF); +} + +void Assembler::psllq(XMMRegister dst, int shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + // XMM6 is for /6 encoding: 66 0F 73 /6 ib + int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66); + emit_byte(0x73); + emit_byte(0xC0 | encode); + emit_byte(shift & 0xFF); +} + +void Assembler::psllw(XMMRegister dst, XMMRegister shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66); +} + +void Assembler::pslld(XMMRegister dst, XMMRegister shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xF2, dst, shift, VEX_SIMD_66); +} + +void Assembler::psllq(XMMRegister dst, XMMRegister shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xF3, dst, shift, VEX_SIMD_66); +} + +void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + // XMM6 is for /6 encoding: 66 0F 71 /6 ib + emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector256); + emit_byte(shift & 0xFF); +} + +void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + // XMM6 is for /6 encoding: 66 0F 72 /6 ib + emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector256); + emit_byte(shift & 0xFF); +} + +void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + // XMM6 is for /6 encoding: 66 0F 73 /6 ib + emit_vex_arith(0x73, xmm6, dst, src, VEX_SIMD_66, vector256); + emit_byte(shift & 0xFF); +} + +void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector256); +} + +void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector256); +} + +void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xF3, dst, src, shift, VEX_SIMD_66, vector256); +} + +// Shift packed integers logically right by specified number of bits. +void Assembler::psrlw(XMMRegister dst, int shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + // XMM2 is for /2 encoding: 66 0F 71 /2 ib + int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66); + emit_byte(0x71); + emit_byte(0xC0 | encode); + emit_byte(shift & 0xFF); +} + +void Assembler::psrld(XMMRegister dst, int shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + // XMM2 is for /2 encoding: 66 0F 72 /2 ib + int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66); + emit_byte(0x72); + emit_byte(0xC0 | encode); + emit_byte(shift & 0xFF); +} + +void Assembler::psrlq(XMMRegister dst, int shift) { + // Do not confuse it with psrldq SSE2 instruction which + // shifts 128 bit value in xmm register by number of bytes. + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + // XMM2 is for /2 encoding: 66 0F 73 /2 ib + int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66); + emit_byte(0x73); + emit_byte(0xC0 | encode); + emit_byte(shift & 0xFF); +} + +void Assembler::psrlw(XMMRegister dst, XMMRegister shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66); +} + +void Assembler::psrld(XMMRegister dst, XMMRegister shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xD2, dst, shift, VEX_SIMD_66); +} + +void Assembler::psrlq(XMMRegister dst, XMMRegister shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xD3, dst, shift, VEX_SIMD_66); +} + +void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + // XMM2 is for /2 encoding: 66 0F 73 /2 ib + emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector256); + emit_byte(shift & 0xFF); +} + +void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + // XMM2 is for /2 encoding: 66 0F 73 /2 ib + emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector256); + emit_byte(shift & 0xFF); +} + +void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + // XMM2 is for /2 encoding: 66 0F 73 /2 ib + emit_vex_arith(0x73, xmm2, dst, src, VEX_SIMD_66, vector256); + emit_byte(shift & 0xFF); +} + +void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector256); +} + +void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector256); +} + +void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xD3, dst, src, shift, VEX_SIMD_66, vector256); +} + +// Shift packed integers arithmetically right by specified number of bits. +void Assembler::psraw(XMMRegister dst, int shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + // XMM4 is for /4 encoding: 66 0F 71 /4 ib + int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66); + emit_byte(0x71); + emit_byte(0xC0 | encode); + emit_byte(shift & 0xFF); +} + +void Assembler::psrad(XMMRegister dst, int shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + // XMM4 is for /4 encoding: 66 0F 72 /4 ib + int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66); + emit_byte(0x72); + emit_byte(0xC0 | encode); + emit_byte(shift & 0xFF); +} + +void Assembler::psraw(XMMRegister dst, XMMRegister shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66); +} + +void Assembler::psrad(XMMRegister dst, XMMRegister shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xE2, dst, shift, VEX_SIMD_66); +} + +void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + // XMM4 is for /4 encoding: 66 0F 71 /4 ib + emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector256); + emit_byte(shift & 0xFF); +} + +void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + // XMM4 is for /4 encoding: 66 0F 71 /4 ib + emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector256); + emit_byte(shift & 0xFF); +} + +void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector256); +} + +void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector256); +} + + +// AND packed integers +void Assembler::pand(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xDB, dst, src, VEX_SIMD_66); +} + +void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::por(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xEB, dst, src, VEX_SIMD_66); +} + +void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::pxor(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_simd_arith(0xEF, dst, src, VEX_SIMD_66); +} + +void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256); +} + +void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256); +} + + +void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { + assert(VM_Version::supports_avx(), ""); + bool vector256 = true; + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A); + emit_byte(0x18); + emit_byte(0xC0 | encode); + // 0x00 - insert into lower 128 bits + // 0x01 - insert into upper 128 bits + emit_byte(0x01); +} + +void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { + assert(VM_Version::supports_avx2(), ""); + bool vector256 = true; int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A); emit_byte(0x38); emit_byte(0xC0 | encode); @@ -3805,6 +4099,49 @@ int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegis } } +void Assembler::emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) { + InstructionMark im(this); + simd_prefix(dst, dst, src, pre); + emit_byte(opcode); + emit_operand(dst, src); +} + +void Assembler::emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) { + int encode = simd_prefix_and_encode(dst, dst, src, pre); + emit_byte(opcode); + emit_byte(0xC0 | encode); +} + +// Versions with no second source register (non-destructive source). +void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) { + InstructionMark im(this); + simd_prefix(dst, xnoreg, src, pre); + emit_byte(opcode); + emit_operand(dst, src); +} + +void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) { + int encode = simd_prefix_and_encode(dst, xnoreg, src, pre); + emit_byte(opcode); + emit_byte(0xC0 | encode); +} + +// 3-operands AVX instructions +void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, + Address src, VexSimdPrefix pre, bool vector256) { + InstructionMark im(this); + vex_prefix(dst, nds, src, pre, vector256); + emit_byte(opcode); + emit_operand(dst, src); +} + +void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, + XMMRegister src, VexSimdPrefix pre, bool vector256) { + int encode = vex_prefix_and_encode(dst, nds, src, pre, vector256); + emit_byte(opcode); + emit_byte(0xC0 | encode); +} + #ifndef _LP64 void Assembler::incl(Register dst) { @@ -7968,21 +8305,21 @@ void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src } } -void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { +void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) { if (reachable(src)) { - vandpd(dst, nds, as_Address(src)); + vandpd(dst, nds, as_Address(src), vector256); } else { lea(rscratch1, src); - vandpd(dst, nds, Address(rscratch1, 0)); + vandpd(dst, nds, Address(rscratch1, 0), vector256); } } -void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src) { +void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) { if (reachable(src)) { - vandps(dst, nds, as_Address(src)); + vandps(dst, nds, as_Address(src), vector256); } else { lea(rscratch1, src); - vandps(dst, nds, Address(rscratch1, 0)); + vandps(dst, nds, Address(rscratch1, 0), vector256); } } @@ -8040,21 +8377,21 @@ void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src } } -void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { +void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) { if (reachable(src)) { - vxorpd(dst, nds, as_Address(src)); + vxorpd(dst, nds, as_Address(src), vector256); } else { lea(rscratch1, src); - vxorpd(dst, nds, Address(rscratch1, 0)); + vxorpd(dst, nds, Address(rscratch1, 0), vector256); } } -void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src) { +void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) { if (reachable(src)) { - vxorps(dst, nds, as_Address(src)); + vxorps(dst, nds, as_Address(src), vector256); } else { lea(rscratch1, src); - vxorps(dst, nds, Address(rscratch1, 0)); + vxorps(dst, nds, Address(rscratch1, 0), vector256); } } diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp index 0d8746543..d06f499ca 100644 --- a/src/cpu/x86/vm/assembler_x86.hpp +++ b/src/cpu/x86/vm/assembler_x86.hpp @@ -617,6 +617,7 @@ private: VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) { simd_prefix(dst, xnoreg, src, pre, opc); } + void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) { simd_prefix(src, dst, pre); } @@ -626,16 +627,10 @@ private: simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w); } - int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F, bool rex_w = false, bool vector256 = false); - int simd_prefix_and_encode(XMMRegister dst, XMMRegister src, - VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) { - return simd_prefix_and_encode(dst, xnoreg, src, pre, opc); - } - // Move/convert 32-bit integer value. int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src, VexSimdPrefix pre) { @@ -677,6 +672,15 @@ private: void emit_arith(int op1, int op2, Register dst, jobject obj); void emit_arith(int op1, int op2, Register dst, Register src); + void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre); + void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre); + void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre); + void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre); + void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, + Address src, VexSimdPrefix pre, bool vector256); + void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, + XMMRegister src, VexSimdPrefix pre, bool vector256); + void emit_operand(Register reg, Register base, Register index, Address::ScaleFactor scale, int disp, @@ -891,12 +895,6 @@ private: void andq(Register dst, Address src); void andq(Register dst, Register src); - // Bitwise Logical AND of Packed Double-Precision Floating-Point Values - void andpd(XMMRegister dst, XMMRegister src); - - // Bitwise Logical AND of Packed Single-Precision Floating-Point Values - void andps(XMMRegister dst, XMMRegister src); - void bsfl(Register dst, Register src); void bsrl(Register dst, Register src); @@ -1436,10 +1434,6 @@ private: void prefetcht2(Address src); void prefetchw(Address src); - // POR - Bitwise logical OR - void por(XMMRegister dst, XMMRegister src); - void por(XMMRegister dst, Address src); - // Shuffle Packed Doublewords void pshufd(XMMRegister dst, XMMRegister src, int mode); void pshufd(XMMRegister dst, Address src, int mode); @@ -1448,9 +1442,6 @@ private: void pshuflw(XMMRegister dst, XMMRegister src, int mode); void pshuflw(XMMRegister dst, Address src, int mode); - // Shift Right by bits Logical Quadword Immediate - void psrlq(XMMRegister dst, int shift); - // Shift Right by bytes Logical DoubleQuadword Immediate void psrldq(XMMRegister dst, int shift); @@ -1475,10 +1466,6 @@ private: void pushq(Address src); - // Xor Packed Byte Integer Values - void pxor(XMMRegister dst, Address src); - void pxor(XMMRegister dst, XMMRegister src); - void rcll(Register dst, int imm8); void rclq(Register dst, int imm8); @@ -1601,15 +1588,10 @@ private: void xorq(Register dst, Address src); void xorq(Register dst, Register src); - // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values - void xorpd(XMMRegister dst, XMMRegister src); - - // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values - void xorps(XMMRegister dst, XMMRegister src); - void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0 // AVX 3-operands scalar instructions (encoded with VEX prefix) + void vaddsd(XMMRegister dst, XMMRegister nds, Address src); void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src); void vaddss(XMMRegister dst, XMMRegister nds, Address src); @@ -1627,14 +1609,147 @@ private: void vsubss(XMMRegister dst, XMMRegister nds, Address src); void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src); - // AVX Vector instrucitons. - void vandpd(XMMRegister dst, XMMRegister nds, Address src); - void vandps(XMMRegister dst, XMMRegister nds, Address src); - void vxorpd(XMMRegister dst, XMMRegister nds, Address src); - void vxorps(XMMRegister dst, XMMRegister nds, Address src); + + //====================VECTOR ARITHMETIC===================================== + + // Add Packed Floating-Point Values + void addpd(XMMRegister dst, XMMRegister src); + void addps(XMMRegister dst, XMMRegister src); + void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + + // Subtract Packed Floating-Point Values + void subpd(XMMRegister dst, XMMRegister src); + void subps(XMMRegister dst, XMMRegister src); + void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + + // Multiply Packed Floating-Point Values + void mulpd(XMMRegister dst, XMMRegister src); + void mulps(XMMRegister dst, XMMRegister src); + void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + + // Divide Packed Floating-Point Values + void divpd(XMMRegister dst, XMMRegister src); + void divps(XMMRegister dst, XMMRegister src); + void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + + // Bitwise Logical AND of Packed Floating-Point Values + void andpd(XMMRegister dst, XMMRegister src); + void andps(XMMRegister dst, XMMRegister src); + void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + + // Bitwise Logical XOR of Packed Floating-Point Values + void xorpd(XMMRegister dst, XMMRegister src); + void xorps(XMMRegister dst, XMMRegister src); void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + + // Add packed integers + void paddb(XMMRegister dst, XMMRegister src); + void paddw(XMMRegister dst, XMMRegister src); + void paddd(XMMRegister dst, XMMRegister src); + void paddq(XMMRegister dst, XMMRegister src); + void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + + // Sub packed integers + void psubb(XMMRegister dst, XMMRegister src); + void psubw(XMMRegister dst, XMMRegister src); + void psubd(XMMRegister dst, XMMRegister src); + void psubq(XMMRegister dst, XMMRegister src); + void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + + // Multiply packed integers (only shorts and ints) + void pmullw(XMMRegister dst, XMMRegister src); + void pmulld(XMMRegister dst, XMMRegister src); + void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + void vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + + // Shift left packed integers + void psllw(XMMRegister dst, int shift); + void pslld(XMMRegister dst, int shift); + void psllq(XMMRegister dst, int shift); + void psllw(XMMRegister dst, XMMRegister shift); + void pslld(XMMRegister dst, XMMRegister shift); + void psllq(XMMRegister dst, XMMRegister shift); + void vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256); + void vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256); + void vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256); + void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); + void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); + void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); + + // Logical shift right packed integers + void psrlw(XMMRegister dst, int shift); + void psrld(XMMRegister dst, int shift); + void psrlq(XMMRegister dst, int shift); + void psrlw(XMMRegister dst, XMMRegister shift); + void psrld(XMMRegister dst, XMMRegister shift); + void psrlq(XMMRegister dst, XMMRegister shift); + void vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256); + void vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256); + void vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256); + void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); + void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); + void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); + + // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs) + void psraw(XMMRegister dst, int shift); + void psrad(XMMRegister dst, int shift); + void psraw(XMMRegister dst, XMMRegister shift); + void psrad(XMMRegister dst, XMMRegister shift); + void vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256); + void vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256); + void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); + void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256); + + // And packed integers + void pand(XMMRegister dst, XMMRegister src); + void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + + // Or packed integers + void por(XMMRegister dst, XMMRegister src); + void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + + // Xor packed integers + void pxor(XMMRegister dst, XMMRegister src); void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + + // Copy low 128bit into high 128bit of YMM registers. void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src); void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src); @@ -2532,11 +2647,13 @@ public: void vaddss(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vaddss(dst, nds, src); } void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src); - void vandpd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vandpd(dst, nds, src); } - void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src); + void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); } + void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); } + void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256); - void vandps(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vandps(dst, nds, src); } - void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src); + void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); } + void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); } + void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256); void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); } void vdivsd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vdivsd(dst, nds, src); } @@ -2565,12 +2682,12 @@ public: // AVX Vector instructions void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); } - void vxorpd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorpd(dst, nds, src); } - void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src); + void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); } + void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256); void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); } - void vxorps(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorps(dst, nds, src); } - void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src); + void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); } + void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256); void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2 @@ -2578,6 +2695,12 @@ public: else Assembler::vxorpd(dst, nds, src, vector256); } + void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { + if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2 + Assembler::vpxor(dst, nds, src, vector256); + else + Assembler::vxorpd(dst, nds, src, vector256); + } // Move packed integer values from low 128 bit to hign 128 bit in 256 bit vector. void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { diff --git a/src/cpu/x86/vm/x86.ad b/src/cpu/x86/vm/x86.ad index de512c37b..6bb14ef16 100644 --- a/src/cpu/x86/vm/x86.ad +++ b/src/cpu/x86/vm/x86.ad @@ -500,6 +500,24 @@ const int Matcher::base2reg[Type::lastype] = { 0 /*bottom*/ }; +const bool Matcher::match_rule_supported(int opcode) { + if (!has_match_rule(opcode)) + return false; + + switch (opcode) { + case Op_PopCountI: + case Op_PopCountL: + if (!UsePopCountInstruction) + return false; + case Op_MulVI: + if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX + return false; + break; + } + + return true; // Per default match rules are supported. +} + // Max vector size in bytes. 0 if not supported. const int Matcher::vector_width_in_bytes(BasicType bt) { assert(is_java_primitive(bt), "only primitive type vectors"); @@ -1439,8 +1457,9 @@ instruct absF_reg_reg(regF dst, regF src) %{ ins_cost(150); format %{ "vandps $dst, $src, [0x7fffffff]\t# abs float by sign masking" %} ins_encode %{ + bool vector256 = false; __ vandps($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(float_signmask())); + ExternalAddress(float_signmask()), vector256); %} ins_pipe(pipe_slow); %} @@ -1464,8 +1483,9 @@ instruct absD_reg_reg(regD dst, regD src) %{ format %{ "vandpd $dst, $src, [0x7fffffffffffffff]\t" "# abs double by sign masking" %} ins_encode %{ + bool vector256 = false; __ vandpd($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(double_signmask())); + ExternalAddress(double_signmask()), vector256); %} ins_pipe(pipe_slow); %} @@ -1487,8 +1507,9 @@ instruct negF_reg_reg(regF dst, regF src) %{ ins_cost(150); format %{ "vxorps $dst, $src, [0x80000000]\t# neg float by sign flipping" %} ins_encode %{ + bool vector256 = false; __ vxorps($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(float_signflip())); + ExternalAddress(float_signflip()), vector256); %} ins_pipe(pipe_slow); %} @@ -1512,8 +1533,9 @@ instruct negD_reg_reg(regD dst, regD src) %{ format %{ "vxorpd $dst, $src, [0x8000000000000000]\t" "# neg double by sign flipping" %} ins_encode %{ + bool vector256 = false; __ vxorpd($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(double_signflip())); + ExternalAddress(double_signflip()), vector256); %} ins_pipe(pipe_slow); %} @@ -2382,3 +2404,2416 @@ instruct Repl4D_zero(vecY dst, immD0 zero) %{ ins_pipe( fpu_reg_reg ); %} +// ====================VECTOR ARITHMETIC======================================= + +// --------------------------------- ADD -------------------------------------- + +// Bytes vector add +instruct vadd4B(vecS dst, vecS src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (AddVB dst src)); + format %{ "paddb $dst,$src\t! add packed4B" %} + ins_encode %{ + __ paddb($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (AddVB src1 src2)); + format %{ "vpaddb $dst,$src1,$src2\t! add packed4B" %} + ins_encode %{ + bool vector256 = false; + __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8B(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (AddVB dst src)); + format %{ "paddb $dst,$src\t! add packed8B" %} + ins_encode %{ + __ paddb($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (AddVB src1 src2)); + format %{ "vpaddb $dst,$src1,$src2\t! add packed8B" %} + ins_encode %{ + bool vector256 = false; + __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd16B(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 16); + match(Set dst (AddVB dst src)); + format %{ "paddb $dst,$src\t! add packed16B" %} + ins_encode %{ + __ paddb($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 16); + match(Set dst (AddVB src1 src2)); + format %{ "vpaddb $dst,$src1,$src2\t! add packed16B" %} + ins_encode %{ + bool vector256 = false; + __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 16); + match(Set dst (AddVB src (LoadVector mem))); + format %{ "vpaddb $dst,$src,$mem\t! add packed16B" %} + ins_encode %{ + bool vector256 = false; + __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 32); + match(Set dst (AddVB src1 src2)); + format %{ "vpaddb $dst,$src1,$src2\t! add packed32B" %} + ins_encode %{ + bool vector256 = true; + __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 32); + match(Set dst (AddVB src (LoadVector mem))); + format %{ "vpaddb $dst,$src,$mem\t! add packed32B" %} + ins_encode %{ + bool vector256 = true; + __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Shorts/Chars vector add +instruct vadd2S(vecS dst, vecS src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (AddVS dst src)); + format %{ "paddw $dst,$src\t! add packed2S" %} + ins_encode %{ + __ paddw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (AddVS src1 src2)); + format %{ "vpaddw $dst,$src1,$src2\t! add packed2S" %} + ins_encode %{ + bool vector256 = false; + __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4S(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (AddVS dst src)); + format %{ "paddw $dst,$src\t! add packed4S" %} + ins_encode %{ + __ paddw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (AddVS src1 src2)); + format %{ "vpaddw $dst,$src1,$src2\t! add packed4S" %} + ins_encode %{ + bool vector256 = false; + __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8S(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (AddVS dst src)); + format %{ "paddw $dst,$src\t! add packed8S" %} + ins_encode %{ + __ paddw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (AddVS src1 src2)); + format %{ "vpaddw $dst,$src1,$src2\t! add packed8S" %} + ins_encode %{ + bool vector256 = false; + __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (AddVS src (LoadVector mem))); + format %{ "vpaddw $dst,$src,$mem\t! add packed8S" %} + ins_encode %{ + bool vector256 = false; + __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (AddVS src1 src2)); + format %{ "vpaddw $dst,$src1,$src2\t! add packed16S" %} + ins_encode %{ + bool vector256 = true; + __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (AddVS src (LoadVector mem))); + format %{ "vpaddw $dst,$src,$mem\t! add packed16S" %} + ins_encode %{ + bool vector256 = true; + __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Integers vector add +instruct vadd2I(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (AddVI dst src)); + format %{ "paddd $dst,$src\t! add packed2I" %} + ins_encode %{ + __ paddd($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (AddVI src1 src2)); + format %{ "vpaddd $dst,$src1,$src2\t! add packed2I" %} + ins_encode %{ + bool vector256 = false; + __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4I(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (AddVI dst src)); + format %{ "paddd $dst,$src\t! add packed4I" %} + ins_encode %{ + __ paddd($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (AddVI src1 src2)); + format %{ "vpaddd $dst,$src1,$src2\t! add packed4I" %} + ins_encode %{ + bool vector256 = false; + __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (AddVI src (LoadVector mem))); + format %{ "vpaddd $dst,$src,$mem\t! add packed4I" %} + ins_encode %{ + bool vector256 = false; + __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (AddVI src1 src2)); + format %{ "vpaddd $dst,$src1,$src2\t! add packed8I" %} + ins_encode %{ + bool vector256 = true; + __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (AddVI src (LoadVector mem))); + format %{ "vpaddd $dst,$src,$mem\t! add packed8I" %} + ins_encode %{ + bool vector256 = true; + __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Longs vector add +instruct vadd2L(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (AddVL dst src)); + format %{ "paddq $dst,$src\t! add packed2L" %} + ins_encode %{ + __ paddq($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (AddVL src1 src2)); + format %{ "vpaddq $dst,$src1,$src2\t! add packed2L" %} + ins_encode %{ + bool vector256 = false; + __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (AddVL src (LoadVector mem))); + format %{ "vpaddq $dst,$src,$mem\t! add packed2L" %} + ins_encode %{ + bool vector256 = false; + __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 4); + match(Set dst (AddVL src1 src2)); + format %{ "vpaddq $dst,$src1,$src2\t! add packed4L" %} + ins_encode %{ + bool vector256 = true; + __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 4); + match(Set dst (AddVL src (LoadVector mem))); + format %{ "vpaddq $dst,$src,$mem\t! add packed4L" %} + ins_encode %{ + bool vector256 = true; + __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Floats vector add +instruct vadd2F(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (AddVF dst src)); + format %{ "addps $dst,$src\t! add packed2F" %} + ins_encode %{ + __ addps($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (AddVF src1 src2)); + format %{ "vaddps $dst,$src1,$src2\t! add packed2F" %} + ins_encode %{ + bool vector256 = false; + __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4F(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (AddVF dst src)); + format %{ "addps $dst,$src\t! add packed4F" %} + ins_encode %{ + __ addps($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (AddVF src1 src2)); + format %{ "vaddps $dst,$src1,$src2\t! add packed4F" %} + ins_encode %{ + bool vector256 = false; + __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (AddVF src (LoadVector mem))); + format %{ "vaddps $dst,$src,$mem\t! add packed4F" %} + ins_encode %{ + bool vector256 = false; + __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (AddVF src1 src2)); + format %{ "vaddps $dst,$src1,$src2\t! add packed8F" %} + ins_encode %{ + bool vector256 = true; + __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (AddVF src (LoadVector mem))); + format %{ "vaddps $dst,$src,$mem\t! add packed8F" %} + ins_encode %{ + bool vector256 = true; + __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Doubles vector add +instruct vadd2D(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (AddVD dst src)); + format %{ "addpd $dst,$src\t! add packed2D" %} + ins_encode %{ + __ addpd($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (AddVD src1 src2)); + format %{ "vaddpd $dst,$src1,$src2\t! add packed2D" %} + ins_encode %{ + bool vector256 = false; + __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (AddVD src (LoadVector mem))); + format %{ "vaddpd $dst,$src,$mem\t! add packed2D" %} + ins_encode %{ + bool vector256 = false; + __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (AddVD src1 src2)); + format %{ "vaddpd $dst,$src1,$src2\t! add packed4D" %} + ins_encode %{ + bool vector256 = true; + __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (AddVD src (LoadVector mem))); + format %{ "vaddpd $dst,$src,$mem\t! add packed4D" %} + ins_encode %{ + bool vector256 = true; + __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// --------------------------------- SUB -------------------------------------- + +// Bytes vector sub +instruct vsub4B(vecS dst, vecS src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (SubVB dst src)); + format %{ "psubb $dst,$src\t! sub packed4B" %} + ins_encode %{ + __ psubb($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (SubVB src1 src2)); + format %{ "vpsubb $dst,$src1,$src2\t! sub packed4B" %} + ins_encode %{ + bool vector256 = false; + __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8B(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (SubVB dst src)); + format %{ "psubb $dst,$src\t! sub packed8B" %} + ins_encode %{ + __ psubb($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (SubVB src1 src2)); + format %{ "vpsubb $dst,$src1,$src2\t! sub packed8B" %} + ins_encode %{ + bool vector256 = false; + __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub16B(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 16); + match(Set dst (SubVB dst src)); + format %{ "psubb $dst,$src\t! sub packed16B" %} + ins_encode %{ + __ psubb($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 16); + match(Set dst (SubVB src1 src2)); + format %{ "vpsubb $dst,$src1,$src2\t! sub packed16B" %} + ins_encode %{ + bool vector256 = false; + __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 16); + match(Set dst (SubVB src (LoadVector mem))); + format %{ "vpsubb $dst,$src,$mem\t! sub packed16B" %} + ins_encode %{ + bool vector256 = false; + __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 32); + match(Set dst (SubVB src1 src2)); + format %{ "vpsubb $dst,$src1,$src2\t! sub packed32B" %} + ins_encode %{ + bool vector256 = true; + __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 32); + match(Set dst (SubVB src (LoadVector mem))); + format %{ "vpsubb $dst,$src,$mem\t! sub packed32B" %} + ins_encode %{ + bool vector256 = true; + __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Shorts/Chars vector sub +instruct vsub2S(vecS dst, vecS src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (SubVS dst src)); + format %{ "psubw $dst,$src\t! sub packed2S" %} + ins_encode %{ + __ psubw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (SubVS src1 src2)); + format %{ "vpsubw $dst,$src1,$src2\t! sub packed2S" %} + ins_encode %{ + bool vector256 = false; + __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4S(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (SubVS dst src)); + format %{ "psubw $dst,$src\t! sub packed4S" %} + ins_encode %{ + __ psubw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (SubVS src1 src2)); + format %{ "vpsubw $dst,$src1,$src2\t! sub packed4S" %} + ins_encode %{ + bool vector256 = false; + __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8S(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (SubVS dst src)); + format %{ "psubw $dst,$src\t! sub packed8S" %} + ins_encode %{ + __ psubw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (SubVS src1 src2)); + format %{ "vpsubw $dst,$src1,$src2\t! sub packed8S" %} + ins_encode %{ + bool vector256 = false; + __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (SubVS src (LoadVector mem))); + format %{ "vpsubw $dst,$src,$mem\t! sub packed8S" %} + ins_encode %{ + bool vector256 = false; + __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (SubVS src1 src2)); + format %{ "vpsubw $dst,$src1,$src2\t! sub packed16S" %} + ins_encode %{ + bool vector256 = true; + __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (SubVS src (LoadVector mem))); + format %{ "vpsubw $dst,$src,$mem\t! sub packed16S" %} + ins_encode %{ + bool vector256 = true; + __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Integers vector sub +instruct vsub2I(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (SubVI dst src)); + format %{ "psubd $dst,$src\t! sub packed2I" %} + ins_encode %{ + __ psubd($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (SubVI src1 src2)); + format %{ "vpsubd $dst,$src1,$src2\t! sub packed2I" %} + ins_encode %{ + bool vector256 = false; + __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4I(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (SubVI dst src)); + format %{ "psubd $dst,$src\t! sub packed4I" %} + ins_encode %{ + __ psubd($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (SubVI src1 src2)); + format %{ "vpsubd $dst,$src1,$src2\t! sub packed4I" %} + ins_encode %{ + bool vector256 = false; + __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (SubVI src (LoadVector mem))); + format %{ "vpsubd $dst,$src,$mem\t! sub packed4I" %} + ins_encode %{ + bool vector256 = false; + __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (SubVI src1 src2)); + format %{ "vpsubd $dst,$src1,$src2\t! sub packed8I" %} + ins_encode %{ + bool vector256 = true; + __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (SubVI src (LoadVector mem))); + format %{ "vpsubd $dst,$src,$mem\t! sub packed8I" %} + ins_encode %{ + bool vector256 = true; + __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Longs vector sub +instruct vsub2L(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (SubVL dst src)); + format %{ "psubq $dst,$src\t! sub packed2L" %} + ins_encode %{ + __ psubq($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (SubVL src1 src2)); + format %{ "vpsubq $dst,$src1,$src2\t! sub packed2L" %} + ins_encode %{ + bool vector256 = false; + __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (SubVL src (LoadVector mem))); + format %{ "vpsubq $dst,$src,$mem\t! sub packed2L" %} + ins_encode %{ + bool vector256 = false; + __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 4); + match(Set dst (SubVL src1 src2)); + format %{ "vpsubq $dst,$src1,$src2\t! sub packed4L" %} + ins_encode %{ + bool vector256 = true; + __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 4); + match(Set dst (SubVL src (LoadVector mem))); + format %{ "vpsubq $dst,$src,$mem\t! sub packed4L" %} + ins_encode %{ + bool vector256 = true; + __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Floats vector sub +instruct vsub2F(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (SubVF dst src)); + format %{ "subps $dst,$src\t! sub packed2F" %} + ins_encode %{ + __ subps($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (SubVF src1 src2)); + format %{ "vsubps $dst,$src1,$src2\t! sub packed2F" %} + ins_encode %{ + bool vector256 = false; + __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4F(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (SubVF dst src)); + format %{ "subps $dst,$src\t! sub packed4F" %} + ins_encode %{ + __ subps($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (SubVF src1 src2)); + format %{ "vsubps $dst,$src1,$src2\t! sub packed4F" %} + ins_encode %{ + bool vector256 = false; + __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (SubVF src (LoadVector mem))); + format %{ "vsubps $dst,$src,$mem\t! sub packed4F" %} + ins_encode %{ + bool vector256 = false; + __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (SubVF src1 src2)); + format %{ "vsubps $dst,$src1,$src2\t! sub packed8F" %} + ins_encode %{ + bool vector256 = true; + __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (SubVF src (LoadVector mem))); + format %{ "vsubps $dst,$src,$mem\t! sub packed8F" %} + ins_encode %{ + bool vector256 = true; + __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Doubles vector sub +instruct vsub2D(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (SubVD dst src)); + format %{ "subpd $dst,$src\t! sub packed2D" %} + ins_encode %{ + __ subpd($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (SubVD src1 src2)); + format %{ "vsubpd $dst,$src1,$src2\t! sub packed2D" %} + ins_encode %{ + bool vector256 = false; + __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (SubVD src (LoadVector mem))); + format %{ "vsubpd $dst,$src,$mem\t! sub packed2D" %} + ins_encode %{ + bool vector256 = false; + __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (SubVD src1 src2)); + format %{ "vsubpd $dst,$src1,$src2\t! sub packed4D" %} + ins_encode %{ + bool vector256 = true; + __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (SubVD src (LoadVector mem))); + format %{ "vsubpd $dst,$src,$mem\t! sub packed4D" %} + ins_encode %{ + bool vector256 = true; + __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// --------------------------------- MUL -------------------------------------- + +// Shorts/Chars vector mul +instruct vmul2S(vecS dst, vecS src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (MulVS dst src)); + format %{ "pmullw $dst,$src\t! mul packed2S" %} + ins_encode %{ + __ pmullw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (MulVS src1 src2)); + format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %} + ins_encode %{ + bool vector256 = false; + __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4S(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (MulVS dst src)); + format %{ "pmullw $dst,$src\t! mul packed4S" %} + ins_encode %{ + __ pmullw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (MulVS src1 src2)); + format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %} + ins_encode %{ + bool vector256 = false; + __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul8S(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (MulVS dst src)); + format %{ "pmullw $dst,$src\t! mul packed8S" %} + ins_encode %{ + __ pmullw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (MulVS src1 src2)); + format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %} + ins_encode %{ + bool vector256 = false; + __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (MulVS src (LoadVector mem))); + format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %} + ins_encode %{ + bool vector256 = false; + __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (MulVS src1 src2)); + format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %} + ins_encode %{ + bool vector256 = true; + __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (MulVS src (LoadVector mem))); + format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %} + ins_encode %{ + bool vector256 = true; + __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Integers vector mul (sse4_1) +instruct vmul2I(vecD dst, vecD src) %{ + predicate(UseSSE > 3 && n->as_Vector()->length() == 2); + match(Set dst (MulVI dst src)); + format %{ "pmulld $dst,$src\t! mul packed2I" %} + ins_encode %{ + __ pmulld($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (MulVI src1 src2)); + format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %} + ins_encode %{ + bool vector256 = false; + __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4I(vecX dst, vecX src) %{ + predicate(UseSSE > 3 && n->as_Vector()->length() == 4); + match(Set dst (MulVI dst src)); + format %{ "pmulld $dst,$src\t! mul packed4I" %} + ins_encode %{ + __ pmulld($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (MulVI src1 src2)); + format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %} + ins_encode %{ + bool vector256 = false; + __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (MulVI src (LoadVector mem))); + format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %} + ins_encode %{ + bool vector256 = false; + __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (MulVI src1 src2)); + format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %} + ins_encode %{ + bool vector256 = true; + __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (MulVI src (LoadVector mem))); + format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %} + ins_encode %{ + bool vector256 = true; + __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Floats vector mul +instruct vmul2F(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (MulVF dst src)); + format %{ "mulps $dst,$src\t! mul packed2F" %} + ins_encode %{ + __ mulps($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (MulVF src1 src2)); + format %{ "vmulps $dst,$src1,$src2\t! mul packed2F" %} + ins_encode %{ + bool vector256 = false; + __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4F(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (MulVF dst src)); + format %{ "mulps $dst,$src\t! mul packed4F" %} + ins_encode %{ + __ mulps($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (MulVF src1 src2)); + format %{ "vmulps $dst,$src1,$src2\t! mul packed4F" %} + ins_encode %{ + bool vector256 = false; + __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (MulVF src (LoadVector mem))); + format %{ "vmulps $dst,$src,$mem\t! mul packed4F" %} + ins_encode %{ + bool vector256 = false; + __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (MulVF src1 src2)); + format %{ "vmulps $dst,$src1,$src2\t! mul packed8F" %} + ins_encode %{ + bool vector256 = true; + __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (MulVF src (LoadVector mem))); + format %{ "vmulps $dst,$src,$mem\t! mul packed8F" %} + ins_encode %{ + bool vector256 = true; + __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Doubles vector mul +instruct vmul2D(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (MulVD dst src)); + format %{ "mulpd $dst,$src\t! mul packed2D" %} + ins_encode %{ + __ mulpd($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (MulVD src1 src2)); + format %{ "vmulpd $dst,$src1,$src2\t! mul packed2D" %} + ins_encode %{ + bool vector256 = false; + __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (MulVD src (LoadVector mem))); + format %{ "vmulpd $dst,$src,$mem\t! mul packed2D" %} + ins_encode %{ + bool vector256 = false; + __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (MulVD src1 src2)); + format %{ "vmulpd $dst,$src1,$src2\t! mul packed4D" %} + ins_encode %{ + bool vector256 = true; + __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (MulVD src (LoadVector mem))); + format %{ "vmulpd $dst,$src,$mem\t! mul packed4D" %} + ins_encode %{ + bool vector256 = true; + __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// --------------------------------- DIV -------------------------------------- + +// Floats vector div +instruct vdiv2F(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (DivVF dst src)); + format %{ "divps $dst,$src\t! div packed2F" %} + ins_encode %{ + __ divps($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (DivVF src1 src2)); + format %{ "vdivps $dst,$src1,$src2\t! div packed2F" %} + ins_encode %{ + bool vector256 = false; + __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv4F(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (DivVF dst src)); + format %{ "divps $dst,$src\t! div packed4F" %} + ins_encode %{ + __ divps($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (DivVF src1 src2)); + format %{ "vdivps $dst,$src1,$src2\t! div packed4F" %} + ins_encode %{ + bool vector256 = false; + __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (DivVF src (LoadVector mem))); + format %{ "vdivps $dst,$src,$mem\t! div packed4F" %} + ins_encode %{ + bool vector256 = false; + __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (DivVF src1 src2)); + format %{ "vdivps $dst,$src1,$src2\t! div packed8F" %} + ins_encode %{ + bool vector256 = true; + __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (DivVF src (LoadVector mem))); + format %{ "vdivps $dst,$src,$mem\t! div packed8F" %} + ins_encode %{ + bool vector256 = true; + __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Doubles vector div +instruct vdiv2D(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (DivVD dst src)); + format %{ "divpd $dst,$src\t! div packed2D" %} + ins_encode %{ + __ divpd($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (DivVD src1 src2)); + format %{ "vdivpd $dst,$src1,$src2\t! div packed2D" %} + ins_encode %{ + bool vector256 = false; + __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (DivVD src (LoadVector mem))); + format %{ "vdivpd $dst,$src,$mem\t! div packed2D" %} + ins_encode %{ + bool vector256 = false; + __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (DivVD src1 src2)); + format %{ "vdivpd $dst,$src1,$src2\t! div packed4D" %} + ins_encode %{ + bool vector256 = true; + __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (DivVD src (LoadVector mem))); + format %{ "vdivpd $dst,$src,$mem\t! div packed4D" %} + ins_encode %{ + bool vector256 = true; + __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// ------------------------------ LeftShift ----------------------------------- + +// Shorts/Chars vector left shift +instruct vsll2S(vecS dst, regF shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (LShiftVS dst shift)); + format %{ "psllw $dst,$shift\t! left shift packed2S" %} + ins_encode %{ + __ psllw($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll2S_imm(vecS dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (LShiftVS dst shift)); + format %{ "psllw $dst,$shift\t! left shift packed2S" %} + ins_encode %{ + __ psllw($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll2S_reg(vecS dst, vecS src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (LShiftVS src shift)); + format %{ "vpsllw $dst,$src,$shift\t! left shift packed2S" %} + ins_encode %{ + bool vector256 = false; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (LShiftVS src shift)); + format %{ "vpsllw $dst,$src,$shift\t! left shift packed2S" %} + ins_encode %{ + bool vector256 = false; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll4S(vecD dst, regF shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (LShiftVS dst shift)); + format %{ "psllw $dst,$shift\t! left shift packed4S" %} + ins_encode %{ + __ psllw($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll4S_imm(vecD dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (LShiftVS dst shift)); + format %{ "psllw $dst,$shift\t! left shift packed4S" %} + ins_encode %{ + __ psllw($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll4S_reg(vecD dst, vecD src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (LShiftVS src shift)); + format %{ "vpsllw $dst,$src,$shift\t! left shift packed4S" %} + ins_encode %{ + bool vector256 = false; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (LShiftVS src shift)); + format %{ "vpsllw $dst,$src,$shift\t! left shift packed4S" %} + ins_encode %{ + bool vector256 = false; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll8S(vecX dst, regF shift) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (LShiftVS dst shift)); + format %{ "psllw $dst,$shift\t! left shift packed8S" %} + ins_encode %{ + __ psllw($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll8S_imm(vecX dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (LShiftVS dst shift)); + format %{ "psllw $dst,$shift\t! left shift packed8S" %} + ins_encode %{ + __ psllw($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll8S_reg(vecX dst, vecX src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (LShiftVS src shift)); + format %{ "vpsllw $dst,$src,$shift\t! left shift packed8S" %} + ins_encode %{ + bool vector256 = false; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (LShiftVS src shift)); + format %{ "vpsllw $dst,$src,$shift\t! left shift packed8S" %} + ins_encode %{ + bool vector256 = false; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll16S_reg(vecY dst, vecY src, regF shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (LShiftVS src shift)); + format %{ "vpsllw $dst,$src,$shift\t! left shift packed16S" %} + ins_encode %{ + bool vector256 = true; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (LShiftVS src shift)); + format %{ "vpsllw $dst,$src,$shift\t! left shift packed16S" %} + ins_encode %{ + bool vector256 = true; + __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Integers vector left shift +instruct vsll2I(vecD dst, regF shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (LShiftVI dst shift)); + format %{ "pslld $dst,$shift\t! left shift packed2I" %} + ins_encode %{ + __ pslld($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll2I_imm(vecD dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (LShiftVI dst shift)); + format %{ "pslld $dst,$shift\t! left shift packed2I" %} + ins_encode %{ + __ pslld($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll2I_reg(vecD dst, vecD src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (LShiftVI src shift)); + format %{ "vpslld $dst,$src,$shift\t! left shift packed2I" %} + ins_encode %{ + bool vector256 = false; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (LShiftVI src shift)); + format %{ "vpslld $dst,$src,$shift\t! left shift packed2I" %} + ins_encode %{ + bool vector256 = false; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll4I(vecX dst, regF shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (LShiftVI dst shift)); + format %{ "pslld $dst,$shift\t! left shift packed4I" %} + ins_encode %{ + __ pslld($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll4I_imm(vecX dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (LShiftVI dst shift)); + format %{ "pslld $dst,$shift\t! left shift packed4I" %} + ins_encode %{ + __ pslld($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll4I_reg(vecX dst, vecX src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (LShiftVI src shift)); + format %{ "vpslld $dst,$src,$shift\t! left shift packed4I" %} + ins_encode %{ + bool vector256 = false; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (LShiftVI src shift)); + format %{ "vpslld $dst,$src,$shift\t! left shift packed4I" %} + ins_encode %{ + bool vector256 = false; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll8I_reg(vecY dst, vecY src, regF shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (LShiftVI src shift)); + format %{ "vpslld $dst,$src,$shift\t! left shift packed8I" %} + ins_encode %{ + bool vector256 = true; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (LShiftVI src shift)); + format %{ "vpslld $dst,$src,$shift\t! left shift packed8I" %} + ins_encode %{ + bool vector256 = true; + __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Longs vector left shift +instruct vsll2L(vecX dst, regF shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (LShiftVL dst shift)); + format %{ "psllq $dst,$shift\t! left shift packed2L" %} + ins_encode %{ + __ psllq($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll2L_imm(vecX dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (LShiftVL dst shift)); + format %{ "psllq $dst,$shift\t! left shift packed2L" %} + ins_encode %{ + __ psllq($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll2L_reg(vecX dst, vecX src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (LShiftVL src shift)); + format %{ "vpsllq $dst,$src,$shift\t! left shift packed2L" %} + ins_encode %{ + bool vector256 = false; + __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (LShiftVL src shift)); + format %{ "vpsllq $dst,$src,$shift\t! left shift packed2L" %} + ins_encode %{ + bool vector256 = false; + __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll4L_reg(vecY dst, vecY src, regF shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 4); + match(Set dst (LShiftVL src shift)); + format %{ "vpsllq $dst,$src,$shift\t! left shift packed4L" %} + ins_encode %{ + bool vector256 = true; + __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 4); + match(Set dst (LShiftVL src shift)); + format %{ "vpsllq $dst,$src,$shift\t! left shift packed4L" %} + ins_encode %{ + bool vector256 = true; + __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// ----------------------- LogicalRightShift ----------------------------------- + +// Shorts/Chars vector logical right shift produces incorrect Java result +// for negative data because java code convert short value into int with +// sign extension before a shift. + +// Integers vector logical right shift +instruct vsrl2I(vecD dst, regF shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (URShiftVI dst shift)); + format %{ "psrld $dst,$shift\t! logical right shift packed2I" %} + ins_encode %{ + __ psrld($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl2I_imm(vecD dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (URShiftVI dst shift)); + format %{ "psrld $dst,$shift\t! logical right shift packed2I" %} + ins_encode %{ + __ psrld($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl2I_reg(vecD dst, vecD src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (URShiftVI src shift)); + format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed2I" %} + ins_encode %{ + bool vector256 = false; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (URShiftVI src shift)); + format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed2I" %} + ins_encode %{ + bool vector256 = false; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl4I(vecX dst, regF shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (URShiftVI dst shift)); + format %{ "psrld $dst,$shift\t! logical right shift packed4I" %} + ins_encode %{ + __ psrld($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl4I_imm(vecX dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (URShiftVI dst shift)); + format %{ "psrld $dst,$shift\t! logical right shift packed4I" %} + ins_encode %{ + __ psrld($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl4I_reg(vecX dst, vecX src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (URShiftVI src shift)); + format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed4I" %} + ins_encode %{ + bool vector256 = false; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (URShiftVI src shift)); + format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed4I" %} + ins_encode %{ + bool vector256 = false; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl8I_reg(vecY dst, vecY src, regF shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (URShiftVI src shift)); + format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed8I" %} + ins_encode %{ + bool vector256 = true; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (URShiftVI src shift)); + format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed8I" %} + ins_encode %{ + bool vector256 = true; + __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Longs vector logical right shift +instruct vsrl2L(vecX dst, regF shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (URShiftVL dst shift)); + format %{ "psrlq $dst,$shift\t! logical right shift packed2L" %} + ins_encode %{ + __ psrlq($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl2L_imm(vecX dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (URShiftVL dst shift)); + format %{ "psrlq $dst,$shift\t! logical right shift packed2L" %} + ins_encode %{ + __ psrlq($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl2L_reg(vecX dst, vecX src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (URShiftVL src shift)); + format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed2L" %} + ins_encode %{ + bool vector256 = false; + __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (URShiftVL src shift)); + format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed2L" %} + ins_encode %{ + bool vector256 = false; + __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl4L_reg(vecY dst, vecY src, regF shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 4); + match(Set dst (URShiftVL src shift)); + format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed4L" %} + ins_encode %{ + bool vector256 = true; + __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 4); + match(Set dst (URShiftVL src shift)); + format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed4L" %} + ins_encode %{ + bool vector256 = true; + __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// ------------------- ArithmeticRightShift ----------------------------------- + +// Shorts/Chars vector arithmetic right shift +instruct vsra2S(vecS dst, regF shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (RShiftVS dst shift)); + format %{ "psraw $dst,$shift\t! arithmetic right shift packed2S" %} + ins_encode %{ + __ psraw($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra2S_imm(vecS dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (RShiftVS dst shift)); + format %{ "psraw $dst,$shift\t! arithmetic right shift packed2S" %} + ins_encode %{ + __ psraw($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra2S_reg(vecS dst, vecS src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (RShiftVS src shift)); + format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed2S" %} + ins_encode %{ + bool vector256 = false; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (RShiftVS src shift)); + format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed2S" %} + ins_encode %{ + bool vector256 = false; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra4S(vecD dst, regF shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (RShiftVS dst shift)); + format %{ "psraw $dst,$shift\t! arithmetic right shift packed4S" %} + ins_encode %{ + __ psraw($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra4S_imm(vecD dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (RShiftVS dst shift)); + format %{ "psraw $dst,$shift\t! arithmetic right shift packed4S" %} + ins_encode %{ + __ psraw($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra4S_reg(vecD dst, vecD src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (RShiftVS src shift)); + format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed4S" %} + ins_encode %{ + bool vector256 = false; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (RShiftVS src shift)); + format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed4S" %} + ins_encode %{ + bool vector256 = false; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra8S(vecX dst, regF shift) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (RShiftVS dst shift)); + format %{ "psraw $dst,$shift\t! arithmetic right shift packed8S" %} + ins_encode %{ + __ psraw($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra8S_imm(vecX dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (RShiftVS dst shift)); + format %{ "psraw $dst,$shift\t! arithmetic right shift packed8S" %} + ins_encode %{ + __ psraw($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra8S_reg(vecX dst, vecX src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (RShiftVS src shift)); + format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed8S" %} + ins_encode %{ + bool vector256 = false; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (RShiftVS src shift)); + format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed8S" %} + ins_encode %{ + bool vector256 = false; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra16S_reg(vecY dst, vecY src, regF shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (RShiftVS src shift)); + format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed16S" %} + ins_encode %{ + bool vector256 = true; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (RShiftVS src shift)); + format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed16S" %} + ins_encode %{ + bool vector256 = true; + __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// Integers vector arithmetic right shift +instruct vsra2I(vecD dst, regF shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (RShiftVI dst shift)); + format %{ "psrad $dst,$shift\t! arithmetic right shift packed2I" %} + ins_encode %{ + __ psrad($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra2I_imm(vecD dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (RShiftVI dst shift)); + format %{ "psrad $dst,$shift\t! arithmetic right shift packed2I" %} + ins_encode %{ + __ psrad($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra2I_reg(vecD dst, vecD src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (RShiftVI src shift)); + format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed2I" %} + ins_encode %{ + bool vector256 = false; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (RShiftVI src shift)); + format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed2I" %} + ins_encode %{ + bool vector256 = false; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra4I(vecX dst, regF shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (RShiftVI dst shift)); + format %{ "psrad $dst,$shift\t! arithmetic right shift packed4I" %} + ins_encode %{ + __ psrad($dst$$XMMRegister, $shift$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra4I_imm(vecX dst, immI8 shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (RShiftVI dst shift)); + format %{ "psrad $dst,$shift\t! arithmetic right shift packed4I" %} + ins_encode %{ + __ psrad($dst$$XMMRegister, (int)$shift$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra4I_reg(vecX dst, vecX src, regF shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (RShiftVI src shift)); + format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed4I" %} + ins_encode %{ + bool vector256 = false; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (RShiftVI src shift)); + format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed4I" %} + ins_encode %{ + bool vector256 = false; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra8I_reg(vecY dst, vecY src, regF shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (RShiftVI src shift)); + format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed8I" %} + ins_encode %{ + bool vector256 = true; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (RShiftVI src shift)); + format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed8I" %} + ins_encode %{ + bool vector256 = true; + __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// There are no longs vector arithmetic right shift instructions. + + +// --------------------------------- AND -------------------------------------- + +instruct vand4B(vecS dst, vecS src) %{ + predicate(n->as_Vector()->length_in_bytes() == 4); + match(Set dst (AndV dst src)); + format %{ "pand $dst,$src\t! and vectors (4 bytes)" %} + ins_encode %{ + __ pand($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4); + match(Set dst (AndV src1 src2)); + format %{ "vpand $dst,$src1,$src2\t! and vectors (4 bytes)" %} + ins_encode %{ + bool vector256 = false; + __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vand8B(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length_in_bytes() == 8); + match(Set dst (AndV dst src)); + format %{ "pand $dst,$src\t! and vectors (8 bytes)" %} + ins_encode %{ + __ pand($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8); + match(Set dst (AndV src1 src2)); + format %{ "vpand $dst,$src1,$src2\t! and vectors (8 bytes)" %} + ins_encode %{ + bool vector256 = false; + __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vand16B(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length_in_bytes() == 16); + match(Set dst (AndV dst src)); + format %{ "pand $dst,$src\t! and vectors (16 bytes)" %} + ins_encode %{ + __ pand($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16); + match(Set dst (AndV src1 src2)); + format %{ "vpand $dst,$src1,$src2\t! and vectors (16 bytes)" %} + ins_encode %{ + bool vector256 = false; + __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vand16B_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16); + match(Set dst (AndV src (LoadVector mem))); + format %{ "vpand $dst,$src,$mem\t! and vectors (16 bytes)" %} + ins_encode %{ + bool vector256 = false; + __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32); + match(Set dst (AndV src1 src2)); + format %{ "vpand $dst,$src1,$src2\t! and vectors (32 bytes)" %} + ins_encode %{ + bool vector256 = true; + __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vand32B_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32); + match(Set dst (AndV src (LoadVector mem))); + format %{ "vpand $dst,$src,$mem\t! and vectors (32 bytes)" %} + ins_encode %{ + bool vector256 = true; + __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// --------------------------------- OR --------------------------------------- + +instruct vor4B(vecS dst, vecS src) %{ + predicate(n->as_Vector()->length_in_bytes() == 4); + match(Set dst (OrV dst src)); + format %{ "por $dst,$src\t! or vectors (4 bytes)" %} + ins_encode %{ + __ por($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4); + match(Set dst (OrV src1 src2)); + format %{ "vpor $dst,$src1,$src2\t! or vectors (4 bytes)" %} + ins_encode %{ + bool vector256 = false; + __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vor8B(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length_in_bytes() == 8); + match(Set dst (OrV dst src)); + format %{ "por $dst,$src\t! or vectors (8 bytes)" %} + ins_encode %{ + __ por($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8); + match(Set dst (OrV src1 src2)); + format %{ "vpor $dst,$src1,$src2\t! or vectors (8 bytes)" %} + ins_encode %{ + bool vector256 = false; + __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vor16B(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length_in_bytes() == 16); + match(Set dst (OrV dst src)); + format %{ "por $dst,$src\t! or vectors (16 bytes)" %} + ins_encode %{ + __ por($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16); + match(Set dst (OrV src1 src2)); + format %{ "vpor $dst,$src1,$src2\t! or vectors (16 bytes)" %} + ins_encode %{ + bool vector256 = false; + __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vor16B_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16); + match(Set dst (OrV src (LoadVector mem))); + format %{ "vpor $dst,$src,$mem\t! or vectors (16 bytes)" %} + ins_encode %{ + bool vector256 = false; + __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32); + match(Set dst (OrV src1 src2)); + format %{ "vpor $dst,$src1,$src2\t! or vectors (32 bytes)" %} + ins_encode %{ + bool vector256 = true; + __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vor32B_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32); + match(Set dst (OrV src (LoadVector mem))); + format %{ "vpor $dst,$src,$mem\t! or vectors (32 bytes)" %} + ins_encode %{ + bool vector256 = true; + __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +// --------------------------------- XOR -------------------------------------- + +instruct vxor4B(vecS dst, vecS src) %{ + predicate(n->as_Vector()->length_in_bytes() == 4); + match(Set dst (XorV dst src)); + format %{ "pxor $dst,$src\t! xor vectors (4 bytes)" %} + ins_encode %{ + __ pxor($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4); + match(Set dst (XorV src1 src2)); + format %{ "vpxor $dst,$src1,$src2\t! xor vectors (4 bytes)" %} + ins_encode %{ + bool vector256 = false; + __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vxor8B(vecD dst, vecD src) %{ + predicate(n->as_Vector()->length_in_bytes() == 8); + match(Set dst (XorV dst src)); + format %{ "pxor $dst,$src\t! xor vectors (8 bytes)" %} + ins_encode %{ + __ pxor($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8); + match(Set dst (XorV src1 src2)); + format %{ "vpxor $dst,$src1,$src2\t! xor vectors (8 bytes)" %} + ins_encode %{ + bool vector256 = false; + __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vxor16B(vecX dst, vecX src) %{ + predicate(n->as_Vector()->length_in_bytes() == 16); + match(Set dst (XorV dst src)); + format %{ "pxor $dst,$src\t! xor vectors (16 bytes)" %} + ins_encode %{ + __ pxor($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16); + match(Set dst (XorV src1 src2)); + format %{ "vpxor $dst,$src1,$src2\t! xor vectors (16 bytes)" %} + ins_encode %{ + bool vector256 = false; + __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16); + match(Set dst (XorV src (LoadVector mem))); + format %{ "vpxor $dst,$src,$mem\t! xor vectors (16 bytes)" %} + ins_encode %{ + bool vector256 = false; + __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32); + match(Set dst (XorV src1 src2)); + format %{ "vpxor $dst,$src1,$src2\t! xor vectors (32 bytes)" %} + ins_encode %{ + bool vector256 = true; + __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256); + %} + ins_pipe( pipe_slow ); +%} + +instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{ + predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32); + match(Set dst (XorV src (LoadVector mem))); + format %{ "vpxor $dst,$src,$mem\t! xor vectors (32 bytes)" %} + ins_encode %{ + bool vector256 = true; + __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256); + %} + ins_pipe( pipe_slow ); +%} + diff --git a/src/cpu/x86/vm/x86_32.ad b/src/cpu/x86/vm/x86_32.ad index b46d3d688..02e5b3224 100644 --- a/src/cpu/x86/vm/x86_32.ad +++ b/src/cpu/x86/vm/x86_32.ad @@ -1367,22 +1367,6 @@ int emit_deopt_handler(CodeBuffer& cbuf) { return offset; } - -const bool Matcher::match_rule_supported(int opcode) { - if (!has_match_rule(opcode)) - return false; - - switch (opcode) { - case Op_PopCountI: - case Op_PopCountL: - if (!UsePopCountInstruction) - return false; - break; - } - - return true; // Per default match rules are supported. -} - int Matcher::regnum_to_fpu_offset(int regnum) { return regnum - 32; // The FP registers are in the second chunk } diff --git a/src/cpu/x86/vm/x86_64.ad b/src/cpu/x86/vm/x86_64.ad index 68c6d3540..96b71b615 100644 --- a/src/cpu/x86/vm/x86_64.ad +++ b/src/cpu/x86/vm/x86_64.ad @@ -1513,22 +1513,6 @@ int emit_deopt_handler(CodeBuffer& cbuf) return offset; } - -const bool Matcher::match_rule_supported(int opcode) { - if (!has_match_rule(opcode)) - return false; - - switch (opcode) { - case Op_PopCountI: - case Op_PopCountL: - if (!UsePopCountInstruction) - return false; - break; - } - - return true; // Per default match rules are supported. -} - int Matcher::regnum_to_fpu_offset(int regnum) { return regnum - 32; // The FP registers are in the second chunk @@ -6427,6 +6411,31 @@ instruct castP2X(rRegL dst, rRegP src) ins_pipe(ialu_reg_reg); // XXX %} +// Convert oop into int for vectors alignment masking +instruct convP2I(rRegI dst, rRegP src) +%{ + match(Set dst (ConvL2I (CastP2X src))); + + format %{ "movl $dst, $src\t# ptr -> int" %} + ins_encode %{ + __ movl($dst$$Register, $src$$Register); + %} + ins_pipe(ialu_reg_reg); // XXX +%} + +// Convert compressed oop into int for vectors alignment masking +// in case of 32bit oops (heap < 4Gb). +instruct convN2I(rRegI dst, rRegN src) +%{ + predicate(Universe::narrow_oop_shift() == 0); + match(Set dst (ConvL2I (CastP2X (DecodeN src)))); + + format %{ "movl $dst, $src\t# compressed ptr -> int" %} + ins_encode %{ + __ movl($dst$$Register, $src$$Register); + %} + ins_pipe(ialu_reg_reg); // XXX +%} // Convert oop pointer into compressed form instruct encodeHeapOop(rRegN dst, rRegP src, rFlagsReg cr) %{ @@ -10049,11 +10058,10 @@ instruct MoveD2L_reg_reg(rRegL dst, regD src) %{ ins_pipe( pipe_slow ); %} -// The next instructions have long latency and use Int unit. Set high cost. instruct MoveI2F_reg_reg(regF dst, rRegI src) %{ match(Set dst (MoveI2F src)); effect(DEF dst, USE src); - ins_cost(300); + ins_cost(100); format %{ "movd $dst,$src\t# MoveI2F" %} ins_encode %{ __ movdl($dst$$XMMRegister, $src$$Register); @@ -10064,7 +10072,7 @@ instruct MoveI2F_reg_reg(regF dst, rRegI src) %{ instruct MoveL2D_reg_reg(regD dst, rRegL src) %{ match(Set dst (MoveL2D src)); effect(DEF dst, USE src); - ins_cost(300); + ins_cost(100); format %{ "movd $dst,$src\t# MoveL2D" %} ins_encode %{ __ movdq($dst$$XMMRegister, $src$$Register); diff --git a/src/share/vm/opto/classes.hpp b/src/share/vm/opto/classes.hpp index bdf18b51f..02c62f68a 100644 --- a/src/share/vm/opto/classes.hpp +++ b/src/share/vm/opto/classes.hpp @@ -256,6 +256,8 @@ macro(SubVI) macro(SubVL) macro(SubVF) macro(SubVD) +macro(MulVS) +macro(MulVI) macro(MulVF) macro(MulVD) macro(DivVF) @@ -263,9 +265,15 @@ macro(DivVD) macro(LShiftVB) macro(LShiftVS) macro(LShiftVI) +macro(LShiftVL) macro(RShiftVB) macro(RShiftVS) macro(RShiftVI) +macro(RShiftVL) +macro(URShiftVB) +macro(URShiftVS) +macro(URShiftVI) +macro(URShiftVL) macro(AndV) macro(OrV) macro(XorV) diff --git a/src/share/vm/opto/loopnode.cpp b/src/share/vm/opto/loopnode.cpp index 43def7314..ae267c589 100644 --- a/src/share/vm/opto/loopnode.cpp +++ b/src/share/vm/opto/loopnode.cpp @@ -1773,6 +1773,8 @@ void IdealLoopTree::dump_head( ) const { if (stride_con > 0) tty->print("+"); tty->print("%d", stride_con); + tty->print(" (%d iters) ", (int)cl->profile_trip_cnt()); + if (cl->is_pre_loop ()) tty->print(" pre" ); if (cl->is_main_loop()) tty->print(" main"); if (cl->is_post_loop()) tty->print(" post"); diff --git a/src/share/vm/opto/superword.cpp b/src/share/vm/opto/superword.cpp index cfeb47e50..62c1a8362 100644 --- a/src/share/vm/opto/superword.cpp +++ b/src/share/vm/opto/superword.cpp @@ -1357,6 +1357,12 @@ void SuperWord::output() { // Promote operands to vector Node* in1 = vector_opd(p, 1); Node* in2 = vector_opd(p, 2); + if (VectorNode::is_invariant_vector(in1) && (n->is_Add() || n->is_Mul())) { + // Move invariant vector input into second position to avoid register spilling. + Node* tmp = in1; + in1 = in2; + in2 = tmp; + } vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n)); } else { ShouldNotReachHere(); @@ -1400,6 +1406,36 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) { if (opd->is_Vector() || opd->is_LoadVector()) { return opd; // input is matching vector } + if ((opd_idx == 2) && VectorNode::is_shift(p0)) { + // No vector is needed for shift count. + // Vector instructions do not mask shift count, do it here. + Compile* C = _phase->C; + Node* cnt = opd; + juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1); + const TypeInt* t = opd->find_int_type(); + if (t != NULL && t->is_con()) { + juint shift = t->get_con(); + if (shift > mask) { // Unsigned cmp + cnt = ConNode::make(C, TypeInt::make(shift & mask)); + } + } else { + if (t == NULL || t->_lo < 0 || t->_hi > (int)mask) { + cnt = ConNode::make(C, TypeInt::make(mask)); + _phase->_igvn.register_new_node_with_optimizer(cnt); + cnt = new (C, 3) AndINode(opd, cnt); + _phase->_igvn.register_new_node_with_optimizer(cnt); + _phase->set_ctrl(cnt, _phase->get_ctrl(opd)); + } + assert(opd->bottom_type()->isa_int(), "int type only"); + // Move non constant shift count into XMM register. + cnt = new (_phase->C, 2) MoveI2FNode(cnt); + } + if (cnt != opd) { + _phase->_igvn.register_new_node_with_optimizer(cnt); + _phase->set_ctrl(cnt, _phase->get_ctrl(opd)); + } + return cnt; + } assert(!opd->is_StoreVector(), "such vector is not expected here"); // Convert scalar input to vector with the same number of elements as // p0's vector. Use p0's type because size of operand's container in @@ -1718,38 +1754,28 @@ void SuperWord::compute_vector_element_type() { for (int i = _block.length() - 1; i >= 0; i--) { Node* n = _block.at(i); // Only integer types need be examined - if (n->bottom_type()->isa_int()) { + const Type* vt = velt_type(n); + if (vt->basic_type() == T_INT) { uint start, end; vector_opd_range(n, &start, &end); const Type* vt = velt_type(n); for (uint j = start; j < end; j++) { Node* in = n->in(j); - // Don't propagate through a type conversion - if (n->bottom_type() != in->bottom_type()) - continue; - switch(in->Opcode()) { - case Op_AddI: case Op_AddL: - case Op_SubI: case Op_SubL: - case Op_MulI: case Op_MulL: - case Op_AndI: case Op_AndL: - case Op_OrI: case Op_OrL: - case Op_XorI: case Op_XorL: - case Op_LShiftI: case Op_LShiftL: - case Op_CMoveI: case Op_CMoveL: - if (in_bb(in)) { - bool same_type = true; - for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) { - Node *use = in->fast_out(k); - if (!in_bb(use) || !same_velt_type(use, n)) { - same_type = false; - break; - } - } - if (same_type) { - set_velt_type(in, vt); + // Don't propagate through a memory + if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT && + data_size(n) < data_size(in)) { + bool same_type = true; + for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) { + Node *use = in->fast_out(k); + if (!in_bb(use) || !same_velt_type(use, n)) { + same_type = false; + break; } } + if (same_type) { + set_velt_type(in, vt); + } } } } @@ -1792,10 +1818,8 @@ const Type* SuperWord::container_type(Node* n) { } const Type* t = _igvn.type(n); if (t->basic_type() == T_INT) { - if (t->higher_equal(TypeInt::BOOL)) return TypeInt::BOOL; - if (t->higher_equal(TypeInt::BYTE)) return TypeInt::BYTE; - if (t->higher_equal(TypeInt::CHAR)) return TypeInt::CHAR; - if (t->higher_equal(TypeInt::SHORT)) return TypeInt::SHORT; + // A narrow type of arithmetic operations will be determined by + // propagating the type of memory operations. return TypeInt::INT; } return t; @@ -1940,7 +1964,7 @@ void SuperWord::align_initial_loop_index(MemNode* align_to_ref) { // lim0 == original pre loop limit // V == v_align (power of 2) // invar == extra invariant piece of the address expression - // e == k [ +/- invar ] + // e == offset [ +/- invar ] // // When reassociating expressions involving '%' the basic rules are: // (a - b) % k == 0 => a % k == b % k @@ -1993,13 +2017,12 @@ void SuperWord::align_initial_loop_index(MemNode* align_to_ref) { int elt_size = align_to_ref_p.memory_size(); int v_align = vw / elt_size; assert(v_align > 1, "sanity"); - int k = align_to_ref_p.offset_in_bytes() / elt_size; - - Node *kn = _igvn.intcon(k); + int offset = align_to_ref_p.offset_in_bytes() / elt_size; + Node *offsn = _igvn.intcon(offset); - Node *e = kn; + Node *e = offsn; if (align_to_ref_p.invar() != NULL) { - // incorporate any extra invariant piece producing k +/- invar >>> log2(elt) + // incorporate any extra invariant piece producing (offset +/- invar) >>> log2(elt) Node* log2_elt = _igvn.intcon(exact_log2(elt_size)); Node* aref = new (_phase->C, 3) URShiftINode(align_to_ref_p.invar(), log2_elt); _phase->_igvn.register_new_node_with_optimizer(aref); @@ -2014,15 +2037,15 @@ void SuperWord::align_initial_loop_index(MemNode* align_to_ref) { } if (vw > ObjectAlignmentInBytes) { // incorporate base e +/- base && Mask >>> log2(elt) - Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw))); Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base()); _phase->_igvn.register_new_node_with_optimizer(xbase); - Node* masked_xbase = new (_phase->C, 3) AndXNode(xbase, mask); - _phase->_igvn.register_new_node_with_optimizer(masked_xbase); #ifdef _LP64 - masked_xbase = new (_phase->C, 2) ConvL2INode(masked_xbase); - _phase->_igvn.register_new_node_with_optimizer(masked_xbase); + xbase = new (_phase->C, 2) ConvL2INode(xbase); + _phase->_igvn.register_new_node_with_optimizer(xbase); #endif + Node* mask = _igvn.intcon(vw-1); + Node* masked_xbase = new (_phase->C, 3) AndINode(xbase, mask); + _phase->_igvn.register_new_node_with_optimizer(masked_xbase); Node* log2_elt = _igvn.intcon(exact_log2(elt_size)); Node* bref = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt); _phase->_igvn.register_new_node_with_optimizer(bref); diff --git a/src/share/vm/opto/vectornode.cpp b/src/share/vm/opto/vectornode.cpp index c786754cd..a0144df90 100644 --- a/src/share/vm/opto/vectornode.cpp +++ b/src/share/vm/opto/vectornode.cpp @@ -69,6 +69,15 @@ int VectorNode::opcode(int sopc, uint vlen, BasicType bt) { case Op_SubD: assert(bt == T_DOUBLE, "must be"); return Op_SubVD; + case Op_MulI: + switch (bt) { + case T_BOOLEAN: + case T_BYTE: return 0; // Unimplemented + case T_CHAR: + case T_SHORT: return Op_MulVS; + case T_INT: return Matcher::match_rule_supported(Op_MulVI) ? Op_MulVI : 0; // SSE4_1 + } + ShouldNotReachHere(); case Op_MulF: assert(bt == T_FLOAT, "must be"); return Op_MulVF; @@ -90,6 +99,9 @@ int VectorNode::opcode(int sopc, uint vlen, BasicType bt) { case T_INT: return Op_LShiftVI; } ShouldNotReachHere(); + case Op_LShiftL: + assert(bt == T_LONG, "must be"); + return Op_LShiftVL; case Op_RShiftI: switch (bt) { case T_BOOLEAN: @@ -99,6 +111,21 @@ int VectorNode::opcode(int sopc, uint vlen, BasicType bt) { case T_INT: return Op_RShiftVI; } ShouldNotReachHere(); + case Op_RShiftL: + assert(bt == T_LONG, "must be"); + return Op_RShiftVL; + case Op_URShiftI: + switch (bt) { + case T_BOOLEAN: + case T_BYTE: return Op_URShiftVB; + case T_CHAR: + case T_SHORT: return Op_URShiftVS; + case T_INT: return Op_URShiftVI; + } + ShouldNotReachHere(); + case Op_URShiftL: + assert(bt == T_LONG, "must be"); + return Op_URShiftVL; case Op_AndI: case Op_AndL: return Op_AndV; @@ -140,6 +167,34 @@ bool VectorNode::implemented(int opc, uint vlen, BasicType bt) { return false; } +bool VectorNode::is_shift(Node* n) { + switch (n->Opcode()) { + case Op_LShiftI: + case Op_LShiftL: + case Op_RShiftI: + case Op_RShiftL: + case Op_URShiftI: + case Op_URShiftL: + return true; + } + return false; +} + +// Check if input is loop invarient vector. +bool VectorNode::is_invariant_vector(Node* n) { + // Only Replicate vector nodes are loop invarient for now. + switch (n->Opcode()) { + case Op_ReplicateB: + case Op_ReplicateS: + case Op_ReplicateI: + case Op_ReplicateL: + case Op_ReplicateF: + case Op_ReplicateD: + return true; + } + return false; +} + // Return the vector version of a scalar operation node. VectorNode* VectorNode::make(Compile* C, int opc, Node* n1, Node* n2, uint vlen, BasicType bt) { const TypeVect* vt = TypeVect::make(bt, vlen); @@ -160,6 +215,8 @@ VectorNode* VectorNode::make(Compile* C, int opc, Node* n1, Node* n2, uint vlen, case Op_SubVF: return new (C, 3) SubVFNode(n1, n2, vt); case Op_SubVD: return new (C, 3) SubVDNode(n1, n2, vt); + case Op_MulVS: return new (C, 3) MulVSNode(n1, n2, vt); + case Op_MulVI: return new (C, 3) MulVINode(n1, n2, vt); case Op_MulVF: return new (C, 3) MulVFNode(n1, n2, vt); case Op_MulVD: return new (C, 3) MulVDNode(n1, n2, vt); @@ -169,10 +226,17 @@ VectorNode* VectorNode::make(Compile* C, int opc, Node* n1, Node* n2, uint vlen, case Op_LShiftVB: return new (C, 3) LShiftVBNode(n1, n2, vt); case Op_LShiftVS: return new (C, 3) LShiftVSNode(n1, n2, vt); case Op_LShiftVI: return new (C, 3) LShiftVINode(n1, n2, vt); + case Op_LShiftVL: return new (C, 3) LShiftVLNode(n1, n2, vt); case Op_RShiftVB: return new (C, 3) RShiftVBNode(n1, n2, vt); case Op_RShiftVS: return new (C, 3) RShiftVSNode(n1, n2, vt); case Op_RShiftVI: return new (C, 3) RShiftVINode(n1, n2, vt); + case Op_RShiftVL: return new (C, 3) RShiftVLNode(n1, n2, vt); + + case Op_URShiftVB: return new (C, 3) URShiftVBNode(n1, n2, vt); + case Op_URShiftVS: return new (C, 3) URShiftVSNode(n1, n2, vt); + case Op_URShiftVI: return new (C, 3) URShiftVINode(n1, n2, vt); + case Op_URShiftVL: return new (C, 3) URShiftVLNode(n1, n2, vt); case Op_AndV: return new (C, 3) AndVNode(n1, n2, vt); case Op_OrV: return new (C, 3) OrVNode (n1, n2, vt); diff --git a/src/share/vm/opto/vectornode.hpp b/src/share/vm/opto/vectornode.hpp index 602ee94c5..9b01e638c 100644 --- a/src/share/vm/opto/vectornode.hpp +++ b/src/share/vm/opto/vectornode.hpp @@ -46,6 +46,7 @@ class VectorNode : public TypeNode { const TypeVect* vect_type() const { return type()->is_vect(); } uint length() const { return vect_type()->length(); } // Vector length + uint length_in_bytes() const { return vect_type()->length_in_bytes(); } virtual int Opcode() const; @@ -57,7 +58,8 @@ class VectorNode : public TypeNode { static int opcode(int opc, uint vlen, BasicType bt); static bool implemented(int opc, uint vlen, BasicType bt); - + static bool is_shift(Node* n); + static bool is_invariant_vector(Node* n); }; //===========================Vector=ALU=Operations==================================== @@ -158,6 +160,22 @@ class SubVDNode : public VectorNode { virtual int Opcode() const; }; +//------------------------------MulVSNode--------------------------------------- +// Vector multiply short +class MulVSNode : public VectorNode { + public: + MulVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} + virtual int Opcode() const; +}; + +//------------------------------MulVINode--------------------------------------- +// Vector multiply int +class MulVINode : public VectorNode { + public: + MulVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} + virtual int Opcode() const; +}; + //------------------------------MulVFNode--------------------------------------- // Vector multiply float class MulVFNode : public VectorNode { @@ -191,7 +209,7 @@ class DivVDNode : public VectorNode { }; //------------------------------LShiftVBNode--------------------------------------- -// Vector lshift byte +// Vector left shift bytes class LShiftVBNode : public VectorNode { public: LShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} @@ -199,7 +217,7 @@ class LShiftVBNode : public VectorNode { }; //------------------------------LShiftVSNode--------------------------------------- -// Vector lshift shorts +// Vector left shift shorts class LShiftVSNode : public VectorNode { public: LShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} @@ -207,39 +225,88 @@ class LShiftVSNode : public VectorNode { }; //------------------------------LShiftVINode--------------------------------------- -// Vector lshift ints +// Vector left shift ints class LShiftVINode : public VectorNode { public: LShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} virtual int Opcode() const; }; -//------------------------------URShiftVBNode--------------------------------------- -// Vector urshift bytes +//------------------------------LShiftVLNode--------------------------------------- +// Vector left shift longs +class LShiftVLNode : public VectorNode { + public: + LShiftVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} + virtual int Opcode() const; +}; + +//------------------------------RShiftVBNode--------------------------------------- +// Vector right arithmetic (signed) shift bytes class RShiftVBNode : public VectorNode { public: RShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} virtual int Opcode() const; }; -//------------------------------URShiftVSNode--------------------------------------- -// Vector urshift shorts +//------------------------------RShiftVSNode--------------------------------------- +// Vector right arithmetic (signed) shift shorts class RShiftVSNode : public VectorNode { public: RShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} virtual int Opcode() const; }; -//------------------------------URShiftVINode--------------------------------------- -// Vector urshift ints +//------------------------------RShiftVINode--------------------------------------- +// Vector right arithmetic (signed) shift ints class RShiftVINode : public VectorNode { public: RShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} virtual int Opcode() const; }; +//------------------------------RShiftVLNode--------------------------------------- +// Vector right arithmetic (signed) shift longs +class RShiftVLNode : public VectorNode { + public: + RShiftVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} + virtual int Opcode() const; +}; + +//------------------------------URShiftVBNode--------------------------------------- +// Vector right logical (unsigned) shift bytes +class URShiftVBNode : public VectorNode { + public: + URShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} + virtual int Opcode() const; +}; + +//------------------------------URShiftVSNode--------------------------------------- +// Vector right logical (unsigned) shift shorts +class URShiftVSNode : public VectorNode { + public: + URShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} + virtual int Opcode() const; +}; + +//------------------------------URShiftVINode--------------------------------------- +// Vector right logical (unsigned) shift ints +class URShiftVINode : public VectorNode { + public: + URShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} + virtual int Opcode() const; +}; + +//------------------------------URShiftVLNode--------------------------------------- +// Vector right logical (unsigned) shift longs +class URShiftVLNode : public VectorNode { + public: + URShiftVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} + virtual int Opcode() const; +}; + + //------------------------------AndVNode--------------------------------------- -// Vector and +// Vector and integer class AndVNode : public VectorNode { public: AndVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} @@ -247,7 +314,7 @@ class AndVNode : public VectorNode { }; //------------------------------OrVNode--------------------------------------- -// Vector or +// Vector or integer class OrVNode : public VectorNode { public: OrVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} @@ -255,7 +322,7 @@ class OrVNode : public VectorNode { }; //------------------------------XorVNode--------------------------------------- -// Vector xor +// Vector xor integer class XorVNode : public VectorNode { public: XorVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {} diff --git a/test/compiler/6340864/TestByteVect.java b/test/compiler/6340864/TestByteVect.java new file mode 100644 index 000000000..ec4ba9fcd --- /dev/null +++ b/test/compiler/6340864/TestByteVect.java @@ -0,0 +1,1274 @@ +/* + * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 6340864 + * @summary Implement vectorization optimizations in hotspot-server + * + * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestByteVect + */ + +public class TestByteVect { + private static final int ARRLEN = 997; + private static final int ITERS = 11000; + private static final int ADD_INIT = 0; + private static final int BIT_MASK = 0xB7; + private static final int VALUE = 3; + private static final int SHIFT = 8; + + public static void main(String args[]) { + System.out.println("Testing Byte vectors"); + int errn = test(); + if (errn > 0) { + System.err.println("FAILED: " + errn + " errors"); + System.exit(97); + } + System.out.println("PASSED"); + } + + static int test() { + byte[] a0 = new byte[ARRLEN]; + byte[] a1 = new byte[ARRLEN]; + byte[] a2 = new byte[ARRLEN]; + byte[] a3 = new byte[ARRLEN]; + byte[] a4 = new byte[ARRLEN]; + short[] p2 = new short[ARRLEN/2]; + int[] p4 = new int[ARRLEN/4]; + long[] p8 = new long[ARRLEN/8]; + // Initialize + int gold_sum = 0; + for (int i=0; i>>VALUE)); + } + test_srlv(a0, a1, VALUE); + for (int i=0; i>>VALUE)); + } + + test_srac(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav(a0, a1, VALUE); + for (int i=0; i>VALUE)); + } + + test_sllc_n(a0, a1); + for (int i=0; i>>(-VALUE))); + } + test_srlv(a0, a1, -VALUE); + for (int i=0; i>>(-VALUE))); + } + + test_srac_n(a0, a1); + for (int i=0; i>(-VALUE))); + } + test_srav(a0, a1, -VALUE); + for (int i=0; i>(-VALUE))); + } + + test_sllc_o(a0, a1); + for (int i=0; i>>SHIFT)); + } + test_srlv(a0, a1, SHIFT); + for (int i=0; i>>SHIFT)); + } + + test_srac_o(a0, a1); + for (int i=0; i>SHIFT)); + } + test_srav(a0, a1, SHIFT); + for (int i=0; i>SHIFT)); + } + + test_sllc_on(a0, a1); + for (int i=0; i>>(-SHIFT))); + } + test_srlv(a0, a1, -SHIFT); + for (int i=0; i>>(-SHIFT))); + } + + test_srac_on(a0, a1); + for (int i=0; i>(-SHIFT))); + } + test_srav(a0, a1, -SHIFT); + for (int i=0; i>(-SHIFT))); + } + + test_pack2(p2, a1); + for (int i=0; i 0) + return errn; + + System.out.println("Time"); + long start, end; + + start = System.currentTimeMillis(); + for (int i=0; i>>VALUE); + } + } + static void test_srlc_n(byte[] a0, byte[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)(a1[i]>>>(-VALUE)); + } + } + static void test_srlc_o(byte[] a0, byte[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)(a1[i]>>>SHIFT); + } + } + static void test_srlc_on(byte[] a0, byte[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)(a1[i]>>>(-SHIFT)); + } + } + static void test_srlv(byte[] a0, byte[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)(a1[i]>>>b); + } + } + + static void test_srac(byte[] a0, byte[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)(a1[i]>>VALUE); + } + } + static void test_srac_n(byte[] a0, byte[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)(a1[i]>>(-VALUE)); + } + } + static void test_srac_o(byte[] a0, byte[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)(a1[i]>>SHIFT); + } + } + static void test_srac_on(byte[] a0, byte[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)(a1[i]>>(-SHIFT)); + } + } + static void test_srav(byte[] a0, byte[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (byte)(a1[i]>>b); + } + } + + static void test_pack2(short[] p2, byte[] a1) { + if (p2.length*2 > a1.length) return; + for (int i = 0; i < p2.length; i+=1) { + short l0 = (short)a1[i*2+0]; + short l1 = (short)a1[i*2+1]; + p2[i] = (short)((l1 << 8) | (l0 & 0xFF)); + } + } + static void test_unpack2(byte[] a0, short[] p2) { + if (p2.length*2 > a0.length) return; + for (int i = 0; i < p2.length; i+=1) { + short l = p2[i]; + a0[i*2+0] = (byte)(l & 0xFF); + a0[i*2+1] = (byte)(l >> 8); + } + } + static void test_pack2_swap(short[] p2, byte[] a1) { + if (p2.length*2 > a1.length) return; + for (int i = 0; i < p2.length; i+=1) { + short l0 = (short)a1[i*2+0]; + short l1 = (short)a1[i*2+1]; + p2[i] = (short)((l0 << 8) | (l1 & 0xFF)); + } + } + static void test_unpack2_swap(byte[] a0, short[] p2) { + if (p2.length*2 > a0.length) return; + for (int i = 0; i < p2.length; i+=1) { + short l = p2[i]; + a0[i*2+0] = (byte)(l >> 8); + a0[i*2+1] = (byte)(l & 0xFF); + } + } + + static void test_pack4(int[] p4, byte[] a1) { + if (p4.length*4 > a1.length) return; + for (int i = 0; i < p4.length; i+=1) { + int l0 = (int)a1[i*4+0]; + int l1 = (int)a1[i*4+1]; + int l2 = (int)a1[i*4+2]; + int l3 = (int)a1[i*4+3]; + p4[i] = (l0 & 0xFF) | + ((l1 & 0xFF) << 8) | + ((l2 & 0xFF) << 16) | + ((l3 & 0xFF) << 24); + } + } + static void test_unpack4(byte[] a0, int[] p4) { + if (p4.length*4 > a0.length) return; + for (int i = 0; i < p4.length; i+=1) { + int l = p4[i]; + a0[i*4+0] = (byte)(l & 0xFF); + a0[i*4+1] = (byte)(l >> 8); + a0[i*4+2] = (byte)(l >> 16); + a0[i*4+3] = (byte)(l >> 24); + } + } + static void test_pack4_swap(int[] p4, byte[] a1) { + if (p4.length*4 > a1.length) return; + for (int i = 0; i < p4.length; i+=1) { + int l0 = (int)a1[i*4+0]; + int l1 = (int)a1[i*4+1]; + int l2 = (int)a1[i*4+2]; + int l3 = (int)a1[i*4+3]; + p4[i] = (l3 & 0xFF) | + ((l2 & 0xFF) << 8) | + ((l1 & 0xFF) << 16) | + ((l0 & 0xFF) << 24); + } + } + static void test_unpack4_swap(byte[] a0, int[] p4) { + if (p4.length*4 > a0.length) return; + for (int i = 0; i < p4.length; i+=1) { + int l = p4[i]; + a0[i*4+0] = (byte)(l >> 24); + a0[i*4+1] = (byte)(l >> 16); + a0[i*4+2] = (byte)(l >> 8); + a0[i*4+3] = (byte)(l & 0xFF); + } + } + + static void test_pack8(long[] p8, byte[] a1) { + if (p8.length*8 > a1.length) return; + for (int i = 0; i < p8.length; i+=1) { + long l0 = (long)a1[i*8+0]; + long l1 = (long)a1[i*8+1]; + long l2 = (long)a1[i*8+2]; + long l3 = (long)a1[i*8+3]; + long l4 = (long)a1[i*8+4]; + long l5 = (long)a1[i*8+5]; + long l6 = (long)a1[i*8+6]; + long l7 = (long)a1[i*8+7]; + p8[i] = (l0 & 0xFFl) | + ((l1 & 0xFFl) << 8) | + ((l2 & 0xFFl) << 16) | + ((l3 & 0xFFl) << 24) | + ((l4 & 0xFFl) << 32) | + ((l5 & 0xFFl) << 40) | + ((l6 & 0xFFl) << 48) | + ((l7 & 0xFFl) << 56); + } + } + static void test_unpack8(byte[] a0, long[] p8) { + if (p8.length*8 > a0.length) return; + for (int i = 0; i < p8.length; i+=1) { + long l = p8[i]; + a0[i*8+0] = (byte)(l & 0xFFl); + a0[i*8+1] = (byte)(l >> 8); + a0[i*8+2] = (byte)(l >> 16); + a0[i*8+3] = (byte)(l >> 24); + a0[i*8+4] = (byte)(l >> 32); + a0[i*8+5] = (byte)(l >> 40); + a0[i*8+6] = (byte)(l >> 48); + a0[i*8+7] = (byte)(l >> 56); + } + } + static void test_pack8_swap(long[] p8, byte[] a1) { + if (p8.length*8 > a1.length) return; + for (int i = 0; i < p8.length; i+=1) { + long l0 = (long)a1[i*8+0]; + long l1 = (long)a1[i*8+1]; + long l2 = (long)a1[i*8+2]; + long l3 = (long)a1[i*8+3]; + long l4 = (long)a1[i*8+4]; + long l5 = (long)a1[i*8+5]; + long l6 = (long)a1[i*8+6]; + long l7 = (long)a1[i*8+7]; + p8[i] = (l7 & 0xFFl) | + ((l6 & 0xFFl) << 8) | + ((l5 & 0xFFl) << 16) | + ((l4 & 0xFFl) << 24) | + ((l3 & 0xFFl) << 32) | + ((l2 & 0xFFl) << 40) | + ((l1 & 0xFFl) << 48) | + ((l0 & 0xFFl) << 56); + } + } + static void test_unpack8_swap(byte[] a0, long[] p8) { + if (p8.length*8 > a0.length) return; + for (int i = 0; i < p8.length; i+=1) { + long l = p8[i]; + a0[i*8+0] = (byte)(l >> 56); + a0[i*8+1] = (byte)(l >> 48); + a0[i*8+2] = (byte)(l >> 40); + a0[i*8+3] = (byte)(l >> 32); + a0[i*8+4] = (byte)(l >> 24); + a0[i*8+5] = (byte)(l >> 16); + a0[i*8+6] = (byte)(l >> 8); + a0[i*8+7] = (byte)(l & 0xFFl); + } + } + + static int verify(String text, int i, byte elem, byte val) { + if (elem != val) { + System.err.println(text + "[" + i + "] = " + elem + " != " + val); + return 1; + } + return 0; + } + + static int verify(String text, int i, short elem, short val) { + if (elem != val) { + System.err.println(text + "[" + i + "] = " + elem + " != " + val); + return 1; + } + return 0; + } + + static int verify(String text, int i, int elem, int val) { + if (elem != val) { + System.err.println(text + "[" + i + "] = " + Integer.toHexString(elem) + " != " + Integer.toHexString(val)); + return 1; + } + return 0; + } + + static int verify(String text, int i, long elem, long val) { + if (elem != val) { + System.err.println(text + "[" + i + "] = " + Long.toHexString(elem) + " != " + Long.toHexString(val)); + return 1; + } + return 0; + } +} diff --git a/test/compiler/6340864/TestDoubleVect.java b/test/compiler/6340864/TestDoubleVect.java new file mode 100644 index 000000000..db0f460dd --- /dev/null +++ b/test/compiler/6340864/TestDoubleVect.java @@ -0,0 +1,560 @@ +/* + * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 6340864 + * @summary Implement vectorization optimizations in hotspot-server + * + * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestDoubleVect + */ + +public class TestDoubleVect { + private static final int ARRLEN = 997; + private static final int ITERS = 11000; + private static final double ADD_INIT = -7500.; + private static final double VALUE = 15.; + + public static void main(String args[]) { + System.out.println("Testing Double vectors"); + int errn = test(); + if (errn > 0) { + System.err.println("FAILED: " + errn + " errors"); + System.exit(97); + } + System.out.println("PASSED"); + } + + static int test() { + double[] a0 = new double[ARRLEN]; + double[] a1 = new double[ARRLEN]; + double[] a2 = new double[ARRLEN]; + double[] a3 = new double[ARRLEN]; + // Initialize + double gold_sum = 0; + for (int i=0; i 0) + return errn; + + System.out.println("Time"); + long start, end; + + start = System.currentTimeMillis(); + for (int i=0; i 0) { + System.err.println("FAILED: " + errn + " errors"); + System.exit(97); + } + System.out.println("PASSED"); + } + + static int test() { + float[] a0 = new float[ARRLEN]; + float[] a1 = new float[ARRLEN]; + float[] a2 = new float[ARRLEN]; + float[] a3 = new float[ARRLEN]; + // Initialize + float gold_sum = 0; + for (int i=0; i 0) + return errn; + + System.out.println("Time"); + long start, end; + + start = System.currentTimeMillis(); + for (int i=0; i 0) { + System.err.println("FAILED: " + errn + " errors"); + System.exit(97); + } + System.out.println("PASSED"); + } + + static int test() { + int[] a0 = new int[ARRLEN]; + int[] a1 = new int[ARRLEN]; + int[] a2 = new int[ARRLEN]; + int[] a3 = new int[ARRLEN]; + int[] a4 = new int[ARRLEN]; + long[] p2 = new long[ARRLEN/2]; + // Initialize + int gold_sum = 0; + for (int i=0; i>>VALUE)); + } + test_srlv(a0, a1, VALUE); + for (int i=0; i>>VALUE)); + } + + test_srac(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav(a0, a1, VALUE); + for (int i=0; i>VALUE)); + } + + test_sllc_n(a0, a1); + for (int i=0; i>>(-VALUE))); + } + test_srlv(a0, a1, -VALUE); + for (int i=0; i>>(-VALUE))); + } + + test_srac_n(a0, a1); + for (int i=0; i>(-VALUE))); + } + test_srav(a0, a1, -VALUE); + for (int i=0; i>(-VALUE))); + } + + test_sllc_o(a0, a1); + for (int i=0; i>>SHIFT)); + } + test_srlv(a0, a1, SHIFT); + for (int i=0; i>>SHIFT)); + } + + test_srac_o(a0, a1); + for (int i=0; i>SHIFT)); + } + test_srav(a0, a1, SHIFT); + for (int i=0; i>SHIFT)); + } + + test_sllc_on(a0, a1); + for (int i=0; i>>(-SHIFT))); + } + test_srlv(a0, a1, -SHIFT); + for (int i=0; i>>(-SHIFT))); + } + + test_srac_on(a0, a1); + for (int i=0; i>(-SHIFT))); + } + test_srav(a0, a1, -SHIFT); + for (int i=0; i>(-SHIFT))); + } + + test_pack2(p2, a1); + for (int i=0; i 0) + return errn; + + System.out.println("Time"); + long start, end; + + start = System.currentTimeMillis(); + for (int i=0; i>>VALUE); + } + } + static void test_srlc_n(int[] a0, int[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)(a1[i]>>>(-VALUE)); + } + } + static void test_srlc_o(int[] a0, int[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)(a1[i]>>>SHIFT); + } + } + static void test_srlc_on(int[] a0, int[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)(a1[i]>>>(-SHIFT)); + } + } + static void test_srlv(int[] a0, int[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)(a1[i]>>>b); + } + } + + static void test_srac(int[] a0, int[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)(a1[i]>>VALUE); + } + } + static void test_srac_n(int[] a0, int[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)(a1[i]>>(-VALUE)); + } + } + static void test_srac_o(int[] a0, int[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)(a1[i]>>SHIFT); + } + } + static void test_srac_on(int[] a0, int[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)(a1[i]>>(-SHIFT)); + } + } + static void test_srav(int[] a0, int[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (int)(a1[i]>>b); + } + } + + static void test_pack2(long[] p2, int[] a1) { + if (p2.length*2 > a1.length) return; + for (int i = 0; i < p2.length; i+=1) { + long l0 = (long)a1[i*2+0]; + long l1 = (long)a1[i*2+1]; + p2[i] = (l1 << 32) | (l0 & 0xFFFFFFFFl); + } + } + static void test_unpack2(int[] a0, long[] p2) { + if (p2.length*2 > a0.length) return; + for (int i = 0; i < p2.length; i+=1) { + long l = p2[i]; + a0[i*2+0] = (int)(l & 0xFFFFFFFFl); + a0[i*2+1] = (int)(l >> 32); + } + } + static void test_pack2_swap(long[] p2, int[] a1) { + if (p2.length*2 > a1.length) return; + for (int i = 0; i < p2.length; i+=1) { + long l0 = (long)a1[i*2+0]; + long l1 = (long)a1[i*2+1]; + p2[i] = (l0 << 32) | (l1 & 0xFFFFFFFFl); + } + } + static void test_unpack2_swap(int[] a0, long[] p2) { + if (p2.length*2 > a0.length) return; + for (int i = 0; i < p2.length; i+=1) { + long l = p2[i]; + a0[i*2+0] = (int)(l >> 32); + a0[i*2+1] = (int)(l & 0xFFFFFFFFl); + } + } + + static int verify(String text, int i, int elem, int val) { + if (elem != val) { + System.err.println(text + "[" + i + "] = " + elem + " != " + val); + return 1; + } + return 0; + } + + static int verify(String text, int i, long elem, long val) { + if (elem != val) { + System.err.println(text + "[" + i + "] = " + Long.toHexString(elem) + " != " + Long.toHexString(val)); + return 1; + } + return 0; + } +} diff --git a/test/compiler/6340864/TestLongVect.java b/test/compiler/6340864/TestLongVect.java new file mode 100644 index 000000000..70b41f4b4 --- /dev/null +++ b/test/compiler/6340864/TestLongVect.java @@ -0,0 +1,917 @@ +/* + * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 6340864 + * @summary Implement vectorization optimizations in hotspot-server + * + * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestLongVect + */ + +public class TestLongVect { + private static final int ARRLEN = 997; + private static final int ITERS = 11000; + private static final long ADD_INIT = Long.MAX_VALUE-500; + private static final long BIT_MASK = 0xEC80F731EC80F731L; + private static final int VALUE = 31; + private static final int SHIFT = 64; + + public static void main(String args[]) { + System.out.println("Testing Long vectors"); + int errn = test(); + if (errn > 0) { + System.err.println("FAILED: " + errn + " errors"); + System.exit(97); + } + System.out.println("PASSED"); + } + + static int test() { + long[] a0 = new long[ARRLEN]; + long[] a1 = new long[ARRLEN]; + long[] a2 = new long[ARRLEN]; + long[] a3 = new long[ARRLEN]; + long[] a4 = new long[ARRLEN]; + // Initialize + long gold_sum = 0; + for (int i=0; i>>VALUE)); + } + test_srlv(a0, a1, VALUE); + for (int i=0; i>>VALUE)); + } + + test_srac(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav(a0, a1, VALUE); + for (int i=0; i>VALUE)); + } + + test_sllc_n(a0, a1); + for (int i=0; i>>(-VALUE))); + } + test_srlv(a0, a1, -VALUE); + for (int i=0; i>>(-VALUE))); + } + + test_srac_n(a0, a1); + for (int i=0; i>(-VALUE))); + } + test_srav(a0, a1, -VALUE); + for (int i=0; i>(-VALUE))); + } + + test_sllc_o(a0, a1); + for (int i=0; i>>SHIFT)); + } + test_srlv(a0, a1, SHIFT); + for (int i=0; i>>SHIFT)); + } + + test_srac_o(a0, a1); + for (int i=0; i>SHIFT)); + } + test_srav(a0, a1, SHIFT); + for (int i=0; i>SHIFT)); + } + + test_sllc_on(a0, a1); + for (int i=0; i>>(-SHIFT))); + } + test_srlv(a0, a1, -SHIFT); + for (int i=0; i>>(-SHIFT))); + } + + test_srac_on(a0, a1); + for (int i=0; i>(-SHIFT))); + } + test_srav(a0, a1, -SHIFT); + for (int i=0; i>(-SHIFT))); + } + + } + + if (errn > 0) + return errn; + + System.out.println("Time"); + long start, end; + + start = System.currentTimeMillis(); + for (int i=0; i>>VALUE); + } + } + static void test_srlc_n(long[] a0, long[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)(a1[i]>>>(-VALUE)); + } + } + static void test_srlc_o(long[] a0, long[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)(a1[i]>>>SHIFT); + } + } + static void test_srlc_on(long[] a0, long[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)(a1[i]>>>(-SHIFT)); + } + } + static void test_srlv(long[] a0, long[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)(a1[i]>>>b); + } + } + + static void test_srac(long[] a0, long[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)(a1[i]>>VALUE); + } + } + static void test_srac_n(long[] a0, long[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)(a1[i]>>(-VALUE)); + } + } + static void test_srac_o(long[] a0, long[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)(a1[i]>>SHIFT); + } + } + static void test_srac_on(long[] a0, long[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)(a1[i]>>(-SHIFT)); + } + } + static void test_srav(long[] a0, long[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (long)(a1[i]>>b); + } + } + + static int verify(String text, int i, long elem, long val) { + if (elem != val) { + System.err.println(text + "[" + i + "] = " + elem + " != " + val); + return 1; + } + return 0; + } +} diff --git a/test/compiler/6340864/TestShortVect.java b/test/compiler/6340864/TestShortVect.java new file mode 100644 index 000000000..a688e0de0 --- /dev/null +++ b/test/compiler/6340864/TestShortVect.java @@ -0,0 +1,1127 @@ +/* + * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 6340864 + * @summary Implement vectorization optimizations in hotspot-server + * + * @run main/othervm/timeout=400 -Xbatch -Xmx64m TestShortVect + */ + +public class TestShortVect { + private static final int ARRLEN = 997; + private static final int ITERS = 11000; + private static final int ADD_INIT = Short.MAX_VALUE-500; + private static final int BIT_MASK = 0xB731; + private static final int VALUE = 7; + private static final int SHIFT = 16; + + public static void main(String args[]) { + System.out.println("Testing Short vectors"); + int errn = test(); + if (errn > 0) { + System.err.println("FAILED: " + errn + " errors"); + System.exit(97); + } + System.out.println("PASSED"); + } + + static int test() { + short[] a0 = new short[ARRLEN]; + short[] a1 = new short[ARRLEN]; + short[] a2 = new short[ARRLEN]; + short[] a3 = new short[ARRLEN]; + short[] a4 = new short[ARRLEN]; + int[] p2 = new int[ARRLEN/2]; + long[] p4 = new long[ARRLEN/4]; + // Initialize + int gold_sum = 0; + for (int i=0; i>>VALUE)); + } + test_srlv(a0, a1, VALUE); + for (int i=0; i>>VALUE)); + } + + test_srac(a0, a1); + for (int i=0; i>VALUE)); + } + test_srav(a0, a1, VALUE); + for (int i=0; i>VALUE)); + } + + test_sllc_n(a0, a1); + for (int i=0; i>>(-VALUE))); + } + test_srlv(a0, a1, -VALUE); + for (int i=0; i>>(-VALUE))); + } + + test_srac_n(a0, a1); + for (int i=0; i>(-VALUE))); + } + test_srav(a0, a1, -VALUE); + for (int i=0; i>(-VALUE))); + } + + test_sllc_o(a0, a1); + for (int i=0; i>>SHIFT)); + } + test_srlv(a0, a1, SHIFT); + for (int i=0; i>>SHIFT)); + } + + test_srac_o(a0, a1); + for (int i=0; i>SHIFT)); + } + test_srav(a0, a1, SHIFT); + for (int i=0; i>SHIFT)); + } + + test_sllc_on(a0, a1); + for (int i=0; i>>(-SHIFT))); + } + test_srlv(a0, a1, -SHIFT); + for (int i=0; i>>(-SHIFT))); + } + + test_srac_on(a0, a1); + for (int i=0; i>(-SHIFT))); + } + test_srav(a0, a1, -SHIFT); + for (int i=0; i>(-SHIFT))); + } + + test_pack2(p2, a1); + for (int i=0; i 0) + return errn; + + System.out.println("Time"); + long start, end; + + start = System.currentTimeMillis(); + for (int i=0; i>>VALUE); + } + } + static void test_srlc_n(short[] a0, short[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)(a1[i]>>>(-VALUE)); + } + } + static void test_srlc_o(short[] a0, short[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)(a1[i]>>>SHIFT); + } + } + static void test_srlc_on(short[] a0, short[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)(a1[i]>>>(-SHIFT)); + } + } + static void test_srlv(short[] a0, short[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)(a1[i]>>>b); + } + } + + static void test_srac(short[] a0, short[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)(a1[i]>>VALUE); + } + } + static void test_srac_n(short[] a0, short[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)(a1[i]>>(-VALUE)); + } + } + static void test_srac_o(short[] a0, short[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)(a1[i]>>SHIFT); + } + } + static void test_srac_on(short[] a0, short[] a1) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)(a1[i]>>(-SHIFT)); + } + } + static void test_srav(short[] a0, short[] a1, int b) { + for (int i = 0; i < a0.length; i+=1) { + a0[i] = (short)(a1[i]>>b); + } + } + + static void test_pack2(int[] p2, short[] a1) { + if (p2.length*2 > a1.length) return; + for (int i = 0; i < p2.length; i+=1) { + int l0 = (int)a1[i*2+0]; + int l1 = (int)a1[i*2+1]; + p2[i] = (l1 << 16) | (l0 & 0xFFFF); + } + } + static void test_unpack2(short[] a0, int[] p2) { + if (p2.length*2 > a0.length) return; + for (int i = 0; i < p2.length; i+=1) { + int l = p2[i]; + a0[i*2+0] = (short)(l & 0xFFFF); + a0[i*2+1] = (short)(l >> 16); + } + } + static void test_pack2_swap(int[] p2, short[] a1) { + if (p2.length*2 > a1.length) return; + for (int i = 0; i < p2.length; i+=1) { + int l0 = (int)a1[i*2+0]; + int l1 = (int)a1[i*2+1]; + p2[i] = (l0 << 16) | (l1 & 0xFFFF); + } + } + static void test_unpack2_swap(short[] a0, int[] p2) { + if (p2.length*2 > a0.length) return; + for (int i = 0; i < p2.length; i+=1) { + int l = p2[i]; + a0[i*2+0] = (short)(l >> 16); + a0[i*2+1] = (short)(l & 0xFFFF); + } + } + + static void test_pack4(long[] p4, short[] a1) { + if (p4.length*4 > a1.length) return; + for (int i = 0; i < p4.length; i+=1) { + long l0 = (long)a1[i*4+0]; + long l1 = (long)a1[i*4+1]; + long l2 = (long)a1[i*4+2]; + long l3 = (long)a1[i*4+3]; + p4[i] = (l0 & 0xFFFFl) | + ((l1 & 0xFFFFl) << 16) | + ((l2 & 0xFFFFl) << 32) | + ((l3 & 0xFFFFl) << 48); + } + } + static void test_unpack4(short[] a0, long[] p4) { + if (p4.length*4 > a0.length) return; + for (int i = 0; i < p4.length; i+=1) { + long l = p4[i]; + a0[i*4+0] = (short)(l & 0xFFFFl); + a0[i*4+1] = (short)(l >> 16); + a0[i*4+2] = (short)(l >> 32); + a0[i*4+3] = (short)(l >> 48); + } + } + static void test_pack4_swap(long[] p4, short[] a1) { + if (p4.length*4 > a1.length) return; + for (int i = 0; i < p4.length; i+=1) { + long l0 = (long)a1[i*4+0]; + long l1 = (long)a1[i*4+1]; + long l2 = (long)a1[i*4+2]; + long l3 = (long)a1[i*4+3]; + p4[i] = (l3 & 0xFFFFl) | + ((l2 & 0xFFFFl) << 16) | + ((l1 & 0xFFFFl) << 32) | + ((l0 & 0xFFFFl) << 48); + } + } + static void test_unpack4_swap(short[] a0, long[] p4) { + if (p4.length*4 > a0.length) return; + for (int i = 0; i < p4.length; i+=1) { + long l = p4[i]; + a0[i*4+0] = (short)(l >> 48); + a0[i*4+1] = (short)(l >> 32); + a0[i*4+2] = (short)(l >> 16); + a0[i*4+3] = (short)(l & 0xFFFFl); + } + } + + static int verify(String text, int i, short elem, short val) { + if (elem != val) { + System.err.println(text + "[" + i + "] = " + elem + " != " + val); + return 1; + } + return 0; + } + + static int verify(String text, int i, int elem, int val) { + if (elem != val) { + System.err.println(text + "[" + i + "] = " + Integer.toHexString(elem) + " != " + Integer.toHexString(val)); + return 1; + } + return 0; + } + + static int verify(String text, int i, long elem, long val) { + if (elem != val) { + System.err.println(text + "[" + i + "] = " + Long.toHexString(elem) + " != " + Long.toHexString(val)); + return 1; + } + return 0; + } +} -- GitLab