提交 f375da7c 编写于 作者: K kvn

6340864: Implement vectorization optimizations in hotspot-server

Summary: Added asm encoding and mach nodes for vector arithmetic instructions on x86.
Reviewed-by: roland
上级 04cd771f
此差异已折叠。
......@@ -617,6 +617,7 @@ private:
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
simd_prefix(dst, xnoreg, src, pre, opc);
}
void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) {
simd_prefix(src, dst, pre);
}
......@@ -626,16 +627,10 @@ private:
simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w);
}
int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
bool rex_w = false, bool vector256 = false);
int simd_prefix_and_encode(XMMRegister dst, XMMRegister src,
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
return simd_prefix_and_encode(dst, xnoreg, src, pre, opc);
}
// Move/convert 32-bit integer value.
int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
VexSimdPrefix pre) {
......@@ -677,6 +672,15 @@ private:
void emit_arith(int op1, int op2, Register dst, jobject obj);
void emit_arith(int op1, int op2, Register dst, Register src);
void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
Address src, VexSimdPrefix pre, bool vector256);
void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
XMMRegister src, VexSimdPrefix pre, bool vector256);
void emit_operand(Register reg,
Register base, Register index, Address::ScaleFactor scale,
int disp,
......@@ -891,12 +895,6 @@ private:
void andq(Register dst, Address src);
void andq(Register dst, Register src);
// Bitwise Logical AND of Packed Double-Precision Floating-Point Values
void andpd(XMMRegister dst, XMMRegister src);
// Bitwise Logical AND of Packed Single-Precision Floating-Point Values
void andps(XMMRegister dst, XMMRegister src);
void bsfl(Register dst, Register src);
void bsrl(Register dst, Register src);
......@@ -1436,10 +1434,6 @@ private:
void prefetcht2(Address src);
void prefetchw(Address src);
// POR - Bitwise logical OR
void por(XMMRegister dst, XMMRegister src);
void por(XMMRegister dst, Address src);
// Shuffle Packed Doublewords
void pshufd(XMMRegister dst, XMMRegister src, int mode);
void pshufd(XMMRegister dst, Address src, int mode);
......@@ -1448,9 +1442,6 @@ private:
void pshuflw(XMMRegister dst, XMMRegister src, int mode);
void pshuflw(XMMRegister dst, Address src, int mode);
// Shift Right by bits Logical Quadword Immediate
void psrlq(XMMRegister dst, int shift);
// Shift Right by bytes Logical DoubleQuadword Immediate
void psrldq(XMMRegister dst, int shift);
......@@ -1475,10 +1466,6 @@ private:
void pushq(Address src);
// Xor Packed Byte Integer Values
void pxor(XMMRegister dst, Address src);
void pxor(XMMRegister dst, XMMRegister src);
void rcll(Register dst, int imm8);
void rclq(Register dst, int imm8);
......@@ -1601,15 +1588,10 @@ private:
void xorq(Register dst, Address src);
void xorq(Register dst, Register src);
// Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
void xorpd(XMMRegister dst, XMMRegister src);
// Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
void xorps(XMMRegister dst, XMMRegister src);
void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0
// AVX 3-operands scalar instructions (encoded with VEX prefix)
void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vaddss(XMMRegister dst, XMMRegister nds, Address src);
......@@ -1627,14 +1609,147 @@ private:
void vsubss(XMMRegister dst, XMMRegister nds, Address src);
void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
// AVX Vector instrucitons.
void vandpd(XMMRegister dst, XMMRegister nds, Address src);
void vandps(XMMRegister dst, XMMRegister nds, Address src);
void vxorpd(XMMRegister dst, XMMRegister nds, Address src);
void vxorps(XMMRegister dst, XMMRegister nds, Address src);
//====================VECTOR ARITHMETIC=====================================
// Add Packed Floating-Point Values
void addpd(XMMRegister dst, XMMRegister src);
void addps(XMMRegister dst, XMMRegister src);
void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
// Subtract Packed Floating-Point Values
void subpd(XMMRegister dst, XMMRegister src);
void subps(XMMRegister dst, XMMRegister src);
void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
// Multiply Packed Floating-Point Values
void mulpd(XMMRegister dst, XMMRegister src);
void mulps(XMMRegister dst, XMMRegister src);
void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
// Divide Packed Floating-Point Values
void divpd(XMMRegister dst, XMMRegister src);
void divps(XMMRegister dst, XMMRegister src);
void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
// Bitwise Logical AND of Packed Floating-Point Values
void andpd(XMMRegister dst, XMMRegister src);
void andps(XMMRegister dst, XMMRegister src);
void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
// Bitwise Logical XOR of Packed Floating-Point Values
void xorpd(XMMRegister dst, XMMRegister src);
void xorps(XMMRegister dst, XMMRegister src);
void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
// Add packed integers
void paddb(XMMRegister dst, XMMRegister src);
void paddw(XMMRegister dst, XMMRegister src);
void paddd(XMMRegister dst, XMMRegister src);
void paddq(XMMRegister dst, XMMRegister src);
void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
// Sub packed integers
void psubb(XMMRegister dst, XMMRegister src);
void psubw(XMMRegister dst, XMMRegister src);
void psubd(XMMRegister dst, XMMRegister src);
void psubq(XMMRegister dst, XMMRegister src);
void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
// Multiply packed integers (only shorts and ints)
void pmullw(XMMRegister dst, XMMRegister src);
void pmulld(XMMRegister dst, XMMRegister src);
void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
// Shift left packed integers
void psllw(XMMRegister dst, int shift);
void pslld(XMMRegister dst, int shift);
void psllq(XMMRegister dst, int shift);
void psllw(XMMRegister dst, XMMRegister shift);
void pslld(XMMRegister dst, XMMRegister shift);
void psllq(XMMRegister dst, XMMRegister shift);
void vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
// Logical shift right packed integers
void psrlw(XMMRegister dst, int shift);
void psrld(XMMRegister dst, int shift);
void psrlq(XMMRegister dst, int shift);
void psrlw(XMMRegister dst, XMMRegister shift);
void psrld(XMMRegister dst, XMMRegister shift);
void psrlq(XMMRegister dst, XMMRegister shift);
void vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
// Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
void psraw(XMMRegister dst, int shift);
void psrad(XMMRegister dst, int shift);
void psraw(XMMRegister dst, XMMRegister shift);
void psrad(XMMRegister dst, XMMRegister shift);
void vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
// And packed integers
void pand(XMMRegister dst, XMMRegister src);
void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
// Or packed integers
void por(XMMRegister dst, XMMRegister src);
void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
// Xor packed integers
void pxor(XMMRegister dst, XMMRegister src);
void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
// Copy low 128bit into high 128bit of YMM registers.
void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
......@@ -2532,11 +2647,13 @@ public:
void vaddss(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vaddss(dst, nds, src); }
void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
void vandpd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vandpd(dst, nds, src); }
void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); }
void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); }
void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
void vandps(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vandps(dst, nds, src); }
void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); }
void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); }
void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
void vdivsd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vdivsd(dst, nds, src); }
......@@ -2565,12 +2682,12 @@ public:
// AVX Vector instructions
void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
void vxorpd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorpd(dst, nds, src); }
void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
void vxorps(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorps(dst, nds, src); }
void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2
......@@ -2578,6 +2695,12 @@ public:
else
Assembler::vxorpd(dst, nds, src, vector256);
}
void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2
Assembler::vpxor(dst, nds, src, vector256);
else
Assembler::vxorpd(dst, nds, src, vector256);
}
// Move packed integer values from low 128 bit to hign 128 bit in 256 bit vector.
void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
......
此差异已折叠。
......@@ -1367,22 +1367,6 @@ int emit_deopt_handler(CodeBuffer& cbuf) {
return offset;
}
const bool Matcher::match_rule_supported(int opcode) {
if (!has_match_rule(opcode))
return false;
switch (opcode) {
case Op_PopCountI:
case Op_PopCountL:
if (!UsePopCountInstruction)
return false;
break;
}
return true; // Per default match rules are supported.
}
int Matcher::regnum_to_fpu_offset(int regnum) {
return regnum - 32; // The FP registers are in the second chunk
}
......
......@@ -1513,22 +1513,6 @@ int emit_deopt_handler(CodeBuffer& cbuf)
return offset;
}
const bool Matcher::match_rule_supported(int opcode) {
if (!has_match_rule(opcode))
return false;
switch (opcode) {
case Op_PopCountI:
case Op_PopCountL:
if (!UsePopCountInstruction)
return false;
break;
}
return true; // Per default match rules are supported.
}
int Matcher::regnum_to_fpu_offset(int regnum)
{
return regnum - 32; // The FP registers are in the second chunk
......@@ -6427,6 +6411,31 @@ instruct castP2X(rRegL dst, rRegP src)
ins_pipe(ialu_reg_reg); // XXX
%}
// Convert oop into int for vectors alignment masking
instruct convP2I(rRegI dst, rRegP src)
%{
match(Set dst (ConvL2I (CastP2X src)));
format %{ "movl $dst, $src\t# ptr -> int" %}
ins_encode %{
__ movl($dst$$Register, $src$$Register);
%}
ins_pipe(ialu_reg_reg); // XXX
%}
// Convert compressed oop into int for vectors alignment masking
// in case of 32bit oops (heap < 4Gb).
instruct convN2I(rRegI dst, rRegN src)
%{
predicate(Universe::narrow_oop_shift() == 0);
match(Set dst (ConvL2I (CastP2X (DecodeN src))));
format %{ "movl $dst, $src\t# compressed ptr -> int" %}
ins_encode %{
__ movl($dst$$Register, $src$$Register);
%}
ins_pipe(ialu_reg_reg); // XXX
%}
// Convert oop pointer into compressed form
instruct encodeHeapOop(rRegN dst, rRegP src, rFlagsReg cr) %{
......@@ -10049,11 +10058,10 @@ instruct MoveD2L_reg_reg(rRegL dst, regD src) %{
ins_pipe( pipe_slow );
%}
// The next instructions have long latency and use Int unit. Set high cost.
instruct MoveI2F_reg_reg(regF dst, rRegI src) %{
match(Set dst (MoveI2F src));
effect(DEF dst, USE src);
ins_cost(300);
ins_cost(100);
format %{ "movd $dst,$src\t# MoveI2F" %}
ins_encode %{
__ movdl($dst$$XMMRegister, $src$$Register);
......@@ -10064,7 +10072,7 @@ instruct MoveI2F_reg_reg(regF dst, rRegI src) %{
instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
match(Set dst (MoveL2D src));
effect(DEF dst, USE src);
ins_cost(300);
ins_cost(100);
format %{ "movd $dst,$src\t# MoveL2D" %}
ins_encode %{
__ movdq($dst$$XMMRegister, $src$$Register);
......
......@@ -256,6 +256,8 @@ macro(SubVI)
macro(SubVL)
macro(SubVF)
macro(SubVD)
macro(MulVS)
macro(MulVI)
macro(MulVF)
macro(MulVD)
macro(DivVF)
......@@ -263,9 +265,15 @@ macro(DivVD)
macro(LShiftVB)
macro(LShiftVS)
macro(LShiftVI)
macro(LShiftVL)
macro(RShiftVB)
macro(RShiftVS)
macro(RShiftVI)
macro(RShiftVL)
macro(URShiftVB)
macro(URShiftVS)
macro(URShiftVI)
macro(URShiftVL)
macro(AndV)
macro(OrV)
macro(XorV)
......
......@@ -1773,6 +1773,8 @@ void IdealLoopTree::dump_head( ) const {
if (stride_con > 0) tty->print("+");
tty->print("%d", stride_con);
tty->print(" (%d iters) ", (int)cl->profile_trip_cnt());
if (cl->is_pre_loop ()) tty->print(" pre" );
if (cl->is_main_loop()) tty->print(" main");
if (cl->is_post_loop()) tty->print(" post");
......
......@@ -1357,6 +1357,12 @@ void SuperWord::output() {
// Promote operands to vector
Node* in1 = vector_opd(p, 1);
Node* in2 = vector_opd(p, 2);
if (VectorNode::is_invariant_vector(in1) && (n->is_Add() || n->is_Mul())) {
// Move invariant vector input into second position to avoid register spilling.
Node* tmp = in1;
in1 = in2;
in2 = tmp;
}
vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
} else {
ShouldNotReachHere();
......@@ -1400,6 +1406,36 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
if (opd->is_Vector() || opd->is_LoadVector()) {
return opd; // input is matching vector
}
if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
// No vector is needed for shift count.
// Vector instructions do not mask shift count, do it here.
Compile* C = _phase->C;
Node* cnt = opd;
juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
const TypeInt* t = opd->find_int_type();
if (t != NULL && t->is_con()) {
juint shift = t->get_con();
if (shift > mask) { // Unsigned cmp
cnt = ConNode::make(C, TypeInt::make(shift & mask));
}
} else {
if (t == NULL || t->_lo < 0 || t->_hi > (int)mask) {
cnt = ConNode::make(C, TypeInt::make(mask));
_phase->_igvn.register_new_node_with_optimizer(cnt);
cnt = new (C, 3) AndINode(opd, cnt);
_phase->_igvn.register_new_node_with_optimizer(cnt);
_phase->set_ctrl(cnt, _phase->get_ctrl(opd));
}
assert(opd->bottom_type()->isa_int(), "int type only");
// Move non constant shift count into XMM register.
cnt = new (_phase->C, 2) MoveI2FNode(cnt);
}
if (cnt != opd) {
_phase->_igvn.register_new_node_with_optimizer(cnt);
_phase->set_ctrl(cnt, _phase->get_ctrl(opd));
}
return cnt;
}
assert(!opd->is_StoreVector(), "such vector is not expected here");
// Convert scalar input to vector with the same number of elements as
// p0's vector. Use p0's type because size of operand's container in
......@@ -1718,26 +1754,17 @@ void SuperWord::compute_vector_element_type() {
for (int i = _block.length() - 1; i >= 0; i--) {
Node* n = _block.at(i);
// Only integer types need be examined
if (n->bottom_type()->isa_int()) {
const Type* vt = velt_type(n);
if (vt->basic_type() == T_INT) {
uint start, end;
vector_opd_range(n, &start, &end);
const Type* vt = velt_type(n);
for (uint j = start; j < end; j++) {
Node* in = n->in(j);
// Don't propagate through a type conversion
if (n->bottom_type() != in->bottom_type())
continue;
switch(in->Opcode()) {
case Op_AddI: case Op_AddL:
case Op_SubI: case Op_SubL:
case Op_MulI: case Op_MulL:
case Op_AndI: case Op_AndL:
case Op_OrI: case Op_OrL:
case Op_XorI: case Op_XorL:
case Op_LShiftI: case Op_LShiftL:
case Op_CMoveI: case Op_CMoveL:
if (in_bb(in)) {
// Don't propagate through a memory
if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT &&
data_size(n) < data_size(in)) {
bool same_type = true;
for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
Node *use = in->fast_out(k);
......@@ -1753,7 +1780,6 @@ void SuperWord::compute_vector_element_type() {
}
}
}
}
#ifndef PRODUCT
if (TraceSuperWord && Verbose) {
for (int i = 0; i < _block.length(); i++) {
......@@ -1792,10 +1818,8 @@ const Type* SuperWord::container_type(Node* n) {
}
const Type* t = _igvn.type(n);
if (t->basic_type() == T_INT) {
if (t->higher_equal(TypeInt::BOOL)) return TypeInt::BOOL;
if (t->higher_equal(TypeInt::BYTE)) return TypeInt::BYTE;
if (t->higher_equal(TypeInt::CHAR)) return TypeInt::CHAR;
if (t->higher_equal(TypeInt::SHORT)) return TypeInt::SHORT;
// A narrow type of arithmetic operations will be determined by
// propagating the type of memory operations.
return TypeInt::INT;
}
return t;
......@@ -1940,7 +1964,7 @@ void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
// lim0 == original pre loop limit
// V == v_align (power of 2)
// invar == extra invariant piece of the address expression
// e == k [ +/- invar ]
// e == offset [ +/- invar ]
//
// When reassociating expressions involving '%' the basic rules are:
// (a - b) % k == 0 => a % k == b % k
......@@ -1993,13 +2017,12 @@ void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
int elt_size = align_to_ref_p.memory_size();
int v_align = vw / elt_size;
assert(v_align > 1, "sanity");
int k = align_to_ref_p.offset_in_bytes() / elt_size;
Node *kn = _igvn.intcon(k);
int offset = align_to_ref_p.offset_in_bytes() / elt_size;
Node *offsn = _igvn.intcon(offset);
Node *e = kn;
Node *e = offsn;
if (align_to_ref_p.invar() != NULL) {
// incorporate any extra invariant piece producing k +/- invar >>> log2(elt)
// incorporate any extra invariant piece producing (offset +/- invar) >>> log2(elt)
Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
Node* aref = new (_phase->C, 3) URShiftINode(align_to_ref_p.invar(), log2_elt);
_phase->_igvn.register_new_node_with_optimizer(aref);
......@@ -2014,15 +2037,15 @@ void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
}
if (vw > ObjectAlignmentInBytes) {
// incorporate base e +/- base && Mask >>> log2(elt)
Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw)));
Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base());
_phase->_igvn.register_new_node_with_optimizer(xbase);
Node* masked_xbase = new (_phase->C, 3) AndXNode(xbase, mask);
_phase->_igvn.register_new_node_with_optimizer(masked_xbase);
#ifdef _LP64
masked_xbase = new (_phase->C, 2) ConvL2INode(masked_xbase);
_phase->_igvn.register_new_node_with_optimizer(masked_xbase);
xbase = new (_phase->C, 2) ConvL2INode(xbase);
_phase->_igvn.register_new_node_with_optimizer(xbase);
#endif
Node* mask = _igvn.intcon(vw-1);
Node* masked_xbase = new (_phase->C, 3) AndINode(xbase, mask);
_phase->_igvn.register_new_node_with_optimizer(masked_xbase);
Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
Node* bref = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt);
_phase->_igvn.register_new_node_with_optimizer(bref);
......
......@@ -69,6 +69,15 @@ int VectorNode::opcode(int sopc, uint vlen, BasicType bt) {
case Op_SubD:
assert(bt == T_DOUBLE, "must be");
return Op_SubVD;
case Op_MulI:
switch (bt) {
case T_BOOLEAN:
case T_BYTE: return 0; // Unimplemented
case T_CHAR:
case T_SHORT: return Op_MulVS;
case T_INT: return Matcher::match_rule_supported(Op_MulVI) ? Op_MulVI : 0; // SSE4_1
}
ShouldNotReachHere();
case Op_MulF:
assert(bt == T_FLOAT, "must be");
return Op_MulVF;
......@@ -90,6 +99,9 @@ int VectorNode::opcode(int sopc, uint vlen, BasicType bt) {
case T_INT: return Op_LShiftVI;
}
ShouldNotReachHere();
case Op_LShiftL:
assert(bt == T_LONG, "must be");
return Op_LShiftVL;
case Op_RShiftI:
switch (bt) {
case T_BOOLEAN:
......@@ -99,6 +111,21 @@ int VectorNode::opcode(int sopc, uint vlen, BasicType bt) {
case T_INT: return Op_RShiftVI;
}
ShouldNotReachHere();
case Op_RShiftL:
assert(bt == T_LONG, "must be");
return Op_RShiftVL;
case Op_URShiftI:
switch (bt) {
case T_BOOLEAN:
case T_BYTE: return Op_URShiftVB;
case T_CHAR:
case T_SHORT: return Op_URShiftVS;
case T_INT: return Op_URShiftVI;
}
ShouldNotReachHere();
case Op_URShiftL:
assert(bt == T_LONG, "must be");
return Op_URShiftVL;
case Op_AndI:
case Op_AndL:
return Op_AndV;
......@@ -140,6 +167,34 @@ bool VectorNode::implemented(int opc, uint vlen, BasicType bt) {
return false;
}
bool VectorNode::is_shift(Node* n) {
switch (n->Opcode()) {
case Op_LShiftI:
case Op_LShiftL:
case Op_RShiftI:
case Op_RShiftL:
case Op_URShiftI:
case Op_URShiftL:
return true;
}
return false;
}
// Check if input is loop invarient vector.
bool VectorNode::is_invariant_vector(Node* n) {
// Only Replicate vector nodes are loop invarient for now.
switch (n->Opcode()) {
case Op_ReplicateB:
case Op_ReplicateS:
case Op_ReplicateI:
case Op_ReplicateL:
case Op_ReplicateF:
case Op_ReplicateD:
return true;
}
return false;
}
// Return the vector version of a scalar operation node.
VectorNode* VectorNode::make(Compile* C, int opc, Node* n1, Node* n2, uint vlen, BasicType bt) {
const TypeVect* vt = TypeVect::make(bt, vlen);
......@@ -160,6 +215,8 @@ VectorNode* VectorNode::make(Compile* C, int opc, Node* n1, Node* n2, uint vlen,
case Op_SubVF: return new (C, 3) SubVFNode(n1, n2, vt);
case Op_SubVD: return new (C, 3) SubVDNode(n1, n2, vt);
case Op_MulVS: return new (C, 3) MulVSNode(n1, n2, vt);
case Op_MulVI: return new (C, 3) MulVINode(n1, n2, vt);
case Op_MulVF: return new (C, 3) MulVFNode(n1, n2, vt);
case Op_MulVD: return new (C, 3) MulVDNode(n1, n2, vt);
......@@ -169,10 +226,17 @@ VectorNode* VectorNode::make(Compile* C, int opc, Node* n1, Node* n2, uint vlen,
case Op_LShiftVB: return new (C, 3) LShiftVBNode(n1, n2, vt);
case Op_LShiftVS: return new (C, 3) LShiftVSNode(n1, n2, vt);
case Op_LShiftVI: return new (C, 3) LShiftVINode(n1, n2, vt);
case Op_LShiftVL: return new (C, 3) LShiftVLNode(n1, n2, vt);
case Op_RShiftVB: return new (C, 3) RShiftVBNode(n1, n2, vt);
case Op_RShiftVS: return new (C, 3) RShiftVSNode(n1, n2, vt);
case Op_RShiftVI: return new (C, 3) RShiftVINode(n1, n2, vt);
case Op_RShiftVL: return new (C, 3) RShiftVLNode(n1, n2, vt);
case Op_URShiftVB: return new (C, 3) URShiftVBNode(n1, n2, vt);
case Op_URShiftVS: return new (C, 3) URShiftVSNode(n1, n2, vt);
case Op_URShiftVI: return new (C, 3) URShiftVINode(n1, n2, vt);
case Op_URShiftVL: return new (C, 3) URShiftVLNode(n1, n2, vt);
case Op_AndV: return new (C, 3) AndVNode(n1, n2, vt);
case Op_OrV: return new (C, 3) OrVNode (n1, n2, vt);
......
......@@ -46,6 +46,7 @@ class VectorNode : public TypeNode {
const TypeVect* vect_type() const { return type()->is_vect(); }
uint length() const { return vect_type()->length(); } // Vector length
uint length_in_bytes() const { return vect_type()->length_in_bytes(); }
virtual int Opcode() const;
......@@ -57,7 +58,8 @@ class VectorNode : public TypeNode {
static int opcode(int opc, uint vlen, BasicType bt);
static bool implemented(int opc, uint vlen, BasicType bt);
static bool is_shift(Node* n);
static bool is_invariant_vector(Node* n);
};
//===========================Vector=ALU=Operations====================================
......@@ -158,6 +160,22 @@ class SubVDNode : public VectorNode {
virtual int Opcode() const;
};
//------------------------------MulVSNode---------------------------------------
// Vector multiply short
class MulVSNode : public VectorNode {
public:
MulVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------MulVINode---------------------------------------
// Vector multiply int
class MulVINode : public VectorNode {
public:
MulVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------MulVFNode---------------------------------------
// Vector multiply float
class MulVFNode : public VectorNode {
......@@ -191,7 +209,7 @@ class DivVDNode : public VectorNode {
};
//------------------------------LShiftVBNode---------------------------------------
// Vector lshift byte
// Vector left shift bytes
class LShiftVBNode : public VectorNode {
public:
LShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
......@@ -199,7 +217,7 @@ class LShiftVBNode : public VectorNode {
};
//------------------------------LShiftVSNode---------------------------------------
// Vector lshift shorts
// Vector left shift shorts
class LShiftVSNode : public VectorNode {
public:
LShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
......@@ -207,39 +225,88 @@ class LShiftVSNode : public VectorNode {
};
//------------------------------LShiftVINode---------------------------------------
// Vector lshift ints
// Vector left shift ints
class LShiftVINode : public VectorNode {
public:
LShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------URShiftVBNode---------------------------------------
// Vector urshift bytes
//------------------------------LShiftVLNode---------------------------------------
// Vector left shift longs
class LShiftVLNode : public VectorNode {
public:
LShiftVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------RShiftVBNode---------------------------------------
// Vector right arithmetic (signed) shift bytes
class RShiftVBNode : public VectorNode {
public:
RShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------URShiftVSNode---------------------------------------
// Vector urshift shorts
//------------------------------RShiftVSNode---------------------------------------
// Vector right arithmetic (signed) shift shorts
class RShiftVSNode : public VectorNode {
public:
RShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------URShiftVINode---------------------------------------
// Vector urshift ints
//------------------------------RShiftVINode---------------------------------------
// Vector right arithmetic (signed) shift ints
class RShiftVINode : public VectorNode {
public:
RShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------RShiftVLNode---------------------------------------
// Vector right arithmetic (signed) shift longs
class RShiftVLNode : public VectorNode {
public:
RShiftVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------URShiftVBNode---------------------------------------
// Vector right logical (unsigned) shift bytes
class URShiftVBNode : public VectorNode {
public:
URShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------URShiftVSNode---------------------------------------
// Vector right logical (unsigned) shift shorts
class URShiftVSNode : public VectorNode {
public:
URShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------URShiftVINode---------------------------------------
// Vector right logical (unsigned) shift ints
class URShiftVINode : public VectorNode {
public:
URShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------URShiftVLNode---------------------------------------
// Vector right logical (unsigned) shift longs
class URShiftVLNode : public VectorNode {
public:
URShiftVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------AndVNode---------------------------------------
// Vector and
// Vector and integer
class AndVNode : public VectorNode {
public:
AndVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
......@@ -247,7 +314,7 @@ class AndVNode : public VectorNode {
};
//------------------------------OrVNode---------------------------------------
// Vector or
// Vector or integer
class OrVNode : public VectorNode {
public:
OrVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
......@@ -255,7 +322,7 @@ class OrVNode : public VectorNode {
};
//------------------------------XorVNode---------------------------------------
// Vector xor
// Vector xor integer
class XorVNode : public VectorNode {
public:
XorVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册