提交 6eb45ee0 编写于 作者: K kvn

6954029: Improve implicit null check generation with compressed oops

Summary: Hoist DecodeN instruction above null check
Reviewed-by: never, twisti
上级 7eaa5edc
...@@ -1760,6 +1760,12 @@ const int Matcher::init_array_short_size = 8 * BytesPerLong; ...@@ -1760,6 +1760,12 @@ const int Matcher::init_array_short_size = 8 * BytesPerLong;
// registers? True for Intel but false for most RISCs // registers? True for Intel but false for most RISCs
const bool Matcher::clone_shift_expressions = false; const bool Matcher::clone_shift_expressions = false;
bool Matcher::narrow_oop_use_complex_address() {
NOT_LP64(ShouldNotCallThis());
assert(UseCompressedOops, "only for compressed oops code");
return false;
}
// Is it better to copy float constants, or load them directly from memory? // Is it better to copy float constants, or load them directly from memory?
// Intel can load a float constant from a direct address, requiring no // Intel can load a float constant from a direct address, requiring no
// extra registers. Most RISCs will have to materialize an address into a // extra registers. Most RISCs will have to materialize an address into a
......
...@@ -65,13 +65,6 @@ void VM_Version::initialize() { ...@@ -65,13 +65,6 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseInlineCaches, false); FLAG_SET_DEFAULT(UseInlineCaches, false);
} }
#ifdef _LP64 #ifdef _LP64
// Single issue niagara1 is slower for CompressedOops
// but niagaras after that it's fine.
if (!is_niagara1_plus()) {
if (FLAG_IS_DEFAULT(UseCompressedOops)) {
FLAG_SET_ERGO(bool, UseCompressedOops, false);
}
}
// 32-bit oops don't make sense for the 64-bit VM on sparc // 32-bit oops don't make sense for the 64-bit VM on sparc
// since the 32-bit VM has the same registers and smaller objects. // since the 32-bit VM has the same registers and smaller objects.
Universe::set_narrow_oop_shift(LogMinObjAlignmentInBytes); Universe::set_narrow_oop_shift(LogMinObjAlignmentInBytes);
......
...@@ -1377,6 +1377,12 @@ const int Matcher::init_array_short_size = 8 * BytesPerLong; ...@@ -1377,6 +1377,12 @@ const int Matcher::init_array_short_size = 8 * BytesPerLong;
// registers? True for Intel but false for most RISCs // registers? True for Intel but false for most RISCs
const bool Matcher::clone_shift_expressions = true; const bool Matcher::clone_shift_expressions = true;
bool Matcher::narrow_oop_use_complex_address() {
ShouldNotCallThis();
return true;
}
// Is it better to copy float constants, or load them directly from memory? // Is it better to copy float constants, or load them directly from memory?
// Intel can load a float constant from a direct address, requiring no // Intel can load a float constant from a direct address, requiring no
// extra registers. Most RISCs will have to materialize an address into a // extra registers. Most RISCs will have to materialize an address into a
......
...@@ -2037,6 +2037,11 @@ const int Matcher::init_array_short_size = 8 * BytesPerLong; ...@@ -2037,6 +2037,11 @@ const int Matcher::init_array_short_size = 8 * BytesPerLong;
// into registers? True for Intel but false for most RISCs // into registers? True for Intel but false for most RISCs
const bool Matcher::clone_shift_expressions = true; const bool Matcher::clone_shift_expressions = true;
bool Matcher::narrow_oop_use_complex_address() {
assert(UseCompressedOops, "only for compressed oops code");
return (LogMinObjAlignmentInBytes <= 3);
}
// Is it better to copy float constants, or load them directly from // Is it better to copy float constants, or load them directly from
// memory? Intel can load a float constant from a direct address, // memory? Intel can load a float constant from a direct address,
// requiring no extra registers. Most RISCs will have to materialize // requiring no extra registers. Most RISCs will have to materialize
......
...@@ -2176,14 +2176,14 @@ static void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc ) { ...@@ -2176,14 +2176,14 @@ static void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc ) {
#ifdef _LP64 #ifdef _LP64
case Op_CastPP: case Op_CastPP:
if (n->in(1)->is_DecodeN() && Universe::narrow_oop_use_implicit_null_checks()) { if (n->in(1)->is_DecodeN() && Matcher::gen_narrow_oop_implicit_null_checks()) {
Compile* C = Compile::current(); Compile* C = Compile::current();
Node* in1 = n->in(1); Node* in1 = n->in(1);
const Type* t = n->bottom_type(); const Type* t = n->bottom_type();
Node* new_in1 = in1->clone(); Node* new_in1 = in1->clone();
new_in1->as_DecodeN()->set_type(t); new_in1->as_DecodeN()->set_type(t);
if (!Matcher::clone_shift_expressions) { if (!Matcher::narrow_oop_use_complex_address()) {
// //
// x86, ARM and friends can handle 2 adds in addressing mode // x86, ARM and friends can handle 2 adds in addressing mode
// and Matcher can fold a DecodeN node into address by using // and Matcher can fold a DecodeN node into address by using
...@@ -2231,8 +2231,12 @@ static void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc ) { ...@@ -2231,8 +2231,12 @@ static void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc ) {
new_in2 = in2->in(1); new_in2 = in2->in(1);
} else if (in2->Opcode() == Op_ConP) { } else if (in2->Opcode() == Op_ConP) {
const Type* t = in2->bottom_type(); const Type* t = in2->bottom_type();
if (t == TypePtr::NULL_PTR && Universe::narrow_oop_use_implicit_null_checks()) { if (t == TypePtr::NULL_PTR) {
new_in2 = ConNode::make(C, TypeNarrowOop::NULL_PTR); // Don't convert CmpP null check into CmpN if compressed
// oops implicit null check is not generated.
// This will allow to generate normal oop implicit null check.
if (Matcher::gen_narrow_oop_implicit_null_checks())
new_in2 = ConNode::make(C, TypeNarrowOop::NULL_PTR);
// //
// This transformation together with CastPP transformation above // This transformation together with CastPP transformation above
// will generated code for implicit NULL checks for compressed oops. // will generated code for implicit NULL checks for compressed oops.
...@@ -2289,9 +2293,9 @@ static void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc ) { ...@@ -2289,9 +2293,9 @@ static void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc ) {
case Op_DecodeN: case Op_DecodeN:
assert(!n->in(1)->is_EncodeP(), "should be optimized out"); assert(!n->in(1)->is_EncodeP(), "should be optimized out");
// DecodeN could be pinned on Sparc where it can't be fold into // DecodeN could be pinned when it can't be fold into
// an address expression, see the code for Op_CastPP above. // an address expression, see the code for Op_CastPP above.
assert(n->in(0) == NULL || !Matcher::clone_shift_expressions, "no control except on sparc"); assert(n->in(0) == NULL || !Matcher::narrow_oop_use_complex_address(), "no control");
break; break;
case Op_EncodeP: { case Op_EncodeP: {
...@@ -2496,6 +2500,10 @@ static void final_graph_reshaping_walk( Node_Stack &nstack, Node *root, Final_Re ...@@ -2496,6 +2500,10 @@ static void final_graph_reshaping_walk( Node_Stack &nstack, Node *root, Final_Re
} }
} }
// Skip next transformation if compressed oops are not used.
if (!UseCompressedOops || !Matcher::gen_narrow_oop_implicit_null_checks())
return;
// Go over safepoints nodes to skip DecodeN nodes for debug edges. // Go over safepoints nodes to skip DecodeN nodes for debug edges.
// It could be done for an uncommon traps or any safepoints/calls // It could be done for an uncommon traps or any safepoints/calls
// if the DecodeN node is referenced only in a debug info. // if the DecodeN node is referenced only in a debug info.
......
...@@ -437,7 +437,7 @@ Node *ConstraintCastNode::Ideal_DU_postCCP( PhaseCCP *ccp ) { ...@@ -437,7 +437,7 @@ Node *ConstraintCastNode::Ideal_DU_postCCP( PhaseCCP *ccp ) {
// If not converting int->oop, throw away cast after constant propagation // If not converting int->oop, throw away cast after constant propagation
Node *CastPPNode::Ideal_DU_postCCP( PhaseCCP *ccp ) { Node *CastPPNode::Ideal_DU_postCCP( PhaseCCP *ccp ) {
const Type *t = ccp->type(in(1)); const Type *t = ccp->type(in(1));
if (!t->isa_oop_ptr() || (in(1)->is_DecodeN() && Universe::narrow_oop_use_implicit_null_checks())) { if (!t->isa_oop_ptr() || (in(1)->is_DecodeN() && Matcher::gen_narrow_oop_implicit_null_checks())) {
return NULL; // do not transform raw pointers or narrow oops return NULL; // do not transform raw pointers or narrow oops
} }
return ConstraintCastNode::Ideal_DU_postCCP(ccp); return ConstraintCastNode::Ideal_DU_postCCP(ccp);
......
...@@ -32,7 +32,8 @@ ...@@ -32,7 +32,8 @@
// with suitable memory ops nearby. Use the memory op to do the NULL check. // with suitable memory ops nearby. Use the memory op to do the NULL check.
// I can generate a memory op if there is not one nearby. // I can generate a memory op if there is not one nearby.
// The proj is the control projection for the not-null case. // The proj is the control projection for the not-null case.
// The val is the pointer being checked for nullness. // The val is the pointer being checked for nullness or
// decodeHeapOop_not_null node if it did not fold into address.
void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowed_reasons) { void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowed_reasons) {
// Assume if null check need for 0 offset then always needed // Assume if null check need for 0 offset then always needed
// Intel solaris doesn't support any null checks yet and no // Intel solaris doesn't support any null checks yet and no
...@@ -96,6 +97,13 @@ void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowe ...@@ -96,6 +97,13 @@ void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowe
} }
} }
// Check for decodeHeapOop_not_null node which did not fold into address
bool is_decoden = ((intptr_t)val) & 1;
val = (Node*)(((intptr_t)val) & ~1);
assert(!is_decoden || (val->in(0) == NULL) && val->is_Mach() &&
(val->as_Mach()->ideal_Opcode() == Op_DecodeN), "sanity");
// Search the successor block for a load or store who's base value is also // Search the successor block for a load or store who's base value is also
// the tested value. There may be several. // the tested value. There may be several.
Node_List *out = new Node_List(Thread::current()->resource_area()); Node_List *out = new Node_List(Thread::current()->resource_area());
...@@ -148,7 +156,8 @@ void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowe ...@@ -148,7 +156,8 @@ void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowe
if( !mach->needs_anti_dependence_check() ) if( !mach->needs_anti_dependence_check() )
continue; // Not an memory op; skip it continue; // Not an memory op; skip it
{ {
// Check that value is used in memory address. // Check that value is used in memory address in
// instructions with embedded load (CmpP val1,(val2+off)).
Node* base; Node* base;
Node* index; Node* index;
const MachOper* oper = mach->memory_inputs(base, index); const MachOper* oper = mach->memory_inputs(base, index);
...@@ -213,7 +222,11 @@ void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowe ...@@ -213,7 +222,11 @@ void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowe
uint vidx = 0; // Capture index of value into memop uint vidx = 0; // Capture index of value into memop
uint j; uint j;
for( j = mach->req()-1; j > 0; j-- ) { for( j = mach->req()-1; j > 0; j-- ) {
if( mach->in(j) == val ) vidx = j; if( mach->in(j) == val ) {
vidx = j;
// Ignore DecodeN val which could be hoisted to where needed.
if( is_decoden ) continue;
}
// Block of memory-op input // Block of memory-op input
Block *inb = cfg->_bbs[mach->in(j)->_idx]; Block *inb = cfg->_bbs[mach->in(j)->_idx];
Block *b = this; // Start from nul check Block *b = this; // Start from nul check
...@@ -270,6 +283,26 @@ void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowe ...@@ -270,6 +283,26 @@ void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowe
extern int implicit_null_checks; extern int implicit_null_checks;
implicit_null_checks++; implicit_null_checks++;
if( is_decoden ) {
// Check if we need to hoist decodeHeapOop_not_null first.
Block *valb = cfg->_bbs[val->_idx];
if( this != valb && this->_dom_depth < valb->_dom_depth ) {
// Hoist it up to the end of the test block.
valb->find_remove(val);
this->add_inst(val);
cfg->_bbs.map(val->_idx,this);
// DecodeN on x86 may kill flags. Check for flag-killing projections
// that also need to be hoisted.
for (DUIterator_Fast jmax, j = val->fast_outs(jmax); j < jmax; j++) {
Node* n = val->fast_out(j);
if( n->Opcode() == Op_MachProj ) {
cfg->_bbs[n->_idx]->find_remove(n);
this->add_inst(n);
cfg->_bbs.map(n->_idx,this);
}
}
}
}
// Hoist the memory candidate up to the end of the test block. // Hoist the memory candidate up to the end of the test block.
Block *old_block = cfg->_bbs[best->_idx]; Block *old_block = cfg->_bbs[best->_idx];
old_block->find_remove(best); old_block->find_remove(best);
......
...@@ -1334,7 +1334,7 @@ static bool match_into_reg( const Node *n, Node *m, Node *control, int i, bool s ...@@ -1334,7 +1334,7 @@ static bool match_into_reg( const Node *n, Node *m, Node *control, int i, bool s
if( j == max_scan ) // No post-domination before scan end? if( j == max_scan ) // No post-domination before scan end?
return true; // Then break the match tree up return true; // Then break the match tree up
} }
if (m->is_DecodeN() && Matcher::clone_shift_expressions) { if (m->is_DecodeN() && Matcher::narrow_oop_use_complex_address()) {
// These are commonly used in address expressions and can // These are commonly used in address expressions and can
// efficiently fold into them on X64 in some cases. // efficiently fold into them on X64 in some cases.
return false; return false;
...@@ -2110,8 +2110,8 @@ void Matcher::collect_null_checks( Node *proj, Node *orig_proj ) { ...@@ -2110,8 +2110,8 @@ void Matcher::collect_null_checks( Node *proj, Node *orig_proj ) {
_null_check_tests.push(proj); _null_check_tests.push(proj);
Node* val = cmp->in(1); Node* val = cmp->in(1);
#ifdef _LP64 #ifdef _LP64
if (UseCompressedOops && !Matcher::clone_shift_expressions && if (val->bottom_type()->isa_narrowoop() &&
val->bottom_type()->isa_narrowoop()) { !Matcher::narrow_oop_use_complex_address()) {
// //
// Look for DecodeN node which should be pinned to orig_proj. // Look for DecodeN node which should be pinned to orig_proj.
// On platforms (Sparc) which can not handle 2 adds // On platforms (Sparc) which can not handle 2 adds
...@@ -2127,6 +2127,9 @@ void Matcher::collect_null_checks( Node *proj, Node *orig_proj ) { ...@@ -2127,6 +2127,9 @@ void Matcher::collect_null_checks( Node *proj, Node *orig_proj ) {
if (d->is_DecodeN() && d->in(1) == val) { if (d->is_DecodeN() && d->in(1) == val) {
val = d; val = d;
val->set_req(0, NULL); // Unpin now. val->set_req(0, NULL); // Unpin now.
// Mark this as special case to distinguish from
// a regular case: CmpP(DecodeN, NULL).
val = (Node*)(((intptr_t)val) | 1);
break; break;
} }
} }
...@@ -2146,9 +2149,21 @@ void Matcher::validate_null_checks( ) { ...@@ -2146,9 +2149,21 @@ void Matcher::validate_null_checks( ) {
for( uint i=0; i < cnt; i+=2 ) { for( uint i=0; i < cnt; i+=2 ) {
Node *test = _null_check_tests[i]; Node *test = _null_check_tests[i];
Node *val = _null_check_tests[i+1]; Node *val = _null_check_tests[i+1];
bool is_decoden = ((intptr_t)val) & 1;
val = (Node*)(((intptr_t)val) & ~1);
if (has_new_node(val)) { if (has_new_node(val)) {
Node* new_val = new_node(val);
if (is_decoden) {
assert(val->is_DecodeN() && val->in(0) == NULL, "sanity");
// Note: new_val may have a control edge if
// the original ideal node DecodeN was matched before
// it was unpinned in Matcher::collect_null_checks().
// Unpin the mach node and mark it.
new_val->set_req(0, NULL);
new_val = (Node*)(((intptr_t)new_val) | 1);
}
// Is a match-tree root, so replace with the matched value // Is a match-tree root, so replace with the matched value
_null_check_tests.map(i+1, new_node(val)); _null_check_tests.map(i+1, new_val);
} else { } else {
// Yank from candidate list // Yank from candidate list
_null_check_tests.map(i+1,_null_check_tests[--cnt]); _null_check_tests.map(i+1,_null_check_tests[--cnt]);
......
...@@ -352,6 +352,38 @@ public: ...@@ -352,6 +352,38 @@ public:
// registers? True for Intel but false for most RISCs // registers? True for Intel but false for most RISCs
static const bool clone_shift_expressions; static const bool clone_shift_expressions;
static bool narrow_oop_use_complex_address();
// Generate implicit null check for narrow oops if it can fold
// into address expression (x64).
//
// [R12 + narrow_oop_reg<<3 + offset] // fold into address expression
// NullCheck narrow_oop_reg
//
// When narrow oops can't fold into address expression (Sparc) and
// base is not null use decode_not_null and normal implicit null check.
// Note, decode_not_null node can be used here since it is referenced
// only on non null path but it requires special handling, see
// collect_null_checks():
//
// decode_not_null narrow_oop_reg, oop_reg // 'shift' and 'add base'
// [oop_reg + offset]
// NullCheck oop_reg
//
// With Zero base and when narrow oops can not fold into address
// expression use normal implicit null check since only shift
// is needed to decode narrow oop.
//
// decode narrow_oop_reg, oop_reg // only 'shift'
// [oop_reg + offset]
// NullCheck oop_reg
//
inline static bool gen_narrow_oop_implicit_null_checks() {
return Universe::narrow_oop_use_implicit_null_checks() &&
(narrow_oop_use_complex_address() ||
Universe::narrow_oop_base() != NULL);
}
// Is it better to copy float constants, or load them directly from memory? // Is it better to copy float constants, or load them directly from memory?
// Intel can load a float constant from a direct address, requiring no // Intel can load a float constant from a direct address, requiring no
// extra registers. Most RISCs will have to materialize an address into a // extra registers. Most RISCs will have to materialize an address into a
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册