coalesce.cpp 36.2 KB
Newer Older
D
duke 已提交
1
/*
2
 * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
D
duke 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
19 20 21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
D
duke 已提交
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 *
 */

#include "incls/_precompiled.incl"
#include "incls/_coalesce.cpp.incl"

//=============================================================================
//------------------------------reset_uf_map-----------------------------------
void PhaseChaitin::reset_uf_map( uint maxlrg ) {
  _maxlrg = maxlrg;
  // Force the Union-Find mapping to be at least this large
  _uf_map.extend(_maxlrg,0);
  // Initialize it to be the ID mapping.
  for( uint i=0; i<_maxlrg; i++ )
    _uf_map.map(i,i);
}

//------------------------------compress_uf_map--------------------------------
// Make all Nodes map directly to their final live range; no need for
// the Union-Find mapping after this call.
void PhaseChaitin::compress_uf_map_for_nodes( ) {
  // For all Nodes, compress mapping
  uint unique = _names.Size();
  for( uint i=0; i<unique; i++ ) {
    uint lrg = _names[i];
    uint compressed_lrg = Find(lrg);
    if( lrg != compressed_lrg )
      _names.map(i,compressed_lrg);
  }
}

//------------------------------Find-------------------------------------------
// Straight out of Tarjan's union-find algorithm
uint PhaseChaitin::Find_compress( uint lrg ) {
  uint cur = lrg;
  uint next = _uf_map[cur];
  while( next != cur ) {        // Scan chain of equivalences
    assert( next < cur, "always union smaller" );
    cur = next;                 // until find a fixed-point
    next = _uf_map[cur];
  }
  // Core of union-find algorithm: update chain of
  // equivalences to be equal to the root.
  while( lrg != next ) {
    uint tmp = _uf_map[lrg];
    _uf_map.map(lrg, next);
    lrg = tmp;
  }
  return lrg;
}

//------------------------------Find-------------------------------------------
// Straight out of Tarjan's union-find algorithm
uint PhaseChaitin::Find_compress( const Node *n ) {
  uint lrg = Find_compress(_names[n->_idx]);
  _names.map(n->_idx,lrg);
  return lrg;
}

//------------------------------Find_const-------------------------------------
// Like Find above, but no path compress, so bad asymptotic behavior
uint PhaseChaitin::Find_const( uint lrg ) const {
  if( !lrg ) return lrg;        // Ignore the zero LRG
  // Off the end?  This happens during debugging dumps when you got
  // brand new live ranges but have not told the allocator yet.
  if( lrg >= _maxlrg ) return lrg;
  uint next = _uf_map[lrg];
  while( next != lrg ) {        // Scan chain of equivalences
    assert( next < lrg, "always union smaller" );
    lrg = next;                 // until find a fixed-point
    next = _uf_map[lrg];
  }
  return next;
}

//------------------------------Find-------------------------------------------
// Like Find above, but no path compress, so bad asymptotic behavior
uint PhaseChaitin::Find_const( const Node *n ) const {
  if( n->_idx >= _names.Size() ) return 0; // not mapped, usual for debug dump
  return Find_const( _names[n->_idx] );
}

//------------------------------Union------------------------------------------
// union 2 sets together.
void PhaseChaitin::Union( const Node *src_n, const Node *dst_n ) {
  uint src = Find(src_n);
  uint dst = Find(dst_n);
  assert( src, "" );
  assert( dst, "" );
  assert( src < _maxlrg, "oob" );
  assert( dst < _maxlrg, "oob" );
  assert( src < dst, "always union smaller" );
  _uf_map.map(dst,src);
}

//------------------------------new_lrg----------------------------------------
void PhaseChaitin::new_lrg( const Node *x, uint lrg ) {
  // Make the Node->LRG mapping
  _names.extend(x->_idx,lrg);
  // Make the Union-Find mapping an identity function
  _uf_map.extend(lrg,lrg);
}

//------------------------------clone_projs------------------------------------
T
twisti 已提交
126
// After cloning some rematerialized instruction, clone any MachProj's that
D
duke 已提交
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
// follow it.  Example: Intel zero is XOR, kills flags.  Sparc FP constants
// use G3 as an address temp.
int PhaseChaitin::clone_projs( Block *b, uint idx, Node *con, Node *copy, uint &maxlrg ) {
  Block *bcon = _cfg._bbs[con->_idx];
  uint cindex = bcon->find_node(con);
  Node *con_next = bcon->_nodes[cindex+1];
  if( con_next->in(0) != con || con_next->Opcode() != Op_MachProj )
    return false;               // No MachProj's follow

  // Copy kills after the cloned constant
  Node *kills = con_next->clone();
  kills->set_req( 0, copy );
  b->_nodes.insert( idx, kills );
  _cfg._bbs.map( kills->_idx, b );
  new_lrg( kills, maxlrg++ );
  return true;
}

//------------------------------compact----------------------------------------
// Renumber the live ranges to compact them.  Makes the IFG smaller.
void PhaseChaitin::compact() {
  // Current the _uf_map contains a series of short chains which are headed
  // by a self-cycle.  All the chains run from big numbers to little numbers.
  // The Find() call chases the chains & shortens them for the next Find call.
  // We are going to change this structure slightly.  Numbers above a moving
  // wave 'i' are unchanged.  Numbers below 'j' point directly to their
  // compacted live range with no further chaining.  There are no chains or
  // cycles below 'i', so the Find call no longer works.
  uint j=1;
  uint i;
  for( i=1; i < _maxlrg; i++ ) {
    uint lr = _uf_map[i];
    // Ignore unallocated live ranges
    if( !lr ) continue;
    assert( lr <= i, "" );
    _uf_map.map(i, ( lr == i ) ? j++ : _uf_map[lr]);
  }
  if( false )                  // PrintOptoCompactLiveRanges
    printf("Compacted %d LRs from %d\n",i-j,i);
  // Now change the Node->LR mapping to reflect the compacted names
  uint unique = _names.Size();
  for( i=0; i<unique; i++ )
    _names.map(i,_uf_map[_names[i]]);

  // Reset the Union-Find mapping
  reset_uf_map(j);

}

//=============================================================================
//------------------------------Dump-------------------------------------------
#ifndef PRODUCT
void PhaseCoalesce::dump( Node *n ) const {
  // Being a const function means I cannot use 'Find'
  uint r = _phc.Find(n);
  tty->print("L%d/N%d ",r,n->_idx);
}

//------------------------------dump-------------------------------------------
void PhaseCoalesce::dump() const {
  // I know I have a block layout now, so I can print blocks in a loop
  for( uint i=0; i<_phc._cfg._num_blocks; i++ ) {
    uint j;
    Block *b = _phc._cfg._blocks[i];
    // Print a nice block header
    tty->print("B%d: ",b->_pre_order);
    for( j=1; j<b->num_preds(); j++ )
      tty->print("B%d ", _phc._cfg._bbs[b->pred(j)->_idx]->_pre_order);
    tty->print("-> ");
    for( j=0; j<b->_num_succs; j++ )
      tty->print("B%d ",b->_succs[j]->_pre_order);
    tty->print(" IDom: B%d/#%d\n", b->_idom ? b->_idom->_pre_order : 0, b->_dom_depth);
    uint cnt = b->_nodes.size();
    for( j=0; j<cnt; j++ ) {
      Node *n = b->_nodes[j];
      dump( n );
      tty->print("\t%s\t",n->Name());

      // Dump the inputs
      uint k;                   // Exit value of loop
      for( k=0; k<n->req(); k++ ) // For all required inputs
        if( n->in(k) ) dump( n->in(k) );
        else tty->print("_ ");
      int any_prec = 0;
      for( ; k<n->len(); k++ )          // For all precedence inputs
        if( n->in(k) ) {
          if( !any_prec++ ) tty->print(" |");
          dump( n->in(k) );
        }

      // Dump node-specific info
      n->dump_spec(tty);
      tty->print("\n");

    }
    tty->print("\n");
  }
}
#endif

//------------------------------combine_these_two------------------------------
// Combine the live ranges def'd by these 2 Nodes.  N2 is an input to N1.
void PhaseCoalesce::combine_these_two( Node *n1, Node *n2 ) {
  uint lr1 = _phc.Find(n1);
  uint lr2 = _phc.Find(n2);
  if( lr1 != lr2 &&             // Different live ranges already AND
      !_phc._ifg->test_edge_sq( lr1, lr2 ) ) {  // Do not interfere
    LRG *lrg1 = &_phc.lrgs(lr1);
    LRG *lrg2 = &_phc.lrgs(lr2);
    // Not an oop->int cast; oop->oop, int->int, AND int->oop are OK.

    // Now, why is int->oop OK?  We end up declaring a raw-pointer as an oop
    // and in general that's a bad thing.  However, int->oop conversions only
    // happen at GC points, so the lifetime of the misclassified raw-pointer
    // is from the CheckCastPP (that converts it to an oop) backwards up
    // through a merge point and into the slow-path call, and around the
    // diamond up to the heap-top check and back down into the slow-path call.
    // The misclassified raw pointer is NOT live across the slow-path call,
    // and so does not appear in any GC info, so the fact that it is
    // misclassified is OK.

    if( (lrg1->_is_oop || !lrg2->_is_oop) && // not an oop->int cast AND
        // Compatible final mask
        lrg1->mask().overlap( lrg2->mask() ) ) {
      // Merge larger into smaller.
      if( lr1 > lr2 ) {
        uint  tmp =  lr1;  lr1 =  lr2;  lr2 =  tmp;
        Node   *n =   n1;   n1 =   n2;   n2 =    n;
        LRG *ltmp = lrg1; lrg1 = lrg2; lrg2 = ltmp;
      }
      // Union lr2 into lr1
      _phc.Union( n1, n2 );
      if (lrg1->_maxfreq < lrg2->_maxfreq)
        lrg1->_maxfreq = lrg2->_maxfreq;
      // Merge in the IFG
      _phc._ifg->Union( lr1, lr2 );
      // Combine register restrictions
      lrg1->AND(lrg2->mask());
    }
  }
}

//------------------------------coalesce_driver--------------------------------
// Copy coalescing
void PhaseCoalesce::coalesce_driver( ) {

  verify();
  // Coalesce from high frequency to low
  for( uint i=0; i<_phc._cfg._num_blocks; i++ )
    coalesce( _phc._blks[i] );

}

//------------------------------insert_copy_with_overlap-----------------------
// I am inserting copies to come out of SSA form.  In the general case, I am
// doing a parallel renaming.  I'm in the Named world now, so I can't do a
// general parallel renaming.  All the copies now use  "names" (live-ranges)
// to carry values instead of the explicit use-def chains.  Suppose I need to
// insert 2 copies into the same block.  They copy L161->L128 and L128->L132.
// If I insert them in the wrong order then L128 will get clobbered before it
// can get used by the second copy.  This cannot happen in the SSA model;
// direct use-def chains get me the right value.  It DOES happen in the named
// model so I have to handle the reordering of copies.
//
// In general, I need to topo-sort the placed copies to avoid conflicts.
// Its possible to have a closed cycle of copies (e.g., recirculating the same
// values around a loop).  In this case I need a temp to break the cycle.
void PhaseAggressiveCoalesce::insert_copy_with_overlap( Block *b, Node *copy, uint dst_name, uint src_name ) {

  // Scan backwards for the locations of the last use of the dst_name.
  // I am about to clobber the dst_name, so the copy must be inserted
  // after the last use.  Last use is really first-use on a backwards scan.
  uint i = b->end_idx()-1;
  while( 1 ) {
    Node *n = b->_nodes[i];
    // Check for end of virtual copies; this is also the end of the
    // parallel renaming effort.
    if( n->_idx < _unique ) break;
    uint idx = n->is_Copy();
    assert( idx || n->is_Con() || n->Opcode() == Op_MachProj, "Only copies during parallel renaming" );
    if( idx && _phc.Find(n->in(idx)) == dst_name ) break;
    i--;
  }
  uint last_use_idx = i;

  // Also search for any kill of src_name that exits the block.
  // Since the copy uses src_name, I have to come before any kill.
  uint kill_src_idx = b->end_idx();
  // There can be only 1 kill that exits any block and that is
  // the last kill.  Thus it is the first kill on a backwards scan.
  i = b->end_idx()-1;
  while( 1 ) {
    Node *n = b->_nodes[i];
    // Check for end of virtual copies; this is also the end of the
    // parallel renaming effort.
    if( n->_idx < _unique ) break;
    assert( n->is_Copy() || n->is_Con() || n->Opcode() == Op_MachProj, "Only copies during parallel renaming" );
    if( _phc.Find(n) == src_name ) {
      kill_src_idx = i;
      break;
    }
    i--;
  }
  // Need a temp?  Last use of dst comes after the kill of src?
  if( last_use_idx >= kill_src_idx ) {
    // Need to break a cycle with a temp
    uint idx = copy->is_Copy();
    Node *tmp = copy->clone();
    _phc.new_lrg(tmp,_phc._maxlrg++);
    // Insert new temp between copy and source
    tmp ->set_req(idx,copy->in(idx));
    copy->set_req(idx,tmp);
    // Save source in temp early, before source is killed
    b->_nodes.insert(kill_src_idx,tmp);
    _phc._cfg._bbs.map( tmp->_idx, b );
    last_use_idx++;
  }

  // Insert just after last use
  b->_nodes.insert(last_use_idx+1,copy);
}

//------------------------------insert_copies----------------------------------
void PhaseAggressiveCoalesce::insert_copies( Matcher &matcher ) {
  // We do LRGs compressing and fix a liveout data only here since the other
  // place in Split() is guarded by the assert which we never hit.
  _phc.compress_uf_map_for_nodes();
  // Fix block's liveout data for compressed live ranges.
  for(uint lrg = 1; lrg < _phc._maxlrg; lrg++ ) {
    uint compressed_lrg = _phc.Find(lrg);
    if( lrg != compressed_lrg ) {
      for( uint bidx = 0; bidx < _phc._cfg._num_blocks; bidx++ ) {
        IndexSet *liveout = _phc._live->live(_phc._cfg._blocks[bidx]);
        if( liveout->member(lrg) ) {
          liveout->remove(lrg);
          liveout->insert(compressed_lrg);
        }
      }
    }
  }

  // All new nodes added are actual copies to replace virtual copies.
  // Nodes with index less than '_unique' are original, non-virtual Nodes.
  _unique = C->unique();

  for( uint i=0; i<_phc._cfg._num_blocks; i++ ) {
    Block *b = _phc._cfg._blocks[i];
    uint cnt = b->num_preds();  // Number of inputs to the Phi

    for( uint l = 1; l<b->_nodes.size(); l++ ) {
      Node *n = b->_nodes[l];

      // Do not use removed-copies, use copied value instead
      uint ncnt = n->req();
      for( uint k = 1; k<ncnt; k++ ) {
        Node *copy = n->in(k);
        uint cidx = copy->is_Copy();
        if( cidx ) {
          Node *def = copy->in(cidx);
          if( _phc.Find(copy) == _phc.Find(def) )
            n->set_req(k,def);
        }
      }

      // Remove any explicit copies that get coalesced.
      uint cidx = n->is_Copy();
      if( cidx ) {
        Node *def = n->in(cidx);
        if( _phc.Find(n) == _phc.Find(def) ) {
          n->replace_by(def);
          n->set_req(cidx,NULL);
          b->_nodes.remove(l);
          l--;
          continue;
        }
      }

      if( n->is_Phi() ) {
        // Get the chosen name for the Phi
        uint phi_name = _phc.Find( n );
        // Ignore the pre-allocated specials
        if( !phi_name ) continue;
        // Check for mismatch inputs to Phi
        for( uint j = 1; j<cnt; j++ ) {
          Node *m = n->in(j);
          uint src_name = _phc.Find(m);
          if( src_name != phi_name ) {
            Block *pred = _phc._cfg._bbs[b->pred(j)->_idx];
            Node *copy;
            assert(!m->is_Con() || m->is_Mach(), "all Con must be Mach");
            // Rematerialize constants instead of copying them
            if( m->is_Mach() && m->as_Mach()->is_Con() &&
                m->as_Mach()->rematerialize() ) {
              copy = m->clone();
              // Insert the copy in the predecessor basic block
              pred->add_inst(copy);
              // Copy any flags as well
              _phc.clone_projs( pred, pred->end_idx(), m, copy, _phc._maxlrg );
            } else {
              const RegMask *rm = C->matcher()->idealreg2spillmask[m->ideal_reg()];
              copy = new (C) MachSpillCopyNode(m,*rm,*rm);
              // Find a good place to insert.  Kinda tricky, use a subroutine
              insert_copy_with_overlap(pred,copy,phi_name,src_name);
            }
            // Insert the copy in the use-def chain
            n->set_req( j, copy );
            _phc._cfg._bbs.map( copy->_idx, pred );
            // Extend ("register allocate") the names array for the copy.
            _phc._names.extend( copy->_idx, phi_name );
          } // End of if Phi names do not match
        } // End of for all inputs to Phi
      } else { // End of if Phi

        // Now check for 2-address instructions
        uint idx;
        if( n->is_Mach() && (idx=n->as_Mach()->two_adr()) ) {
          // Get the chosen name for the Node
          uint name = _phc.Find( n );
          assert( name, "no 2-address specials" );
          // Check for name mis-match on the 2-address input
          Node *m = n->in(idx);
          if( _phc.Find(m) != name ) {
            Node *copy;
            assert(!m->is_Con() || m->is_Mach(), "all Con must be Mach");
            // At this point it is unsafe to extend live ranges (6550579).
            // Rematerialize only constants as we do for Phi above.
            if( m->is_Mach() && m->as_Mach()->is_Con() &&
                m->as_Mach()->rematerialize() ) {
              copy = m->clone();
              // Insert the copy in the basic block, just before us
              b->_nodes.insert( l++, copy );
              if( _phc.clone_projs( b, l, m, copy, _phc._maxlrg ) )
                l++;
            } else {
              const RegMask *rm = C->matcher()->idealreg2spillmask[m->ideal_reg()];
              copy = new (C) MachSpillCopyNode( m, *rm, *rm );
              // Insert the copy in the basic block, just before us
              b->_nodes.insert( l++, copy );
            }
            // Insert the copy in the use-def chain
            n->set_req(idx, copy );
            // Extend ("register allocate") the names array for the copy.
            _phc._names.extend( copy->_idx, name );
            _phc._cfg._bbs.map( copy->_idx, b );
          }

        } // End of is two-adr

        // Insert a copy at a debug use for a lrg which has high frequency
476
        if( b->_freq < OPTO_DEBUG_SPLIT_FREQ || b->is_uncommon(_phc._cfg._bbs) ) {
D
duke 已提交
477 478 479 480 481 482 483 484 485 486 487 488 489
          // Walk the debug inputs to the node and check for lrg freq
          JVMState* jvms = n->jvms();
          uint debug_start = jvms ? jvms->debug_start() : 999999;
          uint debug_end   = jvms ? jvms->debug_end()   : 999999;
          for(uint inpidx = debug_start; inpidx < debug_end; inpidx++) {
            // Do not split monitors; they are only needed for debug table
            // entries and need no code.
            if( jvms->is_monitor_use(inpidx) ) continue;
            Node *inp = n->in(inpidx);
            uint nidx = _phc.n2lidx(inp);
            LRG &lrg = lrgs(nidx);

            // If this lrg has a high frequency use/def
490
            if( lrg._maxfreq >= _phc.high_frequency_lrg() ) {
D
duke 已提交
491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606
              // If the live range is also live out of this block (like it
              // would be for a fast/slow idiom), the normal spill mechanism
              // does an excellent job.  If it is not live out of this block
              // (like it would be for debug info to uncommon trap) splitting
              // the live range now allows a better allocation in the high
              // frequency blocks.
              //   Build_IFG_virtual has converted the live sets to
              // live-IN info, not live-OUT info.
              uint k;
              for( k=0; k < b->_num_succs; k++ )
                if( _phc._live->live(b->_succs[k])->member( nidx ) )
                  break;      // Live in to some successor block?
              if( k < b->_num_succs )
                continue;     // Live out; do not pre-split
              // Split the lrg at this use
              const RegMask *rm = C->matcher()->idealreg2spillmask[inp->ideal_reg()];
              Node *copy = new (C) MachSpillCopyNode( inp, *rm, *rm );
              // Insert the copy in the use-def chain
              n->set_req(inpidx, copy );
              // Insert the copy in the basic block, just before us
              b->_nodes.insert( l++, copy );
              // Extend ("register allocate") the names array for the copy.
              _phc.new_lrg( copy, _phc._maxlrg++ );
              _phc._cfg._bbs.map( copy->_idx, b );
              //tty->print_cr("Split a debug use in Aggressive Coalesce");
            }  // End of if high frequency use/def
          }  // End of for all debug inputs
        }  // End of if low frequency safepoint

      } // End of if Phi

    } // End of for all instructions
  } // End of for all blocks
}

//=============================================================================
//------------------------------coalesce---------------------------------------
// Aggressive (but pessimistic) copy coalescing of a single block

// The following coalesce pass represents a single round of aggressive
// pessimistic coalesce.  "Aggressive" means no attempt to preserve
// colorability when coalescing.  This occasionally means more spills, but
// it also means fewer rounds of coalescing for better code - and that means
// faster compiles.

// "Pessimistic" means we do not hit the fixed point in one pass (and we are
// reaching for the least fixed point to boot).  This is typically solved
// with a few more rounds of coalescing, but the compiler must run fast.  We
// could optimistically coalescing everything touching PhiNodes together
// into one big live range, then check for self-interference.  Everywhere
// the live range interferes with self it would have to be split.  Finding
// the right split points can be done with some heuristics (based on
// expected frequency of edges in the live range).  In short, it's a real
// research problem and the timeline is too short to allow such research.
// Further thoughts: (1) build the LR in a pass, (2) find self-interference
// in another pass, (3) per each self-conflict, split, (4) split by finding
// the low-cost cut (min-cut) of the LR, (5) edges in the LR are weighted
// according to the GCM algorithm (or just exec freq on CFG edges).

void PhaseAggressiveCoalesce::coalesce( Block *b ) {
  // Copies are still "virtual" - meaning we have not made them explicitly
  // copies.  Instead, Phi functions of successor blocks have mis-matched
  // live-ranges.  If I fail to coalesce, I'll have to insert a copy to line
  // up the live-ranges.  Check for Phis in successor blocks.
  uint i;
  for( i=0; i<b->_num_succs; i++ ) {
    Block *bs = b->_succs[i];
    // Find index of 'b' in 'bs' predecessors
    uint j=1;
    while( _phc._cfg._bbs[bs->pred(j)->_idx] != b ) j++;
    // Visit all the Phis in successor block
    for( uint k = 1; k<bs->_nodes.size(); k++ ) {
      Node *n = bs->_nodes[k];
      if( !n->is_Phi() ) break;
      combine_these_two( n, n->in(j) );
    }
  } // End of for all successor blocks


  // Check _this_ block for 2-address instructions and copies.
  uint cnt = b->end_idx();
  for( i = 1; i<cnt; i++ ) {
    Node *n = b->_nodes[i];
    uint idx;
    // 2-address instructions have a virtual Copy matching their input
    // to their output
    if( n->is_Mach() && (idx = n->as_Mach()->two_adr()) ) {
      MachNode *mach = n->as_Mach();
      combine_these_two( mach, mach->in(idx) );
    }
  } // End of for all instructions in block
}

//=============================================================================
//------------------------------PhaseConservativeCoalesce----------------------
PhaseConservativeCoalesce::PhaseConservativeCoalesce( PhaseChaitin &chaitin ) : PhaseCoalesce(chaitin) {
  _ulr.initialize(_phc._maxlrg);
}

//------------------------------verify-----------------------------------------
void PhaseConservativeCoalesce::verify() {
#ifdef ASSERT
  _phc.set_was_low();
#endif
}

//------------------------------union_helper-----------------------------------
void PhaseConservativeCoalesce::union_helper( Node *lr1_node, Node *lr2_node, uint lr1, uint lr2, Node *src_def, Node *dst_copy, Node *src_copy, Block *b, uint bindex ) {
  // Join live ranges.  Merge larger into smaller.  Union lr2 into lr1 in the
  // union-find tree
  _phc.Union( lr1_node, lr2_node );

  // Single-def live range ONLY if both live ranges are single-def.
  // If both are single def, then src_def powers one live range
  // and def_copy powers the other.  After merging, src_def powers
  // the combined live range.
607 608
  lrgs(lr1)._def = (lrgs(lr1).is_multidef() ||
                        lrgs(lr2).is_multidef() )
D
duke 已提交
609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
    ? NodeSentinel : src_def;
  lrgs(lr2)._def = NULL;    // No def for lrg 2
  lrgs(lr2).Clear();        // Force empty mask for LRG 2
  //lrgs(lr2)._size = 0;      // Live-range 2 goes dead
  lrgs(lr1)._is_oop |= lrgs(lr2)._is_oop;
  lrgs(lr2)._is_oop = 0;    // In particular, not an oop for GC info

  if (lrgs(lr1)._maxfreq < lrgs(lr2)._maxfreq)
    lrgs(lr1)._maxfreq = lrgs(lr2)._maxfreq;

  // Copy original value instead.  Intermediate copies go dead, and
  // the dst_copy becomes useless.
  int didx = dst_copy->is_Copy();
  dst_copy->set_req( didx, src_def );
  // Add copy to free list
  // _phc.free_spillcopy(b->_nodes[bindex]);
  assert( b->_nodes[bindex] == dst_copy, "" );
  dst_copy->replace_by( dst_copy->in(didx) );
  dst_copy->set_req( didx, NULL);
  b->_nodes.remove(bindex);
  if( bindex < b->_ihrp_index ) b->_ihrp_index--;
  if( bindex < b->_fhrp_index ) b->_fhrp_index--;

  // Stretched lr1; add it to liveness of intermediate blocks
  Block *b2 = _phc._cfg._bbs[src_copy->_idx];
  while( b != b2 ) {
    b = _phc._cfg._bbs[b->pred(1)->_idx];
    _phc._live->live(b)->insert(lr1);
  }
}

//------------------------------compute_separating_interferences---------------
// Factored code from copy_copy that computes extra interferences from
// lengthening a live range by double-coalescing.
uint PhaseConservativeCoalesce::compute_separating_interferences(Node *dst_copy, Node *src_copy, Block *b, uint bindex, RegMask &rm, uint reg_degree, uint rm_size, uint lr1, uint lr2 ) {

  assert(!lrgs(lr1)._fat_proj, "cannot coalesce fat_proj");
  assert(!lrgs(lr2)._fat_proj, "cannot coalesce fat_proj");
  Node *prev_copy = dst_copy->in(dst_copy->is_Copy());
  Block *b2 = b;
  uint bindex2 = bindex;
  while( 1 ) {
    // Find previous instruction
    bindex2--;                  // Chain backwards 1 instruction
    while( bindex2 == 0 ) {     // At block start, find prior block
      assert( b2->num_preds() == 2, "cannot double coalesce across c-flow" );
      b2 = _phc._cfg._bbs[b2->pred(1)->_idx];
      bindex2 = b2->end_idx()-1;
    }
    // Get prior instruction
    assert(bindex2 < b2->_nodes.size(), "index out of bounds");
    Node *x = b2->_nodes[bindex2];
    if( x == prev_copy ) {      // Previous copy in copy chain?
      if( prev_copy == src_copy)// Found end of chain and all interferences
        break;                  // So break out of loop
      // Else work back one in copy chain
      prev_copy = prev_copy->in(prev_copy->is_Copy());
    } else {                    // Else collect interferences
      uint lidx = _phc.Find(x);
      // Found another def of live-range being stretched?
      if( lidx == lr1 ) return max_juint;
      if( lidx == lr2 ) return max_juint;

      // If we attempt to coalesce across a bound def
      if( lrgs(lidx).is_bound() ) {
        // Do not let the coalesced LRG expect to get the bound color
        rm.SUBTRACT( lrgs(lidx).mask() );
        // Recompute rm_size
        rm_size = rm.Size();
        //if( rm._flags ) rm_size += 1000000;
        if( reg_degree >= rm_size ) return max_juint;
      }
      if( rm.overlap(lrgs(lidx).mask()) ) {
        // Insert lidx into union LRG; returns TRUE if actually inserted
        if( _ulr.insert(lidx) ) {
          // Infinite-stack neighbors do not alter colorability, as they
          // can always color to some other color.
          if( !lrgs(lidx).mask().is_AllStack() ) {
            // If this coalesce will make any new neighbor uncolorable,
            // do not coalesce.
            if( lrgs(lidx).just_lo_degree() )
              return max_juint;
            // Bump our degree
            if( ++reg_degree >= rm_size )
              return max_juint;
          } // End of if not infinite-stack neighbor
        } // End of if actually inserted
      } // End of if live range overlaps
T
twisti 已提交
697 698
    } // End of else collect interferences for 1 node
  } // End of while forever, scan back for interferences
D
duke 已提交
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
  return reg_degree;
}

//------------------------------update_ifg-------------------------------------
void PhaseConservativeCoalesce::update_ifg(uint lr1, uint lr2, IndexSet *n_lr1, IndexSet *n_lr2) {
  // Some original neighbors of lr1 might have gone away
  // because the constrained register mask prevented them.
  // Remove lr1 from such neighbors.
  IndexSetIterator one(n_lr1);
  uint neighbor;
  LRG &lrg1 = lrgs(lr1);
  while ((neighbor = one.next()) != 0)
    if( !_ulr.member(neighbor) )
      if( _phc._ifg->neighbors(neighbor)->remove(lr1) )
        lrgs(neighbor).inc_degree( -lrg1.compute_degree(lrgs(neighbor)) );


  // lr2 is now called (coalesced into) lr1.
  // Remove lr2 from the IFG.
  IndexSetIterator two(n_lr2);
  LRG &lrg2 = lrgs(lr2);
  while ((neighbor = two.next()) != 0)
    if( _phc._ifg->neighbors(neighbor)->remove(lr2) )
      lrgs(neighbor).inc_degree( -lrg2.compute_degree(lrgs(neighbor)) );

  // Some neighbors of intermediate copies now interfere with the
  // combined live range.
  IndexSetIterator three(&_ulr);
  while ((neighbor = three.next()) != 0)
    if( _phc._ifg->neighbors(neighbor)->insert(lr1) )
      lrgs(neighbor).inc_degree( lrg1.compute_degree(lrgs(neighbor)) );
}

//------------------------------record_bias------------------------------------
static void record_bias( const PhaseIFG *ifg, int lr1, int lr2 ) {
  // Tag copy bias here
  if( !ifg->lrgs(lr1)._copy_bias )
    ifg->lrgs(lr1)._copy_bias = lr2;
  if( !ifg->lrgs(lr2)._copy_bias )
    ifg->lrgs(lr2)._copy_bias = lr1;
}

//------------------------------copy_copy--------------------------------------
// See if I can coalesce a series of multiple copies together.  I need the
// final dest copy and the original src copy.  They can be the same Node.
// Compute the compatible register masks.
bool PhaseConservativeCoalesce::copy_copy( Node *dst_copy, Node *src_copy, Block *b, uint bindex ) {

  if( !dst_copy->is_SpillCopy() ) return false;
  if( !src_copy->is_SpillCopy() ) return false;
  Node *src_def = src_copy->in(src_copy->is_Copy());
  uint lr1 = _phc.Find(dst_copy);
  uint lr2 = _phc.Find(src_def );

  // Same live ranges already?
  if( lr1 == lr2 ) return false;

  // Interfere?
  if( _phc._ifg->test_edge_sq( lr1, lr2 ) ) return false;

  // Not an oop->int cast; oop->oop, int->int, AND int->oop are OK.
  if( !lrgs(lr1)._is_oop && lrgs(lr2)._is_oop ) // not an oop->int cast
    return false;

  // Coalescing between an aligned live range and a mis-aligned live range?
  // No, no!  Alignment changes how we count degree.
  if( lrgs(lr1)._fat_proj != lrgs(lr2)._fat_proj )
    return false;

  // Sort; use smaller live-range number
  Node *lr1_node = dst_copy;
  Node *lr2_node = src_def;
  if( lr1 > lr2 ) {
    uint tmp = lr1; lr1 = lr2; lr2 = tmp;
    lr1_node = src_def;  lr2_node = dst_copy;
  }

  // Check for compatibility of the 2 live ranges by
  // intersecting their allowed register sets.
  RegMask rm = lrgs(lr1).mask();
  rm.AND(lrgs(lr2).mask());
  // Number of bits free
  uint rm_size = rm.Size();

  // If we can use any stack slot, then effective size is infinite
  if( rm.is_AllStack() ) rm_size += 1000000;
  // Incompatible masks, no way to coalesce
  if( rm_size == 0 ) return false;

  // Another early bail-out test is when we are double-coalescing and the
T
twisti 已提交
789
  // 2 copies are separated by some control flow.
D
duke 已提交
790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915
  if( dst_copy != src_copy ) {
    Block *src_b = _phc._cfg._bbs[src_copy->_idx];
    Block *b2 = b;
    while( b2 != src_b ) {
      if( b2->num_preds() > 2 ){// Found merge-point
        _phc._lost_opp_cflow_coalesce++;
        // extra record_bias commented out because Chris believes it is not
        // productive.  Since we can record only 1 bias, we want to choose one
        // that stands a chance of working and this one probably does not.
        //record_bias( _phc._lrgs, lr1, lr2 );
        return false;           // To hard to find all interferences
      }
      b2 = _phc._cfg._bbs[b2->pred(1)->_idx];
    }
  }

  // Union the two interference sets together into '_ulr'
  uint reg_degree = _ulr.lrg_union( lr1, lr2, rm_size, _phc._ifg, rm );

  if( reg_degree >= rm_size ) {
    record_bias( _phc._ifg, lr1, lr2 );
    return false;
  }

  // Now I need to compute all the interferences between dst_copy and
  // src_copy.  I'm not willing visit the entire interference graph, so
  // I limit my search to things in dst_copy's block or in a straight
  // line of previous blocks.  I give up at merge points or when I get
  // more interferences than my degree.  I can stop when I find src_copy.
  if( dst_copy != src_copy ) {
    reg_degree = compute_separating_interferences(dst_copy, src_copy, b, bindex, rm, rm_size, reg_degree, lr1, lr2 );
    if( reg_degree == max_juint ) {
      record_bias( _phc._ifg, lr1, lr2 );
      return false;
    }
  } // End of if dst_copy & src_copy are different


  // ---- THE COMBINED LRG IS COLORABLE ----

  // YEAH - Now coalesce this copy away
  assert( lrgs(lr1).num_regs() == lrgs(lr2).num_regs(),   "" );

  IndexSet *n_lr1 = _phc._ifg->neighbors(lr1);
  IndexSet *n_lr2 = _phc._ifg->neighbors(lr2);

  // Update the interference graph
  update_ifg(lr1, lr2, n_lr1, n_lr2);

  _ulr.remove(lr1);

  // Uncomment the following code to trace Coalescing in great detail.
  //
  //if (false) {
  //  tty->cr();
  //  tty->print_cr("#######################################");
  //  tty->print_cr("union %d and %d", lr1, lr2);
  //  n_lr1->dump();
  //  n_lr2->dump();
  //  tty->print_cr("resulting set is");
  //  _ulr.dump();
  //}

  // Replace n_lr1 with the new combined live range.  _ulr will use
  // n_lr1's old memory on the next iteration.  n_lr2 is cleared to
  // send its internal memory to the free list.
  _ulr.swap(n_lr1);
  _ulr.clear();
  n_lr2->clear();

  lrgs(lr1).set_degree( _phc._ifg->effective_degree(lr1) );
  lrgs(lr2).set_degree( 0 );

  // Join live ranges.  Merge larger into smaller.  Union lr2 into lr1 in the
  // union-find tree
  union_helper( lr1_node, lr2_node, lr1, lr2, src_def, dst_copy, src_copy, b, bindex );
  // Combine register restrictions
  lrgs(lr1).set_mask(rm);
  lrgs(lr1).compute_set_mask_size();
  lrgs(lr1)._cost += lrgs(lr2)._cost;
  lrgs(lr1)._area += lrgs(lr2)._area;

  // While its uncommon to successfully coalesce live ranges that started out
  // being not-lo-degree, it can happen.  In any case the combined coalesced
  // live range better Simplify nicely.
  lrgs(lr1)._was_lo = 1;

  // kinda expensive to do all the time
  //tty->print_cr("warning: slow verify happening");
  //_phc._ifg->verify( &_phc );
  return true;
}

//------------------------------coalesce---------------------------------------
// Conservative (but pessimistic) copy coalescing of a single block
void PhaseConservativeCoalesce::coalesce( Block *b ) {
  // Bail out on infrequent blocks
  if( b->is_uncommon(_phc._cfg._bbs) )
    return;
  // Check this block for copies.
  for( uint i = 1; i<b->end_idx(); i++ ) {
    // Check for actual copies on inputs.  Coalesce a copy into its
    // input if use and copy's input are compatible.
    Node *copy1 = b->_nodes[i];
    uint idx1 = copy1->is_Copy();
    if( !idx1 ) continue;       // Not a copy

    if( copy_copy(copy1,copy1,b,i) ) {
      i--;                      // Retry, same location in block
      PhaseChaitin::_conserv_coalesce++;  // Collect stats on success
      continue;
    }

    /* do not attempt pairs.  About 1/2 of all pairs can be removed by
       post-alloc.  The other set are too few to bother.
    Node *copy2 = copy1->in(idx1);
    uint idx2 = copy2->is_Copy();
    if( !idx2 ) continue;
    if( copy_copy(copy1,copy2,b,i) ) {
      i--;                      // Retry, same location in block
      PhaseChaitin::_conserv_coalesce_pair++; // Collect stats on success
      continue;
    }
    */
  }
}