chaitin.cpp revision 3239
0N/A * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved. 0N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 0N/A * This code is free software; you can redistribute it and/or modify it 0N/A * under the terms of the GNU General Public License version 2 only, as 0N/A * published by the Free Software Foundation. 0N/A * This code is distributed in the hope that it will be useful, but WITHOUT 0N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 0N/A * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 0N/A * version 2 for more details (a copy is included in the LICENSE file that 0N/A * accompanied this code). 0N/A * You should have received a copy of the GNU General Public License version 0N/A * 2 along with this work; if not, write to the Free Software Foundation, 0N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 0N/A * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 0N/A//============================================================================= 0N/A//------------------------------score------------------------------------------ 0N/A// Compute score from cost and area. Low score is best to spill. 0N/A // Scale _area by RegisterCostAreaRatio/64K then subtract from cost. 0N/A // Bigger area lowers score, encourages spilling this live range. 0N/A // Bigger cost raise score, prevents spilling this live range. 0N/A // (Note: 1/65536 is the magic constant below; I dont trust the C optimizer 0N/A // to turn a divide by a constant into a multiply by the reciprical). 0N/A // Account for area. Basically, LRGs covering large areas are better 0N/A // to spill because more other LRGs get freed up. 0N/A if(
_area ==
0.0 )
// No area? Then no progress to spill 0N/A return score +
1e30;
// to make progress again. 0N/A return score +
1e17;
// Probably no progress to spill 0N/A return score +
1e10;
// Likely no progress to spill 0N/A//------------------------------LRG_List--------------------------------------- //------------------------------Chaitin---------------------------------------- // Build a list of basic blocks, sorted by frequency // Experiment with sorting strategies to speed compilation // Bump by three orders of magnitude each time // Sort blocks into buckets // Assign block to end of list for appropriate bucket break;
// kick out of inner loop // Dump buckets into final block array for( j = 0; j <
buckcnt[i]; j++ ) {
// Above the OLD FP (and in registers) are the incoming arguments. Stack // slots in this area are called "arg_slots". Above the NEW FP (and in // registers) is the outgoing argument area; above that is the spill/temp // area. These are all "frame_slots". Arg_slots start at the zero // stack_slots and count up to the known arg_size. Frame_slots start at // the stack_slot #arg_size and go up. After allocation I map stack // slots to actual offsets. Stack-slots in the arg_slot area are biased // by the frame_size; stack-slots in the frame_slot area are biased by 0. // Need live-ness for the IFG; need the IFG for coalescing. If the // liveness is JUST for coalescing, then I can get some mileage by renaming // all copy-related live ranges low and then using the max copy-related // live range as a cut-off for LIVE and the IFG. In other words, I can // build a subset of LIVE and IFG just for copies. // Need IFG for coalescing and coloring // Come out of SSA world to the Named world. Assign (virtual) registers to // Nodes. Use the same register for all inputs and the output of PhiNodes // - effectively ending SSA form. This requires either coalescing live // ranges or inserting copies. For the moment, we insert "virtual copies" // - we pretend there is a copy prior to each Phi in predecessor blocks. // We will attempt to coalesce such "virtual copies" before we manifest // Veify the graph before RA. _live =
NULL;
// Mark live as being not available _live = &
live;
// Mark LIVE as being available // Base pointers are currently "used" by instructions which define new // derived pointers. This makes base pointers live up to the where the // derived pointer is made, but not beyond. Really, they need to be live // across any GC point where the derived value is live. So this code looks // at all the GC points, and "stretches" the live range of any base pointer // Since some live range stretched, I need to recompute live // Create the interference graph using virtual copies // Aggressive (but pessimistic) copy coalescing. // This pass works on virtual copies. Any virtual copies which are not // coalesced get manifested as actual copies // The IFG is/was triangular. I am 'squaring it up' so Union can run // faster. Union requires a 'for all' operation which is slow on the // triangular adjacency matrix (quick reminder: the IFG is 'sparse' - // meaning I can visit all the Nodes neighbors less than a Node in time // O(# of neighbors), but I have to visit all the Nodes greater than a // given Node and search them for an instance, i.e., time O(#MaxLRG)). // Insert un-coalesced copies. Visit all Phis. Where inputs to a Phi do // not match the Phi itself, insert a copy. // After aggressive coalesce, attempt a first cut at coloring. // To color, we need the IFG and for that we need LIVE. // Build physical interference graph // If we have a guaranteed spill, might as well spill now // Bail out if unique gets too large (ie - unique > MaxNodeLimit) // Bail out if unique gets too large (ie - unique > MaxNodeLimit - 2*NodeLimitFudgeFactor) compact();
// Compact LRGs; return new lower max lrg // Only do conservative coalescing if requested // Conservative (and pessimistic) copy coalescing of those spills // If max live ranges greater than cutoff, don't color the stack. // This cutoff can be larger than below since it is only done once. // Prepare for Simplify & Select // Simplify the InterFerence Graph by removing LRGs of low degree. // LRGs of low degree are trivially colorable. // Select colors by re-inserting LRGs back into the IFG in reverse order. // Return whether or not something spills. // If we spill, split and recycle the entire thing // Bail out if unique gets too large (ie - unique > MaxNodeLimit - 2*NodeLimitFudgeFactor) compact();
// Compact LRGs; return new lower max lrg // Nuke the live-ness and interference graph and LiveRanGe info // Create LiveRanGe array. // Intersect register masks for all USEs and DEFs // Only do conservative coalescing if requested // Conservative (and pessimistic) copy coalescing // Check for few live ranges determines how aggressive coalesce is. // Simplify the InterFerence Graph by removing LRGs of low degree. // LRGs of low degree are trivially colorable. // Select colors by re-inserting LRGs back into the IFG in reverse order. // Return whether or not something spills. // Count number of Simplify-Select trips per coloring success. // Peephole remove copies // Veify the graph after RA. // max_reg is past the largest *register* used. // Convert that to a frame_slot number. // This frame must preserve the required fp alignment // Move important info out of the live_arena to longer lasting storage. if(
_names[i] ) {
// Live range associated with Node? }
else {
// Must be a register-pair if( !
lrg.
_fat_proj ) {
// Must be aligned adjacent register pair // Live ranges record the highest register in their mask. // We want the low register for the AD file writer's convenience. }
else {
// Misaligned; extract 2 bits//------------------------------de_ssa----------------------------------------- // Set initial Names for all Nodes. Most Nodes get the virtual register // number. A few get the ZERO live range number. These do not // get allocated, but instead rely on correct scheduling to ensure that // only one instance is simultaneously live at a time. // Handle all the normal Nodes in the block // Pre-color to the zero live range, or pick virtual register // Reset the Union-Find mapping to be identity //------------------------------gather_lrg_masks------------------------------- // Gather LiveRanGe information, including register masks. Modification of // cisc spillable in_RegMasks should not be done before AggressiveCoalesce. // Nail down the frame pointer live range // Get virtual register number, same as LiveRanGe index if(
vreg ) {
// No vreg means un-allocable (e.g. memory) // Check for float-vs-int live range (used in register-pressure // Check for twice prior spilling. Once prior spilling might have // spilled 'soft', 2nd prior spill should have spilled 'hard' and // further spilling is unlikely to make progress. // collect defs for MultiDef printing // Check for a single def LRG; these can spill nicely // via rematerialization. Flag as NULL for no def found // yet, or 'n' for single def or -1 for many defs. // Limit result register mask to acceptable registers // Check for bound register masks // Check for maximum frequency value "oops must be in Op_RegP's" );
// Check for multi-kill projection // Fat projections have size equal to number of registers killed // Register pressure is tracked relative to the maximum values // suggested for that platform, INTPRESSURE and FLOATPRESSURE, // and relative to other types which compete for the same regs. // The following table contains suggested values based on the // architectures as defined in each .ad file. // INTPRESSURE and FLOATPRESSURE may be tuned differently for // compile-speed or performance. // SPARC and SPARCV9 reg_pressures are at 2 instead of 1 // since .ad registers are defined as high and low halves. // These reg_pressure values remain compatible with the code // in is_high_pressure() which relates get_invalid_mask_size(), // Block::_reg_pressure and INTPRESSURE, FLOATPRESSURE. // SPARC -d32 has 24 registers available for integral values, // but only 10 of these are safe for 64-bit longs. // Using set_reg_pressure(2) for both int and long means // the allocator will believe it can fit 26 longs into // registers. Using 2 for longs and 1 for ints means the // allocator will attempt to put 52 integers into registers. // The settings below limit this problem to methods with // many long values which are being run on 32-bit SPARC. // ------------------- reg_pressure -------------------- // Each entry is reg_pressure_per_value,number_of_regs // RegL RegI RegFlags RegF RegD INTPRESSURE FLOATPRESSURE // SPARC 2 2 2 2 2 48 (24) 52 (26) // SPARCV9 2 2 2 2 2 48 (24) 52 (26) // ----------------------------------------------------- case Op_RegL:
// Check for long or double // Define platform specific register pressure // If this def of a double forces a mis-aligned double, // flag as '_fat_proj' - really flag as allowing misalignment // AND changes how we count interferences. A mis-aligned // double can interfere with TWO aligned pairs, or effectively case 0:
// not an ideal register // Now do the same for inputs // Setup for CISC SPILLING // Convert operand number to edge index number // Prepare register mask for each input // If this instruction is CISC Spillable, add the flags // bit to its appropriate input // // Testing for floating point code shape // Node *test = n->in(k); // if( test->is_Mach() ) { // MachNode *m = test->as_Mach(); // int op = m->ideal_Opcode(); // if (n->is_Call() && (op == Op_AddF || op == Op_MulF) ) { // Limit result register mask to acceptable registers. // Do not limit registers from uncommon uses before // AggressiveCoalesce. This effectively pre-virtual-splits // around uncommon uses of common defs. // Since we are BEFORE aggressive coalesce, leave the register // mask untrimmed by the call. This encourages more coalescing. // Later, AFTER aggressive, this live range will have to spill // but the spiller handles slow-path calls very nicely. // Check for bound register masks // If this use of a double forces a mis-aligned double, // flag as '_fat_proj' - really flag as allowing misalignment // AND changes how we count interferences. A mis-aligned // double can interfere with TWO aligned pairs, or effectively // if the LRG is an unaligned pair, we will have to spill // so clear the LRG's register mask if it is not already spilled // Check for maximum frequency value }
// End for all allocated inputs }
// end for all instructions // Final per-liverange setup if(
lrg.
not_free() ) {
// Handle case where we lose from the start //------------------------------set_was_low------------------------------------ // Set the was-lo-degree bit. Conservative coalescing should not change the // colorability of the graph. If any live range was of low-degree before // coalescing, it should Simplify. This call sets the was-lo-degree bit. // The bit is checked in Simplify. }
else {
// Else check the Brigg's assertion // Brigg's observation is that the lo-degree neighbors of a // hi-degree live range will not interfere with the color choices // of said hi-degree live range. The Simplify reverse-stack-coloring // order takes care of the details. Hence you do not have to count // low-degree neighbors when determining if this guy colors. lrgs(i).
_was_lo =
1;
// Low degree via the briggs assertion //------------------------------cache_lrg_info--------------------------------- // Compute cost/area ratio, in case we spill. Build the lo-degree list. // Check for being of low degree: means we can be trivially colored. // Low degree, dead or must-spill guys just get to simplify right away // Split low degree list into those guys that must get a // register and those that can go to register or stack. // The idea is LRGs that can go register or stack color first when // they have a good chance of getting a register. The register-only // lo-degree live ranges always get a register. }
else {
// Else high degree//------------------------------Pre-Simplify----------------------------------- // Simplify the IFG by removing LRGs of low degree that have NO copies // Warm up the lo-degree no-copy list // Put the simplified guy on the simplified list. // Yank this guy from the IFG. // If any neighbors' degrees fall below their number of // allowed registers, then put that neighbor on the low degree // list. Note that 'degree' can only fall and 'numregs' is // unchanged by this action. Thus the two are equal at most once, // so LRGs hit the lo-degree worklists at most once. // Check for just becoming of-low-degree }
// End of while lo-degree no_copy worklist not empty // No more lo-degree no-copy live ranges to simplify //------------------------------Simplify--------------------------------------- // Simplify the IFG by removing LRGs of low degree. while(
1 ) {
// Repeat till simplified it all // May want to explore simplifying lo_degree before _lo_stk_degree. // This might result in more spills coloring into registers during // If possible, pull from lo_stk first // Put the simplified guy on the simplified list. // If this guy is "at risk" then mark his current neighbors // Yank this guy from the IFG. // If any neighbors' degrees fall below their number of // allowed registers, then put that neighbor on the low degree // list. Note that 'degree' can only fall and 'numregs' is // unchanged by this action. Thus the two are equal at most once, // so LRGs hit the lo-degree worklist at most once. // Check for just becoming of-low-degree just counting registers. // _must_spill live ranges are already on the low degree list. // Pull from hi-degree list // Check for got everything: is hi-degree list empty? // Time to pick a potential spill guy // It's just vaguely possible to move hi-degree to lo-degree without // going through a just-lo-degree stage: If you remove a double from // a float live range it's degree will drop by 2 and you can skip the // just-lo-degree stage. It's very rare (shows up after 5000+ methods // in -Xcomp of Java2Demo). So just choose this guy to simplify next. // wins. Ties happen because all live ranges in question have spilled // a few times before and the spill-score adds a huge number which // washes out the low order bits. We are choosing the lesser of 2 // evils; in this case pick largest area to spill. // Ties also happen when live ranges are defined and used only inside // one block. In which case their area is 0 and score set to max. // In such case choose bound live range over unbound to free registers // or with smaller cost to spill. // The live range we choose for spilling is either hi-degree, or very // rarely it can be low-degree. If we choose a hi-degree live range // there better not be any lo-degree choices. // Pull from hi-degree list // Jam him on the lo-degree list, despite his high degree. // Maybe he'll get a color, and maybe he'll spill. // Only Select() will know. }
// End of while not simplified everything//------------------------------bias_color------------------------------------- // Choose a color using the biasing heuristic // Check for "at_risk" LRG's // Walk the colored neighbors of the "at_risk" candidate // Choose a color which is both legal and already taken by a neighbor // of the "at_risk" candidate in order to improve the chances of the // "at_risk" candidate of coloring // If this LRG's register is legal for us, choose it (
reg&
1) ==
1) )
// or aligned (adjacent reg is available since we already cleared-to-pairs) // And it is legal for you, (
reg&
1) ==
1) )
// or aligned (adjacent reg is available since we already cleared-to-pairs) }
else if(
chunk == 0 ) {
// Choose a color which is legal for him // If no bias info exists, just go with the register selection ordering // CNC - Fun hack. Alternate 1st and 2nd selection. Enables post-allocate // copy removal to remove many more copies, by preventing a just-assigned // register from being repeatedly assigned. // This 'Remove; find; Insert' idiom is an expensive way to find the // SECOND element in the mask. //------------------------------choose_color----------------------------------- // Choose a color in the current chunk // Use a heuristic to "bias" the color choice // Fat-proj case or misaligned double argument. // Return the highest element in the set. //------------------------------Select----------------------------------------- // Select colors by re-inserting LRGs back into the IFG. LRGs are re-inserted // in reverse order of removal. As long as nothing of hi-degree was yanked, // everything going back is guaranteed a color. Select that color. If some // hi-degree LRG cannot get a color then we record that we must spill. // Pull next LRG from the simplified list - in reverse order of removal // Re-insert into the IFG // capture allstackedness flag before mask is hacked // Yeah, yeah, yeah, I know, I know. I can refactor this // to avoid the GOTO, although the refactored code will not // be much clearer. We arrive here IFF we have a stack-based // live range that cannot color in the current chunk, and it // has to move into the next free stack chunk. int chunk = 0;
// Current chunk is first chunk // Remove neighbor colors // Note that neighbor might be a spill_reg. In this case, exclusion // of its color will be a no-op, since the spill_reg chunk is in outer // space. Also, if neighbor is in a different chunk, this exclusion // will be a no-op. (Later on, if lrg runs out of possible colors in // its chunk, a new chunk of color may be tried, in which case // examination of neighbors is started again, at retry_next_chunk.) // Only subtract masks in the same chunk //assert(is_allstack == lrg->mask().is_AllStack(), "nbrs must not change AllStackedness"); // Aligned pairs need aligned masks // Check if a color is available and if so pick the color // If we fail to color and the AllStack flag is set, trigger // a chunk-rollover event // Bump register mask up to next stack chunk // Record selected register // Fold reg back into normal space // If the live range is not bound, then we actually had some choices // to make. In this case, the mask has more bits in it than the colors // chosen. Restrict the mask to just what was picked. // For pairs, also insert the low bit of the pair // mask must be equal to fatproj bits, by definition // Note that reg is the highest-numbered register in the newly-bound mask. }
// end color available case // Live range is live and no colors available // Assign the special spillreg register // Do not empty the regmask; leave mask_size lying around // for use during Spilling //------------------------------copy_was_spilled------------------------------- // Copy 'was_spilled'-edness from the source Node to the dst Node. //------------------------------set_was_spilled-------------------------------- // Set the 'spilled_once' or 'spilled_twice' flag on a node. //------------------------------fixup_spills----------------------------------- // Convert Ideal spill instructions into proper FramePtr + offset Loads and // Stores. Use-def chains are NOT preserved, but Node->LRG->reg maps are. // This function does only cisc spill work. // Grab the Frame Pointer // For all instructions in block C->
top() == n ||
// Or the random TOP node n->
is_Proj(),
// Or a fat-proj kill node "No dead instructions after post-alloc" );
// Convert operand number to edge index number // Doubles record the HIGH register of an adjacent pair. // This is a CISC Spill, get stack offset and construct new node // Bailout if we might exceed node limit when spilling this instruction }
// End of for all instructions }
// End of for all blocks//------------------------------find_base_for_derived-------------------------- // Helper to stretch above; recursively discover the base Node for a // given derived Node. Easy for AddP-related machine nodes, but needs // to be recursive for derived Phis. // See if already computed; if so return it // See if this happens to be a base. // NOTE: we use TypePtr instead of TypeOopPtr because we can have // pointers derived from NULL! These are always along paths that // can't happen at run-time but the optimizer cannot deduce it so // we have to handle it gracefully. // If its an OOP with a non-zero offset, then it is derived. // Derived is NULL+offset? Base is NULL! // Initialize it once and make it shared: // set control to _root and place it into Start block // (where top() node is placed). // Check for AddP-related opcodes // Recursively find bases for Phis. // First check to see if we can avoid a base Phi here. // Went to the end without finding any different bases? if( i ==
derived->
req() ) {
// No need for a base Phi here // Now we see we need a base-Phi here to merge the bases // Search the current block for an existing base-Phi for( i =
1; i <= b->
end_idx(); i++ ) {
// Search for matching Phi if( !
phi->
is_Phi() ) {
// Found end of Phis with no match? if( j ==
base->
req() ) {
// All inputs match? base =
phi;
// Then use existing 'phi' and drop 'base' // Cache info for later passes //------------------------------stretch_base_pointer_live_ranges--------------- // At each Safepoint, insert extra debug edges for each pair of derived value/ // base pointer that is live across the Safepoint for oopmap building. The // edge pairs get added in after sfpt->jvmtail()->oopoff(), but are in the // For all blocks in RPO do... // Note use of deep-copy constructor. I cannot hammer the original // liveout bits, because they are needed by the following coalesce pass. // Pre-split compares of loop-phis. Loop-phis form a cycle we would // like to see in the same register. Compare uses the loop-phi and so // extends its live range BUT cannot be part of the cycle. If this // extended live range overlaps with the update of the loop-phi value // we need both alive at the same time -- which requires at least 1 // copy. But because Intel has only 2-address registers we end up with // at least 2 copies, one before the loop-phi update instruction and // one after. Instead we split the input to the compare just after the // Get value being defined if(
lidx &&
lidx <
_maxlrg /* Ignore the occasional brand-new live range */) {
// Remove from live-out set // Copies do not define a new value and so do not interfere. // Remove the copies source from the liveout set before interfering. // Now scan for a live derived pointer // Find reaching DEF for base and derived values // This works because we are still in SSA during this call. // If its an OOP with a non-zero offset, then it is derived. // Add reaching DEFs of derived pointer and base pointer as a // See if the base pointer is already live to this point. // Since I'm working on the SSA form, live-ness amounts to // reaching def's. So if I find the base's live range then // I know the base's def reaches here. // Base pointer is not currently live. Since I stretched // the base pointer to here and it crosses basic-block // boundaries, the global live info is now incorrect. }
// End of if base pointer is not live to debug info }
// End of scan all live data for derived ptrs crossing GC point }
// End of if found a GC point if( !n->
is_Phi() ) {
// Phi function uses come from prior block for(
uint k =
1; k < n->
req(); k++ ) {
}
// End of forall instructions in block }
// End of forall blocks // If I created a new live range I need to recompute live //------------------------------add_reference---------------------------------- // Extend the node to LRG mapping //------------------------------dump------------------------------------------- if(
_node_regs ) {
// Got a post-allocation copy of allocation? for( k = 0; k < n->
req(); k++) {
// Data MultiNode's can have projections with no real registers. // Don't die while dumping them. for( ; k < n->
len(); k++ ) {
// Print live-out info at end of block tty->
print(
"--- Chaitin -- argsize: %d framesize: %d ---\n",
tty->
print(
"--- Live RanGe Array ---\n");
// Dump lo-stk-degree list //------------------------------dump_degree_lists------------------------------ // Dump lo-stk-degree list //------------------------------dump_simplified-------------------------------- //------------------------------dump_register---------------------------------- // Dump a register name into a buffer. Be intelligent if we get called // before allocation is complete. if( !
this ) {
// Not got anything? // Post allocation, use direct mappings, no LRG info available }
else if( !
lidx ) {
// Special, not allocated value }
else {
// Hah! We have a bound machine register//----------------------dump_for_spill_split_recycle-------------------------- // Display which live ranges need to be split and the allocator's state //------------------------------dump_frame------------------------------------ // Incoming arguments in registers dump for(
int k = 0; k <
argcnt; k++ ) {
// Check for un-owned padding above incoming args // Incoming argument area dump for( j = 0; j <
argcnt; j++) {
// Old outgoing preserve area // Preserved slots are present on x86 // Outgoing argument area dump // Outgoing new preserve area //------------------------------dump_bb---------------------------------------- //------------------------------dump_lrg--------------------------------------- tty->
print(
"Attempt to print live range index beyond max live range.\n");
if (!m)
continue;
// be robust in the dumper }
// End of per-block dump//------------------------------print_chaitin_statistics------------------------------- tty->
print(
"Conservatively coalesced %d copies, %d pairs",
tty->
print_cr(
"Lost coalesce opportunity, %d private-private, and %d cflow interfered.",
tty->
print_cr(
"Used cisc instruction %d, remained in register %d",