superword.cpp revision 579
579N/A * Copyright 2007-2009 Sun Microsystems, Inc. All Rights Reserved. 0N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 0N/A * This code is free software; you can redistribute it and/or modify it 0N/A * under the terms of the GNU General Public License version 2 only, as 0N/A * published by the Free Software Foundation. 0N/A * This code is distributed in the hope that it will be useful, but WITHOUT 0N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 0N/A * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 0N/A * version 2 for more details (a copy is included in the LICENSE file that 0N/A * accompanied this code). 0N/A * You should have received a copy of the GNU General Public License version 0N/A * 2 along with this work; if not, write to the Free Software Foundation, 0N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 0N/A * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, 0N/A * CA 95054 USA or visit www.sun.com if you need additional information or 0N/A * have any questions. 0N/A#
include "incls/_precompiled.incl" 0N/A// S U P E R W O R D T R A N S F O R M 0N/A//============================================================================= 0N/A//------------------------------SuperWord--------------------------- 0N/A//------------------------------transform_loop--------------------------- 0N/A // Check for no control flow in body (other than exit) 105N/A // Make sure the are no extra control users of the loop backedge 0N/A // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit)))) 0N/A // Do vectors exist on this architecture? 0N/A init();
// initialize data structures 0N/A // For now, define one block which is the entire loop body 0N/A//------------------------------SLP_extract--------------------------- 0N/A// Extract the superword level parallelism 0N/A// 1) A reverse post-order of nodes in the block is constructed. By scanning 0N/A// this list from first to last, all definitions are visited before their uses. 0N/A// 2) A point-to-point dependence graph is constructed between memory references. 0N/A// This simplies the upcoming "independence" checker. 0N/A// 3) The maximum depth in the node graph from the beginning of the block 0N/A// to each node is computed. This is used to prune the graph search 0N/A// in the independence checker. 0N/A// 4) For integer types, the necessary bit width is propagated backwards 0N/A// from stores to allow packed operations on byte, char, and short 0N/A// integers. This reverses the promotion to type "int" that javac 0N/A// did for operations like: char c1,c2,c3; c1 = c2 + c3. 0N/A// 5) One of the memory references is picked to be an aligned vector reference. 0N/A// The pre-loop trip count is adjusted to align this reference in the 0N/A// 6) The initial set of pack pairs is seeded with memory references. 0N/A// 7) The set of pack pairs is extended by following use->def and def->use links. 0N/A// 8) The pairs are combined into vector sized packs. 0N/A// 9) Reorder the memory slices to co-locate members of the memory packs. 0N/A// 10) Generate ideal vector nodes for the final set of packs and where necessary, 0N/A// inserting scalar promotion, vector creation from multiple scalars, and 0N/A// extraction of scalar values from vectors. 0N/A // Attempt vectorization 0N/A//------------------------------find_adjacent_refs--------------------------- 0N/A// Find the adjacent memory references and create pack pairs for them. 0N/A// This is the initial set of packs that will then be extended by 0N/A// following use->def and def->use links. The align positions are 0N/A// assigned relative to the reference "align_to_ref" 0N/A // Get list of memory operations 0N/A // Find a memory reference to align to. The pre-loop trip count 0N/A // is modified to align this reference to a vector-aligned address 72N/A tty->
print_cr(
"\noffset = %d iv_adjustment = %d elt_align = %d scale = %d iv_stride = %d",
0N/A // Set alignment relative to "align_to_ref" 0N/A // Create initial pack pairs of memory operations 0N/A//------------------------------find_align_to_ref--------------------------- 0N/A// Find a memory reference to align the loop induction variable to. 0N/A// Looks first at stores then at loads, looking for a memory reference 0N/A// with the largest number of references similar to it. 0N/A // Count number of comparable memory ops 0N/A // Discard if pre loop can't align this reference 0N/A // Find Store (or Load) with the greatest number of "comparable" references 0N/A // If no stores, look at loads 0N/A//------------------------------ref_is_alignable--------------------------- 0N/A// Can the preloop align the reference to position zero in the vector? 0N/A return true;
// no induction variable 0N/A // Stride one accesses are alignable. 0N/A // If initial offset from start of object is computable, 0N/A // compute alignment within the vector. 0N/A//---------------------------dependence_graph--------------------------- 0N/A// Construct dependency graph. 0N/A// Add dependence edges to load/store nodes for memory dependence 0N/A// A.out()->DependNode.in(1) and DependNode.out()->B.prec(x) 0N/A // First, assign a dependence node to each memory node 0N/A // For each memory slice, create the dependences 0N/A // Get slice in predecessor order (last is first) 0N/A // Make the slice dependent on the root 0N/A // Create a sink for the slice 0N/A // Now visit each pair of memory ops, creating the edges 0N/A // If no dependency yet, use slice 0N/A for (
int k = j -
1; k >= 0; k--) {
0N/A // Create a runtime check to disambiguate 0N/A // Possibly same address 0N/A//---------------------------mem_slice_preds--------------------------- 0N/A// Return a memory slice (node list) in predecessor order starting at "start" 0N/A // Either unrolling is causing a memory edge not to disappear, 0N/A // or need to run igvn.optimize() again before SLP 0N/A // Ditto. Not sure what else to check further. 0N/A // StoreCM has an input edge used as a precedence edge. 0N/A // Maybe an issue when oop stores are vectorized. 0N/A//------------------------------stmts_can_pack--------------------------- 0N/A// Can s1 and s2 be in a pack with s1 immediately preceeding s2 and 0N/A// s1 aligned at "align" 0N/A//------------------------------exists_at--------------------------- 0N/A// Does s exist in a pack at position pos? 0N/A//------------------------------are_adjacent_refs--------------------------- 0N/A// Is s1 immediately before s2 in memory? 0N/A // FIXME - co_locate_pack fails on Stores in different mem-slices, so 0N/A // only pack memops that are in the same alias set until that's fixed. 0N/A//------------------------------isomorphic--------------------------- 0N/A// Are s1 and s2 similar? 0N/A//------------------------------independent--------------------------- 0N/A// Is there no data path from s1 to s2 or s2 to s1? 0N/A // assert(s1->Opcode() == s2->Opcode(), "check isomorphic first"); 0N/A//------------------------------independent_path------------------------------ 0N/A// Helper for independent 0N/A if (
dp >=
1000)
return false;
// stop deep recursion 0N/A//------------------------------set_alignment--------------------------- 0N/A//------------------------------data_size--------------------------- 0N/A//------------------------------extend_packlist--------------------------- 0N/A// Extend packset by following use->def and def->use links from pack members. 0N/A//------------------------------follow_use_defs--------------------------- 0N/A// Extend the packset by visiting operand definitions of nodes in pack p 0N/A//------------------------------follow_def_uses--------------------------- 0N/A// Extend the packset by visiting uses of nodes in pack p 0N/A//---------------------------opnd_positions_match------------------------- 0N/A// Is the use of d1 in u1 at the same operand position as d2 in u2? 0N/A//------------------------------est_savings--------------------------- 0N/A// Estimate the savings from executing s1 and s2 as a pack 0N/A int save =
2 -
1;
// 2 operations per instruction in packed form 0N/A//------------------------------costs--------------------------- 0N/A//------------------------------combine_packs--------------------------- 0N/A// Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last 0N/A//-----------------------------construct_my_pack_map-------------------------- 0N/A// Construct the map from nodes to packs. Only valid after the 0N/A// point where a node is only in one pack (after combine_packs). 0N/A//------------------------------filter_packs--------------------------- 0N/A// Remove packs that are not implemented or not profitable. 0N/A // Remove packs that are not implemented 0N/A // Remove packs that are not profitable 0N/A//------------------------------implemented--------------------------- 0N/A// Can code be generated for pack p? 0N/A//------------------------------profitable--------------------------- 0N/A// For pack p, are all operands and all uses (with in the block) vector? 0N/A // Return false if some input is not vector and inside block 0N/A // For now, return false if not scalar promotion case (inputs are the same.) 0N/A // Later, implement PackNode and allow differring, non-vector inputs 0N/A // (maybe just the ones from outside the block.) 0N/A // For now, return false if not all uses are vector. 0N/A // Later, implement ExtractNode and allow non-vector uses (maybe 0N/A // just the ones outside the block.) 0N/A//------------------------------schedule--------------------------- 0N/A// Adjust the memory graph for the packed operations 0N/A // Co-locate in the memory graph the members of each memory pack 0N/A//------------------------------co_locate_pack--------------------------- 0N/A// Within a pack, move stores down to the last executed store, 0N/A// and move loads up to the first executed load. 0N/A // Push Stores down towards last executed pack member 0N/A // Forward users of my memory state to my input memory state 0N/A --i;
// deleted this edge; rescan position 0N/A // put current immediately before insert_pt 0N/A // Pull Loads up towards first executed pack member 0N/A // Give each load same memory state as first 0N/A//------------------------------output--------------------------- 0N/A// Convert packs into vector node operations 0N/A // MUST ENSURE main loop's initial value is properly aligned: 0N/A // (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0 0N/A // Insert extract (unpack) operations for scalar uses 0N/A // Promote value to be stored to vector 0N/A // Promote operands to vector 0N/A//------------------------------vector_opd--------------------------- 0N/A// Create a vector operand for the nodes in pack p for operand: in(opd_idx) 0N/A // Convert scalar input to vector. Use p0's type because it's container 0N/A // maybe smaller than the operand's container. 0N/A // Insert pack operation 0N/A//------------------------------insert_extracts--------------------------- 0N/A// If a use of pack p is not a vector use, then replace the 0N/A// use with an extract operation. 0N/A // Inspect each use of each pack member. For each use that is 0N/A // not a vector use, replace the use with an extract operation. 0N/A // Insert extract operation 0N/A//------------------------------is_vector_use--------------------------- 0N/A// Is use->in(u_idx) a vector use? 0N/A // check for scalar promotion 0N/A//------------------------------construct_bb--------------------------- 0N/A// Construct reverse postorder list of block members 0N/A // Find non-control nodes with no inputs from within block, 0N/A // create a temporary map from node _idx to bb_idx for use 0N/A // by the visited and post_visited sets, 0N/A // and count number of nodes in block. 0N/A // Find memory slices (head and tail) 0N/A // Create an RPO list of nodes in block 0N/A // Push all non-control nodes with no inputs from within block, then control entry 0N/A // Do a depth first walk over out edges 0N/A // forward arc in graph 0N/A // cross or back arc 0N/A // Don't go around backedge 0N/A // There were no additional uses, post visit node now 0N/A _stk.
pop();
// Remove post-visited node from stack 0N/A // Create real map of block indices for nodes 0N/A//------------------------------initialize_bb--------------------------- 0N/A// Initialize per node info 0N/A//------------------------------bb_insert_after--------------------------- 0N/A// Insert n into block after pos 0N/A // Adjust map from node->_idx to _block index 0N/A//------------------------------compute_max_depth--------------------------- 0N/A// Compute max depth for expressions from beginning of block 0N/A// Use to prune search paths during test for independence. 0N/A//-------------------------compute_vector_element_type----------------------- 0N/A// Compute necessary vector element type for expressions 0N/A// This propagates backwards a narrower integer type when the 0N/A// upper bits of the value are not needed. 0N/A// Example: char a,b,c; a = b + c; 0N/A// Normally the type of the add is integer, but for packed character 0N/A// operations the type of the add needs to be char. 0N/A // Propagate narrowed type backwards through operations 0N/A // that don't depend on higher order bits 0N/A // Only integer types need be examined 0N/A // Don't propagate through a type conversion 0N/A//------------------------------memory_alignment--------------------------- 0N/A// Alignment within a vector memory reference 0N/A//---------------------------container_type--------------------------- 0N/A// Smallest type containing range of values 0N/A//-------------------------vector_opd_range----------------------- 0N/A// (Start, end] half-open range defining which operands are vector 0N/A//------------------------------in_packset--------------------------- 0N/A// Are s1 and s2 in a pack pair and ordered as s1,s2? 0N/A//------------------------------in_pack--------------------------- 0N/A//------------------------------remove_pack_at--------------------------- 0N/A// Remove the pack at position pos in the packset 0N/A//------------------------------executed_first--------------------------- 0N/A// Return the node executed first in pack p. Uses the RPO block list 0N/A// to determine order. 0N/A//------------------------------executed_last--------------------------- 0N/A// Return the node executed last in pack p. 0N/A//----------------------------align_initial_loop_index--------------------------- 0N/A// Adjust pre-loop limit so that in main loop, a load/store reference 0N/A// to align_to_ref will be a position zero in the vector. 0N/A// (iv + k) mod vector_align == 0 0N/A // Where we put new limit calculations 0N/A // Ensure the original loop limit is available from the 0N/A // pre-loop Opaque1 node. 72N/A // lim0 == original pre loop limit 72N/A // V == v_align (power of 2) 72N/A // invar == extra invariant piece of the address expression 72N/A // e == k [ +/- invar ] 72N/A // When reassociating expressions involving '%' the basic rules are: 72N/A // (a - b) % k == 0 => a % k == b % k 72N/A // (a + b) % k == 0 => a % k == (k - b) % k 72N/A // For stride > 0 && scale > 0, 72N/A // Derive the new pre-loop limit "lim" such that the two constraints: 72N/A // (1) lim = lim0 + N (where N is some positive integer < V) 72N/A // (2) (e + lim) % V == 0 72N/A // Substituting (1) into (2), 72N/A // (e + lim0 + N) % V == 0 72N/A // N = (V - (e + lim0)) % V 72N/A // substitute back into (1), so that new limit 72N/A // lim = lim0 + (V - (e + lim0)) % V 72N/A // For stride > 0 && scale < 0 72N/A // (e - lim) % V == 0 72N/A // Solving for lim: 72N/A // (e - lim0 - N) % V == 0 72N/A // N = (e - lim0) % V 72N/A // lim = lim0 + (e - lim0) % V 72N/A // For stride < 0 && scale > 0 72N/A // (e + lim) % V == 0 72N/A // Solving for lim: 72N/A // (e + lim0 - N) % V == 0 72N/A // N = (e + lim0) % V 72N/A // lim = lim0 - (e + lim0) % V 72N/A // For stride < 0 && scale < 0 72N/A // (e - lim) % V == 0 72N/A // Solving for lim: 72N/A // (e - lim0 + N) % V == 0 72N/A // N = (V - (e - lim0)) % V 72N/A // lim = lim0 - (V - (e - lim0)) % V 72N/A // incorporate any extra invariant piece producing k +/- invar >>> log2(elt) 72N/A // compute e +/- lim0 72N/A // compute V - (e +/- lim0) 72N/A // compute N = (exp) % V 72N/A // substitute back into (1), so that new limit 0N/A//----------------------------get_pre_loop_end--------------------------- 0N/A// Find pre loop end from main loop. Returns null if none. 0N/A//------------------------------init--------------------------- 0N/A//------------------------------print_packset--------------------------- 0N/A//------------------------------print_pack--------------------------- 0N/A//------------------------------print_bb--------------------------- 0N/A//------------------------------print_stmt--------------------------- 0N/A//------------------------------blank--------------------------- 0N/A//==============================SWPointer=========================== 0N/A//----------------------------SWPointer------------------------ 0N/A // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant) 0N/A for (
int i = 0; i <
3; i++) {
0N/A break;
// stop looking at addp's 0N/A// Following is used to create a temporary object during 0N/A// the pattern match of an address expression. 0N/A//------------------------scaled_iv_plus_offset-------------------- 0N/A// Match: k*iv + offset 0N/A// where: k is a constant that maybe zero, and 0N/A// offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional 0N/A//----------------------------scaled_iv------------------------ 0N/A// Match: k*iv where k is a constant that's not zero 0N/A return false;
// already found a scale 0N/A // Need to preserve the current _offset value, so 0N/A // create a temporary object for this expression subtree. 0N/A // Hacky, so should re-engineer the address pattern match. 0N/A//----------------------------offset_plus_k------------------------ 0N/A// Match: offset is (k [+/- invariant]) 0N/A// where k maybe zero and invariant is optional, but not both. 0N/A // Okay if value fits into an int 0N/A//----------------------------print------------------------ 0N/A tty->
print(
"base: %d adr: %d scale: %d offset: %d invar: %c%d\n",
0N/A// ========================= OrderedPair ===================== 0N/A// ========================= SWNodeInfo ===================== 0N/A// ============================ DepGraph =========================== 0N/A//------------------------------make_node--------------------------- 0N/A// Make a new dependence graph node for an ideal node. 0N/A//------------------------------make_edge--------------------------- 0N/A// Make a new dependence graph edge from dpred -> dsucc 0N/A// ========================== DepMem ======================== 0N/A//------------------------------in_cnt--------------------------- 0N/A//------------------------------out_cnt--------------------------- 0N/A//------------------------------print----------------------------- 0N/A// =========================== DepEdge ========================= 0N/A//------------------------------DepPreds--------------------------- 0N/A// =========================== DepPreds ========================= 0N/A// Iterator over predecessor edges in the dependence graph. 0N/A//------------------------------DepPreds--------------------------- 0N/A//------------------------------next--------------------------- 0N/A// =========================== DepSuccs ========================= 0N/A// Iterator over successor edges in the dependence graph. 0N/A//------------------------------DepSuccs--------------------------- 0N/A//-------------------------------next---------------------------