superword.cpp revision 1505
#
include "incls/_precompiled.incl"// S U P E R W O R D T R A N S F O R M //============================================================================= //------------------------------SuperWord--------------------------- //------------------------------transform_loop--------------------------- // Check for no control flow in body (other than exit) // Make sure the are no extra control users of the loop backedge // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit)))) // Do vectors exist on this architecture? init();
// initialize data structures // For now, define one block which is the entire loop body //------------------------------SLP_extract--------------------------- // Extract the superword level parallelism // 1) A reverse post-order of nodes in the block is constructed. By scanning // this list from first to last, all definitions are visited before their uses. // 2) A point-to-point dependence graph is constructed between memory references. // This simplies the upcoming "independence" checker. // 3) The maximum depth in the node graph from the beginning of the block // to each node is computed. This is used to prune the graph search // in the independence checker. // 4) For integer types, the necessary bit width is propagated backwards // from stores to allow packed operations on byte, char, and short // integers. This reverses the promotion to type "int" that javac // did for operations like: char c1,c2,c3; c1 = c2 + c3. // 5) One of the memory references is picked to be an aligned vector reference. // The pre-loop trip count is adjusted to align this reference in the // 6) The initial set of pack pairs is seeded with memory references. // 7) The set of pack pairs is extended by following use->def and def->use links. // 8) The pairs are combined into vector sized packs. // 9) Reorder the memory slices to co-locate members of the memory packs. // 10) Generate ideal vector nodes for the final set of packs and where necessary, // inserting scalar promotion, vector creation from multiple scalars, and // extraction of scalar values from vectors. //------------------------------find_adjacent_refs--------------------------- // Find the adjacent memory references and create pack pairs for them. // This is the initial set of packs that will then be extended by // following use->def and def->use links. The align positions are // assigned relative to the reference "align_to_ref" // Get list of memory operations // Find a memory reference to align to. The pre-loop trip count // is modified to align this reference to a vector-aligned address tty->
print_cr(
"\noffset = %d iv_adjustment = %d elt_align = %d scale = %d iv_stride = %d",
// Set alignment relative to "align_to_ref" // Create initial pack pairs of memory operations //------------------------------find_align_to_ref--------------------------- // Find a memory reference to align the loop induction variable to. // Looks first at stores then at loads, looking for a memory reference // with the largest number of references similar to it. // Count number of comparable memory ops // Discard if pre loop can't align this reference // Find Store (or Load) with the greatest number of "comparable" references // If no stores, look at loads tty->
print_cr(
"\nVector memops after find_align_to_refs");
//------------------------------ref_is_alignable--------------------------- // Can the preloop align the reference to position zero in the vector? return true;
// no induction variable // Stride one accesses are alignable. // If initial offset from start of object is computable, // compute alignment within the vector. //---------------------------dependence_graph--------------------------- // Construct dependency graph. // Add dependence edges to load/store nodes for memory dependence // A.out()->DependNode.in(1) and DependNode.out()->B.prec(x) // First, assign a dependence node to each memory node // For each memory slice, create the dependences // Get slice in predecessor order (last is first) // Make the slice dependent on the root // Create a sink for the slice // Now visit each pair of memory ops, creating the edges // If no dependency yet, use slice for (
int k = j -
1; k >= 0; k--) {
// Create a runtime check to disambiguate //---------------------------mem_slice_preds--------------------------- // Return a memory slice (node list) in predecessor order starting at "start" // Either unrolling is causing a memory edge not to disappear, // or need to run igvn.optimize() again before SLP // Ditto. Not sure what else to check further. // StoreCM has an input edge used as a precedence edge. // Maybe an issue when oop stores are vectorized. //------------------------------stmts_can_pack--------------------------- // Can s1 and s2 be in a pack with s1 immediately preceding s2 and // Do not use superword for non-primitives //------------------------------exists_at--------------------------- // Does s exist in a pack at position pos? //------------------------------are_adjacent_refs--------------------------- // Is s1 immediately before s2 in memory? // Do not use superword for non-primitives // FIXME - co_locate_pack fails on Stores in different mem-slices, so // only pack memops that are in the same alias set until that's fixed. //------------------------------isomorphic--------------------------- // Are s1 and s2 similar? if (
s1->
in(0) !=
s2->
in(0))
return false;
//------------------------------independent--------------------------- // Is there no data path from s1 to s2 or s2 to s1? // assert(s1->Opcode() == s2->Opcode(), "check isomorphic first"); //------------------------------independent_path------------------------------ // Helper for independent if (
dp >=
1000)
return false;
// stop deep recursion //------------------------------set_alignment--------------------------- //------------------------------data_size--------------------------- //------------------------------extend_packlist--------------------------- // Extend packset by following use->def and def->use links from pack members. //------------------------------follow_use_defs--------------------------- // Extend the packset by visiting operand definitions of nodes in pack p //------------------------------follow_def_uses--------------------------- // Extend the packset by visiting uses of nodes in pack p //---------------------------opnd_positions_match------------------------- // Is the use of d1 in u1 at the same operand position as d2 in u2? //------------------------------est_savings--------------------------- // Estimate the savings from executing s1 and s2 as a pack int save =
2 -
1;
// 2 operations per instruction in packed form //------------------------------costs--------------------------- //------------------------------combine_packs--------------------------- // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last //-----------------------------construct_my_pack_map-------------------------- // Construct the map from nodes to packs. Only valid after the // point where a node is only in one pack (after combine_packs). //------------------------------filter_packs--------------------------- // Remove packs that are not implemented or not profitable. // Remove packs that are not implemented // Remove packs that are not profitable //------------------------------implemented--------------------------- // Can code be generated for pack p? //------------------------------profitable--------------------------- // For pack p, are all operands and all uses (with in the block) vector? // Return false if some input is not vector and inside block // For now, return false if not scalar promotion case (inputs are the same.) // Later, implement PackNode and allow differing, non-vector inputs // (maybe just the ones from outside the block.) // For now, return false if not all uses are vector. // Later, implement ExtractNode and allow non-vector uses (maybe // just the ones outside the block.) //------------------------------schedule--------------------------- // Adjust the memory graph for the packed operations // Co-locate in the memory graph the members of each memory pack //-------------------------------remove_and_insert------------------- //remove "current" from its current position in the memory graph and insert //it after the appropriate insertion point (lip or uip) //remove current_store from its current position in the memmory graph if (
use ==
prev) {
// connect prev to my_mem --i;
//deleted this edge; rescan position // all uses of insert_pt's memory state should use current's instead --i;
//deleted this edge; rescan position uint pos;
//lip (lower insert point) must be the last one in the memory slice //connect current to insert_pt //------------------------------co_locate_pack---------------------------------- // To schedule a store pack, we need to move any sandwiched memory ops either before // or after the pack, based upon dependence information: // (1) If any store in the pack depends on the sandwiched memory op, the // sandwiched memory op must be scheduled BEFORE the pack; // (2) If a sandwiched memory op depends on any store in the pack, the // sandwiched memory op must be scheduled AFTER the pack; // (3) If a sandwiched memory op (say, memA) depends on another sandwiched // memory op (say memB), memB must be scheduled before memA. So, if memA is // scheduled before the pack, memB must also be scheduled before the pack; // (4) If there is no dependence restriction for a sandwiched memory op, we simply // schedule this store AFTER the pack // (5) We know there is no dependence cycle, so there in no other case; // (6) Finally, all memory ops in another single pack should be moved in the same direction. // To schedule a load pack, we use the memory state of either the first or the last load in // the pack, based on the dependence constraint. // determine which memory operations should be scheduled before the pack for (
uint j = 0; j< i; j++) {
//start scheduling from "last" to "first" // Forward users of my memory state (except "previous) to my input memory state --i;
// deleted this edge; rescan position }
else {
// !in_pack(current, pk) ==> a sandwiched store // all loads in the pack should have the same memory state. By default, // we use the memory state of the last load. However, if any load could // not be moved down due to the dependence constraint, we use the memory // state of the first load. // Give each load the same memory state //------------------------------output--------------------------- // Convert packs into vector node operations // MUST ENSURE main loop's initial value is properly aligned: // (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0 // Insert extract (unpack) operations for scalar uses // Promote value to be stored to vector }
else if (n->
req() ==
3) {
// Promote operands to vector //------------------------------vector_opd--------------------------- // Create a vector operand for the nodes in pack p for operand: in(opd_idx) // Convert scalar input to vector. Use p0's type because it's container // maybe smaller than the operand's container. //------------------------------insert_extracts--------------------------- // If a use of pack p is not a vector use, then replace the // use with an extract operation. // Inspect each use of each pack member. For each use that is // not a vector use, replace the use with an extract operation. // Insert extract operation //------------------------------is_vector_use--------------------------- // Is use->in(u_idx) a vector use? // check for scalar promotion //------------------------------construct_bb--------------------------- // Construct reverse postorder list of block members // Find non-control nodes with no inputs from within block, // create a temporary map from node _idx to bb_idx for use // by the visited and post_visited sets, // and count number of nodes in block. for (
uint j = 0; j < n->
req(); j++) {
// Find memory slices (head and tail) // Create an RPO list of nodes in block // Push all non-control nodes with no inputs from within block, then control entry // Do a depth first walk over out edges // Don't go around backedge // There were no additional uses, post visit node now _stk.
pop();
// Remove node from stack _stk.
pop();
// Remove post-visited node from stack // Create real map of block indices for nodes //------------------------------initialize_bb--------------------------- // Initialize per node info //------------------------------bb_insert_after--------------------------- // Insert n into block after pos // Adjust map from node->_idx to _block index //------------------------------compute_max_depth--------------------------- // Compute max depth for expressions from beginning of block // Use to prune search paths during test for independence. //-------------------------compute_vector_element_type----------------------- // Compute necessary vector element type for expressions // This propagates backwards a narrower integer type when the // upper bits of the value are not needed. // Example: char a,b,c; a = b + c; // Normally the type of the add is integer, but for packed character // operations the type of the add needs to be char. // Propagate narrowed type backwards through operations // that don't depend on higher order bits // Only integer types need be examined // Don't propagate through a type conversion //------------------------------memory_alignment--------------------------- // Alignment within a vector memory reference //---------------------------container_type--------------------------- // Smallest type containing range of values //-------------------------vector_opd_range----------------------- // (Start, end] half-open range defining which operands are vector *
end = n->
req();
// default is all operands//------------------------------in_packset--------------------------- // Are s1 and s2 in a pack pair and ordered as s1,s2? //------------------------------in_pack--------------------------- //------------------------------remove_pack_at--------------------------- // Remove the pack at position pos in the packset //------------------------------executed_first--------------------------- // Return the node executed first in pack p. Uses the RPO block list //------------------------------executed_last--------------------------- // Return the node executed last in pack p. //----------------------------align_initial_loop_index--------------------------- // Adjust pre-loop limit so that in main loop, a load/store reference // to align_to_ref will be a position zero in the vector. // (iv + k) mod vector_align == 0 // Where we put new limit calculations // Ensure the original loop limit is available from the // pre-loop Opaque1 node. // lim0 == original pre loop limit // V == v_align (power of 2) // invar == extra invariant piece of the address expression // When reassociating expressions involving '%' the basic rules are: // (a - b) % k == 0 => a % k == b % k // (a + b) % k == 0 => a % k == (k - b) % k // For stride > 0 && scale > 0, // Derive the new pre-loop limit "lim" such that the two constraints: // (1) lim = lim0 + N (where N is some positive integer < V) // (2) (e + lim) % V == 0 // Substituting (1) into (2), // (e + lim0 + N) % V == 0 // N = (V - (e + lim0)) % V // substitute back into (1), so that new limit // lim = lim0 + (V - (e + lim0)) % V // For stride > 0 && scale < 0 // (e - lim0 - N) % V == 0 // lim = lim0 + (e - lim0) % V // For stride < 0 && scale > 0 // (e + lim0 - N) % V == 0 // lim = lim0 - (e + lim0) % V // For stride < 0 && scale < 0 // (e - lim0 + N) % V == 0 // N = (V - (e - lim0)) % V // lim = lim0 - (V - (e - lim0)) % V // incorporate any extra invariant piece producing k +/- invar >>> log2(elt) // compute V - (e +/- lim0) // substitute back into (1), so that new limit //----------------------------get_pre_loop_end--------------------------- // Find pre loop end from main loop. Returns null if none. //------------------------------init--------------------------- //------------------------------print_packset--------------------------- //------------------------------print_pack--------------------------- //------------------------------print_bb--------------------------- //------------------------------print_stmt--------------------------- //------------------------------blank--------------------------- //==============================SWPointer=========================== //----------------------------SWPointer------------------------ // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant) //unsafe reference could not be aligned appropriately without runtime checking for (
int i = 0; i <
3; i++) {
break;
// stop looking at addp's // Following is used to create a temporary object during // the pattern match of an address expression. //------------------------scaled_iv_plus_offset-------------------- // where: k is a constant that maybe zero, and // offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional //----------------------------scaled_iv------------------------ // Match: k*iv where k is a constant that's not zero return false;
// already found a scale // Need to preserve the current _offset value, so // create a temporary object for this expression subtree. // Hacky, so should re-engineer the address pattern match. //----------------------------offset_plus_k------------------------ // Match: offset is (k [+/- invariant]) // where k maybe zero and invariant is optional, but not both. // Okay if value fits into an int if (
_invar !=
NULL)
return false;
// already have an invariant //----------------------------print------------------------ tty->
print(
"base: %d adr: %d scale: %d offset: %d invar: %c%d\n",
// ========================= OrderedPair ===================== // ========================= SWNodeInfo ===================== // ============================ DepGraph =========================== //------------------------------make_node--------------------------- // Make a new dependence graph node for an ideal node. //------------------------------make_edge--------------------------- // Make a new dependence graph edge from dpred -> dsucc // ========================== DepMem ======================== //------------------------------in_cnt--------------------------- //------------------------------out_cnt--------------------------- //------------------------------print----------------------------- // =========================== DepEdge ========================= //------------------------------DepPreds--------------------------- // =========================== DepPreds ========================= // Iterator over predecessor edges in the dependence graph. //------------------------------DepPreds--------------------------- //------------------------------next--------------------------- // =========================== DepSuccs ========================= // Iterator over successor edges in the dependence graph. //------------------------------DepSuccs--------------------------- //-------------------------------next---------------------------