assembler_sparc.cpp revision 665
665N/A * Copyright 1997-2009 Sun Microsystems, Inc. All Rights Reserved. 0N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 0N/A * This code is free software; you can redistribute it and/or modify it 0N/A * under the terms of the GNU General Public License version 2 only, as 0N/A * published by the Free Software Foundation. 0N/A * This code is distributed in the hope that it will be useful, but WITHOUT 0N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 0N/A * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 0N/A * version 2 for more details (a copy is included in the LICENSE file that 0N/A * accompanied this code). 0N/A * You should have received a copy of the GNU General Public License version 0N/A * 2 along with this work; if not, write to the Free Software Foundation, 0N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 0N/A * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, 0N/A * CA 95054 USA or visit www.sun.com if you need additional information or 0N/A * have any questions. 0N/A#
include "incls/_precompiled.incl" 0N/A// Implementation of Address 0N/A// Warning: In LP64 mode, _disp will occupy more than 10 bits. 0N/A// This is inconsistent with the other constructors but op 0N/A// codes such as ld or ldx, only access disp() to get their 0N/A {
"A0",
"P0"}, {
"A1",
"P1"}, {
"A2",
"P2"}, {
"A3",
"P3"}, {
"A4",
"P4"},
0N/A {
"A5",
"P5"}, {
"A6",
"P6"}, {
"A7",
"P7"}, {
"A8",
"P8"}, {
"A9",
"P9"},
0N/A default: s =
"????";
break;
0N/A default: s =
"????";
break;
0N/A// Patch instruction inst at offset inst_pos to refer to dest_pos 0N/A// and return the resulting instruction. 0N/A// We should have pcs, not offsets, but since all is relative, it will work out 0N/A int m;
// mask for displacement field 0N/A int v;
// new value for displacement field 0N/A// Return the offset of the branch destionation of instruction inst 0N/A// Should have pcs, but since all is relative, it works out. 0N/A return 0x00;
// illegal instruction 0x00000000 0N/A// Generate a bunch 'o stuff (including v9's 0N/A// Generate a bunch 'o stuff unique to V8 0N/A// Implementation of MacroAssembler 0N/A // provoke OS NULL exception if reg = NULL by 0N/A // accessing M[reg] w/o changing any registers 0N/A // nothing to do, (later) access of M[reg + offset] 0N/A // will provoke OS NULL exception if reg = NULL 0N/A // This can only be traceable if r1 & r2 are visible after a window save 0N/A // get nearby pc, store jmp target 0N/A // This can only be traceable if r1 is visible after a window save 0N/A // get nearby pc, store jmp target 0N/A// This code sequence is relocatable to any address, even on LP64. 0N/A // Force fixed length sethi because NativeJump and NativeFarCall don't handle 0N/A // variable length instruction streams. 0N/A // Must do the add here so relocation can find the remainder of the 0N/A // value to be relocated. 0N/A // get nearby pc, store jmp target 0N/A// Convert to C varargs format 0N/A // spill register-resident args to their memory slots 0N/A // (SPARC calling convention requires callers to have already preallocated these) 0N/A // Note that the inArg might in fact be an outgoing argument, 0N/A // if a leaf routine or stub does some tricky argument shuffling. 0N/A // This routine must work even though one of the saved arguments 0N/A // is in the d register (e.g., set_varargs(Argument(0, false), O0)). 0N/A // return the address of the first memory slot 0N/A// Conditional breakpoint (for assertion checks in assembly code) 0N/A// We want to use ST_BREAKPOINT here, but the debugger is confused by it. 0N/A// flush windows (except current) using flushw instruction if avail. 0N/A// Write serialization page so VM thread can do a pseudo remote membar 0N/A// We use the current thread pointer to calculate a thread specific 0N/A// offset to write to within the page. This minimizes bus traffic 0N/A// due to cache line collision. 0N/A // Get the condition codes the V8 way. 0N/A // This is a test of V8 which has icc but not xcc 0N/A // so mask off the xcc bits 0N/A // Compare condition codes from the V8 and V9 ways. 0N/A // Write out the saved condition codes the V8 way 0N/A // Read back the condition codes using the V9 instruction 0N/A // This is a test of V8 which has icc but not xcc 0N/A // so mask off the xcc bits 0N/A // Compare the V8 way with the V9 way. 0N/A // Test code sequence used on V8. Do not move above rdccr. 0N/A // Test code sequence used on V8. Do not move below wrccr. 0N/A// a hook for debugging 0N/A// call this when G2_thread is not known to be valid 0N/A // NOTE: this chops off the heads of the 64-bit O registers. 0N/A // make sure G2_thread contains the right value 0N/A#
endif /* CC_INTERP */ 0N/A // Save & restore possible 64-bit Long arguments in G-regs 0N/A // G2 restored below 0N/A // Save & restore possible 64-bit Long arguments in G-regs 0N/A // smash G2_thread, as if the VM were about to anyway 0N/A // do it the slow way 0N/A// %%% maybe get rid of [re]set_last_Java_frame 0N/A // Always set last_Java_pc and flags first because once last_Java_sp is visible 0N/A // has_last_Java_frame is true and users will look at the rest of the fields. 0N/A // (Note: flags should always be zero before we get here so doesn't need to be set.) 0N/A // Verify that flags was zeroed on return to Java 0N/A stop(
"last_Java_pc not zeroed before leaving Java");
0N/A // Verify that flags was zeroed on return to Java 0N/A stop(
"flags not zeroed before leaving Java");
0N/A // When returning from calling out from Java mode the frame anchor's last_Java_pc 0N/A // will always be set to NULL. It is set here so that if we are doing a call to 0N/A // native (not VM) that we capture the known pc and don't have to rely on the 0N/A // native call having a standard frame linkage where we can find the pc. 0N/A // Make sure that we have an odd stack 0N/A stop(
"Stack Not Biased in set_last_Java_frame");
0N/A // check that it WAS previously set 0N/A#
endif /* CC_INTERP */ 0N/A // Always return last_Java_pc to zero 0N/A // Always null flags after return to Java 0N/A // determine last_java_sp register 0N/A // debugging support 0N/A // 64-bit last_java_sp is biased! 0N/A // check for pending exceptions. use Gtemp as scratch register. 0N/A // get oop result if there is one and reset the value in the thread 0N/A // we use O7 linkage so that forward_exception_entry has the issuing PC 0N/A // O0 is reserved for the thread 0N/A // O0 is reserved for the thread 0N/A // O0 is reserved for the thread 0N/A// Note: The following call_VM overloadings are useful when a "save" 0N/A// has already been performed by a stub, and the last Java frame is 0N/A// the previous one. In that case, last_java_sp must be passed as FP 0N/A // O0 is reserved for the thread 0N/A // O0 is reserved for the thread 0N/A // O0 is reserved for the thread 0N/A// We require that C code which does not return a value in vm_result will 0N/A// leave it undisturbed. 0N/A // Check that we are not overwriting any other oop. 0N/A#
endif /* CC_INTERP */ 0N/A// %%% Note: The following six instructions have been moved, 0N/A// They will be refactored at a later date. 0N/A // if addr of local, do not need to load it 0N/A// ForceRelocatable = 1; 0N/A if ( a.
hi32() &
0x3ff )
// Any bits? 0N/A if ( a.
low32() &
0xFFFFFC00 ) {
// done? 0N/A if( (a.
low32() >>
20) &
0xfff ) {
// Any bits set? 0N/A // Pad out the instruction sequence so it can be 0N/A or3(
G0,
value, d);
// setsw (this leaves upper 32 bits sign-extended) 0N/A // (A negative value could be loaded in 2 insns with sethi/xor, 0N/A // but it would take a more complex relocation.) 0N/A// %%% End of moved six set instructions. 0N/A // (Matcher::isSimpleConstant64 knows about the following optimizations.) 0N/A// compute size in bytes of sparc frame, given 0N/A// number of extraWords 0N/A// save_frame: given number of "extra" words in frame, 0N/A// issue approp. save instruction (p 200, v8 manual) 0N/A // The trick here is to use precisely the same memory word 0N/A // that trap handlers also use to save the register. 0N/A // This word cannot be used for any other purpose, but 0N/A // it works fine to save the register's value, whether or not 0N/A // an interrupt flushes register windows at any given moment! 164N/A // Assembler::sethi(0x3fffff, d); 164N/A // Don't add relocation for 'add'. Do patching during 'sethi' processing. 0N/A for ( j = 0; j <
8; ++j )
0N/A for ( j = 0; j <
8; ++j )
0N/A for ( j = 0; j <
8; ++j )
0N/A for ( j = 0; j <
8; ++j )
0N/A // print out floats with compression 0N/A for (j = 0; j <
32; ) {
0N/A // and doubles (evens only) 0N/A for (j = 0; j <
32; ) {
0N/A for (i = 0; i <
8; ++i) {
0N/A for (i = 0; i <
32; ++i) {
0N/A for (
int i =
1; i <
8; ++i) {
0N/A for (
int j = 0; j <
32; ++j) {
0N/A// pushes double TOS element of FPU stack on CPU stack; pops from FPU stack 0N/A // %%%%%% need to implement this 0N/A// pops double TOS element from CPU stack and pushes on FPU stack 0N/A // %%%%%% need to implement this 0N/A // %%%%%% need to implement this 0N/A // plausibility check for oops 0N/A if (
reg ==
G0)
return;
// always NULL, which is always an oop 0N/A // Call indirectly to solve generation ordering problem 0N/A // Make some space on stack above the current register window. 0N/A // Enough to hold 8 64-bit registers. 0N/A // Save some 64-bit registers; a normal 'save' chops the heads off 0N/A // of 64-bit longs in the 32-bit build. 0N/A mov(
reg,
O0);
// Move arg into O0; arg might be in O7 which is about to be crushed 0N/A // Load address to call to into O7 0N/A // Register call to verify_oop_subroutine 0N/A // recover frame size 0N/A // plausibility check for oops 0N/A // Call indirectly to solve generation ordering problem 0N/A // Make some space on stack above the current register window. 0N/A // Enough to hold 8 64-bit registers. 0N/A // Save some 64-bit registers; a normal 'save' chops the heads off 0N/A // of 64-bit longs in the 32-bit build. 0N/A // Load address to call to into O7 0N/A // Register call to verify_oop_subroutine 0N/A // recover frame size 0N/A// This macro is expanded just once; it creates shared code. Contract: 0N/A// receives an oop in O0. Must restore O0 & O7 from TLS. Must not smash ANY 0N/A// registers, including flags. May not use a register 'save', as this blows 0N/A// the high bits of the O-regs if they contain Long values. Acts as a 'leaf' 0N/A // Leaf call; no frame. 0N/A // O0 and O7 were saved already (O0 in O0's TLS home, O7 in O5's TLS home). 0N/A // O0 is now the oop to be checked. O7 is the return address. 0N/A // Save some more registers for temps. 0N/A {
// count number of verifies 0N/A // mark lower end of faulting range 0N/A // We can't check the mark oop because it could be in the process of 0N/A // locking or unlocking while this is running. 0N/A // assert((obj & oop_mask) == oop_bits); 0N/A // the null_or_fail case is useless; must test for null separately 0N/A // Check the klassOop of this object for being in the right area of memory. 0N/A // Cannot do the load in the delay above slot in case O0 is null 0N/A // assert((klass & klass_mask) == klass_bits); 0N/A // Check the klass's klass 0N/A // mark upper end of faulting range 0N/A //----------------------- 0N/A // Restore prior 64-bit registers 0N/A retl();
// Leaf return; restore prior O7 in delay slot 0N/A //----------------------- 0N/A //----------------------- 0N/A // stop_subroutine expects message pointer in I1. 0N/A // Restore prior 64-bit registers 0N/A // factor long stop-sequence into subroutine to save space 0N/A // call indirectly to solve generation ordering problem 0N/A // save frame first to get O7 for return address 0N/A // add one word to size in case struct is odd number of words long 0N/A // It must be doubleword-aligned for storing doubles into it. 0N/A // stop_subroutine expects message pointer in I1. 0N/A // factor long stop-sequence into subroutine to save space 0N/A // call indirectly to solve generation ordering problem 0N/A // unnoticeable results in the output files. 0N/A // restore(); done in callee to save space! 0N/A// delayed()->restore(); 0N/A // We must be able to turn interactive prompting off 0N/A // in order to run automated test scripts on the VM 0N/A // Use the flag ShowMessageBoxOnError 0N/A char* b =
new char[
1024];
0N/A // for the sake of the debugger, stick a PC on the current frame 0N/A // (this assumes that the caller has performed an extra "save") 0N/A // We expect pointer to message in I1. Caller must set it up in O1 0N/A // In order to get locks work, we need to fake a in_VM state 0N/A// --------------------------------------------------------- 0N/A// compares register with zero and branches. NOT FOR USE WITH 64-bit POINTERS 0N/A// Compares a pointer register with zero and branches on null. 0N/A// Does a test & branch on 32-bit systems and a register-branch on 64-bit. 0N/A// instruction sequences factored across compiler & interpreter 0N/A // And, with an unsigned comparison, it does not matter if the numbers 0N/A // are negative or not. 0N/A // E.g., -2 cmp -1: the low parts are 0xfffffffe and 0xffffffff. 0N/A // The second one is bigger (unsignedly). 0N/A // Other notes: The first move in each triplet can be unconditional 0N/A // (and therefore probably prefetchable). 0N/A // And the equals case for the high part does not need testing, 0N/A // since that triplet is reached only after finding the high halves differ. 0N/A "register alias checks");
0N/A // This code can be optimized to use the 64 bit shifts in V9. 0N/A // Here we use the 32 bit shifts. 0N/A // shift < 32 bits, Ralt_count = Rcount-31 0N/A // We get the transfer bits by shifting right by 32-count the low 0N/A // register. This is done by shifting right by 31-count and then by one 0N/A // more to take care of the special (rare) case where count is zero 0N/A // (shifting by 32 would not work). 0N/A // The order of the next two instructions is critical in the case where 0N/A // Rin and Rout are the same and should not be reversed. 0N/A // shift >= 32 bits, Ralt_count = Rcount-32 0N/A "register alias checks");
0N/A // This code can be optimized to use the 64 bit shifts in V9. 0N/A // Here we use the 32 bit shifts. 0N/A // shift < 32 bits, Ralt_count = Rcount-31 0N/A // We get the transfer bits by shifting left by 32-count the high 0N/A // register. This is done by shifting left by 31-count and then by one 0N/A // more to take care of the special (rare) case where count is zero 0N/A // (shifting by 32 would not work). 0N/A // The order of the next two instructions is critical in the case where 0N/A // Rin and Rout are the same and should not be reversed. 0N/A // shift >= 32 bits, Ralt_count = Rcount-32 0N/A "register alias checks");
0N/A // This code can be optimized to use the 64 bit shifts in V9. 0N/A // Here we use the 32 bit shifts. 0N/A // shift < 32 bits, Ralt_count = Rcount-31 0N/A // We get the transfer bits by shifting left by 32-count the high 0N/A // register. This is done by shifting left by 31-count and then by one 0N/A // more to take care of the special (rare) case where count is zero 0N/A // (shifting by 32 would not work). 0N/A // The order of the next two instructions is critical in the case where 0N/A // Rin and Rout are the same and should not be reversed. 0N/A // shift >= 32 bits, Ralt_count = Rcount-32 0N/A //fb(lt, true, pn, done); delayed()->set( -1, Rresult ); 0N/A // number() does a sanity check on the alignment. 0N/A // number() does a sanity check on the alignment. 0N/A // number() does a sanity check on the alignment. 0N/A // number() does a sanity check on the alignment. 0N/A // number() does a sanity check on the alignment. 0N/A // number() does a sanity check on the alignment. 0N/A// Use for 64 bit operation. 0N/A // store ptr_reg as the new top value 0N/A// [RGV] This routine does not handle 64 bit operations. 0N/A// use casx_under_lock() or casx directly!!! 0N/A // store ptr_reg as the new top value 0N/A // If the register is not an out nor global, it is not visible 0N/A // after the save. Allocate a register for it, save its 0N/A // value in the register save area (the save may not flush 0N/A // registers to the save area). 0N/A // Initialize yield counter 0N/A // Save the regs and make space for a C call 0N/A // reset the counter 0N/A // did we get the lock? 0N/A // yes, got lock. do we have the same top? 622N/A // load indirectly to solve generation ordering problem 623N/A// Look up the method for a megamorphic invokeinterface call. 623N/A// The target method is determined by <intf_klass, itable_index>. 623N/A// The receiver klass is in recv_klass. 623N/A// On success, the result will be in method_result, and execution falls through. 623N/A// On failure, execution transfers to the given label. 623N/A "caller must use same register for non-constant itable index as for method");
623N/A // Compute start of first itableOffsetEntry (which is at the end of the vtable) 623N/A // %%% We should store the aligned, prescaled offset in the klassoop. 623N/A // Then the next several instructions would fold away. 623N/A // hoist first instruction of round_to(scan_temp, BytesPerLong): 623N/A // Round up to align_object_offset boundary 623N/A // see code for instanceKlass::start_of_itable! 623N/A // Was: round_to(scan_temp, BytesPerLong); 623N/A // Hoisted: add(scan_temp, BytesPerLong-1, scan_temp); 623N/A // Adjust recv_klass by scaled itable_index, so we can free itable_index. 623N/A // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 623N/A // if (scan->interface() == intf) { 623N/A // result = (klass + scan->offset() + itable_index); 623N/A // %%%% Could load both offset and interface in one ldx, if they were 623N/A // in the opposite order. This would save a load. 623N/A // Check that this entry is non-null. A null entry means that 623N/A // the receiver class doesn't implement the interface, and wasn't the 623N/A // same as when the caller was compiled. 623N/A // (invert the test to fall through to found_method...) 623N/A // scan_temp[-scan_step] points to the vtable offset we need 644N/A "at most one NULL in the batch, usually");
644N/A // Support for the instanceof hack, which uses delay slots to 644N/A // set a destination register to zero or one. 644N/A // Hacked ba(), which may only be used just before L_fallthrough. 644N/A // If the pointers are equal, we are done (e.g., String[] elements). 644N/A // This self-check enables sharing of secondary supertype arrays among 644N/A // non-primary types such as array-of-interface. Otherwise, each such 644N/A // type would need its own customized SSA. 644N/A // We move this check to the front of the fast path because many 644N/A // type checks are in fact trivially successful in this manner, 644N/A // so we get a nicely predicted branch right at the start of the check. 644N/A // Check the supertype display: 644N/A // The super check offset is always positive... 644N/A // This check has worked decisively for primary supers. 644N/A // Secondary supers are sought in the super_cache ('super_cache_addr'). 644N/A // (Secondary supers are interfaces and very deeply nested subtypes.) 644N/A // This works in the same check above because of a tricky aliasing 644N/A // between the super_cache and the primary super display elements. 644N/A // (The 'super_check_addr' can address either, as the case requires.) 644N/A // Note that the cache is updated below if it does not help us find 644N/A // what we need immediately. 644N/A // So if it was a primary super, we can just fail immediately. 644N/A // Otherwise, it's the slow path for us (no success at this point). 644N/A // if !do_bool_sets, sneak the next cmp into the delay slot: 644N/A // Need a slow path; fast failure is impossible. 644N/A // No slow path; it's a fast decision. 644N/A // a couple of useful fields in sub_klass: 644N/A // Do a linear scan of the secondary super-klass chain. 644N/A // This code is rarely used, so simplicity is a virtue here. 644N/A // We will consult the secondary-super array. 644N/A // Compress superclass if necessary. 644N/A // The superclass is never null; it would be a basic system error if a null 644N/A // pointer were to sneak in here. Note that we have already loaded the 644N/A // Klass::super_check_offset from the super_klass in the fast path, 644N/A // so if there is a null in that register, we are already in the afterlife. 644N/A // Load the array length. (Positive movl does right thing on LP64.) 644N/A // Check for empty secondary super list 644N/A // Skip the array header in all array accesses. 644N/A // Load next super to check 644N/A // Don't use load_heap_oop; we don't want to decode the element. 644N/A // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list 644N/A // A miss means we are NOT a subtype and need to keep looping 644N/A // Falling out the bottom means we found a hit; we ARE a subtype 644N/A // Success. Cache the super we found and proceed in triumph. 0N/A // See whether the lock is currently biased toward our thread and 0N/A // whether the epoch is still valid 0N/A // Note that the runtime guarantees sufficient alignment of JavaThread 0N/A // pointers to allow age to be placed into low bits 0N/A // Reload mark_reg as we may need it later 0N/A // At this point we know that the header has the bias pattern and 0N/A // that we are not the bias owner in the current epoch. We need to 0N/A // figure out more details about the state of the header in order to 0N/A // know what operations can be legally performed on the object's 0N/A // If the low three bits in the xor result aren't clear, that means 0N/A // the prototype header is no longer biased and we have to revoke 0N/A // the bias on this object. 0N/A // Biasing is still enabled for this data type. See whether the 0N/A // epoch of the current bias is still valid, meaning that the epoch 0N/A // bits of the mark word are equal to the epoch bits of the 0N/A // prototype header. (Note that the prototype header's epoch bits 0N/A // only change at a safepoint.) If not, attempt to rebias the object 0N/A // toward the current thread. Note that we must be absolutely sure 0N/A // that the current epoch is invalid in order to do this because 0N/A // otherwise the manipulations it performs on the mark word are 0N/A // The epoch of the current bias is still valid but we know nothing 0N/A // about the owner; it might be set or it might be clear. Try to 0N/A // acquire the bias of the object using an atomic operation. If this 0N/A // fails we will go in to the runtime to revoke the object's bias. 0N/A // Note that we first construct the presumed unbiased header so we 0N/A // don't accidentally blow away another thread's valid bias. 0N/A // If the biasing toward our thread failed, this means that 0N/A // another thread succeeded in biasing it toward itself and we 0N/A // need to revoke that bias. The revocation will occur in the 0N/A // interpreter runtime in the slow case. 0N/A // At this point we know the epoch has expired, meaning that the 0N/A // current "bias owner", if any, is actually invalid. Under these 0N/A // circumstances _only_, we are allowed to use the current header's 0N/A // value as the comparison value when doing the cas to acquire the 0N/A // bias in the current epoch. In other words, we allow transfer of 0N/A // the bias from one thread to another directly in this situation. 0N/A // FIXME: due to a lack of registers we currently blow away the age 0N/A // bits in this situation. Should attempt to preserve them. 0N/A // If the biasing toward our thread failed, this means that 0N/A // another thread succeeded in biasing it toward itself and we 0N/A // need to revoke that bias. The revocation will occur in the 0N/A // interpreter runtime in the slow case. 0N/A // The prototype mark in the klass doesn't have the bias bit set any 0N/A // more, indicating that objects of this data type are not supposed 0N/A // to be biased any more. We are going to try to reset the mark of 0N/A // this object to the prototype value and fall through to the 0N/A // CAS-based locking scheme. Note that if our CAS fails, it means 0N/A // that another thread raced us for the privilege of revoking the 0N/A // bias of this particular object, so it's okay to continue in the 0N/A // normal locking code. 0N/A // FIXME: due to a lack of registers we currently blow away the age 0N/A // bits in this situation. Should attempt to preserve them. 0N/A // Fall through to the normal CAS-based lock, because no matter what 0N/A // the result of the above CAS, some thread must have succeeded in 0N/A // removing the bias bit from the object's header. 0N/A // Check for biased locking unlock case, which is a no-op 0N/A // Note: we do not have to check the thread ID for two reasons. 0N/A // First, the interpreter checks for IllegalMonitorStateException at 0N/A // a higher level. Second, if the bias was revoked while we held the 0N/A // lock, the object could not be rebiased toward another thread, so 0N/A // the bias bit would be clear. 0N/A// CASN -- 32-64 bit switch hitter similar to the synthetic CASN provided by 0N/A// compiler_lock_object() and compiler_unlock_object() are direct transliterations 0N/A// of i486.ad fast_lock() and fast_unlock(). See those methods for detailed comments. 0N/A// The code could be tightened up considerably. 0N/A// box->dhw disposition - post-conditions at DONE_LABEL. 0N/A// - Successful inflated lock: box->dhw != 0. 0N/A// Any non-zero value suffices. 0N/A// Consider G2_thread, rsp, boxReg, or unused_mark() 0N/A// - Successful Stack-lock: box->dhw == mark. 0N/A// box->dhw must contain the displaced mark word value 0N/A// - Failure -- icc.ZFlag == 0 and box->dhw is undefined. 0N/A// The slow-path fast_enter() and slow_enter() operators 0N/A// are responsible for setting box->dhw = NonZero (typically ::unused_mark). 0N/A// - Biased: box->dhw is undefined 0N/A// SPARC refworkload performance - specifically jetstream and scimark - are 0N/A// extremely sensitive to the size of the code emitted by compiler_lock_object 0N/A// and compiler_unlock_object. Critically, the key factor is code size, not path 0N/A// length. (Simply experiments to pad CLO with unexecuted NOPs demonstrte the 0N/A // Fetch object's markword 0N/A // Save Rbox in Rscratch to be used for the cas operation 0N/A // set Rmark to markOop | markOopDesc::unlocked_value 0N/A // Initialize the box. (Must happen before we update the object mark!) 0N/A // compare object markOop with Rmark and if equal exchange Rscratch with object markOop 0N/A // hence we are done 0N/A // we did not find an unlocked object so see if this is a recursive case 0N/A // sub(Rscratch, SP, Rscratch); 0N/A // Triage: biased, stack-locked, neutral, inflated 0N/A // Invariant: if control reaches this point in the emitted stream 0N/A // then Rmark has not been modified. 0N/A // Store mark into displaced mark field in the on-stack basic-lock "box" 0N/A // Critically, this must happen before the CAS 0N/A // Maximize the ST-CAS distance to minimize the ST-before-CAS penalty. 0N/A // Try stack-lock acquisition. 0N/A // Beware: the 1st instruction is in a delay slot 0N/A // Stack-lock attempt failed - check for recursive stack-lock. 0N/A // See the comments below about how we might remove this case. 0N/A // If m->owner != null goto IsLocked 0N/A // Pessimistic form: Test-and-CAS vs CAS 0N/A // The optimistic form avoids RTS->RTO cache line upgrades. 0N/A // m->owner == null : it's unlocked. 0N/A // Try to CAS m->owner from null to Self 0N/A // Invariant: if we acquire the lock then _recursions should be 0. 0N/A // Intentional fall-through into done 0N/A // Aggressively avoid the Store-before-CAS penalty 0N/A // Defer the store into box->dhw until after the CAS 0N/A// Anticipate CAS -- Avoid RTS->RTO upgrade 0N/A// prefetch (mark_addr, Assembler::severalWritesAndPossiblyReads) ; 0N/A // Triage: biased, stack-locked, neutral, inflated 0N/A // Invariant: if control reaches this point in the emitted stream 0N/A // then Rmark has not been modified. 0N/A // Try stack-lock acquisition. 0N/A // Transiently install BUSY (0) encoding in the mark word. 0N/A // if the CAS of 0 into the mark was successful then we execute: 0N/A // ST box->dhw = mark -- save fetched mark in on-stack basiclock box 0N/A // ST obj->mark = box -- overwrite transient 0 value 0N/A // This presumes TSO, of course. 0N/A// prefetch (mark_addr, Assembler::severalWritesAndPossiblyReads) ; 0N/A // Stack-lock attempt failed - check for recursive stack-lock. 0N/A // Tests show that we can remove the recursive case with no impact 0N/A // on refworkload 0.83. If we need to reduce the size of the code 0N/A // emitted by compiler_lock_object() the recursive case is perfect 0N/A // A more extreme idea is to always inflate on stack-lock recursion. 0N/A // This lets us eliminate the recursive checks in compiler_lock_object 0N/A // and compiler_unlock_object and the (box->dhw == 0) encoding. 0N/A // and showed a performance *increase*. In the same experiment I eliminated 0N/A // the fast-path stack-lock code from the interpreter and always passed 0N/A // RScratch contains the fetched obj->mark value from the failed CASN. 0N/A // Accounting needs the Rscratch register 0N/A // If m->owner != null goto IsLocked 0N/A // Test-and-CAS vs CAS 0N/A // Pessimistic form avoids futile (doomed) CAS attempts 0N/A // The optimistic form avoids RTS->RTO cache line upgrades. 0N/A // m->owner == null : it's unlocked. 0N/A // Try to CAS m->owner from null to Self 0N/A // Invariant: if we acquire the lock then _recursions should be 0. 0N/A // ST box->displaced_header = NonZero. 0N/A // Any non-zero value suffices: 0N/A // unused_mark(), G2_thread, RBox, RScratch, rsp, etc. 0N/A // Intentional fall-through into done 0N/A // Test first if it is a fast recursive unlock 0N/A // Check if it is still a light weight lock, this is is true if we see 0N/A // the stack address of the basicLock in the markOop of the object 0N/A // Beware ... If the aggregate size of the code emitted by CLO and CUO is 0N/A // is too large performance rolls abruptly off a cliff. 0N/A // This could be related to inlining policies, code cache management, or 0N/A // TODO: eliminate redundant LDs of obj->mark 0N/A delayed()->
nop() ;
// consider: relocate fetch of mark, above, into this DS 0N/A // Conceptually we need a #loadstore|#storestore "release" MEMBAR before 0N/A // the ST of 0 into _owner which releases the lock. This prevents loads 0N/A // and stores within the critical section from reordering (floating) 0N/A // past the store that releases the lock. But TSO is a strong memory model 0N/A // and that particular flavor of barrier is a noop, so we can safely elide it. 0N/A // Note that we use 1-0 locking by default for the inflated case. We 0N/A // close the resultant (and rare) race by having contented threads in 0N/A // monitorenter periodically poll _owner. 0N/A // invert icc.zf and goto done 0N/A // Consider: we could replace the expensive CAS in the exit 0N/A // path with a simple ST of the displaced mark value fetched from 0N/A // the on-stack basiclock box. That admits a race where a thread T2 0N/A // in the slow lock path -- inflating with monitor M -- could race a 0N/A // thread T1 in the fast unlock path, resulting in a missed wakeup for T2. 0N/A // More precisely T1 in the stack-lock unlock path could "stomp" the 0N/A // inflated mark value M installed by T2, resulting in an orphan 0N/A // object monitor M and T2 becoming stranded. We can remedy that situation 0N/A // by having T2 periodically poll the object's mark word using timed wait 0N/A // operations. If T2 discovers that a stomp has occurred it vacates 0N/A // the monitor M and wakes any other threads stranded on the now-orphan M. 0N/A // In addition the monitor scavenger, which performs deflation, 0N/A // would also need to check for orpan monitors and stranded threads. 0N/A // Finally, inflation is also used when T2 needs to assign a hashCode 0N/A // to O and O is stack-locked by T1. The "stomp" race could cause 0N/A // an assigned hashCode value to be lost. We can avoid that condition 0N/A // and provide the necessary hashCode stability invariants by ensuring 0N/A // that hashCode generation is idempotent between copying GCs. 0N/A // For example we could compute the hashCode of an object O as 0N/A // O's heap address XOR some high quality RNG value that is refreshed 0N/A // at GC-time. The monitor scavenger would install the hashCode 0N/A // found in any orphan monitors. Again, the mechanism admits a 0N/A // lost-update "stomp" WAW race but detects and recovers as needed. 0N/A // A prototype implementation showed excellent results, although 0N/A // the scavenger and timeout code was rather involved. 0N/A // Intentional fall through into done ... 0N/A // %%%%% need to implement this 0N/A // %%%%% need to implement this 0N/A // %%%%% need to implement this 0N/A // %%%%% need to implement this 0N/A // %%%%% need to implement this 0N/A // %%%%% need to implement this 0N/A // %%%%% need to implement this 0N/A // %%%%% need to implement this 0N/A Register obj,
// result: pointer to object after successful allocation 0N/A // make sure arguments make sense 342N/A // No allocation in the shared eden. 342N/A // note: we need both top & top_addr! 342N/A // make sure eden top is properly aligned 342N/A // size is unknown at compile time 342N/A // size is known at compile time 342N/A // Compare obj with the value at top_addr; if still equal, swap the value of 342N/A // end with the value at top_addr. If not equal, read the value at top_addr 342N/A // if someone beat us on the allocation, try again, otherwise continue 342N/A // make sure eden top is properly aligned 0N/A Register obj,
// result: pointer to object after successful allocation 0N/A // make sure arguments make sense 0N/A // calculate amount of free space 0N/A // calculate the new top pointer 0N/A // make sure new free pointer is properly aligned 0N/A stop(
"updated TLAB free is not properly aligned");
0N/A // update the tlab top pointer 0N/A // No allocation in the shared eden. 0N/A // calculate amount of free space 0N/A // Retain tlab and allocate object in shared space if 0N/A // the amount free in the tlab is too large to discard. 0N/A // increment waste limit to prevent getting stuck on this slow path 0N/A // increment number of slow_allocations 0N/A // increment number of refills 0N/A // accumulate wastage 0N/A // if tlab is currently allocated (top or end != null) then 0N/A // fill [top, end + alignment_reserve) with array object 0N/A // set klass to intArrayKlass 167N/A // store klass last. concurrent gcs assumes klass length is valid if 167N/A // klass field is not null. 0N/A // refill the tlab with an eden allocation 0N/A // add object_size ?? 0N/A // check that tlab_size (t1) is still valid 0N/A // Note some conditions are synonyms for others 0N/A// Writes to stack successive pages until offset reached to check for 0N/A// stack overflow + shadow pages. This clobbers tsp and scratch. 0N/A // Use stack pointer in temp stack pointer 0N/A // Bang stack for total size given plus stack shadow page size. 0N/A // Bang one page at a time because a large size can overflow yellow and 0N/A // red zones (the bang will fail but stack overflow handling can't tell that 0N/A // it was a stack overflow bang vs a regular segv). 0N/A // Bang down shadow pages too. 0N/A // The -1 because we already subtracted 1 page. 342N/A/////////////////////////////////////////////////////////////////////////////////// 342N/A// The calls to this don't work. We'd need to do a fair amount of work to 342N/A "check sizes in assembly below");
342N/A // If the branch is taken, no harm in executing this in the delay slot. 342N/A // Use return-from-leaf 342N/A // This should be rare enough that we can afford to save all the 342N/A // scratch registers that the calling context might be using. 342N/A // We need the value of O0 above (for the write into the buffer), so we 342N/A // save and restore it. 342N/A // Since the call will overwrite O7, we save and restore that, as well. 342N/A // satb_log_barrier(tmp, obj, offset, preserve_o_regs); 342N/A // satb_log_barrier_work0(tmp, filtered); 342N/A // Check on whether to annul. 342N/A // satb_log_barrier_work1(tmp, offset); 342N/A // satb_log_barrier_work2(obj, tmp, offset); 342N/A // satb_log_barrier_work3(tmp, filtered, preserve_o_regs); 342N/A // Save G-regs that target may use. 342N/A // Restore G-regs that target may have used. 342N/A // Check on whether to annul. 342N/A // OK, it's not filtered, so we'll need to call enqueue. In the normal 342N/A // case, pre_val will be a scratch G-reg, but there's some cases in which 342N/A // it's an O-reg. In the first case, do a normal call. In the latter, 342N/A // do a save here and call the frameless version. 342N/A "Or we need to think harder.");
342N/A " (%5.2f%% intra-HR, %5.2f%% null, %5.2f%% popular).",
342N/A// This gets to assume that o0 contains the object address. 342N/A // Get O1 + O2 into a reg by itself -- useful in the take-the-branch 342N/A // case, harmless if not. 342N/A // We didn't take the branch, so we're already dirty: return. 342N/A // Use return-from-leaf 342N/A // If the branch is taken, no harm in executing this in the delay slot. 342N/A // Use return-from-leaf 342N/A // This should be rare enough that we can afford to save all the 342N/A // scratch registers that the calling context might be using. 342N/A // We need the value of O3 above (for the write into the buffer), so we 342N/A // save and restore it. 342N/A // Since the call will overwrite O7, we save and restore that, as well. 342N/A // XXX Should have a guarantee here about not going off the end! 342N/A // Does it already do so? Do an experiment... 342N/A // This is a sleazy hack: I'm temporarily hijacking G2, which I 342N/A // Save G-regs that target may use. 342N/A // Restore G-regs that target may have used. 342N/A // XXX Should I predict this taken or not? Does it mattern? 342N/A // Now we decide how to generate the card table write. If we're 342N/A // enqueueing, we call out to a generated function. Otherwise, we do it 342N/A // If the "store_addr" register is an "in" or "local" register, move it to 342N/A // a scratch reg so we can pass it as an argument. 342N/A // Pick a scratch register different from "tmp". 342N/A // Make sure we use up the delay slot! 342N/A/////////////////////////////////////////////////////////////////////////////////// 342N/A // If we're writing constant NULL, we can skip the write barrier. 622N/A// Loading values by size and signed-ness 622N/A case ~
8:
// fall through: 113N/A // The number of bytes in this code is used by 113N/A // MachCallDynamicJavaNode::ret_addr_offset() 113N/A // if this changes, change that. 113N/A // optimize for frequent case src == dst 113N/A // could be moved before branch, and annulate delay, 113N/A // but may add some unneeded work decoding null 178N/A // Also do not verify_oop as this is called by verify_oop. 178N/A // Also do not verify_oop as this is called by verify_oop. 113N/A // call indirectly to solve generation ordering problem