assembler_x86.cpp revision 304
196N/A * Copyright 1997-2008 Sun Microsystems, Inc. All Rights Reserved. 0N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 0N/A * This code is free software; you can redistribute it and/or modify it 0N/A * under the terms of the GNU General Public License version 2 only, as 0N/A * published by the Free Software Foundation. 0N/A * This code is distributed in the hope that it will be useful, but WITHOUT 0N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 0N/A * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 0N/A * version 2 for more details (a copy is included in the LICENSE file that 0N/A * accompanied this code). 0N/A * You should have received a copy of the GNU General Public License version 0N/A * 2 along with this work; if not, write to the Free Software Foundation, 0N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 0N/A * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, 0N/A * CA 95054 USA or visit www.sun.com if you need additional information or 0N/A * have any questions. 0N/A#
include "incls/_precompiled.incl" 0N/A// Implementation of AddressLiteral 0N/A // Oops are a special case. Normally they would be their own section 0N/A // but in cases like icBuffer they are literals in the code stream that 0N/A // we don't have a section for. We use none so that we get a literal address 0N/A // which is always patchable. 0N/A// Implementation of Address 0N/A // Not implementable on 64bit machines 0N/A // Should have been handled higher up the call chain. 304N/A// exceedingly dangerous constructor 0N/A// exceedingly dangerous constructor 0N/A// Convert the raw encoding form into the form expected by the constructor for 0N/A// Address. An index of 4 (rsp) corresponds to having no index, so convert 0N/A// that to noreg for the Address constructor. 0N/A// Implementation of Assembler 0N/A// make this go away someday 0N/A // Do not use AbstractAssembler::relocate, which is not intended for 0N/A // embedded words. Instead, relocate to the enclosing instruction. 0N/A // hack. call32 is too wide for mask so use disp32 0N/A assert((
op1 &
0x02) == 0,
"sign-extension bit should not be set");
0N/A// immediate-to-memory forms 0N/A assert((
op1 &
0x02) == 0,
"sign-extension bit should not be set");
0N/A assert((
op1 &
0x02) == 0,
"sign-extension bit should not be set");
304N/A // Encode the registers as needed in the fields they are used in 0N/A // [base + index*scale + disp] 0N/A // [base + index*scale] 0N/A // [00 reg 100][ss index base] 0N/A // [base + index*scale + imm8] 0N/A // [01 reg 100][ss index base] imm8 304N/A // [base + index*scale + disp32] 304N/A // [10 reg 100][ss index base] disp32 0N/A // [00 reg 100][00 100 100] 304N/A // [01 reg 100][00 100 100] disp8 304N/A // [10 reg 100][00 100 100] disp32 304N/A // [10 reg base] disp32 0N/A // [index*scale + disp] 304N/A // [00 reg 100][ss index 101] disp32 304N/A // [disp] (64bit) RIP-RELATIVE (32bit) abs 304N/A // Note that the RIP-rel. correction applies to the generated 304N/A // disp field, but _not_ to the target address in the rspec. 304N/A // disp was created by converting the target address minus the pc 304N/A // at the start of the instruction. That needs more correction here. 304N/A // intptr_t disp = target - next_ip; 304N/A // Do rip-rel adjustment for 64bit 304N/A "must be 32bit offset (RIP relative address)");
304N/A // [00 reg 100][00 100 101] disp32 0N/A// Secret local extension to Assembler::WhichOperand: 0N/A // Decode the given instruction, and return the address of 0N/A // an embedded 32-bit operand word. 0N/A // If "which" is disp32_operand, selects the displacement portion 0N/A // of an effective address specifier. 304N/A // If "which" is imm64_operand, selects the trailing immediate constant. 0N/A // If "which" is call32_operand, selects the displacement of a call or jump. 0N/A // Caller is responsible for ensuring that there is such an operand, 304N/A // and that it is 32/64 bits wide. 0N/A // If "which" is end_pc_operand, find the end of the instruction. 304N/A int tail_size = 0;
// other random bytes (#32, #16, etc.) at end of insn 0N/A // These convenience macros generate groups of "case" labels for the switch. 304N/A#
define REP4(x) (x)+0:
case (x)+
1:
case (x)+
2:
case (x)+
3 304N/A#
define REP8(x) (x)+0:
case (x)+
1:
case (x)+
2:
case (x)+
3: \
0N/A case (x)+
4:
case (x)+
5:
case (x)+
6:
case (x)+
7 304N/A case 0xFF:
// pushq a; decl a; incl a; call a; jmp a 0N/A case 0x88:
// movb a, r 0N/A case 0x89:
// movl a, r 0N/A case 0x8A:
// movb r, a 0N/A case 0x8B:
// movl r, a 0N/A case 0x8F:
// popl a 304N/A case 0x68:
// pushq #32 0N/A return ip;
// not produced by emit_operand 0N/A case 0x66:
// movw ... (size prefix) 0N/A case 0x8B:
// movw r, a 0N/A case 0x89:
// movw a, r 0N/A case 0xC7:
// movw a, #16 0N/A ip--;
// reparse the 0x0F 304N/A case REP8(
0xB8):
// movl/q r, #32/#64(oop?) 304N/A // these asserts are somewhat nonsensical 0N/A case 0x69:
// imul r, a, #32 0N/A case 0xC7:
// movl a, #32(oop?) 0N/A case 0x0F:
// movx..., etc. 0N/A case 0x12:
// movlps 0N/A case 0x28:
// movaps 0N/A case 0x2E:
// ucomiss 0N/A case 0x2F:
// comiss 0N/A case 0x55:
// andnps 0N/A case 0xAE:
// ldmxcsr a 304N/A // 64bit side says it these have both operands but that doesn't 0N/A case 0xAD:
// shrd r, a, %cl 0N/A case 0xAF:
// imul r, a 304N/A case 0xBE:
// movsbl r, a (movsxb) 304N/A case 0xBF:
// movswl r, a (movsxw) 304N/A case 0xB6:
// movzbl r, a (movzxb) 304N/A case 0xB7:
// movzwl r, a (movzxw) 0N/A case 0xB0:
// cmpxchgb 0N/A case 0xB1:
// cmpxchg 0N/A case 0xC7:
// cmpxchg8 0N/A // fall out of the switch to decode the address 0N/A case 0xAC:
// shrd r, a, #8 0N/A case 0x81:
// addl a, #32; addl r, #32 0N/A // also: orl, adcl, sbbl, andl, subl, xorl, cmpl 304N/A // on 32bit in the case of cmpl, the imm might be an oop 0N/A case 0x83:
// addl a, #8; addl r, #8 0N/A // also: orl, adcl, sbbl, andl, subl, xorl, cmpl 0N/A case 0xD9:
// fnstcw a 0N/A case REP4(
0x00):
// addb a, r; addl a, r; addb r, a; addl r, a 304N/A case 0x87:
// xchg r, a 304N/A case 0x85:
// test r, a 0N/A case 0xC1:
// sal a, #8; sar a, #8; shl a, #8; shr a, #8 0N/A case 0xC6:
// movb a, #8 0N/A case 0x80:
// cmpb a, #8 0N/A case 0x6B:
// imul r, a, #8 0N/A case 0xE8:
// call rdisp32 0N/A case 0xE9:
// jmp rdisp32 0N/A case 0xD1:
// sal a, 1; sar a, 1; shl a, 1; shr a, 1 0N/A case 0xD3:
// sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl 0N/A case 0xD9:
// fld_s a; fst_s a; fstp_s a; fldcw a 0N/A case 0xDD:
// fld_d a; fst_d a; fstp_d a 0N/A case 0xDB:
// fild_s a; fistp_s a; fld_x a; fstp_x a 0N/A case 0xDF:
// fild_d a; fistp_d a 0N/A case 0xD8:
// fadd_s a; fsubr_s a; fmul_s a; fdivr_s a; fcomp_s a 0N/A case 0xDC:
// fadd_d a; fsubr_d a; fmul_d a; fdivr_d a; fcomp_d a 0N/A case 0xDE:
// faddp_d a; fsubrp_d a; fmulp_d a; fdivrp_d a; fcompp_d a 0N/A case 0xF3:
// For SSE 0N/A case 0xF2:
// For SSE2 304N/A // assert(which != imm_operand || has_imm32, "instruction has no imm32 field"); 0N/A // parse the output of emit_operand 0N/A // now ip points at the disp (if any) 0N/A // [00 reg 100][ss index base] 304N/A // [00 reg 100][00 100 esp] 0N/A // [00 reg 100][ss index 101][disp32] 0N/A // [00 reg 101] [disp32] 0N/A return ip;
// caller wants the disp32 0N/A ip +=
4;
// skip the disp32 0N/A // [01 reg 100][ss index base][disp8] 304N/A // [01 reg 100][00 100 esp][disp8] 0N/A // [01 reg base] [disp8] 0N/A ip +=
1;
// skip the disp8 0N/A // [10 reg 100][ss index base][disp32] 304N/A // [10 reg 100][00 100 esp][disp32] 0N/A // [10 reg base] [disp32] 0N/A return ip;
// caller wants the disp32 0N/A ip +=
4;
// skip the disp32 0N/A // [11 reg base] (not a memory addressing mode) 0N/A // Secretly share code with locate_operand: 0N/A // assert(format == imm32_operand, "cannot specify a nonzero format"); 304N/A// work around gcc (3.2.1-7a) bug 0N/A assert(0 <= i && i <
8,
"illegal stack offset");
304N/A// Now the Assembler instruction (identical for 32/64 bits) 0N/A // 4 bytes: NOP DWORD PTR [EAX+0] 0N/A // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset 0N/A // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset 0N/A // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset 304N/A // suspect disp32 is always good 304N/A // 1110 1000 #32-bit disp 304N/A // 1110 1000 #32-bit disp 304N/A // This was originally using a 32bit register encoding 304N/A // and surely we want 64bit! 304N/A // this is a 32bit encoding but in 64bit mode the default 304N/A // operand size is 64bit so there is no need for the 304N/A // wide prefix. So prefix only happens if we use the 304N/A // this may be true but dbx disassembles it as if it 304N/A // int encode = prefix_and_encode(dst->encoding()); 304N/A // if (offset() != x) assert(dst->encoding() >= 8, "what?"); 304N/A // Technically, should use call32_operand, but this format is 304N/A // implied by the fact that we're emitting a call instruction. 304N/A// The 32-bit cmpxchg compares the value at adr with the contents of rax, 304N/A// and stores reg into adr if so; otherwise, the value at adr is loaded into rax,. 304N/A// The ZF is set if the compared values were equal, and cleared otherwise. 304N/A // caveat: no instructionmark, so this isn't relocatable. 304N/A // Emit a synthetic, non-atomic, CAS equivalent. 304N/A // Beware. The synthetic form sets all ICCs, not just ZF. 304N/A // cmpxchg r,[m] is equivalent to rax, = CAS (m, rax, r) 304N/A // NOTE: dbx seems to decode this as comiss even though the 304N/A // 0x66 is there. Strangly ucomisd comes out correct 304N/A // Don't use it directly. Use MacroAssembler::decrement() instead. 304N/A // Don't use it directly. Use MacroAssembler::increment() instead. 304N/A // 0111 tttn #8-bit disp 304N/A // 0000 1111 1000 tttn #32-bit disp 304N/A "must be 32bit offset (call4)");
304N/A // Note: could eliminate cond. jumps to this jump if condition 304N/A // is the same however, seems to be rather unlikely case. 304N/A // Note: use jccb() if label to be bound is very close to get 304N/A // an 8-bit displacement 304N/A "Dispacement too large for a short jmp");
304N/A // 0111 tttn #8-bit disp 304N/A // By default, forward jumps are always 32-bit displacements, since 304N/A // we can't yet know where the label will be bound. If you're sure that 304N/A // the forward jump will not run beyond 256 bytes, use jmpb to 304N/A // force an 8-bit displacement. 304N/A "Dispacement too large for a short jmp");
304N/A // Emit either nothing, a NOP, or a NOP: prefix 304N/A // Memory barriers are only needed on multiprocessors 304N/A // All usable chips support "locked" instructions which suffice 304N/A // as barriers, and are much faster than the alternative of 304N/A // using cpuid instruction. We use here a locked add [esp],0. 304N/A // This is conveniently otherwise a no-op except for blowing 304N/A // flags (which we save and restore.) 304N/A// Uses zero extension on 64bit 304N/A// New cpus require to use movsd and movss to avoid partial register stall 304N/A// when loading from memory. But for old Opteron use movlpd instead of movsd. 304N/A// The selection is done in MacroAssembler::movdbl() and movflt(). 304N/A // workaround gcc (3.2.1-7a) bug 304N/A // In that version of gcc with only an emit_operand(MMX, Address) 304N/A // gcc will tail jump and try and reverse the parameters completely 304N/A // obliterating dst in the process. By having a version available 304N/A // that doesn't need to swap the args at the tail jump the bug is 304N/A // The fancy nops aren't currently recognized by debuggers making it a 304N/A // pain to disassemble code while debugging. If asserts are on clearly 304N/A // speed is not an issue so simply use the single byte traditional nop 0N/A // Using multi-bytes nops "0x0F 0x1F [address]" for Intel 0N/A // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding) 0N/A // 4: 0x0F 0x1F 0x40 0x00 0N/A // 5: 0x0F 0x1F 0x44 0x00 0x00 0N/A // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00 0N/A // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0N/A // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // The rest coding is Intel specific - don't use consecutive address nops 0N/A // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90 0N/A // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90 0N/A // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90 0N/A // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90 0N/A // For Intel don't generate consecutive addess nops (mix with regular nops) 0N/A // Don't use "0x0F 0x1F 0x00" - need patching safe padding 0N/A // Using multi-bytes nops "0x0F 0x1F [address]" for AMD. 0N/A // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding) 0N/A // 4: 0x0F 0x1F 0x40 0x00 0N/A // 5: 0x0F 0x1F 0x44 0x00 0x00 0N/A // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00 0N/A // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0N/A // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // The rest coding is AMD specific - use consecutive address nops 0N/A // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00 0N/A // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00 0N/A // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0N/A // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0N/A // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // Size prefixes (0x66) are added for larger sizes 0N/A // Generate first nop for size between 21-12 0N/A // Generate second nop for size between 11-1 0N/A // Don't use "0x0F 0x1F 0x00" - need patching safe padding 0N/A // Using nops with size prefixes "0x66 0x90". 0N/A // From AMD Optimization Guide: 0N/A // 3: 0x66 0x66 0x90 0N/A // 4: 0x66 0x66 0x66 0x90 0N/A // 5: 0x66 0x66 0x90 0x66 0x90 0N/A // 6: 0x66 0x66 0x90 0x66 0x66 0x90 0N/A // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0N/A // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90 0N/A // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90 0N/A // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90 304N/A // NOTE: this will adjust stack by 8byte on 64bits 304N/A // HMM Table D-1 says sse2 or mmx 304N/A // in 64bits we push 64bits onto the stack but only 304N/A // take a 32bit immediate 304N/A // Note this will push 64bit on 64bit 304N/A// copies data from [esi] to [edi] using rcx pointer sized words 304N/A// sets rcx pointer sized words with rax, value at [edi] 304N/A// scans rcx pointer sized words at [edi] for occurance of rax, 304N/A// scans rcx 4 byte words at [edi] for occurance of rax, 304N/A // Not supported in 64bit mode 0N/A// copies a single word from [esi] to [edi] 304N/A // HMM Table D-1 says sse2 304N/A // NOT_LP64(assert(VM_Version::supports_sse(), "")); 304N/A // not using emit_arith because test 304N/A // doesn't support sign-extension of 304N/A// 32bit only pieces of the assembler 304N/A // NO PREFIX AS NEVER 64BIT 304N/A // NO PREFIX AS NEVER 64BIT (not even 32bit versions of 64bit regs 304N/A// The 64-bit (32bit platform) cmpxchg compares the value at adr with the contents of rdx:rax, 304N/A// and stores rcx:rbx into adr if so; otherwise, the value at adr is loaded 304N/A// into rdx:rax. The ZF is set if the compared values were equal, and cleared otherwise. 304N/A // Don't use it directly. Use MacroAssembler::decrementl() instead. 304N/A// 64bit typically doesn't use the x87 but needs to for the trig funcs 304N/A// Note: The Intel manual (Pentium Processor User's Manual, Vol.3, 1994) 304N/A// is erroneous for some of the floating-point instructions below. 304N/A emit_farith(
0xDE,
0xF8, i);
// ST(0) <- ST(0) / ST(1) and pop (Intel manual wrong) 304N/A emit_farith(
0xDE,
0xF0, i);
// ST(0) <- ST(1) / ST(0) and pop (Intel manual wrong) 304N/A emit_farith(
0xDE,
0xE8, i);
// ST(0) <- ST(0) - ST(1) and pop (Intel manual wrong) 304N/A emit_farith(
0xDE,
0xE0, i);
// ST(0) <- ST(1) - ST(0) and pop (Intel manual wrong) 0N/A // make sure the instruction is supported (introduced for P6, together with cmov) 0N/A // make sure the instruction is supported (introduced for P6, together with cmov) 304N/A // Don't use it directly. Use MacroAssembler::incrementl() instead. 304N/A// 64bit only pieces of the assembler 304N/A// This should only be used by 64bit instructions that can use rip-relative 304N/A// it cannot be used by instructions that want an immediate value. 304N/A // None will force a 64bit literal to the code stream. Likely a placeholder 304N/A // for something that will be patched later and we need to certain it will 304N/A // always be reachable. 304N/A // This should be rip relative and easily reachable. 304N/A // This should be rip relative within the code cache and easily 304N/A // reachable until we get huge code caches. (At which point 304N/A // ic code is going to have issues). 304N/A // Stress the correction code 304N/A // Must be runtimecall reloc, see if it is in the codecache 304N/A // Flipping stuff in the codecache to be unreachable causes issues 304N/A // with things like inline caches where the additional instructions 304N/A // are now (possibly a temp buffer) and where we might end up 304N/A // anywhere in the codeCache then we are always reachable. 304N/A // to be more pessimistic. 304N/A // Because rip relative is a disp + address_of_next_instruction and we 304N/A // don't know the value of address_of_next_instruction we apply a fudge factor 304N/A // to make sure we will be ok no matter the size of the instruction we get placed into. 304N/A // We don't have to fudge the checks above here because they are already worst case. 304N/A // 12 == override/rex byte, opcode byte, rm byte, sib byte, a 4-byte disp , 4-byte literal 304N/A // + 4 because better safe than sorry. 304N/A // Do not use AbstractAssembler::relocate, which is not intended for 304N/A // embedded words. Instead, relocate to the enclosing instruction. 304N/A // Don't use it directly. Use MacroAssembler::decrementl() instead. 304N/A // Use two-byte form (one-byte form is a REX prefix in 64-bit mode) 304N/A // Don't use it directly. Use MacroAssembler::decrementq() instead. 304N/A // Use two-byte form (one-byte from is a REX prefix in 64-bit mode) 304N/A // Don't use it directly. Use MacroAssembler::decrementq() instead. 304N/A // Don't use it directly. Use MacroAssembler::incrementl() instead. 304N/A // Use two-byte form (one-byte from is a REX prefix in 64-bit mode) 304N/A // Don't use it directly. Use MacroAssembler::incrementq() instead. 304N/A // Use two-byte form (one-byte from is a REX prefix in 64-bit mode) 304N/A // Don't use it directly. Use MacroAssembler::incrementq() instead. 304N/A // dbx shows movslq(rcx, 3) as movq $0x0000000049000000,(%rbx) 304N/A // and movslq(r8, 3); as movl $0x0000000048000000,(%rbx) 304N/A // as a result we shouldn't use until tested at runtime... 304N/A // we have to store original rsp. ABI says that 128 bytes 304N/A // below rsp are local scratch. 304N/A // not using emit_arith because test 304N/A // doesn't support sign-extension of 0N/A// Implementation of MacroAssembler 304N/A// First all the versions that have distinct versions depending on 32/64 bit 304N/A// Unless the difference is trivial (1 line or so). 304N/A // See whether the lock is currently biased toward our thread and 304N/A // whether the epoch is still valid 304N/A // Note that the runtime guarantees sufficient alignment of JavaThread 304N/A // pointers to allow age to be placed into low bits 304N/A // First check to see whether biasing is even enabled for this object 304N/A // The bias pattern is present in the object's header. Need to check 304N/A // whether the bias owner and the epoch are both still current. 304N/A // Note that because there is no current thread register on x86 we 304N/A // need to store off the mark word we read out of the object to 304N/A // avoid reloading it and needing to recheck invariants below. This 304N/A // store is unfortunate but it makes the overall code shorter and 304N/A // At this point we know that the header has the bias pattern and 304N/A // that we are not the bias owner in the current epoch. We need to 304N/A // figure out more details about the state of the header in order to 304N/A // know what operations can be legally performed on the object's 304N/A // If the low three bits in the xor result aren't clear, that means 304N/A // the prototype header is no longer biased and we have to revoke 304N/A // the bias on this object. 304N/A // Biasing is still enabled for this data type. See whether the 304N/A // epoch of the current bias is still valid, meaning that the epoch 304N/A // bits of the mark word are equal to the epoch bits of the 304N/A // prototype header. (Note that the prototype header's epoch bits 304N/A // only change at a safepoint.) If not, attempt to rebias the object 304N/A // toward the current thread. Note that we must be absolutely sure 304N/A // that the current epoch is invalid in order to do this because 304N/A // otherwise the manipulations it performs on the mark word are 304N/A // The epoch of the current bias is still valid but we know nothing 304N/A // about the owner; it might be set or it might be clear. Try to 304N/A // acquire the bias of the object using an atomic operation. If this 304N/A // fails we will go in to the runtime to revoke the object's bias. 304N/A // Note that we first construct the presumed unbiased header so we 304N/A // don't accidentally blow away another thread's valid bias. 304N/A // If the biasing toward our thread failed, this means that 304N/A // another thread succeeded in biasing it toward itself and we 304N/A // need to revoke that bias. The revocation will occur in the 304N/A // interpreter runtime in the slow case. 304N/A // At this point we know the epoch has expired, meaning that the 304N/A // current "bias owner", if any, is actually invalid. Under these 304N/A // circumstances _only_, we are allowed to use the current header's 304N/A // value as the comparison value when doing the cas to acquire the 304N/A // bias in the current epoch. In other words, we allow transfer of 304N/A // the bias from one thread to another directly in this situation. 304N/A // FIXME: due to a lack of registers we currently blow away the age 304N/A // bits in this situation. Should attempt to preserve them. 304N/A // If the biasing toward our thread failed, then another thread 304N/A // succeeded in biasing it toward itself and we need to revoke that 304N/A // bias. The revocation will occur in the runtime in the slow case. 304N/A // The prototype mark in the klass doesn't have the bias bit set any 304N/A // more, indicating that objects of this data type are not supposed 304N/A // to be biased any more. We are going to try to reset the mark of 304N/A // this object to the prototype value and fall through to the 304N/A // CAS-based locking scheme. Note that if our CAS fails, it means 304N/A // that another thread raced us for the privilege of revoking the 304N/A // bias of this particular object, so it's okay to continue in the 304N/A // normal locking code. 304N/A // FIXME: due to a lack of registers we currently blow away the age 304N/A // bits in this situation. Should attempt to preserve them. 304N/A // Fall through to the normal CAS-based lock, because no matter what 304N/A // the result of the above CAS, some thread must have succeeded in 304N/A // removing the bias bit from the object's header. 304N/A // According to Intel Doc. AP-526, "Integer Divide", p.18. 0N/A // A 5 byte nop that is safe for patching (see patch_verified_entry) 304N/A // set parity bit if FPU flag C2 is set (via rax) 304N/A // set parity bit if FPU flag C2 is set (via rax) 0N/A// 32bit can do a case table jump in one instruction but we no longer allow the base 0N/A// to be installed in the Address class 304N/A// Note: y_lo will be destroyed 304N/A // Long compare for Java (semantics as described in JVM spec.) 304N/A // x_hi is the return register 0N/A // leal(dst, as_Address(adr)); 304N/A // see note in movl as to why we must use a move 0N/A // Multiplication of two Java long values stored on the stack 0N/A // as illustrated below. Result is in rdx:rax. 0N/A // rsp ---> [ ?? ] \ \ 0N/A // .... | y_rsp_offset | 0N/A // [ y_lo ] / (in bytes) | x_rsp_offset 0N/A // [ y_hi ] | (in bytes) 0N/A // Basic idea: lo(result) = lo(x_lo * y_lo) 0N/A // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi) 0N/A // load x_hi, y_hi and check if quick 0N/A // multiplication is possible 0N/A // do full multiplication 0N/A // Java shift left long support (semantics as described in JVM spec., p.305) 0N/A // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n)) 0N/A // shift value is in rcx ! 0N/A andl(s,
0x3f);
// s := s & 0x3f (s < 0x40) 0N/A // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 0N/A // Java shift right long support (semantics as described in JVM spec., p.306 & p.310) 0N/A // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n)) 0N/A andl(s,
0x3f);
// s := s & 0x3f (s < 0x40) 0N/A // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 304N/A// src should NEVER be a real pointer. Use AddressLiteral for true pointers 304N/A // In order to get locks to work, we need to fake a in_VM state 304N/A // To see where a verify_oop failed, get $ebx+40/X for this frame. 304N/A // This is the value of eip which points to where verify_oop will return. 304N/A // push address of message 304N/A // push address of message 304N/A // amd64 always does this as a pc-rel 304N/A // we can be absolute or disp based on the instruction type 304N/A // See whether the lock is currently biased toward our thread and 304N/A // whether the epoch is still valid 304N/A // Note that the runtime guarantees sufficient alignment of JavaThread 304N/A // pointers to allow age to be placed into low bits 304N/A // First check to see whether biasing is even enabled for this object 304N/A // The bias pattern is present in the object's header. Need to check 304N/A // whether the bias owner and the epoch are both still current. 304N/A // At this point we know that the header has the bias pattern and 304N/A // that we are not the bias owner in the current epoch. We need to 304N/A // figure out more details about the state of the header in order to 304N/A // know what operations can be legally performed on the object's 304N/A // If the low three bits in the xor result aren't clear, that means 304N/A // the prototype header is no longer biased and we have to revoke 304N/A // the bias on this object. 304N/A // Biasing is still enabled for this data type. See whether the 304N/A // epoch of the current bias is still valid, meaning that the epoch 304N/A // bits of the mark word are equal to the epoch bits of the 304N/A // prototype header. (Note that the prototype header's epoch bits 304N/A // only change at a safepoint.) If not, attempt to rebias the object 304N/A // toward the current thread. Note that we must be absolutely sure 304N/A // that the current epoch is invalid in order to do this because 304N/A // otherwise the manipulations it performs on the mark word are 304N/A // The epoch of the current bias is still valid but we know nothing 304N/A // about the owner; it might be set or it might be clear. Try to 304N/A // acquire the bias of the object using an atomic operation. If this 304N/A // fails we will go in to the runtime to revoke the object's bias. 304N/A // Note that we first construct the presumed unbiased header so we 304N/A // don't accidentally blow away another thread's valid bias. 304N/A // If the biasing toward our thread failed, this means that 304N/A // another thread succeeded in biasing it toward itself and we 304N/A // need to revoke that bias. The revocation will occur in the 304N/A // interpreter runtime in the slow case. 304N/A // At this point we know the epoch has expired, meaning that the 304N/A // current "bias owner", if any, is actually invalid. Under these 304N/A // circumstances _only_, we are allowed to use the current header's 304N/A // value as the comparison value when doing the cas to acquire the 304N/A // bias in the current epoch. In other words, we allow transfer of 304N/A // the bias from one thread to another directly in this situation. 304N/A // FIXME: due to a lack of registers we currently blow away the age 304N/A // bits in this situation. Should attempt to preserve them. 304N/A // If the biasing toward our thread failed, then another thread 304N/A // succeeded in biasing it toward itself and we need to revoke that 304N/A // bias. The revocation will occur in the runtime in the slow case. 304N/A // The prototype mark in the klass doesn't have the bias bit set any 304N/A // more, indicating that objects of this data type are not supposed 304N/A // to be biased any more. We are going to try to reset the mark of 304N/A // this object to the prototype value and fall through to the 304N/A // CAS-based locking scheme. Note that if our CAS fails, it means 304N/A // that another thread raced us for the privilege of revoking the 304N/A // bias of this particular object, so it's okay to continue in the 304N/A // normal locking code. 304N/A // FIXME: due to a lack of registers we currently blow away the age 304N/A // bits in this situation. Should attempt to preserve them. 304N/A // Fall through to the normal CAS-based lock, because no matter what 304N/A // the result of the above CAS, some thread must have succeeded in 304N/A // removing the bias bit from the object's header. 304N/A // Windows always allocates space for it's register args 304N/A // Align stack if necessary 304N/A // restore stack pointer 304N/A // Full implementation of Java ldiv and lrem; checks for special 304N/A // case as described in JVM spec., p.243 & p.271. The function 304N/A // returns the (pc) offset of the idivl instruction - may be needed 304N/A // for implicit exceptions. 304N/A // normal case special case 304N/A // input : rax: dividend min_long 304N/A // output: rax: quotient (= rax idiv reg) min_long 304N/A // rdx: remainder (= rax irem reg) 0 304N/A // check for special case 304N/A // normal and special case exit 304N/A // A 5 byte nop that is safe for patching (see patch_verified_entry) 304N/A // Recommened sequence from 'Software Optimization Guide for the AMD 304N/A// 32bit can do a case table jump in one instruction but we no longer allow the base 304N/A// to be installed in the Address class 304N/A // %%% is this really better? Why not on 32bit too? 304N/A// src should NEVER be a real pointer. Use AddressLiteral for true pointers 304N/A// These are mostly for initializing NULL 304N/A // we must set sp to zero to clear frame 304N/A // must clear fp, so that compiled frames are not confused; it is 304N/A // possible that we need it only for debugging 304N/A // determine last_java_sp register 304N/A // last_java_fp is optional 304N/A // last_java_pc is optional 304N/A andq(
rsp, -
16);
// align stack as required by push_CPU_state and call 304N/A // In order to get locks to work, we need to fake a in_VM state 304N/A // To see where a verify_oop failed, get $ebx+40/X for this frame. 304N/A // XXX correct this offset for amd64 304N/A // This is the value of eip which points to where verify_oop will return. 304N/A// Now versions that are common to 32/64 bit 304N/A// Writes to stack successive pages until offset reached to check for 304N/A// stack overflow + shadow pages. This clobbers tmp. 304N/A // Bang stack for total size given plus shadow page size. 304N/A // Bang one page at a time because large size can bang beyond yellow and 304N/A // Bang down shadow pages too. 304N/A // The -1 because we already subtracted 1 page. 304N/A // this could be any sized move but this is can be a debugging crumb 304N/A // so the bigger the better. 304N/A // Check for biased locking unlock case, which is a no-op 304N/A // Note: we do not have to check the thread ID for two reasons. 304N/A // First, the interpreter checks for IllegalMonitorStateException at 304N/A // a higher level. Second, if the bias was revoked while we held the 304N/A // lock, the object could not be rebiased toward another thread, so 304N/A // the bias bit would be clear. 304N/A // implements x == 0 ? 0 : 1 304N/A // note: must only look at least-significant byte of x 304N/A // since C-style booleans are stored in one byte 304N/A// Wouldn't need if AddressLiteral version had new name 304N/A// Implementation of call_VM versions 304N/A // determine java_thread register 304N/A // determine last_java_sp register 304N/A // push java thread (becomes first argument of C function) 304N/A // set last Java frame before call 304N/A // Only interpreter should have to set fp 304N/A // do the call, remove parameters 304N/A // restore the thread (cannot use the pushed argument since arguments 304N/A // may be overwritten by C code generated by an optimizing compiler); 304N/A // however can use the register value directly if it is callee saved. 304N/A // rdi & rsi (also r15) are callee saved -> nothing to do 304N/A stop(
"MacroAssembler::call_VM_base: rdi not callee saved?");
304N/A // reset last Java frame 304N/A // Only interpreter should have to clear fp 304N/A // C++ interp handles this in the interpreter 304N/A // check for pending exceptions (java_thread is set upon return) 304N/A // This used to conditionally jump to forward_exception however it is 304N/A // possible if we relocate that the branch will not reach. So we must jump 304N/A // around so we can always reach 304N/A // get oop result if there is one and reset the value in the thread 304N/A // Calculate the value for last_Java_sp 304N/A // somewhat subtle. call_VM does an intermediate call 304N/A // which places a return address on the stack just under the 304N/A // stack pointer as the user finsihed with it. This allows 304N/A // use to retrieve last_Java_pc from last_Java_sp[-1]. 304N/A // On 32bit we then have to push additional args on the stack to accomplish 304N/A // the actual requested call. On 64bit call_VM only can use register args 304N/A // so the only extra space is the return address that call_VM created. 304N/A // This hopefully explains the calculations here. 304N/A // We've pushed one address, correct last_Java_sp 304N/A }
else {
// unordered is greater 304N/A }
else {
// unordered is greater 304N/A // moves src2's literal address 304N/A // Full implementation of Java idiv and irem; checks for 304N/A // special case as described in JVM spec., p.243 & p.271. 304N/A // The function returns the (pc) offset of the idivl 304N/A // instruction - may be needed for implicit exceptions. 304N/A // normal case special case 304N/A // input : rax,: dividend min_int 304N/A // reg: divisor (may not be rax,/rdx) -1 304N/A // output: rax,: quotient (= rax, idiv reg) min_int 304N/A // rdx: remainder (= rax, irem reg) 0 304N/A // check for special case 304N/A xorl(
rdx,
rdx);
// prepare rdx for possible special case (where remainder = 0) 304N/A // normal and special case exit 304N/A// !defined(COMPILER2) is because of stupid core builds 304N/A#
endif // !LP64 || C1 || !C2 304N/A// Defines obj, preserves var_size_in_bytes 304N/A // if end < obj then we wrapped around => object too long => slow case 304N/A // Compare obj with the top addr, and if still equal, store the new top addr in 304N/A // end at the address of the top addr pointer. Sets ZF if was equal, and clears 304N/A // it otherwise. Use lock prefix for atomicity on MPs. 0N/A // convert FPU condition into eflags condition via rax, 0N/A // condition codes set as follows: 0N/A // CF (corresponds to C0) if x < y 0N/A // PF (corresponds to C2) if unordered 0N/A // ZF (corresponds to C3) if x = y 0N/A }
else {
// unordered is greater 304N/A // Note: fxch & fpop to get rid of ST1 304N/A // (otherwise FPU stack could overflow eventually) 304N/A // 0111 tttn #8-bit disp 304N/A // 0000 1111 1000 tttn #32-bit disp 304N/A// word => int32 which seems bad for 64bit 304N/A // This is dubious to me since it seems safe to do a signed 16 => 64 bit 304N/A // version but this is what 64bit has always done. This seems to imply 304N/A // that users are only using 32bits worth. 304N/A // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 304N/A // and "3.9 Partial Register Penalties", p. 22). 304N/A // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 304N/A // and "3.9 Partial Register Penalties", p. 22). 0N/A// C++ bool manipulation 0N/A if(
sizeof(
bool) ==
1)
0N/A else if(
sizeof(
bool) ==
2)
0N/A else if(
sizeof(
bool) ==
4)
0N/A if(
sizeof(
bool) ==
1)
0N/A else if(
sizeof(
bool) ==
2)
0N/A else if(
sizeof(
bool) ==
4)
0N/A if(
sizeof(
bool) ==
1)
0N/A else if(
sizeof(
bool) ==
2)
0N/A else if(
sizeof(
bool) ==
4)
304N/A// src should NEVER be a real pointer. Use AddressLiteral for true pointers 304N/A // provoke OS NULL exception if reg = NULL by 304N/A // accessing M[reg] w/o changing any (non-CC) registers 304N/A // NOTE: cmpl is plenty here to provoke a segv 304N/A // Note: should probably use testl(rax, Address(reg, 0)); 304N/A // may be shorter code (however, this version of 304N/A // testl needs to be implemented first) 304N/A // nothing to do, (later) access of M[reg + offset] 304N/A // will provoke OS NULL exception if reg = NULL 304N/A // instead of directly emitting a breakpoint, call os:breakpoint for better debugability 304N/A // (e.g., MSVC can't call ps() otherwise) 304N/A// Save Integer and Float state 304N/A// Warning: Stack must be 16 byte aligned (64bit) 304N/A // Push flags first because pusha kills them 304N/A // Make sure rsp stays 16-byte aligned 304N/A // determine java_thread register 304N/A // we must set sp to zero to clear frame 304N/A// Write serialization page so VM thread can do a pseudo remote membar. 304N/A// We use the current thread pointer to calculate a thread specific 304N/A// offset to write to within the page. This minimizes bus traffic 304N/A// due to cache line collision. 304N/A// When entering C land, the rbp, & rsp of the last Java frame have to be recorded 304N/A// in the (thread-local) JavaThread object. When leaving C land, the last Java fp 304N/A// has to be reset to 0. This is required to allow proper stack traversal. 304N/A // determine java_thread register 304N/A // determine last_java_sp register 304N/A // last_java_fp is optional 304N/A // last_java_pc is optional 304N/A // Does a store check for the oop in register obj. The content of 304N/A // register obj is destroyed afterwards. 304N/A// split the store check operation so that other instructions can be scheduled inbetween 304N/A // The calculation for byte_map_base is as follows: 304N/A // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift); 304N/A // So this essentially converts an address to a displacement and 304N/A // it will never need to be relocated. On 64bit however the value may be too 304N/A // large for a 32bit displacement 304N/A // By doing it as an ExternalAddress disp could be converted to a rip-relative 304N/A // displacement and done in a single instruction given favorable mapping and 304N/A // a smarter version of as_Address. Worst case it is two instructions which 304N/A // is no worse off then loading disp into a register and doing as a simple 304N/A // We can't do as ExternalAddress as the only style since if disp == 0 we'll 304N/A // assert since NULL isn't acceptable in a reloci (see 6644928). In any case 304N/A // in some cases we'll get a single instruction version. 304N/A// C++ bool manipulation 0N/A if(
sizeof(
bool) ==
1)
0N/A else if(
sizeof(
bool) ==
2) {
0N/A // testw implementation needed for two byte bools 0N/A }
else if(
sizeof(
bool) ==
4)
304N/A// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 304N/A // update the tlab top pointer 304N/A // recover var_size_in_bytes if necessary 304N/A// Preserves rbx, and rdx. 304N/A // No allocation in the shared eden. 304N/A // calculate amount of free space 304N/A // Retain tlab and allocate object in shared space if 304N/A // the amount free in the tlab is too large to discard. 304N/A // %%% yuck as movptr... 304N/A // increment number of slow_allocations 304N/A // increment number of refills 304N/A // accumulate wastage -- t1 is amount free in tlab 304N/A // if tlab is currently allocated (top or end != null) then 304N/A // fill [top, end + alignment_reserve) with array object 304N/A // set up the mark word 304N/A // set the length to the remaining space 304N/A // set klass to intArrayKlass 304N/A // dubious reloc why not an oop reloc? 304N/A // store klass last. concurrent gcs assumes klass length is valid if 304N/A // klass field is not null. 304N/A // refill the tlab with an eden allocation 304N/A // Check that t1 was preserved in eden_allocate. 304N/Astatic const double pi_4 =
0.7853981633974483;
304N/A // A hand-coded argument reduction for values in fabs(pi/4, pi/2) 304N/A // was attempted in this code; unfortunately it appears that the 304N/A // switch to 80-bit precision and back causes this to be 304N/A // unprofitable compared with simply performing a runtime call if 304N/A // the argument is out of the (-pi/4, pi/4) range. 304N/A // fcmp needs a temporary so preserve rbx, 304N/A // fastest case: -pi/4 <= x <= pi/4 304N/A // slow case: runtime call 304N/A // Preserve registers across runtime call 304N/A // Must preserve all other FPU regs (could alternatively convert 304N/A // SharedRuntime::dsin and dcos into assembly routines known not to trash 304N/A // FPU state, but can not trust C compiler) 304N/A // NOTE that in this case we also push the incoming argument to 304N/A // the stack and restore it later; we also use this stack slot to 304N/A // hold the return value from dsin or dcos. 304N/A // NOTE: we must not use call_VM_leaf here because that requires a 304N/A // complete interpreter frame in debug mode -- same bug as 4387334 304N/A // MacroAssembler::call_VM_leaf_base is perfectly safe and will 304N/A // Need to add stack banging before this runtime call if it needs to 304N/A // be taken; however, there is no generic stack banging routine at 304N/A // the MacroAssembler level 304N/A // Must save return value to stack and then restore entire FPU stack 304N/A // Come here with result in F-TOS 0N/A // Pass register number to verify_oop_subroutine 304N/A // avoid using pushptr, as it modifies scratch registers 304N/A // and our contract is not to modify anything 0N/A // call indirectly to solve generation ordering problem 0N/A // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord); 0N/A // Pass register number to verify_oop_subroutine 0N/A // addr may contain rsp so we will have to adjust it based on the push 304N/A // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which 304N/A // stores rax into addr which is backwards of what was intended. 0N/A // pass msg argument 304N/A // avoid using pushptr, as it modifies scratch registers 304N/A // and our contract is not to modify anything 0N/A // call indirectly to solve generation ordering problem 0N/A // Caller pops the arguments and restores rax, from the stack 0N/A case 0:
rc =
"round near";
break;
0N/A case 1:
rc =
"round down";
break;
0N/A case 2:
rc =
"round up ";
break;
0N/A case 3:
rc =
"chop ";
break;
0N/A // precision control 0N/A case 0:
pc =
"24 bits ";
break;
0N/A case 1:
pc =
"reserved";
break;
0N/A case 2:
pc =
"53 bits ";
break;
0N/A case 3:
pc =
"64 bits ";
break;
0N/A c[0] = (
C3()) ?
'3' :
'-';
0N/A c[
1] = (
C2()) ?
'2' :
'-';
0N/A c[
2] = (
C1()) ?
'1' :
'-';
0N/A c[
3] = (
C0()) ?
'0' :
'-';
0N/A case 0:
return "valid";
0N/A case 1:
return "zero";
0N/A case 2:
return "special";
0N/A case 3:
return "empty";
0N/A // print computation registers 0N/A printf(
"%c r%d = ST%d = ", (j == 0 ?
'*' :
' '), i, j);
0N/A // print control registers 0N/A // computation registers 0N/A // control registers 0N/A printf(
"--------------------------------------------------\n");
0N/A printf(
"--------------------------------------------------\n");
0N/A // For leaf calls, only verify that the top few elements remain empty. 0N/A // We only need 1 empty at the top for C2 code. 0N/A return true;
// All other stack states do not matter 0N/A "bad FPU control word");
0N/A // compute stack depth 0N/A // stack not contiguous 0N/A printf(
"%s: stack not contiguous at ST%d\n", s, i);
0N/A // check if computed stack depth corresponds to expected stack depth 0N/A // expected stack depth is -stack_depth or less 0N/A // too many elements on the stack 0N/A // expected stack depth is stack_depth 0N/A // wrong stack depth 0N/A // everything is cool 0N/A // pass message string s 0N/A int3();
// break if error condition 304N/A // Store to klass gap in destination 304N/A stop(
"MacroAssembler::encode_heap_oop: heap base corrupted?");
304N/A stop(
"null oop passed to encode_heap_oop_not_null");
304N/A stop(
"null oop passed to encode_heap_oop_not_null2");
304N/A stop(
"MacroAssembler::decode_heap_oop: heap base corrupted?");
304N/A // alternate decoding probably a wash. 304N/A // Cannot assert, unverified entry point counts instructions (see .ad file) 304N/A // vtableStubs also counts instructions in pd_code_size_limit. 304N/A // Also do not verify_oop as this is called by verify_oop. 304N/A // Cannot assert, unverified entry point counts instructions (see .ad file) 304N/A // vtableStubs also counts instructions in pd_code_size_limit. 304N/A // Also do not verify_oop as this is called by verify_oop. 0N/A // Note some conditions are synonyms for others