assembler_x86.cpp revision 3932
3050N/A * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. 0N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 0N/A * This code is free software; you can redistribute it and/or modify it 0N/A * under the terms of the GNU General Public License version 2 only, as 0N/A * published by the Free Software Foundation. 0N/A * This code is distributed in the hope that it will be useful, but WITHOUT 0N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 0N/A * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 0N/A * version 2 for more details (a copy is included in the LICENSE file that 0N/A * accompanied this code). 0N/A * You should have received a copy of the GNU General Public License version 0N/A * 2 along with this work; if not, write to the Free Software Foundation, 0N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 1472N/A * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 0N/A// Implementation of AddressLiteral 0N/A // Oops are a special case. Normally they would be their own section 0N/A // but in cases like icBuffer they are literals in the code stream that 0N/A // we don't have a section for. We use none so that we get a literal address 0N/A // which is always patchable. 0N/A// Implementation of Address 0N/A // Not implementable on 64bit machines 0N/A // Should have been handled higher up the call chain. 304N/A// exceedingly dangerous constructor 0N/A// exceedingly dangerous constructor 0N/A// Convert the raw encoding form into the form expected by the constructor for 0N/A// Address. An index of 4 (rsp) corresponds to having no index, so convert 0N/A// that to noreg for the Address constructor. 0N/A// Implementation of Assembler 0N/A// make this go away someday 0N/A // Do not use AbstractAssembler::relocate, which is not intended for 0N/A // embedded words. Instead, relocate to the enclosing instruction. 0N/A // hack. call32 is too wide for mask so use disp32 0N/A assert((
op1 &
0x02) == 0,
"sign-extension bit should not be set");
3236N/A// Force generation of a 4 byte immediate value even if it fits into 8bit 0N/A// immediate-to-memory forms 0N/A assert((
op1 &
0x02) == 0,
"sign-extension bit should not be set");
0N/A assert((
op1 &
0x02) == 0,
"sign-extension bit should not be set");
304N/A // Encode the registers as needed in the fields they are used in 0N/A // [base + index*scale + disp] 0N/A // [base + index*scale] 0N/A // [00 reg 100][ss index base] 0N/A // [base + index*scale + imm8] 0N/A // [01 reg 100][ss index base] imm8 304N/A // [base + index*scale + disp32] 304N/A // [10 reg 100][ss index base] disp32 0N/A // [00 reg 100][00 100 100] 304N/A // [01 reg 100][00 100 100] disp8 304N/A // [10 reg 100][00 100 100] disp32 304N/A // [10 reg base] disp32 0N/A // [index*scale + disp] 304N/A // [00 reg 100][ss index 101] disp32 304N/A // [disp] (64bit) RIP-RELATIVE (32bit) abs 304N/A // Note that the RIP-rel. correction applies to the generated 304N/A // disp field, but _not_ to the target address in the rspec. 304N/A // disp was created by converting the target address minus the pc 304N/A // at the start of the instruction. That needs more correction here. 304N/A // intptr_t disp = target - next_ip; 304N/A // Do rip-rel adjustment for 64bit 304N/A "must be 32bit offset (RIP relative address)");
304N/A // [00 reg 100][00 100 101] disp32 0N/A// Secret local extension to Assembler::WhichOperand: 0N/A // Decode the given instruction, and return the address of 0N/A // an embedded 32-bit operand word. 0N/A // If "which" is disp32_operand, selects the displacement portion 0N/A // of an effective address specifier. 304N/A // If "which" is imm64_operand, selects the trailing immediate constant. 0N/A // If "which" is call32_operand, selects the displacement of a call or jump. 0N/A // Caller is responsible for ensuring that there is such an operand, 304N/A // and that it is 32/64 bits wide. 0N/A // If "which" is end_pc_operand, find the end of the instruction. 304N/A int tail_size = 0;
// other random bytes (#32, #16, etc.) at end of insn 0N/A // These convenience macros generate groups of "case" labels for the switch. 304N/A#
define REP4(x) (x)+0:
case (x)+
1:
case (x)+
2:
case (x)+
3 304N/A#
define REP8(x) (x)+0:
case (x)+
1:
case (x)+
2:
case (x)+
3: \
0N/A case (x)+
4:
case (x)+
5:
case (x)+
6:
case (x)+
7 304N/A case 0xFF:
// pushq a; decl a; incl a; call a; jmp a 0N/A case 0x88:
// movb a, r 0N/A case 0x89:
// movl a, r 0N/A case 0x8A:
// movb r, a 0N/A case 0x8B:
// movl r, a 0N/A case 0x8F:
// popl a 304N/A case 0x68:
// pushq #32 0N/A return ip;
// not produced by emit_operand 0N/A case 0x66:
// movw ... (size prefix) 0N/A case 0x8B:
// movw r, a 0N/A case 0x89:
// movw a, r 0N/A case 0xC7:
// movw a, #16 0N/A ip--;
// reparse the 0x0F 304N/A case REP8(
0xB8):
// movl/q r, #32/#64(oop?) 304N/A // these asserts are somewhat nonsensical 0N/A case 0x69:
// imul r, a, #32 0N/A case 0xC7:
// movl a, #32(oop?) 0N/A case 0x0F:
// movx..., etc. 3039N/A case 0x38:
// ptest, pmovzxbw 3039N/A case 0x70:
// pshufd r, r/a, #8 0N/A case 0x12:
// movlps 0N/A case 0x28:
// movaps 0N/A case 0x2E:
// ucomiss 0N/A case 0x2F:
// comiss 0N/A case 0x55:
// andnps 3039N/A case 0xAE:
// ldmxcsr, stmxcsr, fxrstor, fxsave, clflush 0N/A case 0xAD:
// shrd r, a, %cl 0N/A case 0xAF:
// imul r, a 304N/A case 0xBE:
// movsbl r, a (movsxb) 304N/A case 0xBF:
// movswl r, a (movsxw) 304N/A case 0xB6:
// movzbl r, a (movzxb) 304N/A case 0xB7:
// movzwl r, a (movzxw) 0N/A case 0xB0:
// cmpxchgb 0N/A case 0xB1:
// cmpxchg 0N/A case 0xC7:
// cmpxchg8 0N/A // fall out of the switch to decode the address 3039N/A case 0xC4:
// pinsrw r, a, #8 3039N/A case 0xC5:
// pextrw r, r, #8 0N/A case 0xAC:
// shrd r, a, #8 0N/A case 0x81:
// addl a, #32; addl r, #32 0N/A // also: orl, adcl, sbbl, andl, subl, xorl, cmpl 304N/A // on 32bit in the case of cmpl, the imm might be an oop 0N/A case 0x83:
// addl a, #8; addl r, #8 0N/A // also: orl, adcl, sbbl, andl, subl, xorl, cmpl 0N/A case 0xD9:
// fnstcw a 0N/A case REP4(
0x00):
// addb a, r; addl a, r; addb r, a; addl r, a 304N/A case 0x87:
// xchg r, a 304N/A case 0x85:
// test r, a 0N/A case 0xC1:
// sal a, #8; sar a, #8; shl a, #8; shr a, #8 0N/A case 0xC6:
// movb a, #8 0N/A case 0x80:
// cmpb a, #8 0N/A case 0x6B:
// imul r, a, #8 3039N/A // C4 and C5 are also used as opcodes for PINSRW and PEXTRW instructions 3039N/A // but they have prefix 0x0F and processed when 0x0F processed above. 3039N/A // In 32-bit mode the VEX first byte C4 and C5 alias onto LDS and LES 3039N/A // instructions (these instructions are not supported in 64-bit mode). 3039N/A // To distinguish them bits [7:6] are set in the VEX second byte since 3039N/A // ModRM byte can not be of the form 11xxxxxx in 32-bit mode. To set 3039N/A // those VEX bits REX and vvvv bits are inverted. 3039N/A // Fortunately C2 doesn't generate these instructions so we don't need 3039N/A // to check for them in product version. 3039N/A // To find the end of instruction (which == end_pc_operand). 3039N/A case 0x61:
// pcmpestri r, r/a, #8 3039N/A case 0x70:
// pshufd r, r/a, #8 0N/A case 0xD1:
// sal a, 1; sar a, 1; shl a, 1; shr a, 1 0N/A case 0xD3:
// sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl 0N/A case 0xD9:
// fld_s a; fst_s a; fstp_s a; fldcw a 0N/A case 0xDD:
// fld_d a; fst_d a; fstp_d a 0N/A case 0xDB:
// fild_s a; fistp_s a; fld_x a; fstp_x a 0N/A case 0xDF:
// fild_d a; fistp_d a 0N/A case 0xD8:
// fadd_s a; fsubr_s a; fmul_s a; fdivr_s a; fcomp_s a 0N/A case 0xDC:
// fadd_d a; fsubr_d a; fmul_d a; fdivr_d a; fcomp_d a 0N/A case 0xDE:
// faddp_d a; fsubrp_d a; fmulp_d a; fdivrp_d a; fcompp_d a 0N/A case 0xF3:
// For SSE 0N/A case 0xF2:
// For SSE2 304N/A // assert(which != imm_operand || has_imm32, "instruction has no imm32 field"); 0N/A // parse the output of emit_operand 0N/A // now ip points at the disp (if any) 0N/A // [00 reg 100][ss index base] 304N/A // [00 reg 100][00 100 esp] 0N/A // [00 reg 100][ss index 101][disp32] 0N/A // [00 reg 101] [disp32] 0N/A return ip;
// caller wants the disp32 0N/A ip +=
4;
// skip the disp32 0N/A // [01 reg 100][ss index base][disp8] 304N/A // [01 reg 100][00 100 esp][disp8] 0N/A // [01 reg base] [disp8] 0N/A ip +=
1;
// skip the disp8 0N/A // [10 reg 100][ss index base][disp32] 304N/A // [10 reg 100][00 100 esp][disp32] 0N/A // [10 reg base] [disp32] 0N/A return ip;
// caller wants the disp32 0N/A ip +=
4;
// skip the disp32 0N/A // [11 reg base] (not a memory addressing mode) 0N/A // Secretly share code with locate_operand: 0N/A // assert(format == imm32_operand, "cannot specify a nonzero format"); 304N/A// work around gcc (3.2.1-7a) bug 0N/A assert(0 <= i && i <
8,
"illegal stack offset");
1988N/A// Now the Assembler instructions (identical for 32/64 bits) 0N/A // 4 bytes: NOP DWORD PTR [EAX+0] 0N/A // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset 0N/A // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset 0N/A // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset 304N/A // suspect disp32 is always good 304N/A // 1110 1000 #32-bit disp 304N/A // 1110 1000 #32-bit disp 304N/A // Technically, should use call32_operand, but this format is 304N/A // implied by the fact that we're emitting a call instruction. 304N/A// The 32-bit cmpxchg compares the value at adr with the contents of rax, 304N/A// and stores reg into adr if so; otherwise, the value at adr is loaded into rax,. 304N/A// The ZF is set if the compared values were equal, and cleared otherwise. 304N/A // caveat: no instructionmark, so this isn't relocatable. 304N/A // Emit a synthetic, non-atomic, CAS equivalent. 304N/A // Beware. The synthetic form sets all ICCs, not just ZF. 304N/A // cmpxchg r,[m] is equivalent to rax, = CAS (m, rax, r) 304N/A // NOTE: dbx seems to decode this as comiss even though the 304N/A // 0x66 is there. Strangly ucomisd comes out correct 304N/A // Don't use it directly. Use MacroAssembler::decrement() instead. 304N/A // Don't use it directly. Use MacroAssembler::increment() instead. 304N/A // 0111 tttn #8-bit disp 304N/A // 0000 1111 1000 tttn #32-bit disp 304N/A "must be 32bit offset (call4)");
304N/A // Note: could eliminate cond. jumps to this jump if condition 304N/A // is the same however, seems to be rather unlikely case. 304N/A // Note: use jccb() if label to be bound is very close to get 304N/A // an 8-bit displacement 304N/A // 0111 tttn #8-bit disp 304N/A // By default, forward jumps are always 32-bit displacements, since 304N/A // we can't yet know where the label will be bound. If you're sure that 304N/A // the forward jump will not run beyond 256 bytes, use jmpb to 304N/A // force an 8-bit displacement. 304N/A // Emit either nothing, a NOP, or a NOP: prefix 671N/A// Emit mfence instruction 3845N/A// Move Unaligned 256bit Vector 3845N/A // swap src<->dst for encoding 304N/A// Uses zero extension on 64bit 304N/A// New cpus require to use movsd and movss to avoid partial register stall 304N/A// when loading from memory. But for old Opteron use movlpd instead of movsd. 304N/A// The selection is done in MacroAssembler::movdbl() and movflt(). 304N/A // workaround gcc (3.2.1-7a) bug 304N/A // In that version of gcc with only an emit_operand(MMX, Address) 304N/A // gcc will tail jump and try and reverse the parameters completely 304N/A // obliterating dst in the process. By having a version available 304N/A // that doesn't need to swap the args at the tail jump the bug is 304N/A // The fancy nops aren't currently recognized by debuggers making it a 304N/A // pain to disassemble code while debugging. If asserts are on clearly 304N/A // speed is not an issue so simply use the single byte traditional nop 0N/A // Using multi-bytes nops "0x0F 0x1F [address]" for Intel 0N/A // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding) 0N/A // 4: 0x0F 0x1F 0x40 0x00 0N/A // 5: 0x0F 0x1F 0x44 0x00 0x00 0N/A // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00 0N/A // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0N/A // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // The rest coding is Intel specific - don't use consecutive address nops 0N/A // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90 0N/A // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90 0N/A // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90 0N/A // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90 0N/A // For Intel don't generate consecutive addess nops (mix with regular nops) 0N/A // Don't use "0x0F 0x1F 0x00" - need patching safe padding 0N/A // Using multi-bytes nops "0x0F 0x1F [address]" for AMD. 0N/A // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding) 0N/A // 4: 0x0F 0x1F 0x40 0x00 0N/A // 5: 0x0F 0x1F 0x44 0x00 0x00 0N/A // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00 0N/A // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0N/A // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // The rest coding is AMD specific - use consecutive address nops 0N/A // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00 0N/A // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00 0N/A // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0N/A // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0N/A // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0N/A // Size prefixes (0x66) are added for larger sizes 0N/A // Generate first nop for size between 21-12 0N/A // Generate second nop for size between 11-1 0N/A // Don't use "0x0F 0x1F 0x00" - need patching safe padding 0N/A // Using nops with size prefixes "0x66 0x90". 0N/A // From AMD Optimization Guide: 0N/A // 3: 0x66 0x66 0x90 0N/A // 4: 0x66 0x66 0x66 0x90 0N/A // 5: 0x66 0x66 0x90 0x66 0x90 0N/A // 6: 0x66 0x66 0x90 0x66 0x66 0x90 0N/A // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0N/A // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90 0N/A // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90 0N/A // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90 304N/A // NOTE: this will adjust stack by 8byte on 64bits 2167N/A // Shift 64 bit value logically right by specified number of bits. 2167N/A // HMM Table D-1 says sse2 or mmx. 2167N/A // Do not confuse it with psrldq SSE2 instruction which 2167N/A // shifts 128 bit value in xmm register by number of bytes. 2167N/A // Shift 128 bit value in xmm register by number of bytes. 304N/A // in 64bits we push 64bits onto the stack but only 304N/A // take a 32bit immediate 304N/A // Note this will push 64bit on 64bit 304N/A// copies data from [esi] to [edi] using rcx pointer sized words 304N/A// sets rcx pointer sized words with rax, value at [edi] 304N/A// scans rcx pointer sized words at [edi] for occurance of rax, 304N/A// scans rcx 4 byte words at [edi] for occurance of rax, 304N/A // Not supported in 64bit mode 0N/A// copies a single word from [esi] to [edi] 3236N/A// Force generation of a 4 byte immediate value even if it fits into 8bit 304N/A // not using emit_arith because test 304N/A // doesn't support sign-extension of 3041N/A// AVX 3-operands non destructive source instructions (encoded with VEX prefix) 3845N/A // 0x00 - insert into lower 128 bits 3845N/A // 0x01 - insert into upper 128 bits 3891N/A // 0x00 - insert into lower 128 bits 3891N/A // 0x01 - insert into upper 128 bits 304N/A// 32bit only pieces of the assembler 304N/A // NO PREFIX AS NEVER 64BIT 304N/A // NO PREFIX AS NEVER 64BIT (not even 32bit versions of 64bit regs 304N/A// The 64-bit (32bit platform) cmpxchg compares the value at adr with the contents of rdx:rax, 304N/A// and stores rcx:rbx into adr if so; otherwise, the value at adr is loaded 304N/A// into rdx:rax. The ZF is set if the compared values were equal, and cleared otherwise. 304N/A // Don't use it directly. Use MacroAssembler::decrementl() instead. 304N/A// 64bit typically doesn't use the x87 but needs to for the trig funcs 304N/A// Note: The Intel manual (Pentium Processor User's Manual, Vol.3, 1994) 304N/A// is erroneous for some of the floating-point instructions below. 304N/A emit_farith(
0xDE,
0xF8, i);
// ST(0) <- ST(0) / ST(1) and pop (Intel manual wrong) 304N/A emit_farith(
0xDE,
0xF0, i);
// ST(0) <- ST(1) / ST(0) and pop (Intel manual wrong) 304N/A emit_farith(
0xDE,
0xE8, i);
// ST(0) <- ST(0) - ST(1) and pop (Intel manual wrong) 304N/A emit_farith(
0xDE,
0xE0, i);
// ST(0) <- ST(1) - ST(0) and pop (Intel manual wrong) 0N/A // make sure the instruction is supported (introduced for P6, together with cmov) 0N/A // make sure the instruction is supported (introduced for P6, together with cmov) 3039N/A// SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding. 3039N/A// SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding. 3039N/A// Generate SSE legacy REX prefix and SIMD opcode based on VEX encoding. 304N/A // Don't use it directly. Use MacroAssembler::incrementl() instead. 304N/A// 64bit only pieces of the assembler 304N/A// This should only be used by 64bit instructions that can use rip-relative 304N/A// it cannot be used by instructions that want an immediate value. 304N/A // None will force a 64bit literal to the code stream. Likely a placeholder 304N/A // for something that will be patched later and we need to certain it will 304N/A // always be reachable. 304N/A // This should be rip relative and easily reachable. 304N/A // This should be rip relative within the code cache and easily 304N/A // reachable until we get huge code caches. (At which point 304N/A // ic code is going to have issues). 304N/A // Stress the correction code 304N/A // Must be runtimecall reloc, see if it is in the codecache 304N/A // Flipping stuff in the codecache to be unreachable causes issues 304N/A // with things like inline caches where the additional instructions 304N/A // are now (possibly a temp buffer) and where we might end up 304N/A // anywhere in the codeCache then we are always reachable. 304N/A // to be more pessimistic. 304N/A // Because rip relative is a disp + address_of_next_instruction and we 304N/A // don't know the value of address_of_next_instruction we apply a fudge factor 304N/A // to make sure we will be ok no matter the size of the instruction we get placed into. 304N/A // We don't have to fudge the checks above here because they are already worst case. 304N/A // 12 == override/rex byte, opcode byte, rm byte, sib byte, a 4-byte disp , 4-byte literal 304N/A // + 4 because better safe than sorry. 2251N/A// Check if the polling page is not reachable from the code cache using rip-relative 304N/A // Do not use AbstractAssembler::relocate, which is not intended for 304N/A // embedded words. Instead, relocate to the enclosing instruction. 304N/A // Don't use it directly. Use MacroAssembler::decrementl() instead. 304N/A // Use two-byte form (one-byte form is a REX prefix in 64-bit mode) 304N/A // Don't use it directly. Use MacroAssembler::decrementq() instead. 304N/A // Use two-byte form (one-byte from is a REX prefix in 64-bit mode) 304N/A // Don't use it directly. Use MacroAssembler::decrementq() instead. 304N/A // Don't use it directly. Use MacroAssembler::incrementl() instead. 304N/A // Use two-byte form (one-byte from is a REX prefix in 64-bit mode) 304N/A // Don't use it directly. Use MacroAssembler::incrementq() instead. 304N/A // Use two-byte form (one-byte from is a REX prefix in 64-bit mode) 304N/A // Don't use it directly. Use MacroAssembler::incrementq() instead. 304N/A // dbx shows movslq(rcx, 3) as movq $0x0000000049000000,(%rbx) 304N/A // and movslq(r8, 3); as movl $0x0000000048000000,(%rbx) 304N/A // as a result we shouldn't use until tested at runtime... 304N/A // we have to store original rsp. ABI says that 128 bytes 304N/A // below rsp are local scratch. 3236N/A// Force generation of a 4 byte immediate value even if it fits into 8bit 304N/A // not using emit_arith because test 304N/A // doesn't support sign-extension of 0N/A// Implementation of MacroAssembler 304N/A// First all the versions that have distinct versions depending on 32/64 bit 304N/A// Unless the difference is trivial (1 line or so). 304N/A // See whether the lock is currently biased toward our thread and 304N/A // whether the epoch is still valid 304N/A // Note that the runtime guarantees sufficient alignment of JavaThread 304N/A // pointers to allow age to be placed into low bits 304N/A // First check to see whether biasing is even enabled for this object 304N/A // The bias pattern is present in the object's header. Need to check 304N/A // whether the bias owner and the epoch are both still current. 304N/A // Note that because there is no current thread register on x86 we 304N/A // need to store off the mark word we read out of the object to 304N/A // avoid reloading it and needing to recheck invariants below. This 304N/A // store is unfortunate but it makes the overall code shorter and 304N/A // At this point we know that the header has the bias pattern and 304N/A // that we are not the bias owner in the current epoch. We need to 304N/A // figure out more details about the state of the header in order to 304N/A // know what operations can be legally performed on the object's 304N/A // If the low three bits in the xor result aren't clear, that means 304N/A // the prototype header is no longer biased and we have to revoke 304N/A // the bias on this object. 304N/A // Biasing is still enabled for this data type. See whether the 304N/A // epoch of the current bias is still valid, meaning that the epoch 304N/A // bits of the mark word are equal to the epoch bits of the 304N/A // prototype header. (Note that the prototype header's epoch bits 304N/A // only change at a safepoint.) If not, attempt to rebias the object 304N/A // toward the current thread. Note that we must be absolutely sure 304N/A // that the current epoch is invalid in order to do this because 304N/A // otherwise the manipulations it performs on the mark word are 304N/A // The epoch of the current bias is still valid but we know nothing 304N/A // about the owner; it might be set or it might be clear. Try to 304N/A // acquire the bias of the object using an atomic operation. If this 304N/A // fails we will go in to the runtime to revoke the object's bias. 304N/A // Note that we first construct the presumed unbiased header so we 304N/A // don't accidentally blow away another thread's valid bias. 304N/A // If the biasing toward our thread failed, this means that 304N/A // another thread succeeded in biasing it toward itself and we 304N/A // need to revoke that bias. The revocation will occur in the 304N/A // interpreter runtime in the slow case. 304N/A // At this point we know the epoch has expired, meaning that the 304N/A // current "bias owner", if any, is actually invalid. Under these 304N/A // circumstances _only_, we are allowed to use the current header's 304N/A // value as the comparison value when doing the cas to acquire the 304N/A // bias in the current epoch. In other words, we allow transfer of 304N/A // the bias from one thread to another directly in this situation. 304N/A // FIXME: due to a lack of registers we currently blow away the age 304N/A // bits in this situation. Should attempt to preserve them. 304N/A // If the biasing toward our thread failed, then another thread 304N/A // succeeded in biasing it toward itself and we need to revoke that 304N/A // bias. The revocation will occur in the runtime in the slow case. 304N/A // The prototype mark in the klass doesn't have the bias bit set any 304N/A // more, indicating that objects of this data type are not supposed 304N/A // to be biased any more. We are going to try to reset the mark of 304N/A // this object to the prototype value and fall through to the 304N/A // CAS-based locking scheme. Note that if our CAS fails, it means 304N/A // that another thread raced us for the privilege of revoking the 304N/A // bias of this particular object, so it's okay to continue in the 304N/A // normal locking code. 304N/A // FIXME: due to a lack of registers we currently blow away the age 304N/A // bits in this situation. Should attempt to preserve them. 304N/A // Fall through to the normal CAS-based lock, because no matter what 304N/A // the result of the above CAS, some thread must have succeeded in 304N/A // removing the bias bit from the object's header. 304N/A // According to Intel Doc. AP-526, "Integer Divide", p.18. 304N/A // set parity bit if FPU flag C2 is set (via rax) 304N/A // set parity bit if FPU flag C2 is set (via rax) 0N/A// 32bit can do a case table jump in one instruction but we no longer allow the base 0N/A// to be installed in the Address class 304N/A// Note: y_lo will be destroyed 304N/A // Long compare for Java (semantics as described in JVM spec.) 304N/A // x_hi is the return register 0N/A // leal(dst, as_Address(adr)); 304N/A // see note in movl as to why we must use a move 0N/A // Multiplication of two Java long values stored on the stack 0N/A // as illustrated below. Result is in rdx:rax. 0N/A // rsp ---> [ ?? ] \ \ 0N/A // .... | y_rsp_offset | 0N/A // [ y_lo ] / (in bytes) | x_rsp_offset 0N/A // [ y_hi ] | (in bytes) 0N/A // Basic idea: lo(result) = lo(x_lo * y_lo) 0N/A // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi) 0N/A // load x_hi, y_hi and check if quick 0N/A // multiplication is possible 0N/A // do full multiplication 0N/A // Java shift left long support (semantics as described in JVM spec., p.305) 0N/A // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n)) 0N/A // shift value is in rcx ! 0N/A andl(s,
0x3f);
// s := s & 0x3f (s < 0x40) 0N/A // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 0N/A // Java shift right long support (semantics as described in JVM spec., p.306 & p.310) 0N/A // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n)) 0N/A andl(s,
0x3f);
// s := s & 0x3f (s < 0x40) 0N/A // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 304N/A// src should NEVER be a real pointer. Use AddressLiteral for true pointers 304N/A // In order to get locks to work, we need to fake a in_VM state 304N/A // To see where a verify_oop failed, get $ebx+40/X for this frame. 304N/A // This is the value of eip which points to where verify_oop will return. 3932N/A // Print some words near top of staack. 3932N/A // Print some instructions around pc: 304N/A // push address of message 304N/A // push address of message 304N/A // amd64 always does this as a pc-rel 304N/A // we can be absolute or disp based on the instruction type 304N/A // See whether the lock is currently biased toward our thread and 304N/A // whether the epoch is still valid 304N/A // Note that the runtime guarantees sufficient alignment of JavaThread 304N/A // pointers to allow age to be placed into low bits 304N/A // First check to see whether biasing is even enabled for this object 304N/A // The bias pattern is present in the object's header. Need to check 304N/A // whether the bias owner and the epoch are both still current. 304N/A // At this point we know that the header has the bias pattern and 304N/A // that we are not the bias owner in the current epoch. We need to 304N/A // figure out more details about the state of the header in order to 304N/A // know what operations can be legally performed on the object's 304N/A // If the low three bits in the xor result aren't clear, that means 304N/A // the prototype header is no longer biased and we have to revoke 304N/A // the bias on this object. 304N/A // Biasing is still enabled for this data type. See whether the 304N/A // epoch of the current bias is still valid, meaning that the epoch 304N/A // bits of the mark word are equal to the epoch bits of the 304N/A // prototype header. (Note that the prototype header's epoch bits 304N/A // only change at a safepoint.) If not, attempt to rebias the object 304N/A // toward the current thread. Note that we must be absolutely sure 304N/A // that the current epoch is invalid in order to do this because 304N/A // otherwise the manipulations it performs on the mark word are 304N/A // The epoch of the current bias is still valid but we know nothing 304N/A // about the owner; it might be set or it might be clear. Try to 304N/A // acquire the bias of the object using an atomic operation. If this 304N/A // fails we will go in to the runtime to revoke the object's bias. 304N/A // Note that we first construct the presumed unbiased header so we 304N/A // don't accidentally blow away another thread's valid bias. 304N/A // If the biasing toward our thread failed, this means that 304N/A // another thread succeeded in biasing it toward itself and we 304N/A // need to revoke that bias. The revocation will occur in the 304N/A // interpreter runtime in the slow case. 304N/A // At this point we know the epoch has expired, meaning that the 304N/A // current "bias owner", if any, is actually invalid. Under these 304N/A // circumstances _only_, we are allowed to use the current header's 304N/A // value as the comparison value when doing the cas to acquire the 304N/A // bias in the current epoch. In other words, we allow transfer of 304N/A // the bias from one thread to another directly in this situation. 304N/A // FIXME: due to a lack of registers we currently blow away the age 304N/A // bits in this situation. Should attempt to preserve them. 304N/A // If the biasing toward our thread failed, then another thread 304N/A // succeeded in biasing it toward itself and we need to revoke that 304N/A // bias. The revocation will occur in the runtime in the slow case. 304N/A // The prototype mark in the klass doesn't have the bias bit set any 304N/A // more, indicating that objects of this data type are not supposed 304N/A // to be biased any more. We are going to try to reset the mark of 304N/A // this object to the prototype value and fall through to the 304N/A // CAS-based locking scheme. Note that if our CAS fails, it means 304N/A // that another thread raced us for the privilege of revoking the 304N/A // bias of this particular object, so it's okay to continue in the 304N/A // normal locking code. 304N/A // FIXME: due to a lack of registers we currently blow away the age 304N/A // bits in this situation. Should attempt to preserve them. 304N/A // Fall through to the normal CAS-based lock, because no matter what 304N/A // the result of the above CAS, some thread must have succeeded in 304N/A // removing the bias bit from the object's header. 304N/A // Windows always allocates space for it's register args 304N/A // Align stack if necessary 304N/A // restore stack pointer 304N/A // Full implementation of Java ldiv and lrem; checks for special 304N/A // case as described in JVM spec., p.243 & p.271. The function 304N/A // returns the (pc) offset of the idivl instruction - may be needed 304N/A // for implicit exceptions. 304N/A // normal case special case 304N/A // input : rax: dividend min_long 304N/A // output: rax: quotient (= rax idiv reg) min_long 304N/A // rdx: remainder (= rax irem reg) 0 304N/A // check for special case 304N/A // normal and special case exit 304N/A// 32bit can do a case table jump in one instruction but we no longer allow the base 304N/A// to be installed in the Address class 304N/A // %%% is this really better? Why not on 32bit too? 304N/A// src should NEVER be a real pointer. Use AddressLiteral for true pointers 304N/A// These are mostly for initializing NULL 304N/A // we must set sp to zero to clear frame 304N/A // must clear fp, so that compiled frames are not confused; it is 304N/A // possible that we need it only for debugging 304N/A // determine last_java_sp register 304N/A // last_java_fp is optional 304N/A // last_java_pc is optional 304N/A andq(
rsp, -
16);
// align stack as required by push_CPU_state and call 3932N/A andq(
rsp, -
16);
// align stack as required by push_CPU_state and call 304N/A // In order to get locks to work, we need to fake a in_VM state 304N/A // To see where a verify_oop failed, get $ebx+40/X for this frame. 304N/A // XXX correct this offset for amd64 304N/A // This is the value of eip which points to where verify_oop will return. 3932N/A // Print some words near top of staack. 3932N/A // Print some instructions around pc: 304N/A// Now versions that are common to 32/64 bit 3039N/A // Used in sign-masking with aligned address. 3039N/A // Used in sign-masking with aligned address. 304N/A// Writes to stack successive pages until offset reached to check for 304N/A// stack overflow + shadow pages. This clobbers tmp. 304N/A // Bang stack for total size given plus shadow page size. 304N/A // Bang one page at a time because large size can bang beyond yellow and 304N/A // Bang down shadow pages too. 304N/A // The -1 because we already subtracted 1 page. 304N/A // this could be any sized move but this is can be a debugging crumb 304N/A // so the bigger the better. 304N/A // Check for biased locking unlock case, which is a no-op 304N/A // Note: we do not have to check the thread ID for two reasons. 304N/A // First, the interpreter checks for IllegalMonitorStateException at 304N/A // a higher level. Second, if the bias was revoked while we held the 304N/A // lock, the object could not be rebiased toward another thread, so 304N/A // the bias bit would be clear. 304N/A // implements x == 0 ? 0 : 1 304N/A // note: must only look at least-significant byte of x 304N/A // since C-style booleans are stored in one byte 304N/A// Wouldn't need if AddressLiteral version had new name 304N/A// Implementation of call_VM versions 304N/A // determine java_thread register 304N/A // determine last_java_sp register 2990N/A // TraceBytecodes does not use r12 but saves it over the call, so don't verify 304N/A // push java thread (becomes first argument of C function) 304N/A // set last Java frame before call 304N/A // Only interpreter should have to set fp 304N/A // do the call, remove parameters 304N/A // restore the thread (cannot use the pushed argument since arguments 304N/A // may be overwritten by C code generated by an optimizing compiler); 304N/A // however can use the register value directly if it is callee saved. 304N/A // rdi & rsi (also r15) are callee saved -> nothing to do 3932N/A STOP(
"MacroAssembler::call_VM_base: rdi not callee saved?");
304N/A // reset last Java frame 304N/A // Only interpreter should have to clear fp 304N/A // C++ interp handles this in the interpreter 304N/A // check for pending exceptions (java_thread is set upon return) 304N/A // This used to conditionally jump to forward_exception however it is 304N/A // possible if we relocate that the branch will not reach. So we must jump 304N/A // around so we can always reach 304N/A // get oop result if there is one and reset the value in the thread 304N/A // Calculate the value for last_Java_sp 304N/A // somewhat subtle. call_VM does an intermediate call 304N/A // which places a return address on the stack just under the 304N/A // stack pointer as the user finsihed with it. This allows 304N/A // use to retrieve last_Java_pc from last_Java_sp[-1]. 304N/A // On 32bit we then have to push additional args on the stack to accomplish 304N/A // the actual requested call. On 64bit call_VM only can use register args 304N/A // so the only extra space is the return address that call_VM created. 304N/A // This hopefully explains the calculations here. 304N/A // We've pushed one address, correct last_Java_sp 304N/A }
else {
// unordered is greater 304N/A }
else {
// unordered is greater 304N/A // moves src2's literal address 304N/A // Full implementation of Java idiv and irem; checks for 304N/A // special case as described in JVM spec., p.243 & p.271. 304N/A // The function returns the (pc) offset of the idivl 304N/A // instruction - may be needed for implicit exceptions. 304N/A // normal case special case 304N/A // input : rax,: dividend min_int 304N/A // reg: divisor (may not be rax,/rdx) -1 304N/A // output: rax,: quotient (= rax, idiv reg) min_int 304N/A // rdx: remainder (= rax, irem reg) 0 304N/A // check for special case 304N/A xorl(
rdx,
rdx);
// prepare rdx for possible special case (where remainder = 0) 304N/A // normal and special case exit 304N/A// !defined(COMPILER2) is because of stupid core builds 304N/A#
endif // !LP64 || C1 || !C2 304N/A// Defines obj, preserves var_size_in_bytes 362N/A // if end < obj then we wrapped around => object too long => slow case 362N/A // Compare obj with the top addr, and if still equal, store the new top addr in 362N/A // end at the address of the top addr pointer. Sets ZF if was equal, and clears 362N/A // it otherwise. Use lock prefix for atomicity on MPs. 3236N/A// A 5 byte nop that is safe for patching (see patch_verified_entry) 0N/A // convert FPU condition into eflags condition via rax, 0N/A // condition codes set as follows: 0N/A // CF (corresponds to C0) if x < y 0N/A // PF (corresponds to C2) if unordered 0N/A // ZF (corresponds to C3) if x = y 0N/A }
else {
// unordered is greater 3752N/A // computes 2^X. Stack: X ... 3752N/A // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and 3752N/A // keep it on the thread's stack to compute 2^int(X) later 3752N/A // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1) 3752N/A // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X)) 3752N/A // computes 2^(int(X)): add exponent bias (1023) to int(X), then 3752N/A // shift int(X)+1023 to exponent position. 3752N/A // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11 3752N/A // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent 3752N/A // values so detect them and set result to NaN. 3752N/A movl(
rcx, -
2048);
// 11 bit mask and valid NaN binary encoding 3752N/A // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN. 3752N/A // Check that 1 < int(X)+1023+1 < 2048 3752N/A // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048 3752N/A // 2- (int(X)+1023+1)&-2048 != 0 3752N/A // 3- (int(X)+1023+1)&-2048 != 1 3752N/A // Do 2- first because addl just updated the flags. 3752N/A // computes X^Y = 2^(Y * log2(X)) 3752N/A // if fast computation is not possible, result is NaN. Requires 3752N/A // fallback from user of this macro. 3808N/A // increase precision for intermediate steps of the computation 3752N/A // computes exp(X) = 2^(X * log2(e)) 3752N/A // if fast computation is not possible, result is NaN. Requires 3752N/A // fallback from user of this macro. 3808N/A // increase precision for intermediate steps of the computation 3752N/A // pow and exp needs 2 extra registers on the fpu stack. 3752N/A // fcmp needs a temporary so preserve rdx, 3752N/A fld_s(0);
// duplicate argument for runtime call. Stack: X X 3752N/A // exp(X) not equal to itself: exp(X) is NaN go to slow case. 3752N/A // get rid of duplicate argument. Stack: exp(X) 3752N/A fld_s(
1);
// duplicate arguments for runtime call. Stack: Y X Y 3752N/A // X^Y not equal to itself: X^Y is NaN go to slow case. 3752N/A // get rid of duplicate arguments. Stack: X^Y 3752N/A // For X^Y, when X < 0, Y has to be an integer and the final 3752N/A // result depends on whether it's odd or even. We just checked 3752N/A // that int(Y) == Y. We move int(Y) to gp registers as a 64 bit 3752N/A // integer to test its parity. If int(Y) is huge and doesn't fit 3752N/A // in the 64 bit integer range, the integer indefinite value will 3752N/A // end up in the gp registers. Huge numbers are all even, the 3752N/A // integer indefinite number is even so it's fine. 3752N/A // Let's check we don't end up with an integer indefinite number 3752N/A // when not expected. First test for huge numbers: check whether 3752N/A // int(Y)+1 == int(Y) which is true for very large numbers and 3752N/A // those are all even. A 64 bit integer is guaranteed to not 3752N/A // overflow for numbers where y+1 != y (when precision is set to 3752N/A // trip to memory to force the precision down from double extended 3752N/A // move int(Y) as 64 bit integer to thread's stack 3752N/A // Y is huge so we know it's even. It may not fit in a 64 bit 3752N/A // integer and we don't want the debug code below to see the 3752N/A // integer indefinite value so overwrite int(Y) on the thread's 3752N/A fld_s(
1);
// duplicate arguments for runtime call. Stack: Y X Y 3752N/A // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case. 3752N/A // Check that int(Y) is not integer indefinite value (int 3752N/A // overflow). Shouldn't happen because for values that would 3752N/A // overflow, 1+int(Y)==Y which was tested earlier. 3932N/A STOP(
"integer indefinite value shouldn't be seen here");
3932N/A STOP(
"integer indefinite value shouldn't be seen here");
3752N/A // get rid of duplicate arguments. Stack: X^Y 3752N/A // X <= 0, Y even: X^Y = -abs(X)^Y 3752N/A // Come here with result in F-TOS 304N/A // Note: fxch & fpop to get rid of ST1 304N/A // (otherwise FPU stack could overflow eventually) 304N/A // 0111 tttn #8-bit disp 304N/A // 0000 1111 1000 tttn #32-bit disp 622N/A// Note: load_signed_short used to be called load_signed_word. 622N/A// Although the 'w' in x86 opcodes refers to the term "word" in the assembler 622N/A// manual, which means 16 bits, that usage is found nowhere in HotSpot code. 622N/A// The term "word" in HotSpot means a 32- or 64-bit machine word. 304N/A // This is dubious to me since it seems safe to do a signed 16 => 64 bit 304N/A // version but this is what 64bit has always done. This seems to imply 304N/A // that users are only using 32bits worth. 304N/A // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 304N/A // and "3.9 Partial Register Penalties", p. 22). 622N/A// Note: load_unsigned_short used to be called load_unsigned_word. 304N/A // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 304N/A // and "3.9 Partial Register Penalties", p. 22). 0N/A// C++ bool manipulation 0N/A if(
sizeof(
bool) ==
1)
0N/A else if(
sizeof(
bool) ==
2)
0N/A else if(
sizeof(
bool) ==
4)
0N/A if(
sizeof(
bool) ==
1)
0N/A else if(
sizeof(
bool) ==
2)
0N/A else if(
sizeof(
bool) ==
4)
0N/A if(
sizeof(
bool) ==
1)
0N/A else if(
sizeof(
bool) ==
2)
0N/A else if(
sizeof(
bool) ==
4)
304N/A// src should NEVER be a real pointer. Use AddressLiteral for true pointers 304N/A // provoke OS NULL exception if reg = NULL by 304N/A // accessing M[reg] w/o changing any (non-CC) registers 304N/A // NOTE: cmpl is plenty here to provoke a segv 304N/A // Note: should probably use testl(rax, Address(reg, 0)); 304N/A // may be shorter code (however, this version of 304N/A // testl needs to be implemented first) 304N/A // nothing to do, (later) access of M[reg + offset] 304N/A // will provoke OS NULL exception if reg = NULL 304N/A // instead of directly emitting a breakpoint, call os:breakpoint for better debugability 304N/A // (e.g., MSVC can't call ps() otherwise) 304N/A// Save Integer and Float state 304N/A// Warning: Stack must be 16 byte aligned (64bit) 304N/A // Push flags first because pusha kills them 304N/A // Make sure rsp stays 16-byte aligned 304N/A // determine java_thread register 304N/A // we must set sp to zero to clear frame 304N/A// Write serialization page so VM thread can do a pseudo remote membar. 304N/A// We use the current thread pointer to calculate a thread specific 304N/A// offset to write to within the page. This minimizes bus traffic 304N/A// due to cache line collision. 606N/A // Size of store must match masking code above 304N/A// When entering C land, the rbp, & rsp of the last Java frame have to be recorded 304N/A// in the (thread-local) JavaThread object. When leaving C land, the last Java fp 304N/A// has to be reset to 0. This is required to allow proper stack traversal. 304N/A // determine java_thread register 304N/A // determine last_java_sp register 304N/A // last_java_fp is optional 304N/A // last_java_pc is optional 3041N/A // Used in sign-bit flipping with aligned address. 3041N/A // Used in sign-bit flipping with aligned address. 3041N/A// AVX 3-operands instructions 362N/A////////////////////////////////////////////////////////////////////////////////// 2346N/A // If expand_call is true then we expand the call_VM_leaf macro 2346N/A // directly to skip generating the check by 2346N/A // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp. 2346N/A // Do we need to load the previous value? 2346N/A // Is the previous value null? 362N/A // Can we store original value in the thread's buffer? 2346N/A // (The index field is typed as size_t.) 2346N/A // Record the previous value 362N/A // save the live input values 2346N/A // Calling the runtime using the regular call_VM_leaf mechanism generates 2346N/A // code (generated by InterpreterMacroAssember::call_VM_leaf_base) 2346N/A // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL. 2346N/A // If we care generating the pre-barrier without a frame (e.g. in the 2346N/A // intrinsified Reference.get() routine) then ebp might be pointing to 2346N/A // the caller frame and so this check will most likely fail at runtime. 2346N/A // Expanding the call directly bypasses the generation of the check. 2346N/A // So when we do not have have a full interpreter frame on the stack 2346N/A // expand_call should be passed true. 2346N/A // save the live input values 362N/A // Does store cross heap regions? 362N/A // crosses regions, storing NULL? 362N/A // storing region crossing non-NULL, is card already dirty? 362N/A // get the address of the card 362N/A // storing a region crossing, non-NULL oop, card is clean. 362N/A // save the live input values 362N/A////////////////////////////////////////////////////////////////////////////////// 304N/A // Does a store check for the oop in register obj. The content of 304N/A // register obj is destroyed afterwards. 304N/A// split the store check operation so that other instructions can be scheduled inbetween 304N/A // The calculation for byte_map_base is as follows: 304N/A // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift); 304N/A // So this essentially converts an address to a displacement and 304N/A // it will never need to be relocated. On 64bit however the value may be too 304N/A // large for a 32bit displacement 304N/A // By doing it as an ExternalAddress disp could be converted to a rip-relative 304N/A // displacement and done in a single instruction given favorable mapping and 304N/A // a smarter version of as_Address. Worst case it is two instructions which 304N/A // is no worse off then loading disp into a register and doing as a simple 304N/A // We can't do as ExternalAddress as the only style since if disp == 0 we'll 304N/A // assert since NULL isn't acceptable in a reloci (see 6644928). In any case 304N/A // in some cases we'll get a single instruction version. 3236N/A// Force generation of a 4 byte immediate value even if it fits into 8bit 304N/A// C++ bool manipulation 0N/A if(
sizeof(
bool) ==
1)
0N/A else if(
sizeof(
bool) ==
2) {
0N/A // testw implementation needed for two byte bools 0N/A }
else if(
sizeof(
bool) ==
4)
304N/A// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 304N/A // update the tlab top pointer 304N/A // recover var_size_in_bytes if necessary 304N/A// Preserves rbx, and rdx. 304N/A // No allocation in the shared eden. 304N/A // calculate amount of free space 304N/A // Retain tlab and allocate object in shared space if 304N/A // the amount free in the tlab is too large to discard. 304N/A // %%% yuck as movptr... 304N/A // increment number of slow_allocations 304N/A // increment number of refills 304N/A // accumulate wastage -- t1 is amount free in tlab 304N/A // if tlab is currently allocated (top or end != null) then 304N/A // fill [top, end + alignment_reserve) with array object 304N/A // set up the mark word 304N/A // set the length to the remaining space 304N/A // set klass to intArrayKlass 304N/A // dubious reloc why not an oop reloc? 304N/A // store klass last. concurrent gcs assumes klass length is valid if 304N/A // klass field is not null. 304N/A // refill the tlab with an eden allocation 1988N/A // allocate new tlab, address returned in top 304N/A // Check that t1 was preserved in eden_allocate. 3752N/A // if we are coming from c1, xmm registers may be live 3752N/A // Preserve registers across runtime call 3752N/A // Must preserve all other FPU regs (could alternatively convert 3752N/A // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash 3752N/A // FPU state, but can not trust C compiler) 3752N/A // NOTE that in this case we also push the incoming argument(s) to 3752N/A // the stack and restore it later; we also use this stack slot to 3752N/A // hold the return value from dsin, dcos etc. 3752N/A // NOTE: we must not use call_VM_leaf here because that requires a 3752N/A // complete interpreter frame in debug mode -- same bug as 4387334 3752N/A // MacroAssembler::call_VM_leaf_base is perfectly safe and will 3752N/A // Need to add stack banging before this runtime call if it needs to 3752N/A // be taken; however, there is no generic stack banging routine at 3752N/A // the MacroAssembler level 3752N/A // Must save return value to stack and then restore entire FPU 3752N/A // stack except incoming arguments 304N/Astatic const double pi_4 =
0.7853981633974483;
304N/A // A hand-coded argument reduction for values in fabs(pi/4, pi/2) 304N/A // was attempted in this code; unfortunately it appears that the 304N/A // switch to 80-bit precision and back causes this to be 304N/A // unprofitable compared with simply performing a runtime call if 304N/A // the argument is out of the (-pi/4, pi/4) range. 304N/A // fcmp needs a temporary so preserve rbx, 520N/A // fastest case: -pi/4 <= x <= pi/4 304N/A // slow case: runtime call 304N/A // Come here with result in F-TOS 623N/A// Look up the method for a megamorphic invokeinterface call. 623N/A// The target method is determined by <intf_klass, itable_index>. 623N/A// The receiver klass is in recv_klass. 623N/A// On success, the result will be in method_result, and execution falls through. 623N/A// On failure, execution transfers to the given label. 623N/A "caller must use same register for non-constant itable index as for method");
623N/A // Compute start of first itableOffsetEntry (which is at the end of the vtable) 623N/A // %%% Could store the aligned, prescaled offset in the klassoop. 623N/A // Round up to align_object_offset boundary 623N/A // see code for instanceKlass::start_of_itable! 623N/A // Adjust recv_klass by scaled itable_index, so we can free itable_index. 623N/A // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 623N/A // if (scan->interface() == intf) { 623N/A // result = (klass + scan->offset() + itable_index); 623N/A // (invert the test to fall through to found_method...) 623N/A // Check that the previous entry is non-null. A null entry means that 623N/A // the receiver class doesn't implement the interface, and wasn't the 623N/A // same as when the caller was compiled. 644N/A // Hacked jcc, which "knows" that L_fallthrough, at least, is in 644N/A // range of a jccb. If this routine grows larger, reconsider at 644N/A // least some of these. 644N/A // Hacked jmp, which may only be used just before L_fallthrough. 644N/A // If the pointers are equal, we are done (e.g., String[] elements). 644N/A // This self-check enables sharing of secondary supertype arrays among 644N/A // non-primary types such as array-of-interface. Otherwise, each such 644N/A // type would need its own customized SSA. 644N/A // We move this check to the front of the fast path because many 644N/A // type checks are in fact trivially successful in this manner, 644N/A // so we get a nicely predicted branch right at the start of the check. 644N/A // Check the supertype display: 644N/A // Positive movl does right thing on LP64. 644N/A // This check has worked decisively for primary supers. 644N/A // Secondary supers are sought in the super_cache ('super_cache_addr'). 644N/A // (Secondary supers are interfaces and very deeply nested subtypes.) 644N/A // This works in the same check above because of a tricky aliasing 644N/A // between the super_cache and the primary super display elements. 644N/A // (The 'super_check_addr' can address either, as the case requires.) 644N/A // Note that the cache is updated below if it does not help us find 644N/A // what we need immediately. 644N/A // So if it was a primary super, we can just fail immediately. 644N/A // Otherwise, it's the slow path for us (no success at this point). 644N/A // Need a slow path; fast failure is impossible. 644N/A // No slow path; it's a fast decision. 644N/A // a couple of useful fields in sub_klass: 644N/A // Do a linear scan of the secondary super-klass chain. 644N/A // This code is rarely used, so simplicity is a virtue here. 644N/A // The repne_scan instruction uses fixed registers, which we must spill. 644N/A // Don't worry too much about pre-existing connections with the input regs. 644N/A // Get super_klass value into rax (even if it was in rdi or rcx). 644N/A // We will consult the secondary-super array. 644N/A // Load the array length. (Positive movl does right thing on LP64.) 644N/A // Skip to start of data. 644N/A // Scan RCX words at [RDI] for an occurrence of RAX. 644N/A // Set NZ/Z based on last compare. 1604N/A // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does 1604N/A // not change flags (only scas instruction which is repeated sets flags). 1604N/A // Set Z = 0 (not equal) before 'repne' to indicate that class was not found. 644N/A // This part is tricky, as values in supers array could be 32 or 64 bit wide 644N/A // and we store values in objArrays always encoded, thus we need to encode 644N/A // the value of rax before repne. Note that rax is dead after the repne. 644N/A // The superclass is never null; it would be a basic system error if a null 644N/A // pointer were to sneak in here. Note that we have already loaded the 644N/A // Klass::super_check_offset from the super_klass in the fast path, 644N/A // so if there is a null in that register, we are already in the afterlife. 644N/A // Unspill the temp. registers: 644N/A // Special hack for the AD files: rdi is guaranteed non-zero. 644N/A // Success. Cache the super we found and proceed in triumph. 0N/A // Pass register number to verify_oop_subroutine 304N/A // avoid using pushptr, as it modifies scratch registers 304N/A // and our contract is not to modify anything 0N/A // call indirectly to solve generation ordering problem 1503N/A // Caller pops the arguments (oop, message) and restores rax, r10 622N/A // load indirectly to solve generation ordering problem 710N/A // cf. TemplateTable::prepare_invoke(), if (load_receiver). 0N/A // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord); 0N/A // Pass register number to verify_oop_subroutine 0N/A // addr may contain rsp so we will have to adjust it based on the push 2311N/A // we just did (and on 64 bit we do two pushes) 304N/A // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which 304N/A // stores rax into addr which is backwards of what was intended. 0N/A // pass msg argument 304N/A // avoid using pushptr, as it modifies scratch registers 304N/A // and our contract is not to modify anything 0N/A // call indirectly to solve generation ordering problem 1503N/A // Caller pops the arguments (addr, message) and restores rax, r10. 0N/A case 0:
rc =
"round near";
break;
0N/A case 1:
rc =
"round down";
break;
0N/A case 2:
rc =
"round up ";
break;
0N/A case 3:
rc =
"chop ";
break;
0N/A // precision control 0N/A case 0:
pc =
"24 bits ";
break;
0N/A case 1:
pc =
"reserved";
break;
0N/A case 2:
pc =
"53 bits ";
break;
0N/A case 3:
pc =
"64 bits ";
break;
0N/A c[0] = (
C3()) ?
'3' :
'-';
0N/A c[
1] = (
C2()) ?
'2' :
'-';
0N/A c[
2] = (
C1()) ?
'1' :
'-';
0N/A c[
3] = (
C0()) ?
'0' :
'-';
0N/A case 0:
return "valid";
0N/A case 1:
return "zero";
0N/A case 2:
return "special";
0N/A case 3:
return "empty";
0N/A // print computation registers 0N/A printf(
"%c r%d = ST%d = ", (j == 0 ?
'*' :
' '), i, j);
0N/A // print control registers 0N/A // computation registers 0N/A // control registers 0N/A printf(
"--------------------------------------------------\n");
0N/A printf(
"--------------------------------------------------\n");
0N/A // For leaf calls, only verify that the top few elements remain empty. 0N/A // We only need 1 empty at the top for C2 code. 0N/A return true;
// All other stack states do not matter 0N/A "bad FPU control word");
0N/A // compute stack depth 0N/A // stack not contiguous 0N/A printf(
"%s: stack not contiguous at ST%d\n", s, i);
0N/A // check if computed stack depth corresponds to expected stack depth 0N/A // expected stack depth is -stack_depth or less 0N/A // too many elements on the stack 0N/A // expected stack depth is stack_depth 0N/A // wrong stack depth 0N/A // everything is cool 0N/A // pass message string s 0N/A int3();
// break if error condition 1491N/A // OK to use shift since we don't need to preserve flags. 2311N/A// Doesn't do verfication, generates fixed size code 304N/A // Store to klass gap in destination 3932N/A STOP(
"null oop passed to encode_heap_oop_not_null");
3932N/A STOP(
"null oop passed to encode_heap_oop_not_null2");
1491N/A // Note: it will change flags 304N/A // Cannot assert, unverified entry point counts instructions (see .ad file) 304N/A // vtableStubs also counts instructions in pd_code_size_limit. 304N/A // Also do not verify_oop as this is called by verify_oop. 1491N/A // Note: it will change flags 304N/A // Cannot assert, unverified entry point counts instructions (see .ad file) 304N/A // vtableStubs also counts instructions in pd_code_size_limit. 304N/A // Also do not verify_oop as this is called by verify_oop. 3236N/A// C2 compiled method's prolog code. 3236N/A // WARNING: Initial instruction MUST be 5 bytes or longer so that 3236N/A // NativeJump::patch_verified_entry will be able to patch out the entry 3236N/A // code safely. The push to verify stack depth is ok at 5 bytes, 3236N/A // the frame allocation can be either 3 or 6 bytes. So if we don't do 3236N/A // stack bang then we must use the 6 byte frame allocation even if 3236N/A // Remove word for return addr 3236N/A // Calls to C2R adapters often do not accept exceptional returns. 3236N/A // We require that their callers must bang for them. But be careful, because 3236N/A // some VM calls (such as call site linkage) can use several kilobytes of 3236N/A // stack. But the stack safety zone should account for that. 3236N/A // See bugs 4446381, 4468289, 4497237. 3236N/A // We always push rbp, so that on return to interpreter rbp, will be 3236N/A // restored correctly and we can correct the stack. 3236N/A // Create frame (force generation of a 4 byte immediate value) 3236N/A // If method sets FPU control word do it now 2167N/A// IndexOf for constant substrings with size >= 8 chars 2167N/A// which don't need to be loaded through stack. 2167N/A // This method uses pcmpestri inxtruction with bound registers 986N/A // rax - substring length (elements count) 2167N/A // rdx - string length (elements count) 2167N/A // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2167N/A // rcx - matched index in string 2167N/A // Note, inline_string_indexOf() generates checks: 2167N/A // Reload substr for rescan, this code 2167N/A // is executed only for large substrings (> 8 chars) 2167N/A // We came here after the beginning of the substring was 2167N/A // matched but the rest of it was not so we need to search 2167N/A // again. Start from the next element after the previous match. 2167N/A // cnt2 is number of substring reminding elements and 2167N/A // cnt1 is number of string reminding elements when cmp failed. 2167N/A // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2167N/A // Scan string for start of substr in 16-byte vectors 2167N/A // Found a potential substr 2167N/A // Matched whole vector if first element matched (tmp(rcx) == 0). 2167N/A // After pcmpestri tmp(rcx) contains matched element index 2167N/A // Compute start addr of substr 2167N/A // Make sure string is still long enough 2167N/A // Left less then substring. 2167N/A // This code is optimized for the case when whole substring 2167N/A // is matched if its head is matched. 2167N/A // Reload only string if does not match 2167N/A // Compare the rest of substring (> 8 chars). 2167N/A // First 8 chars are already matched. 2167N/A // Back-up strings to avoid reading beyond substring: 2167N/A // calculate index in register to avoid integer overflow (int_cnt2*2) 2167N/A // Need to reload strings pointers if not matched whole vector 2167N/A // Fall through if found full substring 2167N/A // Found result if we matched full small substring. 2167N/A// Small strings are loaded through stack if they cross page boundary. 2167N/A // int_cnt2 is length of small (< 8 chars) constant substring 2167N/A // or (-1) for non constant substring in which case its length 2167N/A // Note, inline_string_indexOf() generates checks: 2167N/A // This method uses pcmpestri inxtruction with bound registers 2167N/A // rax - substring length (elements count) 986N/A // rdx - string length (elements count) 986N/A // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 986N/A // rcx - matched index in string 2167N/A {
//======================================================== 2167N/A // We don't know where these strings are located 2167N/A // and we can't read beyond them. Load them through stack. 2167N/A }
else {
// cnt2 = { 3, 5, 6, 7 } 2167N/A // Array header size is 12 bytes in 32-bit VM 2167N/A // + 6 bytes for 3 chars == 18 bytes, 2167N/A // enough space to load vec and shift. 2167N/A }
else {
// not constant substring 2167N/A // We can read beyond string if srt+16 does not cross page boundary 2167N/A // since heaps are aligned and mapped by pages. 2167N/A // Move small strings to stack to allow load 16 bytes into vec. 2167N/A // Check cross page boundary. 2167N/A // Small (< 8 chars) constant substrings are loaded already. 2167N/A //======================================================== 2167N/A // String saved at sp+1*wordSize 2167N/A // Substr saved at sp+2*wordSize 2167N/A // Substr count saved at sp+3*wordSize 2167N/A // Reload substr for rescan, this code 2167N/A // is executed only for large substrings (> 8 chars) 2167N/A // We came here after the beginning of the substring was 2167N/A // matched but the rest of it was not so we need to search 2167N/A // again. Start from the next element after the previous match. 2167N/A // Scan string for start of substr in 16-byte vectors 2167N/A // Back-up string to avoid reading beyond string. 2167N/A // Found a potential substr 2167N/A // After pcmpestri tmp(rcx) contains matched element index 986N/A // Make sure string is still long enough 2167N/A // Left less then substring. 2167N/A // Compute start addr of substr 2167N/A // Repeat search for small substring (< 8 chars) 2167N/A // from new point without reloading substring. 2167N/A // Have to check that we don't read beyond string. 2167N/A // Fall through if matched whole substring. 2167N/A // Found result if we matched whole substring. 2167N/A // Repeat search for small substring (<= 8 chars) 2167N/A // from new point 'str1' without reloading substring. 2167N/A // Have to check that we don't read beyond string. 2167N/A // Compare the rest of substring (> 8 chars). 2167N/A // First 8 chars are already matched. 2167N/A // Need to reload strings pointers if not matched whole vector 2167N/A // Back-up strings to avoid reading beyond substring. 986N/A // Compute the minimum of the string lengths and the 986N/A // difference of the string lengths (stack). 986N/A // Do the conditional move stuff 986N/A // Is the minimum length zero? 986N/A // Load first characters 986N/A // Compare first characters 986N/A // Check after comparing first character to see if strings are equivalent 986N/A // Check if the strings start at same location 986N/A // Check if the length difference is zero (from stack) 986N/A // Strings might not be equivalent 986N/A // Setup to compare 16-byte vectors 2134N/A // rax - negative string length (elements count) 2134N/A // rdx - string length (elements count) 2134N/A // pcmpmask - cmp mode: 11000 (string compare with negated result) 2134N/A // + 00 (unsigned bytes) or + 01 (unsigned shorts) 2134N/A // rcx - first mismatched element index 2134N/A // After pcmpestri cnt1(rcx) contains mismatched element index 2134N/A // compare wide vectors tail 986N/A // Mismatched characters in the vectors 986N/A // Fallthru to tail compare 986N/A // Shift str2 and str1 to the end of the arrays, negate min 2134N/A // Compare the rest of the elements 986N/A // Strings are equal up to min length. Return the length difference. 986N/A // Discard the stored length difference 986N/A// Compare char[] arrays aligned to 4 bytes or substrings. 986N/A // Check the input args 986N/A // Need additional checks for arrays_equals. 986N/A // With SSE4.2, use double quad vector compare 986N/A // Compare 16-byte vectors 986N/A // Fallthru to tail compare 986N/A // Compare 4-byte vectors 986N/A // Compare trailing char (final 2 bytes), if any 1683N/A // align source address at 4 bytes address boundary 1683N/A // One byte misalignment happens only for byte arrays 1683N/A // Two bytes misalignment happens only for byte and short (char) arrays 1683N/A for (
int i = 0; i <
32; i +=
4) {
1683N/A // length is too short, just fill qwords 1683N/A // fall through to fill 4 bytes 1683N/A // align to 8 bytes, we know we are 4 byte aligned to start 1683N/A // length is too short, just fill qwords 0N/A // Note some conditions are synonyms for others