assembler_x86.cpp revision 2578
2N/A * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved. 2N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 2N/A * This code is free software; you can redistribute it and/or modify it 2N/A * under the terms of the GNU General Public License version 2 only, as 2N/A * published by the Free Software Foundation. 2N/A * This code is distributed in the hope that it will be useful, but WITHOUT 2N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 2N/A * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 2N/A * version 2 for more details (a copy is included in the LICENSE file that 2N/A * accompanied this code). 2N/A * You should have received a copy of the GNU General Public License version 2N/A * 2 along with this work; if not, write to the Free Software Foundation, 2N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 2N/A * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 4194N/A// Implementation of AddressLiteral // Oops are a special case. Normally they would be their own section // but in cases like icBuffer they are literals in the code stream that // we don't have a section for. We use none so that we get a literal address // which is always patchable. // Implementation of Address // Not implementable on 64bit machines // Should have been handled higher up the call chain. // exceedingly dangerous constructor // exceedingly dangerous constructor // Convert the raw encoding form into the form expected by the constructor for // Address. An index of 4 (rsp) corresponds to having no index, so convert // that to noreg for the Address constructor. // Implementation of Assembler // make this go away someday // Do not use AbstractAssembler::relocate, which is not intended for // embedded words. Instead, relocate to the enclosing instruction. // hack. call32 is too wide for mask so use disp32 assert((
op1 &
0x01) == 0,
"should be 8bit operation");
assert((
op1 &
0x01) ==
1,
"should be 32bit operation");
assert((
op1 &
0x02) == 0,
"sign-extension bit should not be set");
// immediate-to-memory forms assert((
op1 &
0x01) ==
1,
"should be 32bit operation");
assert((
op1 &
0x02) == 0,
"sign-extension bit should not be set");
assert((
op1 &
0x01) ==
1,
"should be 32bit operation");
assert((
op1 &
0x02) == 0,
"sign-extension bit should not be set");
// Encode the registers as needed in the fields they are used in // [base + index*scale + disp] // [00 reg 100][ss index base] // [base + index*scale + imm8] // [01 reg 100][ss index base] imm8 // [base + index*scale + disp32] // [10 reg 100][ss index base] disp32 // [00 reg 100][00 100 100] // [01 reg 100][00 100 100] disp8 // [10 reg 100][00 100 100] disp32 // [00 reg 100][ss index 101] disp32 // [disp] (64bit) RIP-RELATIVE (32bit) abs // Note that the RIP-rel. correction applies to the generated // disp field, but _not_ to the target address in the rspec. // disp was created by converting the target address minus the pc // at the start of the instruction. That needs more correction here. // intptr_t disp = target - next_ip; // Do rip-rel adjustment for 64bit "must be 32bit offset (RIP relative address)");
// 32bit never did this, did everything as the rip-rel/disp code above // [00 reg 100][00 100 101] disp32 // Secret local extension to Assembler::WhichOperand: // Decode the given instruction, and return the address of // an embedded 32-bit operand word. // If "which" is disp32_operand, selects the displacement portion // of an effective address specifier. // If "which" is imm64_operand, selects the trailing immediate constant. // If "which" is call32_operand, selects the displacement of a call or jump. // Caller is responsible for ensuring that there is such an operand, // and that it is 32/64 bits wide. // If "which" is end_pc_operand, find the end of the instruction. int tail_size = 0;
// other random bytes (#32, #16, etc.) at end of insn // These convenience macros generate groups of "case" labels for the switch. #
define REP4(x) (x)+0:
case (x)+
1:
case (x)+
2:
case (x)+
3#
define REP8(x) (x)+0:
case (x)+
1:
case (x)+
2:
case (x)+
3: \
case (x)+
4:
case (x)+
5:
case (x)+
6:
case (x)+
7 case 0xFF:
// pushq a; decl a; incl a; call a; jmp a return ip;
// not produced by emit_operand case 0x66:
// movw ... (size prefix) case 0xC7:
// movw a, #16 case 0x0F:
// several SSE/SSE2 variants ip--;
// reparse the 0x0F case REP8(
0xB8):
// movl/q r, #32/#64(oop?) // these asserts are somewhat nonsensical case 0x69:
// imul r, a, #32 case 0xC7:
// movl a, #32(oop?) case 0x0F:
// movx..., etc. // 64bit side says it these have both operands but that doesn't case 0xAD:
// shrd r, a, %cl case 0xBE:
// movsbl r, a (movsxb) case 0xBF:
// movswl r, a (movsxw) case 0xB6:
// movzbl r, a (movzxb) case 0xB7:
// movzwl r, a (movzxw) case REP16(
0x40):
// cmovl cc, r, a case REP16(
0x90):
// setcc a // fall out of the switch to decode the address case 0xAC:
// shrd r, a, #8 case REP16(
0x80):
// jcc rdisp32 case 0x81:
// addl a, #32; addl r, #32 // also: orl, adcl, sbbl, andl, subl, xorl, cmpl // on 32bit in the case of cmpl, the imm might be an oop case 0x83:
// addl a, #8; addl r, #8 // also: orl, adcl, sbbl, andl, subl, xorl, cmpl case REP4(
0x00):
// addb a, r; addl a, r; addb r, a; addl r, a case REP4(
0x10):
// adc... case REP4(
0x20):
// and... case REP4(
0x30):
// xor... case REP4(
0x08):
// or... case REP4(
0x18):
// sbb... case REP4(
0x28):
// sub... case REP4(
0x38):
// cmp... case 0xC1:
// sal a, #8; sar a, #8; shl a, #8; shr a, #8 case 0x6B:
// imul r, a, #8 case 0xE8:
// call rdisp32 case 0xE9:
// jmp rdisp32 case 0xD1:
// sal a, 1; sar a, 1; shl a, 1; shr a, 1 case 0xD3:
// sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl case 0xD9:
// fld_s a; fst_s a; fstp_s a; fldcw a case 0xDD:
// fld_d a; fst_d a; fstp_d a case 0xDB:
// fild_s a; fistp_s a; fld_x a; fstp_x a case 0xDF:
// fild_d a; fistp_d a case 0xD8:
// fadd_s a; fsubr_s a; fmul_s a; fdivr_s a; fcomp_s a case 0xDC:
// fadd_d a; fsubr_d a; fmul_d a; fdivr_d a; fcomp_d a case 0xDE:
// faddp_d a; fsubrp_d a; fmulp_d a; fdivrp_d a; fcompp_d a // assert(which != imm_operand || has_imm32, "instruction has no imm32 field"); // parse the output of emit_operand base =
op3 &
0x07;
// refetch the base // now ip points at the disp (if any) // [00 reg 100][ss index base] // [00 reg 100][00 100 esp] // [00 reg 100][ss index 101][disp32] return ip;
// caller wants the disp32 ip +=
4;
// skip the disp32 // [01 reg 100][ss index base][disp8] // [01 reg 100][00 100 esp][disp8] ip +=
1;
// skip the disp8 // [10 reg 100][ss index base][disp32] // [10 reg 100][00 100 esp][disp32] // [10 reg base] [disp32] return ip;
// caller wants the disp32 ip +=
4;
// skip the disp32 // [11 reg base] (not a memory addressing mode) // Secretly share code with locate_operand: // assert(format == imm32_operand, "cannot specify a nonzero format"); assert(
opnd ==
pc(),
"must put operand where relocs can find it");
// work around gcc (3.2.1-7a) bug assert(0 <= i && i <
8,
"illegal stack offset");
// Now the Assembler instructions (identical for 32/64 bits) // 4 bytes: NOP DWORD PTR [EAX+0] emit_byte(
0x40);
// emit_rm(cbuf, 0x1, EAX_enc, EAX_enc); // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset emit_byte(
0x44);
// emit_rm(cbuf, 0x1, EAX_enc, 0x4); emit_byte(
0x00);
// emit_rm(cbuf, 0x0, EAX_enc, EAX_enc); // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset emit_byte(
0x80);
// emit_rm(cbuf, 0x2, EAX_enc, EAX_enc); // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset emit_byte(
0x84);
// emit_rm(cbuf, 0x2, EAX_enc, 0x4); emit_byte(
0x00);
// emit_rm(cbuf, 0x0, EAX_enc, EAX_enc); // suspect disp32 is always good // 1110 1000 #32-bit disp // 1110 1000 #32-bit disp // This was originally using a 32bit register encoding // and surely we want 64bit! // this is a 32bit encoding but in 64bit mode the default // operand size is 64bit so there is no need for the // wide prefix. So prefix only happens if we use the // new registers. Much like push/pop. // this may be true but dbx disassembles it as if it // int encode = prefix_and_encode(dst->encoding()); // if (offset() != x) assert(dst->encoding() >= 8, "what?"); // Technically, should use call32_operand, but this format is // implied by the fact that we're emitting a call instruction. // The 32-bit cmpxchg compares the value at adr with the contents of rax, // and stores reg into adr if so; otherwise, the value at adr is loaded into rax,. // The ZF is set if the compared values were equal, and cleared otherwise. // caveat: no instructionmark, so this isn't relocatable. // Emit a synthetic, non-atomic, CAS equivalent. // Beware. The synthetic form sets all ICCs, not just ZF. // cmpxchg r,[m] is equivalent to rax, = CAS (m, rax, r) // NOTE: dbx seems to decode this as comiss even though the // 0x66 is there. Strangly ucomisd comes out correct // Don't use it directly. Use MacroAssembler::decrement() instead. // Don't use it directly. Use MacroAssembler::increment() instead. // 0000 1111 1000 tttn #32-bit disp "must be 32bit offset (call4)");
// Note: could eliminate cond. jumps to this jump if condition // is the same however, seems to be rather unlikely case. // Note: use jccb() if label to be bound is very close to get "Dispacement too large for a short jmp");
// By default, forward jumps are always 32-bit displacements, since // we can't yet know where the label will be bound. If you're sure that // the forward jump will not run beyond 256 bytes, use jmpb to // force an 8-bit displacement. "Dispacement too large for a short jmp");
// Emit either nothing, a NOP, or a NOP: prefix // Emit mfence instruction // swap src/dst to get correct prefix // Uses zero extension on 64bit // New cpus require to use movsd and movss to avoid partial register stall // when loading from memory. But for old Opteron use movlpd instead of movsd. // The selection is done in MacroAssembler::movdbl() and movflt(). // workaround gcc (3.2.1-7a) bug // In that version of gcc with only an emit_operand(MMX, Address) // gcc will tail jump and try and reverse the parameters completely // obliterating dst in the process. By having a version available // that doesn't need to swap the args at the tail jump the bug is // The fancy nops aren't currently recognized by debuggers making it a // pain to disassemble code while debugging. If asserts are on clearly // speed is not an issue so simply use the single byte traditional nop // Using multi-bytes nops "0x0F 0x1F [address]" for Intel // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding) // 4: 0x0F 0x1F 0x40 0x00 // 5: 0x0F 0x1F 0x44 0x00 0x00 // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00 // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 // The rest coding is Intel specific - don't use consecutive address nops // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90 // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90 // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90 // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90 // For Intel don't generate consecutive addess nops (mix with regular nops) // Don't use "0x0F 0x1F 0x00" - need patching safe padding // Using multi-bytes nops "0x0F 0x1F [address]" for AMD. // 3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding) // 4: 0x0F 0x1F 0x40 0x00 // 5: 0x0F 0x1F 0x44 0x00 0x00 // 6: 0x66 0x0F 0x1F 0x44 0x00 0x00 // 7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 // 8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 // 9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 // The rest coding is AMD specific - use consecutive address nops // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00 // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00 // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 // Size prefixes (0x66) are added for larger sizes // Generate first nop for size between 21-12 // Generate second nop for size between 11-1 // Don't use "0x0F 0x1F 0x00" - need patching safe padding // Using nops with size prefixes "0x66 0x90". // From AMD Optimization Guide: // 4: 0x66 0x66 0x66 0x90 // 5: 0x66 0x66 0x90 0x66 0x90 // 6: 0x66 0x66 0x90 0x66 0x66 0x90 // 7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 // 8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90 // 9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90 // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90 // NOTE: this will adjust stack by 8byte on 64bits // Shift 64 bit value logically right by specified number of bits. // HMM Table D-1 says sse2 or mmx. // Do not confuse it with psrldq SSE2 instruction which // shifts 128 bit value in xmm register by number of bytes. // Shift 128 bit value in xmm register by number of bytes. // in 64bits we push 64bits onto the stack but only // take a 32bit immediate // Note this will push 64bit on 64bit // copies data from [esi] to [edi] using rcx pointer sized words // sets rcx pointer sized words with rax, value at [edi] // scans rcx pointer sized words at [edi] for occurance of rax, // scans rcx 4 byte words at [edi] for occurance of rax, // Not supported in 64bit mode // copies a single word from [esi] to [edi] // HMM Table D-1 says sse2 // NOT_LP64(assert(VM_Version::supports_sse(), "")); // HMM Table D-1 says sse2 // NOT_LP64(assert(VM_Version::supports_sse(), "")); // not using emit_arith because test // doesn't support sign-extension of // 32bit only pieces of the assembler // NO PREFIX AS NEVER 64BIT // NO PREFIX AS NEVER 64BIT (not even 32bit versions of 64bit regs // The 64-bit (32bit platform) cmpxchg compares the value at adr with the contents of rdx:rax, // and stores rcx:rbx into adr if so; otherwise, the value at adr is loaded // into rdx:rax. The ZF is set if the compared values were equal, and cleared otherwise. // Don't use it directly. Use MacroAssembler::decrementl() instead. // 64bit typically doesn't use the x87 but needs to for the trig funcs // Note: The Intel manual (Pentium Processor User's Manual, Vol.3, 1994) // is erroneous for some of the floating-point instructions below. emit_farith(
0xDE,
0xF8, i);
// ST(0) <- ST(0) / ST(1) and pop (Intel manual wrong) emit_farith(
0xDE,
0xF0, i);
// ST(0) <- ST(1) / ST(0) and pop (Intel manual wrong) emit_farith(
0xDE,
0xE8, i);
// ST(0) <- ST(0) - ST(1) and pop (Intel manual wrong) emit_farith(
0xDE,
0xE0, i);
// ST(0) <- ST(1) - ST(0) and pop (Intel manual wrong) // make sure the instruction is supported (introduced for P6, together with cmov) // make sure the instruction is supported (introduced for P6, together with cmov) // Don't use it directly. Use MacroAssembler::incrementl() instead. // 64bit only pieces of the assembler // This should only be used by 64bit instructions that can use rip-relative // it cannot be used by instructions that want an immediate value. // None will force a 64bit literal to the code stream. Likely a placeholder // for something that will be patched later and we need to certain it will // This should be rip relative and easily reachable. // This should be rip relative within the code cache and easily // reachable until we get huge code caches. (At which point // ic code is going to have issues). // Stress the correction code // Must be runtimecall reloc, see if it is in the codecache // Flipping stuff in the codecache to be unreachable causes issues // with things like inline caches where the additional instructions // are now (possibly a temp buffer) and where we might end up // anywhere in the codeCache then we are always reachable. // This would have to change if we ever save/restore shared code // to be more pessimistic. // Because rip relative is a disp + address_of_next_instruction and we // don't know the value of address_of_next_instruction we apply a fudge factor // to make sure we will be ok no matter the size of the instruction we get placed into. // We don't have to fudge the checks above here because they are already worst case. // 12 == override/rex byte, opcode byte, rm byte, sib byte, a 4-byte disp , 4-byte literal // + 4 because better safe than sorry. const int fudge =
12 +
4;
// Check if the polling page is not reachable from the code cache using rip-relative // Do not use AbstractAssembler::relocate, which is not intended for // embedded words. Instead, relocate to the enclosing instruction. // Don't use it directly. Use MacroAssembler::decrementl() instead. // Use two-byte form (one-byte form is a REX prefix in 64-bit mode) // Don't use it directly. Use MacroAssembler::decrementq() instead. // Use two-byte form (one-byte from is a REX prefix in 64-bit mode) // Don't use it directly. Use MacroAssembler::decrementq() instead. // Don't use it directly. Use MacroAssembler::incrementl() instead. // Use two-byte form (one-byte from is a REX prefix in 64-bit mode) // Don't use it directly. Use MacroAssembler::incrementq() instead. // Use two-byte form (one-byte from is a REX prefix in 64-bit mode) // Don't use it directly. Use MacroAssembler::incrementq() instead. // swap src/dst to get correct prefix // dbx shows movslq(rcx, 3) as movq $0x0000000049000000,(%rbx) // and movslq(r8, 3); as movl $0x0000000048000000,(%rbx) // as a result we shouldn't use until tested at runtime... // we have to store original rsp. ABI says that 128 bytes // below rsp are local scratch. // not using emit_arith because test // doesn't support sign-extension of // Implementation of MacroAssembler // First all the versions that have distinct versions depending on 32/64 bit // Unless the difference is trivial (1 line or so). // See whether the lock is currently biased toward our thread and // whether the epoch is still valid // Note that the runtime guarantees sufficient alignment of JavaThread // pointers to allow age to be placed into low bits // First check to see whether biasing is even enabled for this object // The bias pattern is present in the object's header. Need to check // whether the bias owner and the epoch are both still current. // Note that because there is no current thread register on x86 we // need to store off the mark word we read out of the object to // avoid reloading it and needing to recheck invariants below. This // store is unfortunate but it makes the overall code shorter and // At this point we know that the header has the bias pattern and // that we are not the bias owner in the current epoch. We need to // figure out more details about the state of the header in order to // know what operations can be legally performed on the object's // If the low three bits in the xor result aren't clear, that means // the prototype header is no longer biased and we have to revoke // the bias on this object. // Biasing is still enabled for this data type. See whether the // epoch of the current bias is still valid, meaning that the epoch // bits of the mark word are equal to the epoch bits of the // prototype header. (Note that the prototype header's epoch bits // only change at a safepoint.) If not, attempt to rebias the object // toward the current thread. Note that we must be absolutely sure // that the current epoch is invalid in order to do this because // otherwise the manipulations it performs on the mark word are // The epoch of the current bias is still valid but we know nothing // about the owner; it might be set or it might be clear. Try to // acquire the bias of the object using an atomic operation. If this // fails we will go in to the runtime to revoke the object's bias. // Note that we first construct the presumed unbiased header so we // don't accidentally blow away another thread's valid bias. // If the biasing toward our thread failed, this means that // another thread succeeded in biasing it toward itself and we // need to revoke that bias. The revocation will occur in the // interpreter runtime in the slow case. // At this point we know the epoch has expired, meaning that the // current "bias owner", if any, is actually invalid. Under these // circumstances _only_, we are allowed to use the current header's // value as the comparison value when doing the cas to acquire the // bias in the current epoch. In other words, we allow transfer of // the bias from one thread to another directly in this situation. // FIXME: due to a lack of registers we currently blow away the age // bits in this situation. Should attempt to preserve them. // If the biasing toward our thread failed, then another thread // succeeded in biasing it toward itself and we need to revoke that // bias. The revocation will occur in the runtime in the slow case. // The prototype mark in the klass doesn't have the bias bit set any // more, indicating that objects of this data type are not supposed // to be biased any more. We are going to try to reset the mark of // this object to the prototype value and fall through to the // CAS-based locking scheme. Note that if our CAS fails, it means // that another thread raced us for the privilege of revoking the // bias of this particular object, so it's okay to continue in the // FIXME: due to a lack of registers we currently blow away the age // bits in this situation. Should attempt to preserve them. // Fall through to the normal CAS-based lock, because no matter what // the result of the above CAS, some thread must have succeeded in // removing the bias bit from the object's header. // According to Intel Doc. AP-526, "Integer Divide", p.18. // A 5 byte nop that is safe for patching (see patch_verified_entry) // set parity bit if FPU flag C2 is set (via rax) // set parity bit if FPU flag C2 is set (via rax) // 32bit can do a case table jump in one instruction but we no longer allow the base // to be installed in the Address class // Note: y_lo will be destroyed // Long compare for Java (semantics as described in JVM spec.) // x_hi is the return register // leal(dst, as_Address(adr)); // see note in movl as to why we must use a move // Multiplication of two Java long values stored on the stack // as illustrated below. Result is in rdx:rax. // [ y_lo ] / (in bytes) | x_rsp_offset // Basic idea: lo(result) = lo(x_lo * y_lo) // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi) // load x_hi, y_hi and check if quick // multiplication is possible orl(
rbx,
rcx);
// rbx, = 0 <=> x_hi = 0 and y_hi = 0 // do full multiplication bind(
quick);
// note: rbx, = 0 if quick multiply! // Java shift left long support (semantics as described in JVM spec., p.305) // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n)) // shift value is in rcx ! andl(s,
0x3f);
// s := s & 0x3f (s < 0x40) cmpl(s, n);
// if (s < n) // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! bind(L);
// s (mod n) < n // Java shift right long support (semantics as described in JVM spec., p.306 & p.310) // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n)) andl(s,
0x3f);
// s := s & 0x3f (s < 0x40) cmpl(s, n);
// if (s < n) // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! bind(L);
// s (mod n) < n // src should NEVER be a real pointer. Use AddressLiteral for true pointers // In order to get locks to work, we need to fake a in_VM state // To see where a verify_oop failed, get $ebx+40/X for this frame. // This is the value of eip which points to where verify_oop will return. assert(
false,
"start up GDB");
::
tty->
print_cr(
"=============== DEBUG MESSAGE: %s ================\n",
msg);
// push address of message pusha();
// push registers // push address of message // amd64 always does this as a pc-rel // we can be absolute or disp based on the instruction type // jmp/call are displacements others are absolute // See whether the lock is currently biased toward our thread and // whether the epoch is still valid // Note that the runtime guarantees sufficient alignment of JavaThread // pointers to allow age to be placed into low bits // First check to see whether biasing is even enabled for this object // The bias pattern is present in the object's header. Need to check // whether the bias owner and the epoch are both still current. // At this point we know that the header has the bias pattern and // that we are not the bias owner in the current epoch. We need to // figure out more details about the state of the header in order to // know what operations can be legally performed on the object's // If the low three bits in the xor result aren't clear, that means // the prototype header is no longer biased and we have to revoke // the bias on this object. // Biasing is still enabled for this data type. See whether the // epoch of the current bias is still valid, meaning that the epoch // bits of the mark word are equal to the epoch bits of the // prototype header. (Note that the prototype header's epoch bits // only change at a safepoint.) If not, attempt to rebias the object // toward the current thread. Note that we must be absolutely sure // that the current epoch is invalid in order to do this because // otherwise the manipulations it performs on the mark word are // The epoch of the current bias is still valid but we know nothing // about the owner; it might be set or it might be clear. Try to // acquire the bias of the object using an atomic operation. If this // fails we will go in to the runtime to revoke the object's bias. // Note that we first construct the presumed unbiased header so we // don't accidentally blow away another thread's valid bias. // If the biasing toward our thread failed, this means that // another thread succeeded in biasing it toward itself and we // need to revoke that bias. The revocation will occur in the // interpreter runtime in the slow case. // At this point we know the epoch has expired, meaning that the // current "bias owner", if any, is actually invalid. Under these // circumstances _only_, we are allowed to use the current header's // value as the comparison value when doing the cas to acquire the // bias in the current epoch. In other words, we allow transfer of // the bias from one thread to another directly in this situation. // FIXME: due to a lack of registers we currently blow away the age // bits in this situation. Should attempt to preserve them. // If the biasing toward our thread failed, then another thread // succeeded in biasing it toward itself and we need to revoke that // bias. The revocation will occur in the runtime in the slow case. // The prototype mark in the klass doesn't have the bias bit set any // more, indicating that objects of this data type are not supposed // to be biased any more. We are going to try to reset the mark of // this object to the prototype value and fall through to the // CAS-based locking scheme. Note that if our CAS fails, it means // that another thread raced us for the privilege of revoking the // bias of this particular object, so it's okay to continue in the // FIXME: due to a lack of registers we currently blow away the age // bits in this situation. Should attempt to preserve them. // Fall through to the normal CAS-based lock, because no matter what // the result of the above CAS, some thread must have succeeded in // removing the bias bit from the object's header. // Windows always allocates space for it's register args // Align stack if necessary // Full implementation of Java ldiv and lrem; checks for special // case as described in JVM spec., p.243 & p.271. The function // returns the (pc) offset of the idivl instruction - may be needed // for implicit exceptions. // normal case special case // input : rax: dividend min_long // reg: divisor (may not be eax/edx) -1 // output: rax: quotient (= rax idiv reg) min_long // rdx: remainder (= rax irem reg) 0 // check for special case xorl(
rdx,
rdx);
// prepare rdx for possible special case (where // normal and special case exit if (
value == 0) { ;
return; }
if (
value == 0) { ;
return; }
// A 5 byte nop that is safe for patching (see patch_verified_entry) // Recommened sequence from 'Software Optimization Guide for the AMD if (
value == 0) { ;
return; }
if (
value == 0) { ;
return; }
// 32bit can do a case table jump in one instruction but we no longer allow the base // to be installed in the Address class // %%% is this really better? Why not on 32bit too? // src should NEVER be a real pointer. Use AddressLiteral for true pointers // These are mostly for initializing NULL // we must set sp to zero to clear frame // must clear fp, so that compiled frames are not confused; it is // possible that we need it only for debugging // determine last_java_sp register // last_java_fp is optional // last_java_pc is optional pusha();
// get regs on stack andq(
rsp, -
16);
// align stack as required by ABI andq(
rsp, -
16);
// align stack as required by push_CPU_state and call // In order to get locks to work, we need to fake a in_VM state // To see where a verify_oop failed, get $ebx+40/X for this frame. // XXX correct this offset for amd64 // This is the value of eip which points to where verify_oop will return. ::
tty->
print_cr(
"=============== DEBUG MESSAGE: %s ================\n",
// Now versions that are common to 32/64 bit // Writes to stack successive pages until offset reached to check for // stack overflow + shadow pages. This clobbers tmp. // Bang stack for total size given plus shadow page size. // Bang one page at a time because large size can bang beyond yellow and // Bang down shadow pages too. // The -1 because we already subtracted 1 page. // this could be any sized move but this is can be a debugging crumb // so the bigger the better. // Check for biased locking unlock case, which is a no-op // Note: we do not have to check the thread ID for two reasons. // First, the interpreter checks for IllegalMonitorStateException at // a higher level. Second, if the bias was revoked while we held the // lock, the object could not be rebiased toward another thread, so // the bias bit would be clear. // implements x == 0 ? 0 : 1 // note: must only look at least-significant byte of x // since C-style booleans are stored in one byte // Wouldn't need if AddressLiteral version had new name // Implementation of call_VM versions // determine java_thread register // determine last_java_sp register // push java thread (becomes first argument of C function) // set last Java frame before call // Only interpreter should have to set fp // do the call, remove parameters // restore the thread (cannot use the pushed argument since arguments // may be overwritten by C code generated by an optimizing compiler); // however can use the register value directly if it is callee saved. // rdi & rsi (also r15) are callee saved -> nothing to do stop(
"MacroAssembler::call_VM_base: rdi not callee saved?");
// Only interpreter should have to clear fp // C++ interp handles this in the interpreter // check for pending exceptions (java_thread is set upon return) // This used to conditionally jump to forward_exception however it is // possible if we relocate that the branch will not reach. So we must jump // around so we can always reach // get oop result if there is one and reset the value in the thread // Calculate the value for last_Java_sp // somewhat subtle. call_VM does an intermediate call // which places a return address on the stack just under the // stack pointer as the user finsihed with it. This allows // use to retrieve last_Java_pc from last_Java_sp[-1]. // On 32bit we then have to push additional args on the stack to accomplish // the actual requested call. On 64bit call_VM only can use register args // so the only extra space is the return address that call_VM created. // This hopefully explains the calculations here. // We've pushed one address, correct last_Java_sp }
else {
// unordered is greater }
else {
// unordered is greater // moves src2's literal address // Full implementation of Java idiv and irem; checks for // special case as described in JVM spec., p.243 & p.271. // The function returns the (pc) offset of the idivl // instruction - may be needed for implicit exceptions. // normal case special case // input : rax,: dividend min_int // reg: divisor (may not be rax,/rdx) -1 // output: rax,: quotient (= rax, idiv reg) min_int // rdx: remainder (= rax, irem reg) 0 // check for special case xorl(
rdx,
rdx);
// prepare rdx for possible special case (where remainder = 0) // normal and special case exit if (
value == 0) { ;
return; }
if (
value == 0) { ;
return; }
// !defined(COMPILER2) is because of stupid core builds for (
int i =
8; i-- > 0; )
ffree(i);
#
endif // !LP64 || C1 || !C2// Defines obj, preserves var_size_in_bytes // if end < obj then we wrapped around => object too long => slow case // Compare obj with the top addr, and if still equal, store the new top addr in // end at the address of the top addr pointer. Sets ZF if was equal, and clears // it otherwise. Use lock prefix for atomicity on MPs. // convert FPU condition into eflags condition via rax, // condition codes set as follows: // CF (corresponds to C0) if x < y // PF (corresponds to C2) if unordered // ZF (corresponds to C3) if x = y }
else {
// unordered is greater // Note: fxch & fpop to get rid of ST1 // (otherwise FPU stack could overflow eventually) if (
value == 0) { ;
return; }
if (
value == 0) { ;
return; }
// 0000 1111 1000 tttn #32-bit disp warning(
"reversing conditional branch");
// Note: load_signed_short used to be called load_signed_word. // Although the 'w' in x86 opcodes refers to the term "word" in the assembler // manual, which means 16 bits, that usage is found nowhere in HotSpot code. // The term "word" in HotSpot means a 32- or 64-bit machine word. // This is dubious to me since it seems safe to do a signed 16 => 64 bit // version but this is what 64bit has always done. This seems to imply // that users are only using 32bits worth. // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, // and "3.9 Partial Register Penalties", p. 22). // Note: load_unsigned_short used to be called load_unsigned_word. // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, // and "3.9 Partial Register Penalties", p. 22). else if(
sizeof(
bool) ==
2)
else if(
sizeof(
bool) ==
4)
else if(
sizeof(
bool) ==
2)
else if(
sizeof(
bool) ==
4)
else if(
sizeof(
bool) ==
2)
else if(
sizeof(
bool) ==
4)
// src should NEVER be a real pointer. Use AddressLiteral for true pointers // provoke OS NULL exception if reg = NULL by // accessing M[reg] w/o changing any (non-CC) registers // NOTE: cmpl is plenty here to provoke a segv // Note: should probably use testl(rax, Address(reg, 0)); // may be shorter code (however, this version of // testl needs to be implemented first) // nothing to do, (later) access of M[reg + offset] // will provoke OS NULL exception if reg = NULL // instead of directly emitting a breakpoint, call os:breakpoint for better debugability // (e.g., MSVC can't call ps() otherwise) // Save Integer and Float state // Warning: Stack must be 16 byte aligned (64bit) // Push flags first because pusha kills them // Make sure rsp stays 16-byte aligned // determine java_thread register // we must set sp to zero to clear frame // Write serialization page so VM thread can do a pseudo remote membar. // We use the current thread pointer to calculate a thread specific // offset to write to within the page. This minimizes bus traffic // due to cache line collision. // Size of store must match masking code above // When entering C land, the rbp, & rsp of the last Java frame have to be recorded // in the (thread-local) JavaThread object. When leaving C land, the last Java fp // has to be reset to 0. This is required to allow proper stack traversal. // determine java_thread register // determine last_java_sp register // last_java_fp is optional // last_java_pc is optional ////////////////////////////////////////////////////////////////////////////////// // If expand_call is true then we expand the call_VM_leaf macro // directly to skip generating the check by // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp. // Do we need to load the previous value? // Is the previous value null? // Can we store original value in the thread's buffer? // (The index field is typed as size_t.) // Record the previous value // save the live input values // Calling the runtime using the regular call_VM_leaf mechanism generates // code (generated by InterpreterMacroAssember::call_VM_leaf_base) // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL. // If we care generating the pre-barrier without a frame (e.g. in the // intrinsified Reference.get() routine) then ebp might be pointing to // the caller frame and so this check will most likely fail at runtime. // Expanding the call directly bypasses the generation of the check. // So when we do not have have a full interpreter frame on the stack // expand_call should be passed true. // save the live input values // Does store cross heap regions? // crosses regions, storing NULL? // storing region crossing non-NULL, is card already dirty? // get the address of the card // storing a region crossing, non-NULL oop, card is clean. // save the live input values ////////////////////////////////////////////////////////////////////////////////// // Does a store check for the oop in register obj. The content of // register obj is destroyed afterwards. // split the store check operation so that other instructions can be scheduled inbetween // The calculation for byte_map_base is as follows: // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift); // So this essentially converts an address to a displacement and // it will never need to be relocated. On 64bit however the value may be too // large for a 32bit displacement // By doing it as an ExternalAddress disp could be converted to a rip-relative // displacement and done in a single instruction given favorable mapping and // a smarter version of as_Address. Worst case it is two instructions which // is no worse off then loading disp into a register and doing as a simple // We can't do as ExternalAddress as the only style since if disp == 0 we'll // assert since NULL isn't acceptable in a reloci (see 6644928). In any case // in some cases we'll get a single instruction version. else if(
sizeof(
bool) ==
2) {
// testw implementation needed for two byte bools }
else if(
sizeof(
bool) ==
4)
// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. // update the tlab top pointer // recover var_size_in_bytes if necessary // Preserves rbx, and rdx. // No allocation in the shared eden. // calculate amount of free space // Retain tlab and allocate object in shared space if // the amount free in the tlab is too large to discard. // increment number of slow_allocations // increment number of refills // accumulate wastage -- t1 is amount free in tlab // if tlab is currently allocated (top or end != null) then // fill [top, end + alignment_reserve) with array object // set the length to the remaining space // set klass to intArrayKlass // dubious reloc why not an oop reloc? // store klass last. concurrent gcs assumes klass length is valid if // klass field is not null. // refill the tlab with an eden allocation // allocate new tlab, address returned in top // Check that t1 was preserved in eden_allocate. stop(
"assert(t1 != tlab size)");
static const double pi_4 =
0.7853981633974483;
// A hand-coded argument reduction for values in fabs(pi/4, pi/2) // was attempted in this code; unfortunately it appears that the // switch to 80-bit precision and back causes this to be // unprofitable compared with simply performing a runtime call if // the argument is out of the (-pi/4, pi/4) range. // fcmp needs a temporary so preserve rbx, fld_s(
1);
// Stack: X PI/4 X fabs();
// Stack: |X| PI/4 X // fastest case: -pi/4 <= x <= pi/4 assert(
false,
"bad intrinsic");
// slow case: runtime call // Preserve registers across runtime call // Must preserve all other FPU regs (could alternatively convert // SharedRuntime::dsin and dcos into assembly routines known not to trash // FPU state, but can not trust C compiler) // NOTE that in this case we also push the incoming argument to // the stack and restore it later; we also use this stack slot to // hold the return value from dsin or dcos. // NOTE: we must not use call_VM_leaf here because that requires a // complete interpreter frame in debug mode -- same bug as 4387334 // MacroAssembler::call_VM_leaf_base is perfectly safe and will // Need to add stack banging before this runtime call if it needs to // be taken; however, there is no generic stack banging routine at // the MacroAssembler level assert(
false,
"bad intrinsic");
// Must save return value to stack and then restore entire FPU stack // Come here with result in F-TOS // Look up the method for a megamorphic invokeinterface call. // The target method is determined by <intf_klass, itable_index>. // The receiver klass is in recv_klass. // On success, the result will be in method_result, and execution falls through. // On failure, execution transfers to the given label. "caller must use same register for non-constant itable index as for method");
// Compute start of first itableOffsetEntry (which is at the end of the vtable) // %%% Could store the aligned, prescaled offset in the klassoop. // Round up to align_object_offset boundary // see code for instanceKlass::start_of_itable! // Adjust recv_klass by scaled itable_index, so we can free itable_index. // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { // if (scan->interface() == intf) { // result = (klass + scan->offset() + itable_index); // (invert the test to fall through to found_method...) // Check that the previous entry is non-null. A null entry means that // the receiver class doesn't implement the interface, and wasn't the // same as when the caller was compiled. // Hacked jcc, which "knows" that L_fallthrough, at least, is in // range of a jccb. If this routine grows larger, reconsider at // Hacked jmp, which may only be used just before L_fallthrough. // If the pointers are equal, we are done (e.g., String[] elements). // This self-check enables sharing of secondary supertype arrays among // non-primary types such as array-of-interface. Otherwise, each such // type would need its own customized SSA. // We move this check to the front of the fast path because many // type checks are in fact trivially successful in this manner, // so we get a nicely predicted branch right at the start of the check. // Check the supertype display: // Positive movl does right thing on LP64. // This check has worked decisively for primary supers. // Secondary supers are sought in the super_cache ('super_cache_addr'). // (Secondary supers are interfaces and very deeply nested subtypes.) // This works in the same check above because of a tricky aliasing // between the super_cache and the primary super display elements. // (The 'super_check_addr' can address either, as the case requires.) // Note that the cache is updated below if it does not help us find // what we need immediately. // So if it was a primary super, we can just fail immediately. // Otherwise, it's the slow path for us (no success at this point). // Need a slow path; fast failure is impossible. // No slow path; it's a fast decision. // a couple of useful fields in sub_klass: // Do a linear scan of the secondary super-klass chain. // This code is rarely used, so simplicity is a virtue here. // The repne_scan instruction uses fixed registers, which we must spill. // Don't worry too much about pre-existing connections with the input regs. // Get super_klass value into rax (even if it was in rdi or rcx). // We will consult the secondary-super array. // Load the array length. (Positive movl does right thing on LP64.) // Skip to start of data. // Scan RCX words at [RDI] for an occurrence of RAX. // Set NZ/Z based on last compare. // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does // not change flags (only scas instruction which is repeated sets flags). // Set Z = 0 (not equal) before 'repne' to indicate that class was not found. // This part is tricky, as values in supers array could be 32 or 64 bit wide // and we store values in objArrays always encoded, thus we need to encode // the value of rax before repne. Note that rax is dead after the repne. // The superclass is never null; it would be a basic system error if a null // pointer were to sneak in here. Note that we have already loaded the // Klass::super_check_offset from the super_klass in the fast path, // so if there is a null in that register, we are already in the afterlife. // Unspill the temp. registers: // Special hack for the AD files: rdi is guaranteed non-zero. // Success. Cache the super we found and proceed in triumph. // Pass register number to verify_oop_subroutine char* b =
new char[
strlen(s) +
50];
push(
reg);
// pass register argument // avoid using pushptr, as it modifies scratch registers // and our contract is not to modify anything // call indirectly to solve generation ordering problem // Caller pops the arguments (oop, message) and restores rax, r10 // load indirectly to solve generation ordering problem char*
buf =
new char[
40];
// - rax ('check' register): required MethodType // - rdx, rsi, or ?: killable temp // compare method type against that of the receiver // A method handle has a "vmslots" field which gives the size of its // argument list in JVM stack slots. This field is either located directly // in every method handle, or else is indirectly accessed through the // method handle's MethodType. This macro hides the distinction. // load mh.type.form.vmslots // hoist vmslots into every mh to avoid dependent load chain // - rdx: killable temp (interpreted only) // - rax: killable temp (compiled only) // pick out the interpreted side of the handler // NOTE: vmentry is not an oop! // for the various stubs which take control at this point, // see MethodHandles::generate_method_handle_stub // cf. TemplateTable::prepare_invoke(), if (load_receiver). // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord); // Pass register number to verify_oop_subroutine char* b =
new char[
strlen(s) +
50];
sprintf(b,
"verify_oop_addr: %s", s);
// addr may contain rsp so we will have to adjust it based on the push // we just did (and on 64 bit we do two pushes) // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which // stores rax into addr which is backwards of what was intended. // avoid using pushptr, as it modifies scratch registers // and our contract is not to modify anything // call indirectly to solve generation ordering problem // Caller pops the arguments (addr, message) and restores rax, r10. stop(
"assert(top >= start)");
stop(
"assert(top <= end)");
case 0:
rc =
"round near";
break;
case 1:
rc =
"round down";
break;
case 2:
rc =
"round up ";
break;
case 3:
rc =
"chop ";
break;
case 0:
pc =
"24 bits ";
break;
case 1:
pc =
"reserved";
break;
case 2:
pc =
"53 bits ";
break;
case 3:
pc =
"64 bits ";
break;
bool busy()
const {
return ((
_value >>
15) &
1) != 0; }
bool C3()
const {
return ((
_value >>
14) &
1) != 0; }
bool C2()
const {
return ((
_value >>
10) &
1) != 0; }
bool C1()
const {
return ((
_value >>
9) &
1) != 0; }
bool C0()
const {
return ((
_value >>
8) &
1) != 0; }
int top()
const {
return (
_value >>
11) &
7 ; }
c[0] = (
C3()) ?
'3' :
'-';
c[
1] = (
C2()) ?
'2' :
'-';
c[
2] = (
C1()) ?
'1' :
'-';
c[
3] = (
C0()) ?
'0' :
'-';
printf(
"%04x flags = %s, cc = %s, top = %d",
_value &
0xFFFF, f, c,
top());
case 2:
return "special";
// print computation registers printf(
"%c r%d = ST%d = ", (j == 0 ?
'*' :
' '), i, j);
// print control registers bool sign()
const {
return ((
_value >>
7) &
1) != 0; }
bool zero()
const {
return ((
_value >>
6) &
1) != 0; }
bool carry()
const {
return ((
_value >> 0) &
1) != 0; }
f[
2] = (
sign ()) ?
'S' :
'-';
f[
3] = (
zero ()) ?
'Z' :
'-';
f[
5] = (
parity ()) ?
'P' :
'-';
f[
6] = (
carry ()) ?
'C' :
'-';
printf(
"--------------------------------------------------\n");
printf(
"--------------------------------------------------\n");
// For leaf calls, only verify that the top few elements remain empty. // We only need 1 empty at the top for C2 code. return true;
// All other stack states do not matter printf(
"%s: stack not contiguous at ST%d\n", s, i);
// check if computed stack depth corresponds to expected stack depth // expected stack depth is -stack_depth or less // too many elements on the stack // expected stack depth is stack_depth int3();
// break if error condition // OK to use shift since we don't need to preserve flags. // Doesn't do verfication, generates fixed size code // Used for storing NULLs. // Store to klass gap in destination verify_heapbase(
"MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
stop(
"null oop passed to encode_heap_oop_not_null");
verify_oop(r,
"broken oop in encode_heap_oop_not_null");
verify_heapbase(
"MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
stop(
"null oop passed to encode_heap_oop_not_null2");
// Note: it will change flags // Cannot assert, unverified entry point counts instructions (see .ad file) // vtableStubs also counts instructions in pd_code_size_limit. // Also do not verify_oop as this is called by verify_oop. // Note: it will change flags // Cannot assert, unverified entry point counts instructions (see .ad file) // vtableStubs also counts instructions in pd_code_size_limit. // Also do not verify_oop as this is called by verify_oop. // IndexOf for constant substrings with size >= 8 chars // which don't need to be loaded through stack. // This method uses pcmpestri inxtruction with bound registers // rax - substring length (elements count) // rdx - string length (elements count) // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) // rcx - matched index in string // Note, inline_string_indexOf() generates checks: // Reload substr for rescan, this code // is executed only for large substrings (> 8 chars) negptr(
cnt2);
// Jumped here with negative cnt2, convert to positive // We came here after the beginning of the substring was // matched but the rest of it was not so we need to search // again. Start from the next element after the previous match. // cnt2 is number of substring reminding elements and // cnt1 is number of string reminding elements when cmp failed. // Restored cnt1 = cnt1 - cnt2 + int_cnt2 // Scan string for start of substr in 16-byte vectors // Found a potential substr // Matched whole vector if first element matched (tmp(rcx) == 0). // After pcmpestri tmp(rcx) contains matched element index // Compute start addr of substr // Make sure string is still long enough // Left less then substring. // This code is optimized for the case when whole substring // is matched if its head is matched. // Reload only string if does not match // Compare the rest of substring (> 8 chars). // First 8 chars are already matched. cmpl(
cnt2, -
8);
// Do not read beyond substring // Back-up strings to avoid reading beyond substring: // cnt1 = cnt1 - cnt2 + 8 // calculate index in register to avoid integer overflow (int_cnt2*2) // Need to reload strings pointers if not matched whole vector // Fall through if found full substring // Found result if we matched full small substring. // Small strings are loaded through stack if they cross page boundary. // int_cnt2 is length of small (< 8 chars) constant substring // or (-1) for non constant substring in which case its length // Note, inline_string_indexOf() generates checks: // This method uses pcmpestri inxtruction with bound registers // rax - substring length (elements count) // rdx - string length (elements count) // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) // rcx - matched index in string {
//======================================================== // We don't know where these strings are located // and we can't read beyond them. Load them through stack. if (
int_cnt2 > 0) {
// small (< 8 chars) constant substring }
else if (
int_cnt2 ==
2) {
// Two chars }
else if (
int_cnt2 ==
4) {
// Four chars }
else {
// cnt2 = { 3, 5, 6, 7 } // Array header size is 12 bytes in 32-bit VM // + 6 bytes for 3 chars == 18 bytes, // enough space to load vec and shift. }
else {
// not constant substring // We can read beyond string if srt+16 does not cross page boundary // since heaps are aligned and mapped by pages. // Move small strings to stack to allow load 16 bytes into vec. // Check cross page boundary. // Small (< 8 chars) constant substrings are loaded already. //======================================================== if (
int_cnt2 < 0) {
// Only for non constant substring // String saved at sp+1*wordSize // Substr saved at sp+2*wordSize // Substr count saved at sp+3*wordSize // Reload substr for rescan, this code // is executed only for large substrings (> 8 chars) // We came here after the beginning of the substring was // matched but the rest of it was not so we need to search // again. Start from the next element after the previous match. // Scan string for start of substr in 16-byte vectors cmpl(
cnt1,
8);
// Do not read beyond string // Back-up string to avoid reading beyond string. // Found a potential substr // After pcmpestri tmp(rcx) contains matched element index // Make sure string is still long enough // Left less then substring. // Compute start addr of substr if (
int_cnt2 > 0) {
// Constant substring // Repeat search for small substring (< 8 chars) // from new point without reloading substring. // Have to check that we don't read beyond string. // Fall through if matched whole substring. // Found result if we matched whole substring. // Repeat search for small substring (<= 8 chars) // from new point 'str1' without reloading substring. // Have to check that we don't read beyond string. // Compare the rest of substring (> 8 chars). // First 8 chars are already matched. // Need to reload strings pointers if not matched whole vector cmpl(
cnt2,
8);
// Do not read beyond substring // Back-up strings to avoid reading beyond substring. // Compute the minimum of the string lengths and the // difference of the string lengths (stack). // Do the conditional move stuff // Is the minimum length zero? // Compare first characters // Check after comparing first character to see if strings are equivalent // Check if the strings start at same location // Check if the length difference is zero (from stack) // Strings might not be equivalent // Advance to next element // Setup to compare 16-byte vectors // rax - negative string length (elements count) // rdx - string length (elements count) // pcmpmask - cmp mode: 11000 (string compare with negated result) // + 00 (unsigned bytes) or + 01 (unsigned shorts) // rcx - first mismatched element index // After pcmpestri cnt1(rcx) contains mismatched element index // compare wide vectors tail // Mismatched characters in the vectors // Fallthru to tail compare // Shift str2 and str1 to the end of the arrays, negate min // Compare the rest of the elements // Strings are equal up to min length. Return the length difference. // Discard the stored length difference // Compare char[] arrays aligned to 4 bytes or substrings. // Need additional checks for arrays_equals. // With SSE4.2, use double quad vector compare // Compare 16-byte vectors andl(
result,
0x0000000e);
// tail count (in bytes) andl(
limit,
0xfffffff0);
// vector count (in bytes) // Fallthru to tail compare // Compare 4-byte vectors andl(
limit,
0xfffffffc);
// vector count (in bytes) // Compare trailing char (final 2 bytes), if any // align source address at 4 bytes address boundary // One byte misalignment happens only for byte arrays // Two bytes misalignment happens only for byte and short (char) arrays for (
int i = 0; i <
32; i +=
4) {
// length is too short, just fill qwords // fall through to fill 4 bytes // align to 8 bytes, we know we are 4 byte aligned to start // length is too short, just fill qwords // Note some conditions are synonyms for others