aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/x86/vm/x86_32.ad
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/x86/vm/x86_32.ad')
-rw-r--r--src/cpu/x86/vm/x86_32.ad561
1 files changed, 14 insertions, 547 deletions
diff --git a/src/cpu/x86/vm/x86_32.ad b/src/cpu/x86/vm/x86_32.ad
index 90c1d899f..382d09b07 100644
--- a/src/cpu/x86/vm/x86_32.ad
+++ b/src/cpu/x86/vm/x86_32.ad
@@ -2910,542 +2910,6 @@ encode %{
emit_d8 (cbuf,0 );
%}
-
- // Because the transitions from emitted code to the runtime
- // monitorenter/exit helper stubs are so slow it's critical that
- // we inline both the stack-locking fast-path and the inflated fast path.
- //
- // See also: cmpFastLock and cmpFastUnlock.
- //
- // What follows is a specialized inline transliteration of the code
- // in slow_enter() and slow_exit(). If we're concerned about I$ bloat
- // another option would be to emit TrySlowEnter and TrySlowExit methods
- // at startup-time. These methods would accept arguments as
- // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
- // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply
- // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
- // In practice, however, the # of lock sites is bounded and is usually small.
- // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
- // if the processor uses simple bimodal branch predictors keyed by EIP
- // Since the helper routines would be called from multiple synchronization
- // sites.
- //
- // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
- // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
- // to those specialized methods. That'd give us a mostly platform-independent
- // implementation that the JITs could optimize and inline at their pleasure.
- // Done correctly, the only time we'd need to cross to native could would be
- // to park() or unpark() threads. We'd also need a few more unsafe operators
- // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
- // (b) explicit barriers or fence operations.
- //
- // TODO:
- //
- // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
- // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
- // Given TLAB allocation, Self is usually manifested in a register, so passing it into
- // the lock operators would typically be faster than reifying Self.
- //
- // * Ideally I'd define the primitives as:
- // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
- // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
- // Unfortunately ADLC bugs prevent us from expressing the ideal form.
- // Instead, we're stuck with a rather awkward and brittle register assignments below.
- // Furthermore the register assignments are overconstrained, possibly resulting in
- // sub-optimal code near the synchronization site.
- //
- // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
- // Alternately, use a better sp-proximity test.
- //
- // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
- // Either one is sufficient to uniquely identify a thread.
- // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
- //
- // * Intrinsify notify() and notifyAll() for the common cases where the
- // object is locked by the calling thread but the waitlist is empty.
- // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
- //
- // * use jccb and jmpb instead of jcc and jmp to improve code density.
- // But beware of excessive branch density on AMD Opterons.
- //
- // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
- // or failure of the fast-path. If the fast-path fails then we pass
- // control to the slow-path, typically in C. In Fast_Lock and
- // Fast_Unlock we often branch to DONE_LABEL, just to find that C2
- // will emit a conditional branch immediately after the node.
- // So we have branches to branches and lots of ICC.ZF games.
- // Instead, it might be better to have C2 pass a "FailureLabel"
- // into Fast_Lock and Fast_Unlock. In the case of success, control
- // will drop through the node. ICC.ZF is undefined at exit.
- // In the case of failure, the node will branch directly to the
- // FailureLabel
-
-
- // obj: object to lock
- // box: on-stack box address (displaced header location) - KILLED
- // rax,: tmp -- KILLED
- // scr: tmp -- KILLED
- enc_class Fast_Lock( eRegP obj, eRegP box, eAXRegI tmp, eRegP scr ) %{
-
- Register objReg = as_Register($obj$$reg);
- Register boxReg = as_Register($box$$reg);
- Register tmpReg = as_Register($tmp$$reg);
- Register scrReg = as_Register($scr$$reg);
-
- // Ensure the register assignents are disjoint
- guarantee (objReg != boxReg, "") ;
- guarantee (objReg != tmpReg, "") ;
- guarantee (objReg != scrReg, "") ;
- guarantee (boxReg != tmpReg, "") ;
- guarantee (boxReg != scrReg, "") ;
- guarantee (tmpReg == as_Register(EAX_enc), "") ;
-
- MacroAssembler masm(&cbuf);
-
- if (_counters != NULL) {
- masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr()));
- }
- if (EmitSync & 1) {
- // set box->dhw = unused_mark (3)
- // Force all sync thru slow-path: slow_enter() and slow_exit()
- masm.movptr (Address(boxReg, 0), int32_t(markOopDesc::unused_mark())) ;
- masm.cmpptr (rsp, (int32_t)0) ;
- } else
- if (EmitSync & 2) {
- Label DONE_LABEL ;
- if (UseBiasedLocking) {
- // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
- masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
- }
-
- masm.movptr(tmpReg, Address(objReg, 0)) ; // fetch markword
- masm.orptr (tmpReg, 0x1);
- masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
- masm.jcc(Assembler::equal, DONE_LABEL);
- // Recursive locking
- masm.subptr(tmpReg, rsp);
- masm.andptr(tmpReg, (int32_t) 0xFFFFF003 );
- masm.movptr(Address(boxReg, 0), tmpReg);
- masm.bind(DONE_LABEL) ;
- } else {
- // Possible cases that we'll encounter in fast_lock
- // ------------------------------------------------
- // * Inflated
- // -- unlocked
- // -- Locked
- // = by self
- // = by other
- // * biased
- // -- by Self
- // -- by other
- // * neutral
- // * stack-locked
- // -- by self
- // = sp-proximity test hits
- // = sp-proximity test generates false-negative
- // -- by other
- //
-
- Label IsInflated, DONE_LABEL, PopDone ;
-
- // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
- // order to reduce the number of conditional branches in the most common cases.
- // Beware -- there's a subtle invariant that fetch of the markword
- // at [FETCH], below, will never observe a biased encoding (*101b).
- // If this invariant is not held we risk exclusion (safety) failure.
- if (UseBiasedLocking && !UseOptoBiasInlining) {
- masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
- }
-
- masm.movptr(tmpReg, Address(objReg, 0)) ; // [FETCH]
- masm.testptr(tmpReg, 0x02) ; // Inflated v (Stack-locked or neutral)
- masm.jccb (Assembler::notZero, IsInflated) ;
-
- // Attempt stack-locking ...
- masm.orptr (tmpReg, 0x1);
- masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
- if (_counters != NULL) {
- masm.cond_inc32(Assembler::equal,
- ExternalAddress((address)_counters->fast_path_entry_count_addr()));
- }
- masm.jccb (Assembler::equal, DONE_LABEL);
-
- // Recursive locking
- masm.subptr(tmpReg, rsp);
- masm.andptr(tmpReg, 0xFFFFF003 );
- masm.movptr(Address(boxReg, 0), tmpReg);
- if (_counters != NULL) {
- masm.cond_inc32(Assembler::equal,
- ExternalAddress((address)_counters->fast_path_entry_count_addr()));
- }
- masm.jmp (DONE_LABEL) ;
-
- masm.bind (IsInflated) ;
-
- // The object is inflated.
- //
- // TODO-FIXME: eliminate the ugly use of manifest constants:
- // Use markOopDesc::monitor_value instead of "2".
- // use markOop::unused_mark() instead of "3".
- // The tmpReg value is an objectMonitor reference ORed with
- // markOopDesc::monitor_value (2). We can either convert tmpReg to an
- // objectmonitor pointer by masking off the "2" bit or we can just
- // use tmpReg as an objectmonitor pointer but bias the objectmonitor
- // field offsets with "-2" to compensate for and annul the low-order tag bit.
- //
- // I use the latter as it avoids AGI stalls.
- // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
- // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
- //
- #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
-
- // boxReg refers to the on-stack BasicLock in the current frame.
- // We'd like to write:
- // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices.
- // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
- // additional latency as we have another ST in the store buffer that must drain.
-
- if (EmitSync & 8192) {
- masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty
- masm.get_thread (scrReg) ;
- masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
- masm.movptr(tmpReg, NULL_WORD); // consider: xor vs mov
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
- } else
- if ((EmitSync & 128) == 0) { // avoid ST-before-CAS
- masm.movptr(scrReg, boxReg) ;
- masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
-
- // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
- if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
- // prefetchw [eax + Offset(_owner)-2]
- masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
- }
-
- if ((EmitSync & 64) == 0) {
- // Optimistic form: consider XORL tmpReg,tmpReg
- masm.movptr(tmpReg, NULL_WORD) ;
- } else {
- // Can suffer RTS->RTO upgrades on shared or cold $ lines
- // Test-And-CAS instead of CAS
- masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner
- masm.testptr(tmpReg, tmpReg) ; // Locked ?
- masm.jccb (Assembler::notZero, DONE_LABEL) ;
- }
-
- // Appears unlocked - try to swing _owner from null to non-null.
- // Ideally, I'd manifest "Self" with get_thread and then attempt
- // to CAS the register containing Self into m->Owner.
- // But we don't have enough registers, so instead we can either try to CAS
- // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
- // we later store "Self" into m->Owner. Transiently storing a stack address
- // (rsp or the address of the box) into m->owner is harmless.
- // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
- masm.movptr(Address(scrReg, 0), 3) ; // box->_displaced_header = 3
- masm.jccb (Assembler::notZero, DONE_LABEL) ;
- masm.get_thread (scrReg) ; // beware: clobbers ICCs
- masm.movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ;
- masm.xorptr(boxReg, boxReg) ; // set icc.ZFlag = 1 to indicate success
-
- // If the CAS fails we can either retry or pass control to the slow-path.
- // We use the latter tactic.
- // Pass the CAS result in the icc.ZFlag into DONE_LABEL
- // If the CAS was successful ...
- // Self has acquired the lock
- // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
- // Intentional fall-through into DONE_LABEL ...
- } else {
- masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty
- masm.movptr(boxReg, tmpReg) ;
-
- // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
- if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
- // prefetchw [eax + Offset(_owner)-2]
- masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
- }
-
- if ((EmitSync & 64) == 0) {
- // Optimistic form
- masm.xorptr (tmpReg, tmpReg) ;
- } else {
- // Can suffer RTS->RTO upgrades on shared or cold $ lines
- masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner
- masm.testptr(tmpReg, tmpReg) ; // Locked ?
- masm.jccb (Assembler::notZero, DONE_LABEL) ;
- }
-
- // Appears unlocked - try to swing _owner from null to non-null.
- // Use either "Self" (in scr) or rsp as thread identity in _owner.
- // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
- masm.get_thread (scrReg) ;
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
-
- // If the CAS fails we can either retry or pass control to the slow-path.
- // We use the latter tactic.
- // Pass the CAS result in the icc.ZFlag into DONE_LABEL
- // If the CAS was successful ...
- // Self has acquired the lock
- // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
- // Intentional fall-through into DONE_LABEL ...
- }
-
- // DONE_LABEL is a hot target - we'd really like to place it at the
- // start of cache line by padding with NOPs.
- // See the AMD and Intel software optimization manuals for the
- // most efficient "long" NOP encodings.
- // Unfortunately none of our alignment mechanisms suffice.
- masm.bind(DONE_LABEL);
-
- // Avoid branch-to-branch on AMD processors
- // This appears to be superstition.
- if (EmitSync & 32) masm.nop() ;
-
-
- // At DONE_LABEL the icc ZFlag is set as follows ...
- // Fast_Unlock uses the same protocol.
- // ZFlag == 1 -> Success
- // ZFlag == 0 -> Failure - force control through the slow-path
- }
- %}
-
- // obj: object to unlock
- // box: box address (displaced header location), killed. Must be EAX.
- // rbx,: killed tmp; cannot be obj nor box.
- //
- // Some commentary on balanced locking:
- //
- // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
- // Methods that don't have provably balanced locking are forced to run in the
- // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
- // The interpreter provides two properties:
- // I1: At return-time the interpreter automatically and quietly unlocks any
- // objects acquired the current activation (frame). Recall that the
- // interpreter maintains an on-stack list of locks currently held by
- // a frame.
- // I2: If a method attempts to unlock an object that is not held by the
- // the frame the interpreter throws IMSX.
- //
- // Lets say A(), which has provably balanced locking, acquires O and then calls B().
- // B() doesn't have provably balanced locking so it runs in the interpreter.
- // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
- // is still locked by A().
- //
- // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
- // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
- // should not be unlocked by "normal" java-level locking and vice-versa. The specification
- // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
-
- enc_class Fast_Unlock( nabxRegP obj, eAXRegP box, eRegP tmp) %{
-
- Register objReg = as_Register($obj$$reg);
- Register boxReg = as_Register($box$$reg);
- Register tmpReg = as_Register($tmp$$reg);
-
- guarantee (objReg != boxReg, "") ;
- guarantee (objReg != tmpReg, "") ;
- guarantee (boxReg != tmpReg, "") ;
- guarantee (boxReg == as_Register(EAX_enc), "") ;
- MacroAssembler masm(&cbuf);
-
- if (EmitSync & 4) {
- // Disable - inhibit all inlining. Force control through the slow-path
- masm.cmpptr (rsp, 0) ;
- } else
- if (EmitSync & 8) {
- Label DONE_LABEL ;
- if (UseBiasedLocking) {
- masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
- }
- // classic stack-locking code ...
- masm.movptr(tmpReg, Address(boxReg, 0)) ;
- masm.testptr(tmpReg, tmpReg) ;
- masm.jcc (Assembler::zero, DONE_LABEL) ;
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
- masm.bind(DONE_LABEL);
- } else {
- Label DONE_LABEL, Stacked, CheckSucc, Inflated ;
-
- // Critically, the biased locking test must have precedence over
- // and appear before the (box->dhw == 0) recursive stack-lock test.
- if (UseBiasedLocking && !UseOptoBiasInlining) {
- masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
- }
-
- masm.cmpptr(Address(boxReg, 0), 0) ; // Examine the displaced header
- masm.movptr(tmpReg, Address(objReg, 0)) ; // Examine the object's markword
- masm.jccb (Assembler::zero, DONE_LABEL) ; // 0 indicates recursive stack-lock
-
- masm.testptr(tmpReg, 0x02) ; // Inflated?
- masm.jccb (Assembler::zero, Stacked) ;
-
- masm.bind (Inflated) ;
- // It's inflated.
- // Despite our balanced locking property we still check that m->_owner == Self
- // as java routines or native JNI code called by this thread might
- // have released the lock.
- // Refer to the comments in synchronizer.cpp for how we might encode extra
- // state in _succ so we can avoid fetching EntryList|cxq.
- //
- // I'd like to add more cases in fast_lock() and fast_unlock() --
- // such as recursive enter and exit -- but we have to be wary of
- // I$ bloat, T$ effects and BP$ effects.
- //
- // If there's no contention try a 1-0 exit. That is, exit without
- // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
- // we detect and recover from the race that the 1-0 exit admits.
- //
- // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
- // before it STs null into _owner, releasing the lock. Updates
- // to data protected by the critical section must be visible before
- // we drop the lock (and thus before any other thread could acquire
- // the lock and observe the fields protected by the lock).
- // IA32's memory-model is SPO, so STs are ordered with respect to
- // each other and there's no need for an explicit barrier (fence).
- // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
-
- masm.get_thread (boxReg) ;
- if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
- // prefetchw [ebx + Offset(_owner)-2]
- masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
- }
-
- // Note that we could employ various encoding schemes to reduce
- // the number of loads below (currently 4) to just 2 or 3.
- // Refer to the comments in synchronizer.cpp.
- // In practice the chain of fetches doesn't seem to impact performance, however.
- if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
- // Attempt to reduce branch density - AMD's branch predictor.
- masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
- masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
- masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
- masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
- masm.jccb (Assembler::notZero, DONE_LABEL) ;
- masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
- masm.jmpb (DONE_LABEL) ;
- } else {
- masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
- masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
- masm.jccb (Assembler::notZero, DONE_LABEL) ;
- masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
- masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
- masm.jccb (Assembler::notZero, CheckSucc) ;
- masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
- masm.jmpb (DONE_LABEL) ;
- }
-
- // The Following code fragment (EmitSync & 65536) improves the performance of
- // contended applications and contended synchronization microbenchmarks.
- // Unfortunately the emission of the code - even though not executed - causes regressions
- // in scimark and jetstream, evidently because of $ effects. Replacing the code
- // with an equal number of never-executed NOPs results in the same regression.
- // We leave it off by default.
-
- if ((EmitSync & 65536) != 0) {
- Label LSuccess, LGoSlowPath ;
-
- masm.bind (CheckSucc) ;
-
- // Optional pre-test ... it's safe to elide this
- if ((EmitSync & 16) == 0) {
- masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
- masm.jccb (Assembler::zero, LGoSlowPath) ;
- }
-
- // We have a classic Dekker-style idiom:
- // ST m->_owner = 0 ; MEMBAR; LD m->_succ
- // There are a number of ways to implement the barrier:
- // (1) lock:andl &m->_owner, 0
- // is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
- // LOCK: ANDL [ebx+Offset(_Owner)-2], 0
- // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
- // (2) If supported, an explicit MFENCE is appealing.
- // In older IA32 processors MFENCE is slower than lock:add or xchg
- // particularly if the write-buffer is full as might be the case if
- // if stores closely precede the fence or fence-equivalent instruction.
- // In more modern implementations MFENCE appears faster, however.
- // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
- // The $lines underlying the top-of-stack should be in M-state.
- // The locked add instruction is serializing, of course.
- // (4) Use xchg, which is serializing
- // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
- // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
- // The integer condition codes will tell us if succ was 0.
- // Since _succ and _owner should reside in the same $line and
- // we just stored into _owner, it's likely that the $line
- // remains in M-state for the lock:orl.
- //
- // We currently use (3), although it's likely that switching to (2)
- // is correct for the future.
-
- masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
- if (os::is_MP()) {
- if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
- masm.mfence();
- } else {
- masm.lock () ; masm.addptr(Address(rsp, 0), 0) ;
- }
- }
- // Ratify _succ remains non-null
- masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
- masm.jccb (Assembler::notZero, LSuccess) ;
-
- masm.xorptr(boxReg, boxReg) ; // box is really EAX
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
- masm.jccb (Assembler::notEqual, LSuccess) ;
- // Since we're low on registers we installed rsp as a placeholding in _owner.
- // Now install Self over rsp. This is safe as we're transitioning from
- // non-null to non=null
- masm.get_thread (boxReg) ;
- masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
- // Intentional fall-through into LGoSlowPath ...
-
- masm.bind (LGoSlowPath) ;
- masm.orptr(boxReg, 1) ; // set ICC.ZF=0 to indicate failure
- masm.jmpb (DONE_LABEL) ;
-
- masm.bind (LSuccess) ;
- masm.xorptr(boxReg, boxReg) ; // set ICC.ZF=1 to indicate success
- masm.jmpb (DONE_LABEL) ;
- }
-
- masm.bind (Stacked) ;
- // It's not inflated and it's not recursively stack-locked and it's not biased.
- // It must be stack-locked.
- // Try to reset the header to displaced header.
- // The "box" value on the stack is stable, so we can reload
- // and be assured we observe the same value as above.
- masm.movptr(tmpReg, Address(boxReg, 0)) ;
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
- // Intention fall-thru into DONE_LABEL
-
-
- // DONE_LABEL is a hot target - we'd really like to place it at the
- // start of cache line by padding with NOPs.
- // See the AMD and Intel software optimization manuals for the
- // most efficient "long" NOP encodings.
- // Unfortunately none of our alignment mechanisms suffice.
- if ((EmitSync & 65536) == 0) {
- masm.bind (CheckSucc) ;
- }
- masm.bind(DONE_LABEL);
-
- // Avoid branch to branch on AMD processors
- if (EmitSync & 32768) { masm.nop() ; }
- }
- %}
-
-
enc_class enc_pop_rdx() %{
emit_opcode(cbuf,0x5A);
%}
@@ -13147,23 +12611,26 @@ instruct RethrowException()
// inlined locking and unlocking
-
-instruct cmpFastLock( eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{
- match( Set cr (FastLock object box) );
- effect( TEMP tmp, TEMP scr, USE_KILL box );
+instruct cmpFastLock(eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{
+ match(Set cr (FastLock object box));
+ effect(TEMP tmp, TEMP scr, USE_KILL box);
ins_cost(300);
format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %}
- ins_encode( Fast_Lock(object,box,tmp,scr) );
- ins_pipe( pipe_slow );
+ ins_encode %{
+ __ fast_lock($object$$Register, $box$$Register, $tmp$$Register, $scr$$Register, _counters);
+ %}
+ ins_pipe(pipe_slow);
%}
-instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
- match( Set cr (FastUnlock object box) );
- effect( TEMP tmp, USE_KILL box );
+instruct cmpFastUnlock(eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
+ match(Set cr (FastUnlock object box));
+ effect(TEMP tmp, USE_KILL box);
ins_cost(300);
format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %}
- ins_encode( Fast_Unlock(object,box,tmp) );
- ins_pipe( pipe_slow );
+ ins_encode %{
+ __ fast_unlock($object$$Register, $box$$Register, $tmp$$Register);
+ %}
+ ins_pipe(pipe_slow);
%}