/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2004 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/asm_linkage.h>
#include <sys/machthread.h>
#include <sys/privregs.h>
#include <sys/fpras_impl.h>
#if !defined(lint)
#include "assym.h"
#endif /* lint */
/*
* Pseudo-code to aid in understanding the control flow of the
*
* On entry:
*
* ! Determine whether to use the FP register version
* ! or the leaf routine version depending on size
* ! of copy and flags. Set up error handling accordingly.
* ! The transition point depends on whether the src and
* ! dst addresses can be aligned to long word, word,
* ! half word, or byte boundaries.
* !
* ! WARNING: <Register usage convention>
* ! For FP version, %l6 holds previous error handling and
* ! a flag: TRAMP_FLAG (low bits)
* ! for leaf routine version, %o4 holds those values.
* ! So either %l6 or %o4 is reserved and not available for
* ! any other use.
*
* if (length <= VIS_COPY_THRESHOLD) ! start with a quick test
* go to small_copy; ! to speed short copies
*
* ! src, dst long word alignable
* if (hw_copy_limit_8 == 0) ! hw_copy disabled
* go to small_copy;
* if (length <= hw_copy_limit_8)
* go to small_copy;
* go to FPBLK_copy;
* }
* if (src,dst not alignable) {
* if (hw_copy_limit_1 == 0) ! hw_copy disabled
* go to small_copy;
* if (length <= hw_copy_limit_1)
* go to small_copy;
* go to FPBLK_copy;
* }
* if (src,dst halfword alignable) {
* if (hw_copy_limit_2 == 0) ! hw_copy disabled
* go to small_copy;
* if (length <= hw_copy_limit_2)
* go to small_copy;
* go to FPBLK_copy;
* }
* if (src,dst word alignable) {
* if (hw_copy_limit_4 == 0) ! hw_copy disabled
* go to small_copy;
* if (length <= hw_copy_limit_4)
* go to small_copy;
* go to FPBLK_copy;
* }
*
* small_copy:
* Setup_leaf_rtn_error_handler; ! diffs for each entry point
*
* if (count <= 3) ! fast path for tiny copies
* go to sm_left; ! special finish up code
* else
* if (count > CHKSIZE) ! medium sized copies
* go to sm_med ! tuned by alignment
* if(src&dst not both word aligned) {
* sm_movebytes:
* move byte by byte in 4-way unrolled loop
* fall into sm_left;
* sm_left:
* move 0-3 bytes byte at a time as needed.
* restore error handler and exit.
*
* } else { ! src&dst are word aligned
* check for at least 8 bytes left,
* move word at a time, unrolled by 2
* when fewer than 8 bytes left,
* sm_half: move half word at a time while 2 or more bytes left
* sm_byte: move final byte if necessary
* sm_exit:
* restore error handler and exit.
* }
*
* ! Medium length cases with at least CHKSIZE bytes available
* ! method: line up src and dst as best possible, then
* ! move data in 4-way unrolled loops.
*
* sm_med:
* if(src&dst unalignable)
* go to sm_movebytes
* if(src&dst halfword alignable)
* go to sm_movehalf
* if(src&dst word alignable)
* go to sm_moveword
* ! fall into long word movement
* move bytes until src is word aligned
* if not long word aligned, move a word
* move long words in 4-way unrolled loop until < 32 bytes left
* move long words in 1-way unrolled loop until < 8 bytes left
* if zero bytes left, goto sm_exit
* if one byte left, go to sm_byte
* else go to sm_half
*
* sm_moveword:
* move bytes until src is word aligned
* move words in 4-way unrolled loop until < 16 bytes left
* move words in 1-way unrolled loop until < 4 bytes left
* if zero bytes left, goto sm_exit
* if one byte left, go to sm_byte
* else go to sm_half
*
* sm_movehalf:
* move a byte if needed to align src on halfword
* move halfwords in 4-way unrolled loop until < 8 bytes left
* if zero bytes left, goto sm_exit
* if one byte left, go to sm_byte
* else go to sm_half
*
*
* FPBLK_copy:
* %l6 = curthread->t_lofault;
* if (%l6 != NULL) {
* membar #Sync
* curthread->t_lofault = .copyerr;
* caller_error_handler = TRUE ! %l6 |= 2
* }
*
* ! for FPU testing we must not migrate cpus
* if (curthread->t_lwp == NULL) {
* ! Kernel threads do not have pcb's in which to store
* ! the floating point state, so disallow preemption during
* ! the copy. This also prevents cpu migration.
* kpreempt_disable(curthread);
* } else {
* thread_nomigrate();
* }
*
* old_fprs = %fprs;
* old_gsr = %gsr;
* if (%fprs.fef) {
* %fprs.fef = 1;
* save current fpregs on stack using blockstore
* } else {
* %fprs.fef = 1;
* }
*
*
* do_blockcopy_here;
*
* In lofault handler:
* curthread->t_lofault = .copyerr2;
* Continue on with the normal exit handler
*
* On normal exit:
* %gsr = old_gsr;
* if (old_fprs & FPRS_FEF)
* restore fpregs from stack using blockload
* else
* zero fpregs
* %fprs = old_fprs;
* membar #Sync
* curthread->t_lofault = (%l6 & ~3);
* ! will always have a current thread
* if (curthread->t_lwp == NULL)
* kpreempt_enable(curthread);
* else
* thread_allowmigrate();
* return (0)
*
* In second lofault handler (.copyerr2):
* We've tried to restore fp state from the stack and failed. To
* prevent from returning with a corrupted fp state, we will panic.
*/
/*
* Comments about optimization choices
*
* The initial optimization decision in this code is to determine
* whether to use the FP registers for a copy or not. If we don't
* use the FP registers, we can execute the copy as a leaf routine,
* saving a register save and restore. Also, less elaborate setup
* is required, allowing short copies to be completed more quickly.
* For longer copies, especially unaligned ones (where the src and
* dst do not align to allow simple ldx,stx operation), the FP
* registers allow much faster copy operations.
*
* The estimated extra cost of the FP path will vary depending on
* boundary, remaining src data after the last full dst cache line is
* moved whether the FP registers need to be saved, and some other
* minor issues. The average additional overhead is estimated to be
* 400 clocks. Since each non-repeated/predicted tst and branch costs
* around 10 clocks, elaborate calculation would slow down to all
* longer copies and only benefit a small portion of medium sized
* copies. Rather than incur such cost, we chose fixed transition
* points for each of the alignment choices.
*
* For the inner loop, here is a comparison of the per cache line
* costs for each alignment when src&dst are in cache:
*
* byte aligned: 108 clocks slower for non-FPBLK
* half aligned: 44 clocks slower for non-FPBLK
* word aligned: 12 clocks slower for non-FPBLK
* long aligned: 4 clocks >>faster<< for non-FPBLK
*
* The long aligned loop runs faster because it does no prefetching.
* That wins if the data is not in cache or there is too little
* data to gain much benefit from prefetching. But when there
* is more data and that data is not in cache, failing to prefetch
* can run much slower. In addition, there is a 2 Kbyte store queue
* which will cause the non-FPBLK inner loop to slow for larger copies.
* The exact tradeoff is strongly load and application dependent, with
* increasing risk of a customer visible performance regression if the
* non-FPBLK code is used for larger copies. Studies of synthetic in-cache
* vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
* upper limit for the non-FPBLK code. To minimize performance regression
* risk while still gaining the primary benefits of the improvements to
* the non-FPBLK code, we set an upper bound of 1024 bytes for the various
* hw_copy_limit_*. Later experimental studies using different values
* of hw_copy_limit_* can be used to make further adjustments if
* appropriate.
*
* hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
* hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
* hw_copy_limit_4 = src and dst are word aligned but not longword aligned
* hw_copy_limit_8 = src and dst are longword aligned
*
* To say that src and dst are word aligned means that after
* some initial alignment activity of moving 0 to 3 bytes,
* both the src and dst will be on word boundaries so that
* word loads and stores may be used.
*
* Recommended initial values as of Mar 2004, includes testing
* on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz):
* hw_copy_limit_1 = 256
* hw_copy_limit_2 = 512
* hw_copy_limit_4 = 1024
* hw_copy_limit_8 = 1024 (or 1536 on some systems)
*
*
* If hw_copy_limit_? is set to zero, then use of FPBLK copy is
* disabled for that alignment choice.
* If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
* the value of VIS_COPY_THRESHOLD is used.
* It is not envisioned that hw_copy_limit_? will be changed in the field
* It is provided to allow for disabling FPBLK copies and to allow
* easy testing of alternate values on future HW implementations
* that might have different cache sizes, clock rates or instruction
* timing rules.
*
* Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
* threshold to speedup all shorter copies (less than 256). That
* saves an alignment test, memory reference, and enabling test
* for all short copies, or an estimated 24 clocks.
*
* The order in which these limits are checked does matter since each
* non-predicted tst and branch costs around 10 clocks.
* If src and dst are randomly selected addresses,
* 4 of 8 will not be alignable.
* 2 of 8 will be half word alignable.
* 1 of 8 will be word alignable.
* 1 of 8 will be long word alignable.
* But, tests on running kernels show that src and dst to copy code
* are typically not on random alignments. Structure copies and
* copies of larger data sizes are often on long word boundaries.
* So we test the long word alignment case first, then
* the byte alignment, then halfword, then word alignment.
*
* Several times, tests for length are made to split the code
* into subcases. These tests often allow later tests to be
* avoided. For example, within the non-FPBLK copy, we first
* check for tiny copies of 3 bytes or less. That allows us
* to use a 4-way unrolled loop for the general byte copy case
* without a test on loop entry.
* We subdivide the non-FPBLK case further into CHKSIZE bytes and less
* vs longer cases. For the really short case, we don't attempt
* align src and dst. We try to minimize special case tests in
* the shortest loops as each test adds a significant percentage
* to the total time.
*
* For the medium sized cases, we allow ourselves to adjust the
* src and dst alignment and provide special cases for each of
* the four adjusted alignment cases. The CHKSIZE that was used
* to decide between short and medium size was chosen to be 39
* as that allows for the worst case of 7 bytes of alignment
* shift and 4 times 8 bytes for the first long word unrolling.
* That knowledge saves an initial test for length on entry into
* the medium cases. If the general loop unrolling factor were
* to be increases, this number would also need to be adjusted.
*
* For all cases in the non-FPBLK code where it is known that at
* least 4 chunks of data are available for movement, the
* loop is unrolled by four. This 4-way loop runs in 8 clocks
* or 2 clocks per data element. Due to limitations of the
* branch instruction on Cheetah, Jaguar, and Panther, the
* minimum time for a small, tight loop is 3 clocks. So
* the 4-way loop runs 50% faster than the fastest non-unrolled
* loop.
*
* Instruction alignment is forced by used of .align 16 directives
* and nops which are not executed in the code. This
* combination of operations shifts the alignment of following
* loops to insure that loops are aligned so that their instructions
* fall within the minimum number of 4 instruction fetch groups.
* If instructions are inserted or removed between the .align
* instruction and the unrolled loops, then the alignment needs
* to be readjusted. Misaligned loops can add a clock per loop
* iteration to the loop timing.
*
* In a few cases, code is duplicated to avoid a branch. Since
* a non-predicted tst and branch takes 10 clocks, this savings
* is judged an appropriate time-space tradeoff.
*
* Within the FPBLK-code, the prefetch method in the inner
* loop needs to be explained as it is not standard. Two
* prefetches are issued for each cache line instead of one.
* The primary one is at the maximum reach of 8 cache lines.
* Most of the time, that maximum prefetch reach gives the
* cache line more time to reach the processor for systems with
* higher processor clocks. But, sometimes memory interference
* can cause that prefetch to be dropped. Putting a second
* prefetch at a reach of 5 cache lines catches the drops
* three iterations later and shows a measured improvement
* in performance over any similar loop with a single prefetch.
* The prefetches are placed in the loop so they overlap with
* non-memory instructions, so that there is no extra cost
* when the data is already in-cache.
*
*/
/*
* Notes on preserving existing fp state and on membars.
*
* When a copyOP decides to use fp we may have to preserve existing
* floating point state. It is not the caller's state that we need to
* preserve - the rest of the kernel does not use fp and, anyway, fp
* registers are volatile across a call. Some examples:
*
* - userland has fp state and is interrupted (device interrupt
* bcopy()
* - another (higher level) interrupt or trap handler uses bcopy
* while a bcopy from an earlier interrupt is still active
* - an asynchronous error trap occurs while fp state exists (in
* userland or in kernel copy) and the tl0 component of the handling
* uses bcopy
* - a user process with fp state incurs a copy-on-write fault and
* hwblkpagecopy always uses fp
*
* We therefore need a per-call place in which to preserve fp state -
* using our stack is ideal (and since fp copy cannot be leaf optimized
* because of calls it makes, this is no hardship).
*
* In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are
* nops (those semantics always apply) and #StoreLoad is implemented
* as a membar #Sync.
*
* It is possible that the owner of the fp state has a block load or
* block store still "in flight" at the time we come to preserve that
* state. Block loads are blocking in Cheetah pipelines so we do not
* need to sync with them. In preserving fp regs we will use block stores
* (which are not blocking in Cheetah pipelines) so we require a membar #Sync
* after storing state (so that our subsequent use of those registers
* does not modify them before the block stores complete); this membar
* also serves to sync with block stores the owner of the fp state has
* initiated.
*
* When we have finished fp copy (with it's repeated block stores)
* we must membar #Sync so that our block stores may complete before
* we either restore the original fp state into the fp registers or
* return to a caller which may initiate other fp operations that could
* modify the fp regs we used before the block stores complete.
*
* Synchronous faults (eg, unresolvable DMMU miss) that occur while
* t_lofault is not NULL will not panic but will instead trampoline
* to the registered lofault handler. There is no need for any
* membars for these - eg, our store to t_lofault will always be visible to
* ourselves and it is our cpu which will take any trap.
*
* Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
* while t_lofault is not NULL will also not panic. Since we're copying
* to or from userland the extent of the damage is known - the destination
* buffer is incomplete. So trap handlers will trampoline to the lofault
* handler in this case which should take some form of error action to
* avoid using the incomplete buffer. The trap handler also flags the
* fault so that later return-from-trap handling (for the trap that brought
* this thread into the kernel in the first place) can notify the process
*
* Asynchronous faults (eg, uncorrectable ECC error from memory) can
* result in deferred error traps - the trap is taken sometime after
* the event and the trap PC may not be the PC of the faulting access.
* Delivery of such pending traps can be forced by a membar #Sync, acting
* separation described in the preceding paragraph we must force delivery
* of deferred traps affecting kernel state before we install a lofault
* handler (if we interpose a new lofault handler on an existing one there
* is no need to repeat this), and we must force delivery of deferred
* errors affecting the lofault-protected region before we clear t_lofault.
* Failure to do so results in lost kernel state being interpreted as
* affects copy data being interpreted as losing kernel state.
*
* Since the copy operations may preserve and later restore floating
* point state that does not belong to the caller (see examples above),
* we must be careful in how we do this in order to prevent corruption
* of another program.
*
* To make sure that floating point state is always saved and restored
* correctly, the following "big rules" must be followed when the floating
* point registers will be used:
*
* 1. %l6 always holds the caller's lofault handler. Also in this register,
* Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
* use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
* lofault handler was set coming in.
*
* 2. The FPUSED flag indicates that all FP state has been successfully stored
* on the stack. It should not be set until this save has been completed.
*
* 3. The FPUSED flag should not be cleared on exit until all FP state has
* been restored from the stack. If an error occurs while restoring
* data from the stack, the error handler can check this flag to see if
* a restore is necessary.
*
* 4. Code run under the new lofault handler must be kept to a minimum. In
* particular, any calls to FP_ALLOWMIGRATE, which could result in a call
* to kpreempt(), should not be made until after the lofault handler has
* been restored.
*/
/*
* VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
* to "break even" using FP/VIS-accelerated memory operations.
* The FPBLK code assumes a minimum number of bytes are available
* to be moved on entry. Check that code carefully before
* reducing VIS_COPY_THRESHOLD below 256.
*/
/*
* This shadows sys/machsystm.h which can't be included due to the lack of
* _ASM guards in include files it references. Change it here, change it there.
*/
/*
* TEST for very short copies
* Be aware that the maximum unroll for the short unaligned case
* is SHORTCOPY+1
*/
/*
* Indicates that we're to trampoline to the error handler.
* Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
* kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
*/
/*
* Number of outstanding prefetches.
* Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
* two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
* reach of 5*BLOCK_SIZE. The double prefetch gives an typical improvement
* of 5% for large copies as compared to a single prefetch. The reason
* for the improvement is that with Cheetah and Jaguar, some prefetches
* are dropped due to the prefetch queue being full. The second prefetch
* reduces the number of cache lines that are dropped.
* Do not remove the double prefetch or change either CHEETAH_PREFETCH
* or CHEETAH_2ND_PREFETCH without extensive performance tests to prove
* there is no loss of performance.
*/
/*
* Size of stack frame in order to accomodate a 64-byte aligned
* floating-point register save area and 2 64-bit temp locations.
* All copy functions use two quadrants of fp registers; to assure a
* block-aligned two block buffer in which to save we must reserve
* three blocks on stack. Not all functions preserve %pfrs on stack
* or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
*
* _______________________________________ <-- %fp + STACK_BIAS
* | We may need to preserve 2 quadrants |
* | of fp regs, but since we do so with |
* | align to VIS_BLOCKSIZE bytes. So |
* | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET
* |-------------------------------------|
* | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET
* |-------------------------------------|
* | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET
* ---------------------------------------
*/
/*
* Common macros used by the various versions of the block copy
* routines in this file.
*/
/*
* In FP copies if we do not have preserved data to restore over
* the fp regs we used then we must zero those regs to avoid
* exposing portions of the data to later threads (data security).
*
* Copy functions use either quadrants 1 and 3 or 2 and 4.
*
* FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
* FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
*
* The instructions below are quicker than repeated fzero instructions
* since they can dispatch down two fp pipelines.
*/
#define FZEROQ1Q3 \
#define FZEROQ2Q4 \
/*
* Used to save and restore in-use fp registers when we want to use FP
* and find fp already in use and copy size still large enough to justify
* the additional overhead of this save and restore.
*
* A membar #Sync is needed before save to sync fp ops initiated before
* the call to the copy function (by whoever has fp in use); for example
* an earlier block load to the quadrant we are about to save may still be
* "in flight". A membar #Sync is required at the end of the save to
* sync our block store (the copy code is about to begin ldd's to the
* first quadrant). Note, however, that since Cheetah pipeline block load
* is blocking we can omit the initial membar before saving fp state (they're
* commented below in case of future porting to a chip that does not block
* on block load).
*
* Similarly: a membar #Sync before restore allows the block stores of
* the copy operation to complete before we fill the quadrants with their
* original data, and a membar #Sync after restore lets the block loads
* of the restore complete before we return to whoever has the fp regs
* in use. To avoid repeated membar #Sync we make it the responsibility
* of the copy code to membar #Sync immediately after copy is complete
* and before using the BLD_*_FROMSTACK macro.
*/
#if !defined(lint)
/* membar #Sync */ ;\
/* membar #Sync - provided at copy completion */ ;\
/* membar #Sync */ ;\
/* membar #Sync - provided at copy completion */ ;\
#endif
/*
* FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger,
* prevent preemption if there is no t_lwp to save FP state to on context
* switch) before commencing a FP copy, and reallow it on completion or
* in error trampoline paths when we were using FP copy.
*
* Both macros may call other functions, so be aware that all outputs are
* forfeit after using these macros. For this reason we do not pass registers
* to use - we just use any outputs we want.
*
* For fpRAS we need to perform the fpRAS mechanism test on the same
* CPU as we use for the copy operation, both so that we validate the
* CPU we perform the copy on and so that we know which CPU failed
* if a failure is detected. Hence we need to be bound to "our" CPU.
* This could be achieved through disabling preemption (and we have do it that
* way for threads with no t_lwp) but for larger copies this may hold
* higher priority threads off of cpu for too long (eg, realtime). So we
* make use of the lightweight t_nomigrate mechanism where we can (ie, when
* we have a t_lwp).
*
* Pseudo code:
*
* FP_NOMIGRATE:
*
* if (curthread->t_lwp) {
* thread_nomigrate();
* } else {
* kpreempt_disable();
* }
*
* FP_ALLOWMIGRATE:
*
* if (curthread->t_lwp) {
* thread_allowmigrate();
* } else {
* kpreempt_enable();
* }
*/
nop ;\
nop ;\
label1: ;\
nop ;\
nop ;\
label1: ;\
nop ;\
/*
* Copy a block of storage, returning an error code if `from' or
* `to' takes a kernel pagefault which cannot be resolved.
* Returns errno value on pagefault error, 0 if all ok
*/
#if defined(lint)
/* ARGSUSED */
int
{ return(0); }
#else /* lint */
.seg ".text"
.align 4
.kcopy_2:
.kcopy_4:
.kcopy_8:
/*
* We got here because of a fault during bcopy_more, called from kcopy or bcopy.
* Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3.
*/
.copyerr:
4:
!
! and *not* to invoke any existing error handler. As far as
! bcopy is concerned, we only set t_lofault if there was an
!
1:
3:
!
! We're here via bcopy. There *must* have been an error handler
! in place otherwise we would have died a nasty death already.
!
jmp %l6 ! goto real handler
restore %g0, 0, %o0 ! dispose of copy window
/*
*/
.asciz "Unable to restore fp state after copy operation"
.align 4
.copyerr2:
/*
* We got here because of a fault during a small kcopy or bcopy.
* No floating point registers are used by the small copies.
* Errno value is in %g1.
*/
1:
3:
#endif /* lint */
/*
* Copy a block of storage - must not overlap (from + len <= to).
* Registers: l6 - saved t_lofault
* (for short copies, o4 - saved t_lofault)
*
* Copy a page of memory.
* Assumes double word alignment and a count >= 256.
*/
#if defined(lint)
/* ARGSUSED */
void
{}
#else /* lint */
.bcopy_2:
.bcopy_4:
.bcopy_8:
.align 16
.align 16
.align 16
.bc_med:
!
!
.align 16
!
!
.align 16
!
!
/*
* The _more entry points are not intended to be used directly by
* any caller from outside this file. They are provided to allow
* profiling and dtrace of the portions of the copy code that uses
* the floating point registers.
* This entry is particularly important as DTRACE (at least as of
* 4/2004) does not support leaf functions.
*/
!
! We've already captured whether t_lofault was zero on entry.
! We need to mark ourselves as being from bcopy since both
! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
! returning.
!
/*
* Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
* Also, use of FP registers has been tested to be enabled
*/
.do_copy:
2:
#if CHEETAH_PREFETCH > 4
#endif
#if CHEETAH_PREFETCH > 5
#endif
#if CHEETAH_PREFETCH > 6
#endif
#if CHEETAH_PREFETCH > 7
#endif
.align 16
1:
3:
2:
.bcb_exit:
4:
2:
#endif /* lint */
/*
* Block copy with possibly overlapped operands.
*/
#if defined(lint)
/*ARGSUSED*/
void
{}
#else /* lint */
retl ! return
1:
!
!
.ov_fwd:
retl ! return
!
!
.ov_bkwd:
retl ! return
#endif /* lint */
/*
* hwblkpagecopy()
*
* Copies exactly one page. This routine assumes the caller (ppcopy)
* has already disabled kernel preemption and has checked
* use_hw_bcopy. Preventing preemption also prevents cpu migration.
*/
#ifdef lint
/*ARGSUSED*/
void
{ }
#else /* lint */
#if CHEETAH_PREFETCH > 4
#endif
#if CHEETAH_PREFETCH > 5
#endif
#if CHEETAH_PREFETCH > 6
#endif
#if CHEETAH_PREFETCH > 7
#endif
.align 16
2:
ba 3f
2: FZEROQ1Q3
#endif /* lint */
/*
* Transfer data to and from user space -
* Note that these routines can cause faults
* It is assumed that the kernel has nothing at
* less than KERNELBASE in the virtual address space.
*
* Note that copyin(9F) and copyout(9F) are part of the
*
* Sigh.
*
* So there's two extremely similar routines - xcopyin() and xcopyout()
* which return the errno that we've faithfully computed. This
* allows other callers (e.g. uiomove(9F)) to work correctly.
* Given that these are used pretty heavily, we expand the calling
* sequences inline for all flavours (rather than making wrappers).
*
* There are also stub routines for xcopyout_little and xcopyin_little,
* which currently are intended to handle requests of <= 16 bytes from
* do_unaligned. Future enhancement to make them handle 8k pages efficiently
* is left as an exercise...
*/
/*
* Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
*
* General theory of operation:
*
* The only difference between copy{in,out} and
* xcopy{in,out} is in the error handling routine they invoke
* when a memory access error occurs. xcopyOP returns the errno
* while copyOP returns -1 (see above). copy{in,out}_noerr set
* a special flag (by oring the TRAMP_FLAG into the fault handler address)
* if they are called with a fault handler already in place. That flag
* causes the default handlers to trampoline to the previous handler
* upon an error.
*
* None of the copyops routines grab a window until it's decided that
* we need to do a HW block copy operation. This saves a window
*
* This code uses a set of 4 limits for the maximum size that will
* If the value for a particular limit is zero, the copy will be performed
* by the plain copy loops rather than FPBLK.
*
* See the description of bcopy above for more details of the
* data copying algorithm and the default limits.
*
*/
/*
* Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
*/
#if defined(lint)
#else /* lint */
/*
* We save the arguments in the following registers in case of a fault:
* kaddr - %l1
* uaddr - %l2
* count - %l3
*/
/*
* Generic copyio fault handler. This is the first line of defense when a
* fault occurs in (x)copyin/(x)copyout. In order for this to function
* properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
* This allows us to share common code for all the flavors of the copy
* operations, including the _noerr versions.
*
* Note that this function will restore the original input parameters before
* calling REAL_LOFAULT. So the real handler can vector to the appropriate
* member of the t_copyop structure, if needed.
*/
4:
1:
#endif
#if defined(lint)
/*ARGSUSED*/
int
{ return (0); }
#else /* lint */
.align 16
.align 16
.align 16
.align 16
.co_med:
!
!
.align 16
!
!
.align 16
!
!
/*
* We got here because of a fault during short copyout.
*/
3:
/*
* The _more entry points are not intended to be used directly by
* any caller from outside this file. They are provided to allow
* profiling and dtrace of the portions of the copy code that uses
* the floating point registers.
* This entry is particularly important as DTRACE (at least as of
* 4/2004) does not support leaf functions.
*/
/*
* Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
*/
2:
#if CHEETAH_PREFETCH > 4
#endif
#if CHEETAH_PREFETCH > 5
#endif
#if CHEETAH_PREFETCH > 6
#endif
#if CHEETAH_PREFETCH > 7
#endif
.align 16
1:
3:
2:
4:
4:
1:
/*
* We got here because of a fault during copyout.
*/
2:
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
{ return (0); }
#else /* lint */
/*
* We got here because of fault during xcopyout
* Errno value is in ERRNO
*/
2:
3:
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
{ return (0); }
#else /* lint */
2:
#endif /* lint */
/*
* Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
*/
#if defined(lint)
/*ARGSUSED*/
int
{ return (0); }
#else /* lint */
.copyin_2:
.copyin_4:
.copyin_8:
.align 16
.align 16
.align 16
.align 16
.ci_med:
!
!
.align 16
!
!
.align 16
!
!
3:
/*
* The _more entry points are not intended to be used directly by
* any caller from outside this file. They are provided to allow
* profiling and dtrace of the portions of the copy code that uses
* the floating point registers.
* This entry is particularly important as DTRACE (at least as of
* 4/2004) does not support leaf functions.
*/
/*
* Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
*/
2:
#if CHEETAH_PREFETCH > 4
#endif
#if CHEETAH_PREFETCH > 5
#endif
#if CHEETAH_PREFETCH > 6
#endif
#if CHEETAH_PREFETCH > 7
#endif
.align 16
1:
3:
2:
4:
4:
1:
/*
* We got here because of a fault during copyin
*/
2:
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
{ return (0); }
#else /* lint */
/*
* We got here because of fault during xcopyin
* Errno value is in ERRNO
*/
2:
3:
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
{ return (0); }
#else /* lint */
2:
#endif /* lint */
/*
* Copy a block of storage - must not overlap (from + len <= to).
* No fault handler installed (to be called under on_fault())
*/
#if defined(lint)
/* ARGSUSED */
void
{}
#else /* lint */
#endif /* lint */
/*
* Copy a block of storage - must not overlap (from + len <= to).
* No fault handler installed (to be called under on_fault())
*/
#if defined(lint)
/* ARGSUSED */
void
{}
#else /* lint */
#endif /* lint */
/*
* hwblkclr - clears block-aligned, block-multiple-sized regions that are
* longer than 256 bytes in length using spitfire's block stores. If
* the criteria for using this routine are not met then it calls bzero
* and returns 1. Otherwise 0 is returned indicating success.
* Caller is responsible for ensuring use_hw_bzero is true and that
* kpreempt_disable() has been called.
*/
#ifdef lint
/*ARGSUSED*/
int
{
return(0);
}
#else /* lint */
.pz_zinst:
bz,a .pz_finished
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
void
{}
#else /*!lint */
/*
* Copy 32 bytes of data from src (%o0) to dst (%o1)
* using physical addresses.
*/
#endif /* lint */
#if defined(lint)
int use_hw_bcopy = 1;
int use_hw_bzero = 1;
#else /* !lint */
.word 1
.word 1
.word 0
.word 0
.word 0
.word 0
.align 64
.section ".text"
#endif /* !lint */