rock_copy.s revision 2f0fcb93196badcdd803715656c809058d9f3114
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/asm_linkage.h>
#include <sys/vtrace.h>
#include <sys/machthread.h>
#include <sys/clock.h>
#include <sys/asi.h>
#include <sys/fsr.h>
#include <sys/privregs.h>
#include <sys/rockasi.h>
#if !defined(lint)
#include "assym.h"
#endif /* lint */
/*
* VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
* to "break even" using FP/VIS-accelerated memory operations.
* The FPBLK code assumes a minimum number of bytes are available
* to be moved on entry. Check that code carefully before
* reducing VIS_COPY_THRESHOLD below 256.
*/
/*
* This shadows sys/machsystm.h which can't be included due to
* the lack of _ASM guards in include files it references.
* Change it here, change it there.
*/
#define VIS_COPY_THRESHOLD 256
/*
* TEST for very short copies
* Be aware that the maximum unroll for the short unaligned case
* is SHORTCOPY+1
*/
#define SHORTCOPY 3
#define CHKSIZE 39
/*
* Indicates that we're to trampoline to the error handler.
* Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
* kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
*/
#define FPUSED_FLAG 1
#define TRAMP_FLAG 2
#define MASK_FLAGS 3
/*
* LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
* handler was set
*/
#define LOFAULT_SET 2
/*
* Number of outstanding prefetches.
* Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
* two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
* reach of 5*BLOCK_SIZE. The double prefetch gives an typical improvement
* of 5% for large copies as compared to a single prefetch. The reason
* for the improvement is that with Cheetah and Jaguar, some prefetches
* are dropped due to the prefetch queue being full. The second prefetch
* reduces the number of cache lines that are dropped.
* Do not remove the double prefetch or change either FIRST_PREFETCH
* or SECOND_PREFETCH without extensive performance tests to prove
* there is no loss of performance.
* XXX: For ROCK, the prefetch depth can be upto 16, but sticking
* with 8 as of now pending more clarity on this.
*/
#define FIRST_PREFETCH 8
#define SECOND_PREFETCH 5
#define VIS_BLOCKSIZE 64
/*
* Size of stack frame in order to accomodate a 64-byte aligned
* floating-point register save area and 2 64-bit temp locations.
* All copy functions use two quadrants of fp registers; to assure a
* block-aligned two block buffer in which to save we must reserve
* three blocks on stack. Not all functions preserve %pfrs on stack
* or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
*
* _______________________________________ <-- %fp + STACK_BIAS
* | We may need to preserve 2 quadrants |
* | of fp regs, but since we do so with |
* | BST/BLD we need room in which to |
* | align to VIS_BLOCKSIZE bytes. So |
* | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET
* |-------------------------------------|
* | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET
* |-------------------------------------|
* | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET
* ---------------------------------------
*/
#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3)
#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1)
#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8)
#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8)
#define ICACHE_LINE_SIZE 64
#define MEDIUM_MAX 255
#define MED_WMAX 256 /* max copy for medium word-aligned case */
#define MED_MAX 256 /* max copy for medium longword-aligned case */
#define PAGE_MASK 8191
#define ST_CACHE_ALIGN 127
#ifndef BSTORE_SIZE
#define BSTORE_SIZE 256 /* min copy size for block store */
#endif
/*
* Common macros used by the various versions of the block copy
* routines in this file.
*/
/*
* In FP copies if we do not have preserved data to restore over
* the fp regs we used then we must zero those regs to avoid
* exposing portions of the data to later threads (data security).
*
* Copy functions use either quadrants 1 and 3 or 2 and 4.
*
* FZEROQ3Q4: Zero quadrants 3 and 4, ie %d32 - %d46 and %d48 - %d62
*
*/
#define FZEROQ3Q4 \
movxtod %g0, %d32 ;\
movxtod %g0, %d34 ;\
fsrc1 %d0, %d36 ;\
fsrc1 %d0, %d38 ;\
fsrc1 %d0, %d40 ;\
fsrc1 %d0, %d42 ;\
fsrc1 %d0, %d44 ;\
fsrc1 %d0, %d46 ;\
fsrc1 %d0, %d48 ;\
fsrc1 %d0, %d50 ;\
fsrc1 %d0, %d52 ;\
fsrc1 %d0, %d54 ;\
fsrc1 %d0, %d56 ;\
fsrc1 %d0, %d58 ;\
fsrc1 %d0, %d60 ;\
fsrc1 %d0, %d62
/*
* Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
* Used to save and restore in-use fp registers when we want to use FP
* and find fp already in use and copy size still large enough to justify
* the additional overhead of this save and restore.
*
* A membar #Sync is needed before save to sync fp ops initiated before
* the call to the copy function (by whoever has fp in use); for example
* an earlier block load to the quadrant we are about to save may still be
* "in flight". A membar #Sync is required at the end of the save to
* sync our block store (the copy code is about to begin ldd's to the
* first quadrant). Note, however, that since Cheetah pipeline block load
* is blocking we can omit the initial membar before saving fp state (they're
* commented below in case of future porting to a chip that does not block
* on block load).
*
* Similarly: a membar #Sync before restore allows the block stores of
* the copy operation to complete before we fill the quadrants with their
* original data, and a membar #Sync after restore lets the block loads
* of the restore complete before we return to whoever has the fp regs
* in use. To avoid repeated membar #Sync we make it the responsibility
* of the copy code to membar #Sync immediately after copy is complete
* and before using the BLD_*_FROMSTACK macro.
*/
#if !defined(lint)
#define BST_FPQ3Q4_TOSTACK(tmp1) \
/* membar #Sync */ ;\
add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
stda %d32, [tmp1]ASI_BLK_P ;\
add tmp1, VIS_BLOCKSIZE, tmp1 ;\
stda %d48, [tmp1]ASI_BLK_P ;\
membar #Sync
#define BLD_FPQ3Q4_FROMSTACK(tmp1) \
/* membar #Sync - provided at copy completion */ ;\
add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
ldda [tmp1]ASI_BLK_P, %d32 ;\
add tmp1, VIS_BLOCKSIZE, tmp1 ;\
ldda [tmp1]ASI_BLK_P, %d48 ;\
membar #Sync
#endif
/*
* FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger,
* prevent preemption if there is no t_lwp to save FP state to on context
* switch) before commencing a FP copy, and reallow it on completion or
* in error trampoline paths when we were using FP copy.
*
* Both macros may call other functions, so be aware that all outputs are
* forfeit after using these macros. For this reason we do not pass registers
* to use - we just use any outputs we want.
*
* For fpRAS we need to perform the fpRAS mechanism test on the same
* CPU as we use for the copy operation, both so that we validate the
* CPU we perform the copy on and so that we know which CPU failed
* if a failure is detected. Hence we need to be bound to "our" CPU.
* This could be achieved through disabling preemption (and we have do it that
* way for threads with no t_lwp) but for larger copies this may hold
* higher priority threads off of cpu for too long (eg, realtime). So we
* make use of the lightweight t_nomigrate mechanism where we can (ie, when
* we have a t_lwp).
*
* Pseudo code:
*
* FP_NOMIGRATE:
*
* if (curthread->t_lwp) {
* thread_nomigrate();
* } else {
* kpreempt_disable();
* }
*
* FP_ALLOWMIGRATE:
*
* if (curthread->t_lwp) {
* thread_allowmigrate();
* } else {
* kpreempt_enable();
* }
*/
#define FP_NOMIGRATE(label1, label2) \
ldn [THREAD_REG + T_LWP], %o0 ;\
brz,a,pn %o0, label1/**/f ;\
ldsb [THREAD_REG + T_PREEMPT], %o1 ;\
call thread_nomigrate ;\
nop ;\
ba label2/**/f ;\
nop ;\
label1: ;\
inc %o1 ;\
stb %o1, [THREAD_REG + T_PREEMPT] ;\
label2:
#define FP_ALLOWMIGRATE(label1, label2) \
ldn [THREAD_REG + T_LWP], %o0 ;\
brz,a,pn %o0, label1/**/f ;\
ldsb [THREAD_REG + T_PREEMPT], %o1 ;\
call thread_allowmigrate ;\
nop ;\
ba label2/**/f ;\
nop ;\
label1: ;\
dec %o1 ;\
brnz,pn %o1, label2/**/f ;\
stb %o1, [THREAD_REG + T_PREEMPT] ;\
ldn [THREAD_REG + T_CPU], %o0 ;\
ldub [%o0 + CPU_KPRUNRUN], %o0 ;\
brz,pt %o0, label2/**/f ;\
nop ;\
call kpreempt ;\
rdpr %pil, %o0 ;\
label2:
/*
* Copy a block of storage, returning an error code if `from' or
* `to' takes a kernel pagefault which cannot be resolved.
* Returns errno value on pagefault error, 0 if all ok
*/
#if defined(lint)
/* ARGSUSED */
int
kcopy(const void *from, void *to, size_t count)
{ return(0); }
#else /* lint */
.seg ".text"
.align 4
ENTRY(kcopy)
cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
bleu,pt %ncc, .kcopy_small ! go to larger cases
xor %o0, %o1, %o3 ! are src, dst alignable?
btst 7, %o3 !
bz,pt %ncc, .kcopy_8 ! check for longword alignment
nop
btst 1, %o3 !
bz,pt %ncc, .kcopy_2 ! check for half-word
nop
sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_1)], %o3
tst %o3
bz,pn %icc, .kcopy_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .kcopy_small ! go to small copy
nop
ba,pt %ncc, .kcopy_more ! otherwise go to large copy
nop
.kcopy_2:
btst 3, %o3 !
bz,pt %ncc, .kcopy_4 ! check for word alignment
nop
sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_2)], %o3
tst %o3
bz,pn %icc, .kcopy_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .kcopy_small ! go to small copy
nop
ba,pt %ncc, .kcopy_more ! otherwise go to large copy
nop
.kcopy_4:
! already checked longword, must be word aligned
sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_4)], %o3
tst %o3
bz,pn %icc, .kcopy_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .kcopy_small ! go to small copy
nop
ba,pt %ncc, .kcopy_more ! otherwise go to large copy
nop
.kcopy_8:
sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_8)], %o3
tst %o3
bz,pn %icc, .kcopy_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .kcopy_small ! go to small copy
nop
ba,pt %ncc, .kcopy_more ! otherwise go to large copy
nop
.kcopy_small:
sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value
or %o5, %lo(.sm_copyerr), %o5
ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
membar #Sync ! sync error barrier
ba,pt %ncc, .sm_do_copy ! common code
stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
.kcopy_more:
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
sethi %hi(.copyerr), %l7 ! copyerr is lofault value
or %l7, %lo(.copyerr), %l7
ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
membar #Sync ! sync error barrier
ba,pt %ncc, .do_copy ! common code
stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
/*
* We got here because of a fault during bcopy_more, called from kcopy or bcopy.
* Errno value is in %g1. bcopy_more uses fp quadrants 3 and 4.
*/
.copyerr:
set .copyerr2, %l0
membar #Sync ! sync error barrier
stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault
btst FPUSED_FLAG, %l6
bz %ncc, 1f
and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0
ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
wr %o2, 0, %gsr
ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
! No need to save regs if either FEF or DU are clear
and %o3, FPRS_FEF|FPRS_DU, %o2
xorcc %o2, FPRS_FEF|FPRS_DU, %g0
bnz,pt %icc, 4f
nop
BLD_FPQ3Q4_FROMSTACK(%o2)
ba,pt %ncc, 1f
wr %o3, 0, %fprs ! restore fprs
4:
FZEROQ3Q4
wr %o3, 0, %fprs ! restore fprs
!
! Need to cater for the different expectations of kcopy
! and bcopy. kcopy will *always* set a t_lofault handler
! If it fires, we're expected to just return the error code
! and *not* to invoke any existing error handler. As far as
! bcopy is concerned, we only set t_lofault if there was an
! existing lofault handler. In that case we're expected to
! invoke the previously existing handler after resetting the
! t_lofault value.
!
1:
andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off
membar #Sync ! sync error barrier
stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
FP_ALLOWMIGRATE(5, 6)
btst TRAMP_FLAG, %l0
bnz,pn %ncc, 3f
nop
ret
restore %g1, 0, %o0
3:
!
! We're here via bcopy. There *must* have been an error handler
! in place otherwise we would have died a nasty death already.
!
jmp %l6 ! goto real handler
restore %g0, 0, %o0 ! dispose of copy window
/*
* We got here because of a fault in .copyerr. We can't safely restore fp
* state, so we panic.
*/
fp_panic_msg:
.asciz "Unable to restore fp state after copy operation"
.align 4
.copyerr2:
set fp_panic_msg, %o0
call panic
nop
/*
* We got here because of a fault during a small kcopy or bcopy.
* No floating point registers are used by the small copies.
* Errno value is in %g1.
*/
.sm_copyerr:
1:
btst TRAMP_FLAG, %o4
membar #Sync
andn %o4, TRAMP_FLAG, %o4
bnz,pn %ncc, 3f
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g1, %o0
3:
jmp %o4 ! goto real handler
mov %g0, %o0 !
SET_SIZE(kcopy)
#endif /* lint */
/*
* Copy a block of storage - must not overlap (from + len <= to).
* Registers: l6 - saved t_lofault
* (for short copies, o4 - saved t_lofault)
*
* Copy a page of memory.
* Assumes double word alignment and a count >= 256.
*/
#if defined(lint)
/* ARGSUSED */
void
bcopy(const void *from, void *to, size_t count)
{}
#else /* lint */
ENTRY(bcopy)
cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
bleu,pt %ncc, .bcopy_small ! go to larger cases
xor %o0, %o1, %o3 ! are src, dst alignable?
btst 7, %o3 !
bz,pt %ncc, .bcopy_8 ! check for longword alignment
nop
btst 1, %o3 !
bz,pt %ncc, .bcopy_2 ! check for half-word
nop
sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_1)], %o3
tst %o3
bz,pn %icc, .bcopy_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .bcopy_small ! go to small copy
nop
ba,pt %ncc, .bcopy_more ! otherwise go to large copy
nop
.bcopy_2:
btst 3, %o3 !
bz,pt %ncc, .bcopy_4 ! check for word alignment
nop
sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_2)], %o3
tst %o3
bz,pn %icc, .bcopy_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .bcopy_small ! go to small copy
nop
ba,pt %ncc, .bcopy_more ! otherwise go to large copy
nop
.bcopy_4:
! already checked longword, must be word aligned
sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_4)], %o3
tst %o3
bz,pn %icc, .bcopy_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .bcopy_small ! go to small copy
nop
ba,pt %ncc, .bcopy_more ! otherwise go to large copy
nop
.bcopy_8:
sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_8)], %o3
tst %o3
bz,pn %icc, .bcopy_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .bcopy_small ! go to small copy
nop
ba,pt %ncc, .bcopy_more ! otherwise go to large copy
nop
.align 16
.bcopy_small:
ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault
tst %o4
bz,pt %icc, .sm_do_copy
nop
sethi %hi(.sm_copyerr), %o5
or %o5, %lo(.sm_copyerr), %o5
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector
or %o4, TRAMP_FLAG, %o4 ! error should trampoline
.sm_do_copy:
cmp %o2, SHORTCOPY ! check for really short case
bleu,pt %ncc, .bc_sm_left !
cmp %o2, CHKSIZE ! check for medium length cases
bgu,pn %ncc, .bc_med !
or %o0, %o1, %o3 ! prepare alignment check
andcc %o3, 0x3, %g0 ! test for alignment
bz,pt %ncc, .bc_sm_word ! branch to word aligned case
.bc_sm_movebytes:
sub %o2, 3, %o2 ! adjust count to allow cc zero test
.bc_sm_notalign4:
ldub [%o0], %o3 ! read byte
stb %o3, [%o1] ! write byte
subcc %o2, 4, %o2 ! reduce count by 4
ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes
add %o0, 4, %o0 ! advance SRC by 4
stb %o3, [%o1 + 1]
ldub [%o0 - 2], %o3
add %o1, 4, %o1 ! advance DST by 4
stb %o3, [%o1 - 2]
ldub [%o0 - 1], %o3
bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain
stb %o3, [%o1 - 1]
add %o2, 3, %o2 ! restore count
.bc_sm_left:
tst %o2
bz,pt %ncc, .bc_sm_exit ! check for zero length
deccc %o2 ! reduce count for cc test
ldub [%o0], %o3 ! move one byte
bz,pt %ncc, .bc_sm_exit
stb %o3, [%o1]
ldub [%o0 + 1], %o3 ! move another byte
deccc %o2 ! check for more
bz,pt %ncc, .bc_sm_exit
stb %o3, [%o1 + 1]
ldub [%o0 + 2], %o3 ! move final byte
stb %o3, [%o1 + 2]
membar #Sync ! sync error barrier
andn %o4, TRAMP_FLAG, %o4
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return 0
.align 16
nop ! instruction alignment
! see discussion at start of file
.bc_sm_words:
lduw [%o0], %o3 ! read word
.bc_sm_wordx:
subcc %o2, 8, %o2 ! update count
stw %o3, [%o1] ! write word
add %o0, 8, %o0 ! update SRC
lduw [%o0 - 4], %o3 ! read word
add %o1, 8, %o1 ! update DST
bgt,pt %ncc, .bc_sm_words ! loop til done
stw %o3, [%o1 - 4] ! write word
addcc %o2, 7, %o2 ! restore count
bz,pt %ncc, .bc_sm_exit
deccc %o2
bz,pt %ncc, .bc_sm_byte
.bc_sm_half:
subcc %o2, 2, %o2 ! reduce count by 2
add %o0, 2, %o0 ! advance SRC by 2
lduh [%o0 - 2], %o3 ! read half word
add %o1, 2, %o1 ! advance DST by 2
bgt,pt %ncc, .bc_sm_half ! loop til done
sth %o3, [%o1 - 2] ! write half word
addcc %o2, 1, %o2 ! restore count
bz,pt %ncc, .bc_sm_exit
nop
.bc_sm_byte:
ldub [%o0], %o3
stb %o3, [%o1]
membar #Sync ! sync error barrier
andn %o4, TRAMP_FLAG, %o4
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return 0
.bc_sm_word:
subcc %o2, 4, %o2 ! update count
bgt,pt %ncc, .bc_sm_wordx
lduw [%o0], %o3 ! read word
addcc %o2, 3, %o2 ! restore count
bz,pt %ncc, .bc_sm_exit
stw %o3, [%o1] ! write word
deccc %o2 ! reduce count for cc test
ldub [%o0 + 4], %o3 ! load one byte
bz,pt %ncc, .bc_sm_exit
stb %o3, [%o1 + 4] ! store one byte
ldub [%o0 + 5], %o3 ! load second byte
deccc %o2
bz,pt %ncc, .bc_sm_exit
stb %o3, [%o1 + 5] ! store second byte
ldub [%o0 + 6], %o3 ! load third byte
stb %o3, [%o1 + 6] ! store third byte
.bc_sm_exit:
membar #Sync ! sync error barrier
andn %o4, TRAMP_FLAG, %o4
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return 0
.align 16
.bc_med:
xor %o0, %o1, %o3 ! setup alignment check
btst 1, %o3
bnz,pt %ncc, .bc_sm_movebytes ! unaligned
nop
btst 3, %o3
bnz,pt %ncc, .bc_med_half ! halfword aligned
nop
btst 7, %o3
bnz,pt %ncc, .bc_med_word ! word aligned
nop
.bc_med_long:
btst 3, %o0 ! check for
bz,pt %ncc, .bc_med_long1 ! word alignment
nop
.bc_med_long0:
ldub [%o0], %o3 ! load one byte
inc %o0
stb %o3,[%o1] ! store byte
inc %o1
btst 3, %o0
bnz,pt %ncc, .bc_med_long0
dec %o2
.bc_med_long1: ! word aligned
btst 7, %o0 ! check for long word
bz,pt %ncc, .bc_med_long2
nop
lduw [%o0], %o3 ! load word
add %o0, 4, %o0 ! advance SRC by 4
stw %o3, [%o1] ! store word
add %o1, 4, %o1 ! advance DST by 4
sub %o2, 4, %o2 ! reduce count by 4
!
! Now long word aligned and have at least 32 bytes to move
!
.bc_med_long2:
sub %o2, 31, %o2 ! adjust count to allow cc zero test
.bc_med_lmove:
ldx [%o0], %o3 ! read long word
stx %o3, [%o1] ! write long word
subcc %o2, 32, %o2 ! reduce count by 32
ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words
add %o0, 32, %o0 ! advance SRC by 32
stx %o3, [%o1 + 8]
ldx [%o0 - 16], %o3
add %o1, 32, %o1 ! advance DST by 32
stx %o3, [%o1 - 16]
ldx [%o0 - 8], %o3
bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left
stx %o3, [%o1 - 8]
addcc %o2, 24, %o2 ! restore count to long word offset
ble,pt %ncc, .bc_med_lextra ! check for more long words to move
nop
.bc_med_lword:
ldx [%o0], %o3 ! read long word
subcc %o2, 8, %o2 ! reduce count by 8
stx %o3, [%o1] ! write long word
add %o0, 8, %o0 ! advance SRC by 8
bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left
add %o1, 8, %o1 ! advance DST by 8
.bc_med_lextra:
addcc %o2, 7, %o2 ! restore rest of count
bz,pt %ncc, .bc_sm_exit ! if zero, then done
deccc %o2
bz,pt %ncc, .bc_sm_byte
nop
ba,pt %ncc, .bc_sm_half
nop
.align 16
.bc_med_word:
btst 3, %o0 ! check for
bz,pt %ncc, .bc_med_word1 ! word alignment
nop
.bc_med_word0:
ldub [%o0], %o3 ! load one byte
inc %o0
stb %o3,[%o1] ! store byte
inc %o1
btst 3, %o0
bnz,pt %ncc, .bc_med_word0
dec %o2
!
! Now word aligned and have at least 36 bytes to move
!
.bc_med_word1:
sub %o2, 15, %o2 ! adjust count to allow cc zero test
.bc_med_wmove:
lduw [%o0], %o3 ! read word
stw %o3, [%o1] ! write word
subcc %o2, 16, %o2 ! reduce count by 16
lduw [%o0 + 4], %o3 ! repeat for a total for 4 words
add %o0, 16, %o0 ! advance SRC by 16
stw %o3, [%o1 + 4]
lduw [%o0 - 8], %o3
add %o1, 16, %o1 ! advance DST by 16
stw %o3, [%o1 - 8]
lduw [%o0 - 4], %o3
bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left
stw %o3, [%o1 - 4]
addcc %o2, 12, %o2 ! restore count to word offset
ble,pt %ncc, .bc_med_wextra ! check for more words to move
nop
.bc_med_word2:
lduw [%o0], %o3 ! read word
subcc %o2, 4, %o2 ! reduce count by 4
stw %o3, [%o1] ! write word
add %o0, 4, %o0 ! advance SRC by 4
bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left
add %o1, 4, %o1 ! advance DST by 4
.bc_med_wextra:
addcc %o2, 3, %o2 ! restore rest of count
bz,pt %ncc, .bc_sm_exit ! if zero, then done
deccc %o2
bz,pt %ncc, .bc_sm_byte
nop
ba,pt %ncc, .bc_sm_half
nop
.align 16
.bc_med_half:
btst 1, %o0 ! check for
bz,pt %ncc, .bc_med_half1 ! half word alignment
nop
ldub [%o0], %o3 ! load one byte
inc %o0
stb %o3,[%o1] ! store byte
inc %o1
dec %o2
!
! Now half word aligned and have at least 38 bytes to move
!
.bc_med_half1:
sub %o2, 7, %o2 ! adjust count to allow cc zero test
.bc_med_hmove:
lduh [%o0], %o3 ! read half word
sth %o3, [%o1] ! write half word
subcc %o2, 8, %o2 ! reduce count by 8
lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords
add %o0, 8, %o0 ! advance SRC by 8
sth %o3, [%o1 + 2]
lduh [%o0 - 4], %o3
add %o1, 8, %o1 ! advance DST by 8
sth %o3, [%o1 - 4]
lduh [%o0 - 2], %o3
bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left
sth %o3, [%o1 - 2]
addcc %o2, 7, %o2 ! restore count
bz,pt %ncc, .bc_sm_exit
deccc %o2
bz,pt %ncc, .bc_sm_byte
nop
ba,pt %ncc, .bc_sm_half
nop
SET_SIZE(bcopy)
/*
* The _more entry points are not intended to be used directly by
* any caller from outside this file. They are provided to allow
* profiling and dtrace of the portions of the copy code that uses
* the floating point registers.
* This entry is particularly important as DTRACE (at least as of
* 4/2004) does not support leaf functions.
*/
ENTRY(bcopy_more)
.bcopy_more:
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault
tst %l6
bz,pt %ncc, .do_copy
nop
sethi %hi(.copyerr), %o2
or %o2, %lo(.copyerr), %o2
membar #Sync ! sync error barrier
stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector
!
! We've already captured whether t_lofault was zero on entry.
! We need to mark ourselves as being from bcopy since both
! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
! and the saved lofault was zero, we won't reset lofault on
! returning.
!
or %l6, TRAMP_FLAG, %l6
/*
* Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
* Also, use of FP registers has been tested to be enabled
*/
.do_copy:
FP_NOMIGRATE(6, 7)
mov %i1, %g5 ! save dest addr start
mov %i2, %g2 ! save size
rd %fprs, %o2 ! check for unused fp
st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
! FPU enabled ? If not, enable it.
btst FPRS_FEF, %o2
bz,a,pt %icc, .do_blockcopy
wr %g0, FPRS_FEF, %fprs
! FPU enabled, but is Q3Q4 dirty ? If yes, save them.
btst FPRS_DU, %o2
bz,pn %icc, .do_blockcopy
nop
BST_FPQ3Q4_TOSTACK(%o2)
.do_blockcopy:
rd %gsr, %o2
stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
or %l6, FPUSED_FLAG, %l6
ba,a,pt %xcc, .medium
nop
#define REALSRC %i0
#define DST %i1
#define CNT %i2
#define SRC %i3
#define TMP %i5
.align 16
.medium:
neg DST, TMP
neg REALSRC, SRC
andcc TMP, 7, TMP ! bytes till DST 8 byte aligned
and SRC, 7, SRC ! bytes till REALSRC 8 byte aligned
bz %ncc, .med2
sub TMP, SRC, SRC
! -(bytes till REALSRC aligned after DST aligned)
! SRC = {-7, -6, ... 7} SRC > 0 => REALSRC overaligned
sub CNT, TMP, CNT ! update count
.med1:
ldub [REALSRC], %i4
deccc TMP
inc REALSRC
stb %i4, [DST]
bgu,pt %ncc, .med1
inc DST
! Now DST is 8-byte aligned. DST, REALSRC, CNT are current.
.med2:
andcc REALSRC, 0x3, %g0 ! test alignment
bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases
! if src, dst not aligned
prefetch [REALSRC + (1 * VIS_BLOCKSIZE)], #n_reads
/*
* Handle all cases where src and dest are aligned on word
* or long word boundaries. Use unrolled loops for better
* performance. This option wins over standard large data
* move when source and destination is in cache for medium
* to short data moves.
*/
andcc REALSRC, 0x7, %g0 ! test word alignment
bz,pt %ncc, .medlword ! branch to long word aligned case
prefetch [REALSRC + (2 * VIS_BLOCKSIZE)], #n_reads
cmp CNT, MED_WMAX ! limit to store buffer size
bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop
nop
subcc CNT, 15, CNT ! adjust length to allow cc test
! for end of loop
ble,pt %ncc, .medw15 ! skip big loop if less than 16
prefetch [REALSRC + (3 * VIS_BLOCKSIZE)], #n_reads
/*
* no need to put prefetch in loop as prefetches have
* already been issued for maximum loop size
*/
.medw16:
ld [REALSRC], %i4 ! load
subcc CNT, 16, CNT ! decrement length count
stw %i4, [DST] ! and store
ld [REALSRC+4], SRC ! a block of 16 bytes
add REALSRC, 16, REALSRC ! increase src ptr by 16
stw SRC, [DST+4]
ld [REALSRC-8], %i4
add DST, 16, DST ! increase dst ptr by 16
stw %i4, [DST-8]
ld [REALSRC-4], SRC
bgu,pt %ncc, .medw16 ! repeat if at least 16 bytes left
stw SRC, [DST-4]
.medw15:
addcc CNT, 15, CNT ! restore count
bz,pt %ncc, .bcb_exit ! exit if finished
nop
cmp CNT, 8
blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
nop !
ld [REALSRC], %i4 ! load 4 bytes
subcc CNT, 8, CNT ! decrease count by 8
stw %i4, [DST] ! and store 4 bytes
add REALSRC, 8, REALSRC ! increase src ptr by 8
ld [REALSRC-4], SRC ! load 4 bytes
add DST, 8, DST ! increase dst ptr by 8
stw SRC, [DST-4] ! and store 4 bytes
bz %ncc, .bcb_exit ! exit if finished
nop
.medw7: ! count is ge 1, less than 8
cmp CNT, 3 ! check for 4 bytes left
ble,pt %ncc, .medw3 ! skip if 3 or fewer bytes left
nop !
ld [REALSRC], %i4 ! load 4 bytes
sub CNT, 4, CNT ! decrease count by 4
add REALSRC, 4, REALSRC ! increase src ptr by 4
stw %i4, [DST] ! and store 4 bytes
add DST, 4, DST ! increase dst ptr by 4
tst CNT ! check for zero bytes left
bz %ncc, .bcb_exit ! exit if finished
nop
.medw3: ! count is known to be 1, 2, or 3
deccc CNT ! reduce count by one
ldub [REALSRC], SRC ! load one byte
bz,pt %ncc, .bcb_exit ! exit if last byte
stb SRC, [DST] ! store one byte
ldub [REALSRC+1], SRC ! load second byte
deccc CNT ! reduce count by one
bz,pt %ncc, .bcb_exit ! exit if last byte
stb SRC, [DST+1] ! store second byte
ldub [REALSRC+2], SRC ! load third byte
stb SRC, [DST+2] ! store third byte
ba,a,pt %ncc, .bcb_exit ! exit
nop
/*
* Special case for handling when src and dest are both long word aligned
* and total data to move is between SMALL_MAX and MED_MAX bytes
*/
.align 16
nop
.medlword: ! long word aligned
! length > SMALL_MAX
cmp CNT, MED_MAX ! limit to store buffer size
bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop
nop
subcc CNT, 31, CNT ! adjust length to allow cc test
! for end of loop
ble,pt %ncc, .medl31 ! skip big loop if less than 32
prefetch [REALSRC + (3 * VIS_BLOCKSIZE)], #n_reads
/*
* no need to put prefetch in loop as prefetches have
* already been issued for maximum loop size
*/
.medl32:
ldx [REALSRC], %i4 ! load
subcc CNT, 32, CNT ! decrement length count
stx %i4, [DST] ! and store
ldx [REALSRC+8], SRC ! a block of 32 bytes
add REALSRC, 32, REALSRC ! increase src ptr by 32
stx SRC, [DST+8]
ldx [REALSRC-16], %i4
add DST, 32, DST ! increase dst ptr by 32
stx %i4, [DST-16]
ldx [REALSRC-8], SRC
bgu,pt %ncc, .medl32 ! repeat if at least 32 bytes left
stx SRC, [DST-8]
.medl31:
addcc CNT, 16, CNT ! adjust remaining count
ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left
nop !
ldx [REALSRC], %i4 ! load and store 16 bytes
add REALSRC, 16, REALSRC ! increase src ptr by 16
stx %i4, [DST] !
sub CNT, 16, CNT ! decrease count by 16
ldx [REALSRC-8], SRC !
add DST, 16, DST ! increase dst ptr by 16
stx SRC, [DST-8]
.medl15:
addcc CNT, 15, CNT ! restore count
bz,pt %ncc, .bcb_exit ! exit if finished
nop
cmp CNT, 8
blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
nop
ldx [REALSRC], %i4 ! load 8 bytes
add REALSRC, 8, REALSRC ! increase src ptr by 8
stx %i4, [DST] ! and store 8 bytes
subcc CNT, 8, CNT ! decrease count by 8
bz %ncc, .bcb_exit ! exit if finished
add DST, 8, DST ! increase dst ptr by 8
ba .medw7
nop
.align 16
nop
nop
nop
.mediumsetup:
prefetch [REALSRC + (2 * VIS_BLOCKSIZE)], #one_read
.mediumrejoin:
add REALSRC, 8, REALSRC ! prepare to round REALSRC upward
sethi %hi(0x1234567f), TMP ! For GSR.MASK
or TMP, 0x67f, TMP
cmp CNT, MEDIUM_MAX
bmask TMP, %g0, %g0
! Compute TMP (no of bytes that need copying using the main loop).
! First, compute for the medium case.
! Then, if large case, TMP is replaced by count for block alignment.
! Be careful not to read past end of REALSRC
! Currently, CNT is the actual count remaining
! SRC is how much sooner we'll cross the alignment
! boundary in REALSRC compared to in DST
!
! Examples: Let # denote bytes that should not be accessed
! Let x denote a byte already copied to align DST
! Let . and - denote bytes not yet copied
! Let | denote double alignment boundaries
!
! DST: ######xx|........|--------|..###### CNT = 18
! i1
!
! SRC = -3: REALSRC: ###xx...|.....---|-----..#|######## TMP = 8
! i0
!
! SRC = 0: REALSRC: ######xx|........|--------|..###### TMP = 16-8 = 8
! i0
!
! SRC = +1: REALSRC: #######x|x.......|.-------|-..##### TMP = 16-8 = 8
! i0
wr %g0, ASI_CACHE_SPARING_P, %asi
or %g0, -8, TMP
alignaddr REALSRC, %g0, REALSRC ! set GSR.ALIGN and align REALSRC
movrlz SRC, %g0, TMP ! subtract 8 from i2+i3 only if i3>=0
add TMP, CNT, TMP
add TMP, SRC, TMP
bleu %ncc, .med4
andn TMP, 7, TMP ! 8 byte aligned count
neg DST, TMP ! 'large' case
and TMP, VIS_BLOCKSIZE-1, TMP ! bytes till DST block aligned
.med4:
brgez,a SRC, .beginmedloop
ldda [REALSRC-8]%asi, %d32
add REALSRC, SRC, REALSRC ! back up REALSRC
.med5:
ldda [REALSRC]ASI_FL8_P, %d34
inc REALSRC
andcc REALSRC, 7, %g0
bnz %ncc, .med5
bshuffle %d32, %d34, %d32 ! shifts d32 left 1 byte and or's in d34
.beginmedloop:
tst TMP
bz %ncc, .endmedloop
sub CNT, TMP, CNT ! update count for later
! Main loop to write out doubles. Note: TMP & 7 == 0
ldd [REALSRC], %d34
subcc TMP, 8, TMP ! update local count
bz,pn %ncc, .medloop1
add REALSRC, 8, REALSRC ! update REALSRC
.medloop:
faligndata %d32, %d34, %d36
ldda [REALSRC]%asi, %d32
subcc TMP, 8, TMP ! update local count
add REALSRC, 16, REALSRC ! update REALSRC
std %d36, [DST]
bz,pn %ncc, .medloop2
faligndata %d34, %d32, %d38
ldda [REALSRC - 8]%asi, %d34
subcc TMP, 8, TMP ! update local count
std %d38, [DST + 8]
bnz,pt %ncc, .medloop
add DST, 16, DST ! update DST
.medloop1:
faligndata %d32, %d34, %d36
fsrc1 %d34, %d32
std %d36, [DST]
ba .endmedloop
add DST, 8, DST
.medloop2:
std %d38, [DST + 8]
sub REALSRC, 8, REALSRC
add DST, 16, DST
.endmedloop:
! Currently, i0 is pointing to the next double-aligned byte in REALSRC
! The 8 bytes starting at [REALSRC-8] are available in d32
! At least one, and possibly all, of these need to be written.
cmp CNT, VIS_BLOCKSIZE
bgu %ncc, .large ! otherwise, less than 16 bytes left
andcc SRC, 7, TMP ! Number of bytes needed to completely
! fill %d32 with good (unwritten) data.
bz %ncc, .medpst2
sub TMP, 8, SRC ! -(number of good bytes in %d32)
cmp CNT, 8
bl,a %ncc, .medpst3 ! Not enough bytes to fill %d32
add REALSRC, SRC, REALSRC ! Back up REALSRC
.medpst1:
deccc TMP
ldda [REALSRC]ASI_FL8_P, %d34
inc REALSRC
bgu %ncc, .medpst1
bshuffle %d32, %d34, %d32 ! shifts d32 left 1 byte and or's in d34
.medpst2:
subcc CNT, 8, CNT
std %d32, [DST]
bz %ncc, .mediumexit
add DST, 8, DST
.medpst3:
ldub [REALSRC], SRC
deccc CNT
inc REALSRC
stb SRC, [DST]
bgu %ncc, .medpst3
inc DST
.mediumexit:
ba,pt %ncc, .bcb_exit
nop
.align ICACHE_LINE_SIZE
.large:
! The following test for BSTORE_SIZE is used to decide whether
! to store data with a block store or with individual stores.
! The block store wins when the amount of data is so large
! that it is causes other application data to be moved out
! of the L1 or L2 cache.
! On a Panther, block store can lose more often because block
! store forces the stored data to be removed from the L3 cache.
!
sethi %hi(BSTORE_SIZE), TMP
or TMP, %lo(BSTORE_SIZE), TMP
cmp CNT, TMP
bgu %ncc, .xlarge
! DST I/O Destination is 64-byte aligned
! REALSRC I/O Source is 8-byte aligned (and we've set GSR.ALIGN)
! %d32 I/O Already loaded with Source data from [REALSRC-8]
! CNT I/O Count (number of bytes that need to be written)
! SRC I Not written. If zero, then REALSRC is double aligned.
! TMP O The number of doubles that remain to be written.
! Load the rest of the current block
! Recall that REALSRC is further into source buffer
! than DST is into the destination buffer.
prefetch [DST + (0 * VIS_BLOCKSIZE)], #n_writes
prefetch [DST + (1 * VIS_BLOCKSIZE)], #n_writes
prefetch [DST + (2 * VIS_BLOCKSIZE)], #n_writes
ldda [REALSRC]%asi, %d34
prefetch [REALSRC + (3 * VIS_BLOCKSIZE)], #one_read
ldda [REALSRC + 0x08]%asi, %d36
faligndata %d32, %d34, %d48
ldda [REALSRC + 0x10]%asi, %d38
faligndata %d34, %d36, %d50
ldda [REALSRC + 0x18]%asi, %d40
faligndata %d36, %d38, %d52
ldda [REALSRC + 0x20]%asi, %d42
or %g0, -8, TMP ! if SRC >=0, TMP = -8
prefetch [REALSRC + (4 * VIS_BLOCKSIZE)], #one_read
faligndata %d38, %d40, %d54
ldda [REALSRC + 0x28]%asi, %d44
movrlz SRC, %g0, TMP ! if SRC < 0, TMP = 0 (needed later)
faligndata %d40, %d42, %d56
ldda [REALSRC + 0x30]%asi, %d46
faligndata %d42, %d44, %d58
ldda [REALSRC + 0x38]%asi, %d32
sub CNT, VIS_BLOCKSIZE, CNT
prefetch [REALSRC + (5 * VIS_BLOCKSIZE)], #one_read
add REALSRC, VIS_BLOCKSIZE, REALSRC ! update REALSRC
! Main loop. Write previous block. Load rest of current block.
! Some bytes will be loaded that won't yet be written.
.large1:
ldda [REALSRC]%asi, %d34
faligndata %d44, %d46, %d60
ldda [REALSRC + 0x08]%asi, %d36
faligndata %d46, %d32, %d62
std %d48, [DST]
std %d50, [DST+8]
std %d52, [DST+16]
std %d54, [DST+24]
std %d56, [DST+32]
std %d58, [DST+40]
std %d60, [DST+48]
std %d62, [DST+56]
sub CNT, VIS_BLOCKSIZE, CNT ! update count
prefetch [DST + (6 * VIS_BLOCKSIZE)], #n_writes
prefetch [DST + (3 * VIS_BLOCKSIZE)], #n_writes
add DST, VIS_BLOCKSIZE, DST ! update DST
ldda [REALSRC + 0x10]%asi, %d38
faligndata %d32, %d34, %d48
ldda [REALSRC + 0x18]%asi, %d40
faligndata %d34, %d36, %d50
ldda [REALSRC + 0x20]%asi, %d42
faligndata %d36, %d38, %d52
ldda [REALSRC + 0x28]%asi, %d44
faligndata %d38, %d40, %d54
ldda [REALSRC + 0x30]%asi, %d46
faligndata %d40, %d42, %d56
ldda [REALSRC + 0x38]%asi, %d32
faligndata %d42, %d44, %d58
cmp CNT, VIS_BLOCKSIZE + 8
prefetch [REALSRC + (5 * VIS_BLOCKSIZE)], #one_read
bgu,pt %ncc, .large1
add REALSRC, VIS_BLOCKSIZE, REALSRC
faligndata %d44, %d46, %d60
faligndata %d46, %d32, %d62
stda %d48, [DST]ASI_BLK_P ! store 64 bytes, bypass cache
cmp CNT, VIS_BLOCKSIZE
bne %ncc, .large2 ! exactly 1 blk remaining?
add DST, VIS_BLOCKSIZE, DST ! update DST
brz,a SRC, .large3 ! is SRC double aligned ?
ldd [REALSRC], %d34
.large2:
add TMP, CNT, TMP ! TMP was already set to 0 or -8
add TMP, SRC, TMP
ba .beginmedloop
andn TMP, 7, TMP
.large3:
ldd [REALSRC + 0x08], %d36
ldd [REALSRC + 0x10], %d38
ldd [REALSRC + 0x18], %d40
ldd [REALSRC + 0x20], %d42
ldd [REALSRC + 0x28], %d44
ldd [REALSRC + 0x30], %d46
! Don't need to load [REALSRC + 0x38] since we were
! already 8 bytes ahead in REALSRC to start with.
stda %d32, [DST]ASI_BLK_P
ba,a,pt %ncc, .bcb_exit
nop
.align 16
! two nops here causes loop starting at .xlarge1 below to be
! on a cache line boundary, improving performance
nop
nop
.xlarge:
! DST I/O Destination is 64-byte aligned
! REALSRC I/O Source is 8-byte aligned (and we've set GSR.ALIGN)
! %d32 I/O Already loaded with Source data from [REALSRC-8]
! CNT I/O Count (number of bytes that need to be written)
! SRC I Not written. If zero, then REALSRC is double aligned.
! TMP O The number of doubles that remain to be written.
! Load the rest of the current block
! Recall that REALSRC is further into source buffer
! than DST is into the destination buffer.
! prefetch [REALSRC + (3 * VIS_BLOCKSIZE)], #one_read
! executed in delay slot for branch to .xlarge
prefetch [REALSRC + (4 * VIS_BLOCKSIZE)], #one_read
prefetch [REALSRC + (5 * VIS_BLOCKSIZE)], #one_read
ldda [REALSRC]%asi, %d34
prefetch [REALSRC + (6 * VIS_BLOCKSIZE)], #one_read
ldda [REALSRC + 0x8]%asi, %d36
faligndata %d32, %d34, %d48
ldda [REALSRC + 0x10]%asi, %d38
faligndata %d34, %d36, %d50
ldda [REALSRC + 0x18]%asi, %d40
faligndata %d36, %d38, %d52
ldda [REALSRC + 0x20]%asi, %d42
or %g0, -8, TMP ! if SRC >= 0, TMP = -8
faligndata %d38, %d40, %d54
ldda [REALSRC + 0x28]%asi, %d44
movrlz SRC, %g0, TMP ! if SRC < 0, TMP = 0 (needed later)
faligndata %d40, %d42, %d56
ldda [REALSRC + 0x30]%asi, %d46
faligndata %d42, %d44, %d58
ldda [REALSRC + 0x38]%asi, %d32
sub CNT, VIS_BLOCKSIZE, CNT ! update count
prefetch [REALSRC + (7 * VIS_BLOCKSIZE)], #one_read
add REALSRC, VIS_BLOCKSIZE, REALSRC ! update REALSRC
! This point is 32-byte aligned since 24 instructions appear since
! the previous alignment directive.
! Main loop. Write previous block. Load rest of current block.
! Some bytes will be loaded that won't yet be written.
.xlarge1:
ldda [REALSRC]%asi, %d34
faligndata %d44, %d46, %d60
ldda [REALSRC + 0x8]%asi, %d36
faligndata %d46, %d32, %d62
stda %d48, [DST]ASI_BLK_P
sub CNT, VIS_BLOCKSIZE, CNT ! update count
ldda [REALSRC + 0x10]%asi, %d38
faligndata %d32, %d34, %d48
ldda [REALSRC + 0x18]%asi, %d40
faligndata %d34, %d36, %d50
ldda [REALSRC + 0x20]%asi, %d42
faligndata %d36, %d38, %d52
ldda [REALSRC + 0x28]%asi, %d44
faligndata %d38, %d40, %d54
ldda [REALSRC + 0x30]%asi, %d46
faligndata %d40, %d42, %d56
ldda [REALSRC + 0x38]%asi, %d32
faligndata %d42, %d44, %d58
! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K
prefetch [REALSRC + (8 * VIS_BLOCKSIZE) + 8], #one_read
add DST, VIS_BLOCKSIZE, DST ! update DST
cmp CNT, VIS_BLOCKSIZE + 8
! second prefetch important to correct for occasional dropped
! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K
! strong prefetch prevents drops on Panther, but Jaguar and earlier
! US-III models treat strong prefetches as weak prefetchs
! to avoid regressions on customer hardware, we retain the prefetch
prefetch [REALSRC + (5 * VIS_BLOCKSIZE)], #one_read
bgu,pt %ncc, .xlarge1
add REALSRC, VIS_BLOCKSIZE, REALSRC ! update REALSRC
faligndata %d44, %d46, %d60
faligndata %d46, %d32, %d62
stda %d48, [DST]ASI_BLK_P ! store 64 bytes, bypass cache
cmp CNT, VIS_BLOCKSIZE
bne %ncc, .xlarge2 ! exactly 1 block remaining?
add DST, VIS_BLOCKSIZE, DST ! update DST
brz,a SRC, .xlarge3 ! is SRC double aligned?
ldd [REALSRC], %d34
.xlarge2:
add TMP, CNT, TMP ! TMP was already set to 0 or -8
add TMP, SRC, TMP
ba .beginmedloop
andn TMP, 7, TMP ! 8 byte aligned count
! This is when there is exactly 1 block remaining
! and Source is aligned
.xlarge3:
ldd [REALSRC + 0x08], %d36
ldd [REALSRC + 0x10], %d38
ldd [REALSRC + 0x18], %d40
ldd [REALSRC + 0x20], %d42
ldd [REALSRC + 0x28], %d44
ldd [REALSRC + 0x30], %d46
! Don't need to load [REALSRC + 0x38] since we were
! already 8 bytes ahead in REALSRC to start with.
stda %d32, [DST]ASI_BLK_P
.bcb_exit:
membar #Sync
ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
wr %o2, 0, %gsr
ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
! No need to save regs if either FEF or DU are clear
and %o3, FPRS_FEF|FPRS_DU, %o2
xorcc %o2, FPRS_FEF|FPRS_DU, %g0
bnz,pt %icc, 4f
nop
BLD_FPQ3Q4_FROMSTACK(%o2)
ba,pt %ncc, 5f
wr %o3, 0, %fprs ! restore fprs
4:
FZEROQ3Q4
wr %o3, 0, %fprs ! restore fprs
5:
mov %g5, %o0 ! copy dest address
call rock_sync_icache
mov %g2, %o1 ! saved size
membar #Sync ! sync error barrier
andn %l6, MASK_FLAGS, %l6
stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
FP_ALLOWMIGRATE(5, 6)
ret
restore %g0, 0, %o0
SET_SIZE(bcopy_more)
#endif /* lint */
/*
* Block copy with possibly overlapped operands.
*/
#if defined(lint)
/*ARGSUSED*/
void
ovbcopy(const void *from, void *to, size_t count)
{}
#else /* lint */
ENTRY(ovbcopy)
tst %o2 ! check count
bgu,a %ncc, 1f ! nothing to do or bad arguments
subcc %o0, %o1, %o3 ! difference of from and to address
retl ! return
nop
1:
bneg,a %ncc, 2f
neg %o3 ! if < 0, make it positive
2: cmp %o2, %o3 ! cmp size and abs(from - to)
bleu %ncc, bcopy ! if size <= abs(diff): use bcopy,
.empty ! no overlap
cmp %o0, %o1 ! compare from and to addresses
blu %ncc, .ov_bkwd ! if from < to, copy backwards
nop
!
! Copy forwards.
!
.ov_fwd:
ldub [%o0], %o3 ! read from address
inc %o0 ! inc from address
stb %o3, [%o1] ! write to address
deccc %o2 ! dec count
bgu %ncc, .ov_fwd ! loop till done
inc %o1 ! inc to address
retl ! return
nop
!
! Copy backwards.
!
.ov_bkwd:
deccc %o2 ! dec count
ldub [%o0 + %o2], %o3 ! get byte at end of src
bgu %ncc, .ov_bkwd ! loop till done
stb %o3, [%o1 + %o2] ! delay slot, store at end of dst
retl ! return
nop
SET_SIZE(ovbcopy)
#endif /* lint */
/*
* hwblkpagecopy()
*
* Copies exactly one page. This routine assumes the caller (ppcopy)
* has already disabled kernel preemption and has checked
* use_hw_bcopy. Preventing preemption also prevents cpu migration.
*/
#ifdef lint
/*ARGSUSED*/
void
hwblkpagecopy(const void *src, void *dst)
{ }
#else /* lint */
ENTRY(hwblkpagecopy)
! get another window w/space for three aligned blocks of saved fpregs
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
! %i0 - source address (arg)
! %i1 - destination address (arg)
! %i2 - length of region (not arg)
! %l0 - saved fprs
! %l1 - pointer to saved fpregs
rd %fprs, %l0 ! check for unused fp
! FPU enabled ? If not, enable it.
btst FPRS_FEF, %l0
bz,a,pt %icc, 1f
wr %g0, FPRS_FEF, %fprs
! FPU enabled, but is Q3Q4 dirty ? If yes, save them.
btst FPRS_DU, %l0
bz,pn %icc, 1f
nop
BST_FPQ3Q4_TOSTACK(%l1)
1: set PAGESIZE, CNT
mov %i1, %o0 ! store destination address for flushing
mov REALSRC, SRC
prefetch [SRC], #one_read
prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
ldd [SRC], %d32
#if FIRST_PREFETCH > 4
prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
#endif
ldd [SRC + 0x08], %d34
#if FIRST_PREFETCH > 5
prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
#endif
ldd [SRC + 0x10], %d36
#if FIRST_PREFETCH > 6
prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
#endif
faligndata %d32, %d34, %d48
ldd [SRC + 0x18], %d38
#if FIRST_PREFETCH > 7
prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
#endif
faligndata %d34, %d36, %d50
ldd [SRC + 0x20], %d40
faligndata %d36, %d38, %d52
ldd [SRC + 0x28], %d42
faligndata %d38, %d40, %d54
ldd [SRC + 0x30], %d44
faligndata %d40, %d42, %d56
ldd [SRC + 0x38], %d46
faligndata %d42, %d44, %d58
ldd [SRC + VIS_BLOCKSIZE], %d32
sub CNT, VIS_BLOCKSIZE, CNT
add SRC, VIS_BLOCKSIZE, SRC
ba,a,pt %ncc, 2f
nop
.align ICACHE_LINE_SIZE
2:
ldd [SRC + 0x08], %d34
faligndata %d44, %d46, %d60
ldd [SRC + 0x10], %d36
faligndata %d46, %d32, %d62
stda %d48, [DST]ASI_BLK_P
ldd [SRC + 0x18], %d38
faligndata %d32, %d34, %d48
ldd [SRC + 0x20], %d40
faligndata %d34, %d36, %d50
ldd [SRC + 0x28], %d42
faligndata %d36, %d38, %d52
ldd [SRC + 0x30], %d44
faligndata %d38, %d40, %d54
ldd [SRC + 0x38], %d46
faligndata %d40, %d42, %d56
ldd [SRC + VIS_BLOCKSIZE], %d32
faligndata %d42, %d44, %d58
prefetch [SRC + ((FIRST_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
sub CNT, VIS_BLOCKSIZE, CNT
add DST, VIS_BLOCKSIZE, DST
cmp CNT, VIS_BLOCKSIZE + 8
prefetch [SRC + ((SECOND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
bgu,pt %ncc, 2b
add SRC, VIS_BLOCKSIZE, SRC
! trailing block
ldd [SRC + 0x08], %d34
faligndata %d44, %d46, %d60
ldd [SRC + 0x10], %d36
faligndata %d46, %d32, %d62
stda %d48, [DST]ASI_BLK_P
ldd [SRC + 0x18], %d38
ldd [SRC + 0x20], %d40
ldd [SRC + 0x28], %d42
ldd [SRC + 0x30], %d44
ldd [SRC + 0x38], %d46
sub CNT, VIS_BLOCKSIZE, CNT
add DST, VIS_BLOCKSIZE, DST
add SRC, VIS_BLOCKSIZE, SRC
stda %d32, [DST]ASI_BLK_P
set PAGESIZE, %o1
call rock_sync_icache
nop
membar #Sync
btst FPRS_DU, %l0
bz,pt %icc, 2f
nop
BLD_FPQ3Q4_FROMSTACK(%l3)
ba 3f
nop
2: FZEROQ3Q4
3: wr %l0, 0, %fprs ! restore fprs
ret
restore %g0, 0, %o0
SET_SIZE(hwblkpagecopy)
#endif /* lint */
/*
* Transfer data to and from user space -
* Note that these routines can cause faults
* It is assumed that the kernel has nothing at
* less than KERNELBASE in the virtual address space.
*
* Note that copyin(9F) and copyout(9F) are part of the
* DDI/DKI which specifies that they return '-1' on "errors."
*
* Sigh.
*
* So there's two extremely similar routines - xcopyin() and xcopyout()
* which return the errno that we've faithfully computed. This
* allows other callers (e.g. uiomove(9F)) to work correctly.
* Given that these are used pretty heavily, we expand the calling
* sequences inline for all flavours (rather than making wrappers).
*
* There are also stub routines for xcopyout_little and xcopyin_little,
* which currently are intended to handle requests of <= 16 bytes from
* do_unaligned. Future enhancement to make them handle 8k pages efficiently
* is left as an exercise...
*/
/*
* Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
*
* General theory of operation:
*
* The only difference between copy{in,out} and
* xcopy{in,out} is in the error handling routine they invoke
* when a memory access error occurs. xcopyOP returns the errno
* while copyOP returns -1 (see above). copy{in,out}_noerr set
* a special flag (by oring the TRAMP_FLAG into the fault handler address)
* if they are called with a fault handler already in place. That flag
* causes the default handlers to trampoline to the previous handler
* upon an error.
*
* None of the copyops routines grab a window until it's decided that
* we need to do a HW block copy operation. This saves a window
* spill/fill when we're called during socket ops. The typical IO
* path won't cause spill/fill traps.
*
* This code uses a set of 4 limits for the maximum size that will
* be copied given a particular input/output address alignment.
* If the value for a particular limit is zero, the copy will be performed
* by the plain copy loops rather than FPBLK.
*
* See the description of bcopy above for more details of the
* data copying algorithm and the default limits.
*
*/
/*
* Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
*/
#if defined(lint)
#else /* lint */
/*
* We save the arguments in the following registers in case of a fault:
* kaddr - %l1
* uaddr - %l2
* count - %l3
*/
#define SAVE_SRC %l1
#define SAVE_DST %l2
#define SAVE_COUNT %l3
#define SM_SAVE_SRC %g4
#define SM_SAVE_DST %g5
#define SM_SAVE_COUNT %o5
#define ERRNO %l5
#define REAL_LOFAULT %l4
/*
* Generic copyio fault handler. This is the first line of defense when a
* fault occurs in (x)copyin/(x)copyout. In order for this to function
* properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
* This allows us to share common code for all the flavors of the copy
* operations, including the _noerr versions.
*
* Note that this function will restore the original input parameters before
* calling REAL_LOFAULT. So the real handler can vector to the appropriate
* member of the t_copyop structure, if needed.
*/
ENTRY(copyio_fault)
membar #Sync
mov %g1,ERRNO ! save errno in ERRNO
btst FPUSED_FLAG, %l6
bz %ncc, 1f
nop
ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
wr %o2, 0, %gsr ! restore gsr
ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
btst FPRS_DU, %o3
bz,pt %icc, 4f
nop
BLD_FPQ3Q4_FROMSTACK(%o2)
ba,pt %ncc, 1f
wr %o3, 0, %fprs ! restore fprs
4:
FZEROQ3Q4
wr %o3, 0, %fprs ! restore fprs
1:
andn %l6, FPUSED_FLAG, %l6
membar #Sync
stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
FP_ALLOWMIGRATE(5, 6)
mov SAVE_SRC, %i0
mov SAVE_DST, %i1
jmp REAL_LOFAULT
mov SAVE_COUNT, %i2
SET_SIZE(copyio_fault)
#endif
#if defined(lint)
/*ARGSUSED*/
int
copyout(const void *kaddr, void *uaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(copyout)
cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
bleu,pt %ncc, .copyout_small ! go to larger cases
xor %o0, %o1, %o3 ! are src, dst alignable?
btst 7, %o3 !
bz,pt %ncc, .copyout_8 ! check for longword alignment
nop
btst 1, %o3 !
bz,pt %ncc, .copyout_2 ! check for half-word
nop
sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_1)], %o3
tst %o3
bz,pn %icc, .copyout_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyout_small ! go to small copy
nop
ba,pt %ncc, .copyout_more ! otherwise go to large copy
nop
.copyout_2:
btst 3, %o3 !
bz,pt %ncc, .copyout_4 ! check for word alignment
nop
sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_2)], %o3
tst %o3
bz,pn %icc, .copyout_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyout_small ! go to small copy
nop
ba,pt %ncc, .copyout_more ! otherwise go to large copy
nop
.copyout_4:
! already checked longword, must be word aligned
sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_4)], %o3
tst %o3
bz,pn %icc, .copyout_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyout_small ! go to small copy
nop
ba,pt %ncc, .copyout_more ! otherwise go to large copy
nop
.copyout_8:
sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_8)], %o3
tst %o3
bz,pn %icc, .copyout_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyout_small ! go to small copy
nop
ba,pt %ncc, .copyout_more ! otherwise go to large copy
nop
.align 16
nop ! instruction alignment
! see discussion at start of file
.copyout_small:
sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault
or %o5, %lo(.sm_copyout_err), %o5
ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
.sm_do_copyout:
mov %o0, SM_SAVE_SRC
mov %o1, SM_SAVE_DST
cmp %o2, SHORTCOPY ! check for really short case
bleu,pt %ncc, .co_sm_left !
mov %o2, SM_SAVE_COUNT
cmp %o2, CHKSIZE ! check for medium length cases
bgu,pn %ncc, .co_med !
or %o0, %o1, %o3 ! prepare alignment check
andcc %o3, 0x3, %g0 ! test for alignment
bz,pt %ncc, .co_sm_word ! branch to word aligned case
.co_sm_movebytes:
sub %o2, 3, %o2 ! adjust count to allow cc zero test
.co_sm_notalign4:
ldub [%o0], %o3 ! read byte
subcc %o2, 4, %o2 ! reduce count by 4
stba %o3, [%o1]ASI_USER ! write byte
inc %o1 ! advance DST by 1
ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes
add %o0, 4, %o0 ! advance SRC by 4
stba %o3, [%o1]ASI_USER
inc %o1 ! advance DST by 1
ldub [%o0 - 2], %o3
stba %o3, [%o1]ASI_USER
inc %o1 ! advance DST by 1
ldub [%o0 - 1], %o3
stba %o3, [%o1]ASI_USER
bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain
inc %o1 ! advance DST by 1
add %o2, 3, %o2 ! restore count
.co_sm_left:
tst %o2
bz,pt %ncc, .co_sm_exit ! check for zero length
nop
ldub [%o0], %o3 ! load one byte
deccc %o2 ! reduce count for cc test
bz,pt %ncc, .co_sm_exit
stba %o3,[%o1]ASI_USER ! store one byte
ldub [%o0 + 1], %o3 ! load second byte
deccc %o2
inc %o1
bz,pt %ncc, .co_sm_exit
stba %o3,[%o1]ASI_USER ! store second byte
ldub [%o0 + 2], %o3 ! load third byte
inc %o1
stba %o3,[%o1]ASI_USER ! store third byte
membar #Sync ! sync error barrier
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return 0
.align 16
.co_sm_words:
lduw [%o0], %o3 ! read word
.co_sm_wordx:
subcc %o2, 8, %o2 ! update count
stwa %o3, [%o1]ASI_USER ! write word
add %o0, 8, %o0 ! update SRC
lduw [%o0 - 4], %o3 ! read word
add %o1, 4, %o1 ! update DST
stwa %o3, [%o1]ASI_USER ! write word
bgt,pt %ncc, .co_sm_words ! loop til done
add %o1, 4, %o1 ! update DST
addcc %o2, 7, %o2 ! restore count
bz,pt %ncc, .co_sm_exit
nop
deccc %o2
bz,pt %ncc, .co_sm_byte
.co_sm_half:
subcc %o2, 2, %o2 ! reduce count by 2
lduh [%o0], %o3 ! read half word
add %o0, 2, %o0 ! advance SRC by 2
stha %o3, [%o1]ASI_USER ! write half word
bgt,pt %ncc, .co_sm_half ! loop til done
add %o1, 2, %o1 ! advance DST by 2
addcc %o2, 1, %o2 ! restore count
bz,pt %ncc, .co_sm_exit
nop
.co_sm_byte:
ldub [%o0], %o3
stba %o3, [%o1]ASI_USER
membar #Sync ! sync error barrier
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return 0
.align 16
.co_sm_word:
subcc %o2, 4, %o2 ! update count
bgt,pt %ncc, .co_sm_wordx
lduw [%o0], %o3 ! read word
addcc %o2, 3, %o2 ! restore count
bz,pt %ncc, .co_sm_exit
stwa %o3, [%o1]ASI_USER ! write word
deccc %o2 ! reduce count for cc test
ldub [%o0 + 4], %o3 ! load one byte
add %o1, 4, %o1
bz,pt %ncc, .co_sm_exit
stba %o3, [%o1]ASI_USER ! store one byte
ldub [%o0 + 5], %o3 ! load second byte
deccc %o2
inc %o1
bz,pt %ncc, .co_sm_exit
stba %o3, [%o1]ASI_USER ! store second byte
ldub [%o0 + 6], %o3 ! load third byte
inc %o1
stba %o3, [%o1]ASI_USER ! store third byte
.co_sm_exit:
membar #Sync ! sync error barrier
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return 0
.align 16
.co_med:
xor %o0, %o1, %o3 ! setup alignment check
btst 1, %o3
bnz,pt %ncc, .co_sm_movebytes ! unaligned
nop
btst 3, %o3
bnz,pt %ncc, .co_med_half ! halfword aligned
nop
btst 7, %o3
bnz,pt %ncc, .co_med_word ! word aligned
nop
.co_med_long:
btst 3, %o0 ! check for
bz,pt %ncc, .co_med_long1 ! word alignment
nop
.co_med_long0:
ldub [%o0], %o3 ! load one byte
inc %o0
stba %o3,[%o1]ASI_USER ! store byte
inc %o1
btst 3, %o0
bnz,pt %ncc, .co_med_long0
dec %o2
.co_med_long1: ! word aligned
btst 7, %o0 ! check for long word
bz,pt %ncc, .co_med_long2
nop
lduw [%o0], %o3 ! load word
add %o0, 4, %o0 ! advance SRC by 4
stwa %o3, [%o1]ASI_USER ! store word
add %o1, 4, %o1 ! advance DST by 4
sub %o2, 4, %o2 ! reduce count by 4
!
! Now long word aligned and have at least 32 bytes to move
!
.co_med_long2:
sub %o2, 31, %o2 ! adjust count to allow cc zero test
sub %o1, 8, %o1 ! adjust pointer to allow store in
! branch delay slot instead of add
.co_med_lmove:
add %o1, 8, %o1 ! advance DST by 8
ldx [%o0], %o3 ! read long word
subcc %o2, 32, %o2 ! reduce count by 32
stxa %o3, [%o1]ASI_USER ! write long word
add %o1, 8, %o1 ! advance DST by 8
ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words
add %o0, 32, %o0 ! advance SRC by 32
stxa %o3, [%o1]ASI_USER
ldx [%o0 - 16], %o3
add %o1, 8, %o1 ! advance DST by 8
stxa %o3, [%o1]ASI_USER
ldx [%o0 - 8], %o3
add %o1, 8, %o1 ! advance DST by 8
bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left
stxa %o3, [%o1]ASI_USER
add %o1, 8, %o1 ! advance DST by 8
addcc %o2, 24, %o2 ! restore count to long word offset
ble,pt %ncc, .co_med_lextra ! check for more long words to move
nop
.co_med_lword:
ldx [%o0], %o3 ! read long word
subcc %o2, 8, %o2 ! reduce count by 8
stxa %o3, [%o1]ASI_USER ! write long word
add %o0, 8, %o0 ! advance SRC by 8
bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left
add %o1, 8, %o1 ! advance DST by 8
.co_med_lextra:
addcc %o2, 7, %o2 ! restore rest of count
bz,pt %ncc, .co_sm_exit ! if zero, then done
deccc %o2
bz,pt %ncc, .co_sm_byte
nop
ba,pt %ncc, .co_sm_half
nop
.align 16
nop ! instruction alignment
! see discussion at start of file
.co_med_word:
btst 3, %o0 ! check for
bz,pt %ncc, .co_med_word1 ! word alignment
nop
.co_med_word0:
ldub [%o0], %o3 ! load one byte
inc %o0
stba %o3,[%o1]ASI_USER ! store byte
inc %o1
btst 3, %o0
bnz,pt %ncc, .co_med_word0
dec %o2
!
! Now word aligned and have at least 36 bytes to move
!
.co_med_word1:
sub %o2, 15, %o2 ! adjust count to allow cc zero test
.co_med_wmove:
lduw [%o0], %o3 ! read word
subcc %o2, 16, %o2 ! reduce count by 16
stwa %o3, [%o1]ASI_USER ! write word
add %o1, 4, %o1 ! advance DST by 4
lduw [%o0 + 4], %o3 ! repeat for a total for 4 words
add %o0, 16, %o0 ! advance SRC by 16
stwa %o3, [%o1]ASI_USER
add %o1, 4, %o1 ! advance DST by 4
lduw [%o0 - 8], %o3
stwa %o3, [%o1]ASI_USER
add %o1, 4, %o1 ! advance DST by 4
lduw [%o0 - 4], %o3
stwa %o3, [%o1]ASI_USER
bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left
add %o1, 4, %o1 ! advance DST by 4
addcc %o2, 12, %o2 ! restore count to word offset
ble,pt %ncc, .co_med_wextra ! check for more words to move
nop
.co_med_word2:
lduw [%o0], %o3 ! read word
subcc %o2, 4, %o2 ! reduce count by 4
stwa %o3, [%o1]ASI_USER ! write word
add %o0, 4, %o0 ! advance SRC by 4
bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left
add %o1, 4, %o1 ! advance DST by 4
.co_med_wextra:
addcc %o2, 3, %o2 ! restore rest of count
bz,pt %ncc, .co_sm_exit ! if zero, then done
deccc %o2
bz,pt %ncc, .co_sm_byte
nop
ba,pt %ncc, .co_sm_half
nop
.align 16
nop ! instruction alignment
nop ! see discussion at start of file
nop
.co_med_half:
btst 1, %o0 ! check for
bz,pt %ncc, .co_med_half1 ! half word alignment
nop
ldub [%o0], %o3 ! load one byte
inc %o0
stba %o3,[%o1]ASI_USER ! store byte
inc %o1
dec %o2
!
! Now half word aligned and have at least 38 bytes to move
!
.co_med_half1:
sub %o2, 7, %o2 ! adjust count to allow cc zero test
.co_med_hmove:
lduh [%o0], %o3 ! read half word
subcc %o2, 8, %o2 ! reduce count by 8
stha %o3, [%o1]ASI_USER ! write half word
add %o1, 2, %o1 ! advance DST by 2
lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords
add %o0, 8, %o0 ! advance SRC by 8
stha %o3, [%o1]ASI_USER
add %o1, 2, %o1 ! advance DST by 2
lduh [%o0 - 4], %o3
stha %o3, [%o1]ASI_USER
add %o1, 2, %o1 ! advance DST by 2
lduh [%o0 - 2], %o3
stha %o3, [%o1]ASI_USER
bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left
add %o1, 2, %o1 ! advance DST by 2
addcc %o2, 7, %o2 ! restore count
bz,pt %ncc, .co_sm_exit
deccc %o2
bz,pt %ncc, .co_sm_byte
nop
ba,pt %ncc, .co_sm_half
nop
/*
* We got here because of a fault during short copyout.
* Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
*/
.sm_copyout_err:
membar #Sync
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
mov SM_SAVE_SRC, %o0
mov SM_SAVE_DST, %o1
mov SM_SAVE_COUNT, %o2
ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
tst %o3
bz,pt %ncc, 3f ! if not, return error
nop
ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with
jmp %o5 ! original arguments
nop
3:
retl
or %g0, -1, %o0 ! return error value
SET_SIZE(copyout)
/*
* The _more entry points are not intended to be used directly by
* any caller from outside this file. They are provided to allow
* profiling and dtrace of the portions of the copy code that uses
* the floating point registers.
* This entry is particularly important as DTRACE (at least as of
* 4/2004) does not support leaf functions.
*/
ENTRY(copyout_more)
.copyout_more:
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
set .copyout_err, REAL_LOFAULT
/*
* Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
*/
.do_copyout:
set copyio_fault, %l7 ! .copyio_fault is lofault val
ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
membar #Sync ! sync error barrier
stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
mov %i0, SAVE_SRC
mov %i1, SAVE_DST
mov %i2, SAVE_COUNT
FP_NOMIGRATE(6, 7)
rd %fprs, %o2 ! check for unused fp
st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
! FPU enabled ? If not, enable it.
btst FPRS_FEF, %o2
bz,a,pt %icc, .do_blockcopyout
wr %g0, FPRS_FEF, %fprs
! FPU enabled, but is Q3Q4 dirty ? If yes, save them.
btst FPRS_DU, %o2
bz,pn %icc, .do_blockcopyout
nop
BST_FPQ3Q4_TOSTACK(%o2)
.do_blockcopyout:
rd %gsr, %o2
stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
or %l6, FPUSED_FLAG, %l6
andcc DST, VIS_BLOCKSIZE - 1, TMP
mov ASI_USER, %asi
bz,pt %ncc, 2f
neg TMP
add TMP, VIS_BLOCKSIZE, TMP
! TMP = bytes required to align DST on FP_BLOCK boundary
! Using SRC as a tmp here
cmp TMP, 3
bleu,pt %ncc, 1f
sub CNT,TMP,CNT ! adjust main count
sub TMP, 3, TMP ! adjust for end of loop test
.co_blkalign:
ldub [REALSRC], SRC ! move 4 bytes per loop iteration
stba SRC, [DST]%asi
subcc TMP, 4, TMP
ldub [REALSRC + 1], SRC
add REALSRC, 4, REALSRC
stba SRC, [DST + 1]%asi
ldub [REALSRC - 2], SRC
add DST, 4, DST
stba SRC, [DST - 2]%asi
ldub [REALSRC - 1], SRC
bgu,pt %ncc, .co_blkalign
stba SRC, [DST - 1]%asi
addcc TMP, 3, TMP ! restore count adjustment
bz,pt %ncc, 2f ! no bytes left?
nop
1: ldub [REALSRC], SRC
inc REALSRC
inc DST
deccc TMP
bgu %ncc, 1b
stba SRC, [DST - 1]%asi
2:
andn REALSRC, 0x7, SRC
alignaddr REALSRC, %g0, %g0
! SRC - 8-byte aligned
! DST - 64-byte aligned
prefetch [SRC], #one_read
prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
ldd [SRC], %d32
#if FIRST_PREFETCH > 4
prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
#endif
ldd [SRC + 0x08], %d34
#if FIRST_PREFETCH > 5
prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
#endif
ldd [SRC + 0x10], %d36
#if FIRST_PREFETCH > 6
prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
#endif
faligndata %d32, %d34, %d48
ldd [SRC + 0x18], %d38
#if FIRST_PREFETCH > 7
prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
#endif
faligndata %d34, %d36, %d50
ldd [SRC + 0x20], %d40
faligndata %d36, %d38, %d52
ldd [SRC + 0x28], %d42
faligndata %d38, %d40, %d54
ldd [SRC + 0x30], %d44
faligndata %d40, %d42, %d56
ldd [SRC + 0x38], %d46
faligndata %d42, %d44, %d58
ldd [SRC + VIS_BLOCKSIZE], %d32
sub CNT, VIS_BLOCKSIZE, CNT
add SRC, VIS_BLOCKSIZE, SRC
add REALSRC, VIS_BLOCKSIZE, REALSRC
ba,a,pt %ncc, 1f
nop
.align ICACHE_LINE_SIZE
1:
ldd [SRC + 0x08], %d34
faligndata %d44, %d46, %d60
ldd [SRC + 0x10], %d36
faligndata %d46, %d32, %d62
stda %d48, [DST]ASI_BLK_AIUS
ldd [SRC + 0x18], %d38
faligndata %d32, %d34, %d48
ldd [SRC + 0x20], %d40
faligndata %d34, %d36, %d50
ldd [SRC + 0x28], %d42
faligndata %d36, %d38, %d52
ldd [SRC + 0x30], %d44
faligndata %d38, %d40, %d54
ldd [SRC + 0x38], %d46
faligndata %d40, %d42, %d56
sub CNT, VIS_BLOCKSIZE, CNT
ldd [SRC + VIS_BLOCKSIZE], %d32
faligndata %d42, %d44, %d58
prefetch [SRC + ((FIRST_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
add DST, VIS_BLOCKSIZE, DST
prefetch [SRC + ((SECOND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
add REALSRC, VIS_BLOCKSIZE, REALSRC
cmp CNT, VIS_BLOCKSIZE + 8
bgu,pt %ncc, 1b
add SRC, VIS_BLOCKSIZE, SRC
! only if REALSRC & 0x7 is 0
cmp CNT, VIS_BLOCKSIZE
bne %ncc, 3f
andcc REALSRC, 0x7, %g0
bz,pt %ncc, 2f
nop
3:
faligndata %d44, %d46, %d60
faligndata %d46, %d32, %d62
stda %d48, [DST]ASI_BLK_AIUS
add DST, VIS_BLOCKSIZE, DST
ba,pt %ncc, 3f
nop
2:
ldd [SRC + 0x08], %d34
faligndata %d44, %d46, %d60
ldd [SRC + 0x10], %d36
faligndata %d46, %d32, %d62
stda %d48, [DST]ASI_BLK_AIUS
ldd [SRC + 0x18], %d38
ldd [SRC + 0x20], %d40
ldd [SRC + 0x28], %d42
ldd [SRC + 0x30], %d44
ldd [SRC + 0x38], %d46
sub CNT, VIS_BLOCKSIZE, CNT
add DST, VIS_BLOCKSIZE, DST
add SRC, VIS_BLOCKSIZE, SRC
add REALSRC, VIS_BLOCKSIZE, REALSRC
stda %d32, [DST]ASI_BLK_AIUS
add DST, VIS_BLOCKSIZE, DST
ba,a,pt %ncc, 4f
nop
3: tst CNT
bz,a %ncc, 4f
nop
5: ldub [REALSRC], TMP
inc REALSRC
inc DST
deccc CNT
bgu %ncc, 5b
stba TMP, [DST - 1]%asi
4:
.copyout_exit:
membar #Sync
ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
wr %o2, 0, %gsr ! restore gsr
ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
btst FPRS_DU, %o3
bz,pt %icc, 4f
nop
BLD_FPQ3Q4_FROMSTACK(%o2)
ba,pt %ncc, 1f
wr %o3, 0, %fprs ! restore fprs
4:
FZEROQ3Q4
wr %o3, 0, %fprs ! restore fprs
1:
membar #Sync
andn %l6, FPUSED_FLAG, %l6
stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
FP_ALLOWMIGRATE(5, 6)
ret
restore %g0, 0, %o0
/*
* We got here because of a fault during copyout.
* Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
*/
.copyout_err:
ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
tst %o4
bz,pt %ncc, 2f ! if not, return error
nop
ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with
jmp %g2 ! original arguments
restore %g0, 0, %g0 ! dispose of copy window
2:
ret
restore %g0, -1, %o0 ! return error value
SET_SIZE(copyout_more)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
xcopyout(const void *kaddr, void *uaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(xcopyout)
cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
bleu,pt %ncc, .xcopyout_small ! go to larger cases
xor %o0, %o1, %o3 ! are src, dst alignable?
btst 7, %o3 !
bz,pt %ncc, .xcopyout_8 !
nop
btst 1, %o3 !
bz,pt %ncc, .xcopyout_2 ! check for half-word
nop
sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_1)], %o3
tst %o3
bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .xcopyout_small ! go to small copy
nop
ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
nop
.xcopyout_2:
btst 3, %o3 !
bz,pt %ncc, .xcopyout_4 ! check for word alignment
nop
sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_2)], %o3
tst %o3
bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .xcopyout_small ! go to small copy
nop
ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
nop
.xcopyout_4:
! already checked longword, must be word aligned
sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_4)], %o3
tst %o3
bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .xcopyout_small ! go to small copy
nop
ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
nop
.xcopyout_8:
sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_8)], %o3
tst %o3
bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .xcopyout_small ! go to small copy
nop
ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
nop
.xcopyout_small:
sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault
or %o5, %lo(.sm_xcopyout_err), %o5
ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
membar #Sync ! sync error barrier
ba,pt %ncc, .sm_do_copyout ! common code
stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
.xcopyout_more:
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
sethi %hi(.xcopyout_err), REAL_LOFAULT
ba,pt %ncc, .do_copyout ! common code
or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
/*
* We got here because of fault during xcopyout
* Errno value is in ERRNO
*/
.xcopyout_err:
ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
tst %o4
bz,pt %ncc, 2f ! if not, return error
nop
ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with
jmp %g2 ! original arguments
restore %g0, 0, %g0 ! dispose of copy window
2:
ret
restore ERRNO, 0, %o0 ! return errno value
.sm_xcopyout_err:
membar #Sync
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
mov SM_SAVE_SRC, %o0
mov SM_SAVE_DST, %o1
mov SM_SAVE_COUNT, %o2
ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
tst %o3
bz,pt %ncc, 3f ! if not, return error
nop
ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with
jmp %o5 ! original arguments
nop
3:
retl
or %g1, 0, %o0 ! return errno value
SET_SIZE(xcopyout)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
xcopyout_little(const void *kaddr, void *uaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(xcopyout_little)
sethi %hi(.xcopyio_err), %o5
or %o5, %lo(.xcopyio_err), %o5
ldn [THREAD_REG + T_LOFAULT], %o4
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT]
mov %o4, %o5
subcc %g0, %o2, %o3
add %o0, %o2, %o0
bz,pn %ncc, 2f ! check for zero bytes
sub %o2, 1, %o4
add %o0, %o4, %o0 ! start w/last byte
add %o1, %o2, %o1
ldub [%o0 + %o3], %o4
1: stba %o4, [%o1 + %o3]ASI_AIUSL
inccc %o3
sub %o0, 2, %o0 ! get next byte
bcc,a,pt %ncc, 1b
ldub [%o0 + %o3], %o4
2:
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return (0)
SET_SIZE(xcopyout_little)
#endif /* lint */
/*
* Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
*/
#if defined(lint)
/*ARGSUSED*/
int
copyin(const void *uaddr, void *kaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(copyin)
cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
bleu,pt %ncc, .copyin_small ! go to larger cases
xor %o0, %o1, %o3 ! are src, dst alignable?
btst 7, %o3 !
bz,pt %ncc, .copyin_8 ! check for longword alignment
nop
btst 1, %o3 !
bz,pt %ncc, .copyin_2 ! check for half-word
nop
sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_1)], %o3
tst %o3
bz,pn %icc, .copyin_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyin_small ! go to small copy
nop
ba,pt %ncc, .copyin_more ! otherwise go to large copy
nop
.copyin_2:
btst 3, %o3 !
bz,pt %ncc, .copyin_4 ! check for word alignment
nop
sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_2)], %o3
tst %o3
bz,pn %icc, .copyin_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyin_small ! go to small copy
nop
ba,pt %ncc, .copyin_more ! otherwise go to large copy
nop
.copyin_4:
! already checked longword, must be word aligned
sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_4)], %o3
tst %o3
bz,pn %icc, .copyin_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyin_small ! go to small copy
nop
ba,pt %ncc, .copyin_more ! otherwise go to large copy
nop
.copyin_8:
sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_8)], %o3
tst %o3
bz,pn %icc, .copyin_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyin_small ! go to small copy
nop
ba,pt %ncc, .copyin_more ! otherwise go to large copy
nop
.align 16
nop ! instruction alignment
! see discussion at start of file
.copyin_small:
sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault
or %o5, %lo(.sm_copyin_err), %o5
ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT]
.sm_do_copyin:
mov %o0, SM_SAVE_SRC
mov %o1, SM_SAVE_DST
cmp %o2, SHORTCOPY ! check for really short case
bleu,pt %ncc, .ci_sm_left !
mov %o2, SM_SAVE_COUNT
cmp %o2, CHKSIZE ! check for medium length cases
bgu,pn %ncc, .ci_med !
or %o0, %o1, %o3 ! prepare alignment check
andcc %o3, 0x3, %g0 ! test for alignment
bz,pt %ncc, .ci_sm_word ! branch to word aligned case
.ci_sm_movebytes:
sub %o2, 3, %o2 ! adjust count to allow cc zero test
.ci_sm_notalign4:
lduba [%o0]ASI_USER, %o3 ! read byte
subcc %o2, 4, %o2 ! reduce count by 4
stb %o3, [%o1] ! write byte
add %o0, 1, %o0 ! advance SRC by 1
lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes
add %o0, 1, %o0 ! advance SRC by 1
stb %o3, [%o1 + 1]
add %o1, 4, %o1 ! advance DST by 4
lduba [%o0]ASI_USER, %o3
add %o0, 1, %o0 ! advance SRC by 1
stb %o3, [%o1 - 2]
lduba [%o0]ASI_USER, %o3
add %o0, 1, %o0 ! advance SRC by 1
bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain
stb %o3, [%o1 - 1]
add %o2, 3, %o2 ! restore count
.ci_sm_left:
tst %o2
bz,pt %ncc, .ci_sm_exit
nop
lduba [%o0]ASI_USER, %o3 ! load one byte
deccc %o2 ! reduce count for cc test
bz,pt %ncc, .ci_sm_exit
stb %o3,[%o1] ! store one byte
inc %o0
lduba [%o0]ASI_USER, %o3 ! load second byte
deccc %o2
bz,pt %ncc, .ci_sm_exit
stb %o3,[%o1 + 1] ! store second byte
inc %o0
lduba [%o0]ASI_USER, %o3 ! load third byte
stb %o3,[%o1 + 2] ! store third byte
membar #Sync ! sync error barrier
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return 0
.align 16
.ci_sm_words:
lduwa [%o0]ASI_USER, %o3 ! read word
.ci_sm_wordx:
subcc %o2, 8, %o2 ! update count
stw %o3, [%o1] ! write word
add %o0, 4, %o0 ! update SRC
add %o1, 8, %o1 ! update DST
lduwa [%o0]ASI_USER, %o3 ! read word
add %o0, 4, %o0 ! update SRC
bgt,pt %ncc, .ci_sm_words ! loop til done
stw %o3, [%o1 - 4] ! write word
addcc %o2, 7, %o2 ! restore count
bz,pt %ncc, .ci_sm_exit
nop
deccc %o2
bz,pt %ncc, .ci_sm_byte
.ci_sm_half:
subcc %o2, 2, %o2 ! reduce count by 2
lduha [%o0]ASI_USER, %o3 ! read half word
add %o0, 2, %o0 ! advance SRC by 2
add %o1, 2, %o1 ! advance DST by 2
bgt,pt %ncc, .ci_sm_half ! loop til done
sth %o3, [%o1 - 2] ! write half word
addcc %o2, 1, %o2 ! restore count
bz,pt %ncc, .ci_sm_exit
nop
.ci_sm_byte:
lduba [%o0]ASI_USER, %o3
stb %o3, [%o1]
membar #Sync ! sync error barrier
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return 0
.align 16
.ci_sm_word:
subcc %o2, 4, %o2 ! update count
bgt,pt %ncc, .ci_sm_wordx
lduwa [%o0]ASI_USER, %o3 ! read word
addcc %o2, 3, %o2 ! restore count
bz,pt %ncc, .ci_sm_exit
stw %o3, [%o1] ! write word
deccc %o2 ! reduce count for cc test
add %o0, 4, %o0
lduba [%o0]ASI_USER, %o3 ! load one byte
bz,pt %ncc, .ci_sm_exit
stb %o3, [%o1 + 4] ! store one byte
inc %o0
lduba [%o0]ASI_USER, %o3 ! load second byte
deccc %o2
bz,pt %ncc, .ci_sm_exit
stb %o3, [%o1 + 5] ! store second byte
inc %o0
lduba [%o0]ASI_USER, %o3 ! load third byte
stb %o3, [%o1 + 6] ! store third byte
.ci_sm_exit:
membar #Sync ! sync error barrier
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return 0
.align 16
.ci_med:
xor %o0, %o1, %o3 ! setup alignment check
btst 1, %o3
bnz,pt %ncc, .ci_sm_movebytes ! unaligned
nop
btst 3, %o3
bnz,pt %ncc, .ci_med_half ! halfword aligned
nop
btst 7, %o3
bnz,pt %ncc, .ci_med_word ! word aligned
nop
.ci_med_long:
btst 3, %o0 ! check for
bz,pt %ncc, .ci_med_long1 ! word alignment
nop
.ci_med_long0:
lduba [%o0]ASI_USER, %o3 ! load one byte
inc %o0
stb %o3,[%o1] ! store byte
inc %o1
btst 3, %o0
bnz,pt %ncc, .ci_med_long0
dec %o2
.ci_med_long1: ! word aligned
btst 7, %o0 ! check for long word
bz,pt %ncc, .ci_med_long2
nop
lduwa [%o0]ASI_USER, %o3 ! load word
add %o0, 4, %o0 ! advance SRC by 4
stw %o3, [%o1] ! store word
add %o1, 4, %o1 ! advance DST by 4
sub %o2, 4, %o2 ! reduce count by 4
!
! Now long word aligned and have at least 32 bytes to move
!
.ci_med_long2:
sub %o2, 31, %o2 ! adjust count to allow cc zero test
.ci_med_lmove:
ldxa [%o0]ASI_USER, %o3 ! read long word
subcc %o2, 32, %o2 ! reduce count by 32
stx %o3, [%o1] ! write long word
add %o0, 8, %o0 ! advance SRC by 8
ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words
add %o0, 8, %o0 ! advance SRC by 8
stx %o3, [%o1 + 8]
add %o1, 32, %o1 ! advance DST by 32
ldxa [%o0]ASI_USER, %o3
add %o0, 8, %o0 ! advance SRC by 8
stx %o3, [%o1 - 16]
ldxa [%o0]ASI_USER, %o3
add %o0, 8, %o0 ! advance SRC by 8
bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left
stx %o3, [%o1 - 8]
addcc %o2, 24, %o2 ! restore count to long word offset
ble,pt %ncc, .ci_med_lextra ! check for more long words to move
nop
.ci_med_lword:
ldxa [%o0]ASI_USER, %o3 ! read long word
subcc %o2, 8, %o2 ! reduce count by 8
stx %o3, [%o1] ! write long word
add %o0, 8, %o0 ! advance SRC by 8
bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left
add %o1, 8, %o1 ! advance DST by 8
.ci_med_lextra:
addcc %o2, 7, %o2 ! restore rest of count
bz,pt %ncc, .ci_sm_exit ! if zero, then done
deccc %o2
bz,pt %ncc, .ci_sm_byte
nop
ba,pt %ncc, .ci_sm_half
nop
.align 16
nop ! instruction alignment
! see discussion at start of file
.ci_med_word:
btst 3, %o0 ! check for
bz,pt %ncc, .ci_med_word1 ! word alignment
nop
.ci_med_word0:
lduba [%o0]ASI_USER, %o3 ! load one byte
inc %o0
stb %o3,[%o1] ! store byte
inc %o1
btst 3, %o0
bnz,pt %ncc, .ci_med_word0
dec %o2
!
! Now word aligned and have at least 36 bytes to move
!
.ci_med_word1:
sub %o2, 15, %o2 ! adjust count to allow cc zero test
.ci_med_wmove:
lduwa [%o0]ASI_USER, %o3 ! read word
subcc %o2, 16, %o2 ! reduce count by 16
stw %o3, [%o1] ! write word
add %o0, 4, %o0 ! advance SRC by 4
lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words
add %o0, 4, %o0 ! advance SRC by 4
stw %o3, [%o1 + 4]
add %o1, 16, %o1 ! advance DST by 16
lduwa [%o0]ASI_USER, %o3
add %o0, 4, %o0 ! advance SRC by 4
stw %o3, [%o1 - 8]
lduwa [%o0]ASI_USER, %o3
add %o0, 4, %o0 ! advance SRC by 4
bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left
stw %o3, [%o1 - 4]
addcc %o2, 12, %o2 ! restore count to word offset
ble,pt %ncc, .ci_med_wextra ! check for more words to move
nop
.ci_med_word2:
lduwa [%o0]ASI_USER, %o3 ! read word
subcc %o2, 4, %o2 ! reduce count by 4
stw %o3, [%o1] ! write word
add %o0, 4, %o0 ! advance SRC by 4
bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left
add %o1, 4, %o1 ! advance DST by 4
.ci_med_wextra:
addcc %o2, 3, %o2 ! restore rest of count
bz,pt %ncc, .ci_sm_exit ! if zero, then done
deccc %o2
bz,pt %ncc, .ci_sm_byte
nop
ba,pt %ncc, .ci_sm_half
nop
.align 16
nop ! instruction alignment
! see discussion at start of file
.ci_med_half:
btst 1, %o0 ! check for
bz,pt %ncc, .ci_med_half1 ! half word alignment
nop
lduba [%o0]ASI_USER, %o3 ! load one byte
inc %o0
stb %o3,[%o1] ! store byte
inc %o1
dec %o2
!
! Now half word aligned and have at least 38 bytes to move
!
.ci_med_half1:
sub %o2, 7, %o2 ! adjust count to allow cc zero test
.ci_med_hmove:
lduha [%o0]ASI_USER, %o3 ! read half word
subcc %o2, 8, %o2 ! reduce count by 8
sth %o3, [%o1] ! write half word
add %o0, 2, %o0 ! advance SRC by 2
lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords
add %o0, 2, %o0 ! advance SRC by 2
sth %o3, [%o1 + 2]
add %o1, 8, %o1 ! advance DST by 8
lduha [%o0]ASI_USER, %o3
add %o0, 2, %o0 ! advance SRC by 2
sth %o3, [%o1 - 4]
lduha [%o0]ASI_USER, %o3
add %o0, 2, %o0 ! advance SRC by 2
bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left
sth %o3, [%o1 - 2]
addcc %o2, 7, %o2 ! restore count
bz,pt %ncc, .ci_sm_exit
deccc %o2
bz,pt %ncc, .ci_sm_byte
nop
ba,pt %ncc, .ci_sm_half
nop
.sm_copyin_err:
membar #Sync
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
mov SM_SAVE_SRC, %o0
mov SM_SAVE_DST, %o1
mov SM_SAVE_COUNT, %o2
ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
tst %o3
bz,pt %ncc, 3f ! if not, return error
nop
ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with
jmp %o5 ! original arguments
nop
3:
retl
or %g0, -1, %o0 ! return errno value
SET_SIZE(copyin)
/*
* The _more entry points are not intended to be used directly by
* any caller from outside this file. They are provided to allow
* profiling and dtrace of the portions of the copy code that uses
* the floating point registers.
* This entry is particularly important as DTRACE (at least as of
* 4/2004) does not support leaf functions.
*/
ENTRY(copyin_more)
.copyin_more:
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
set .copyin_err, REAL_LOFAULT
/*
* Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
*/
.do_copyin:
set copyio_fault, %l7 ! .copyio_fault is lofault val
ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
membar #Sync ! sync error barrier
stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
mov %i0, SAVE_SRC
mov %i1, SAVE_DST
mov %i2, SAVE_COUNT
FP_NOMIGRATE(6, 7)
rd %fprs, %o2 ! check for unused fp
st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
! FPU enabled ? If not, enable it.
btst FPRS_FEF, %o2
bz,a,pt %icc, .do_blockcopyin
wr %g0, FPRS_FEF, %fprs
! FPU enabled, but is Q3Q4 dirty ? If yes, save them.
btst FPRS_DU, %o2
bz,pn %icc, .do_blockcopyin
nop
BST_FPQ3Q4_TOSTACK(%o2)
.do_blockcopyin:
rd %gsr, %o2
stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
or %l6, FPUSED_FLAG, %l6
andcc DST, VIS_BLOCKSIZE - 1, TMP
mov ASI_USER, %asi
bz,pt %ncc, 2f
neg TMP
add TMP, VIS_BLOCKSIZE, TMP
! TMP = bytes required to align DST on FP_BLOCK boundary
! Using SRC as a tmp here
cmp TMP, 3
bleu,pt %ncc, 1f
sub CNT,TMP,CNT ! adjust main count
sub TMP, 3, TMP ! adjust for end of loop test
.ci_blkalign:
lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration
stb SRC, [DST]
subcc TMP, 4, TMP
lduba [REALSRC + 1]%asi, SRC
add REALSRC, 4, REALSRC
stb SRC, [DST + 1]
lduba [REALSRC - 2]%asi, SRC
add DST, 4, DST
stb SRC, [DST - 2]
lduba [REALSRC - 1]%asi, SRC
bgu,pt %ncc, .ci_blkalign
stb SRC, [DST - 1]
addcc TMP, 3, TMP ! restore count adjustment
bz,pt %ncc, 2f ! no bytes left?
nop
1: lduba [REALSRC]%asi, SRC
inc REALSRC
inc DST
deccc TMP
bgu %ncc, 1b
stb SRC, [DST - 1]
2:
andn REALSRC, 0x7, SRC
alignaddr REALSRC, %g0, %g0
! SRC - 8-byte aligned
! DST - 64-byte aligned
prefetcha [SRC]%asi, #one_read
prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read
prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read
prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read
ldda [SRC]%asi, %d32
#if FIRST_PREFETCH > 4
prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
#endif
ldda [SRC + 0x08]%asi, %d34
#if FIRST_PREFETCH > 5
prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read
#endif
ldda [SRC + 0x10]%asi, %d36
#if FIRST_PREFETCH > 6
prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read
#endif
faligndata %d32, %d34, %d48
ldda [SRC + 0x18]%asi, %d38
#if FIRST_PREFETCH > 7
prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read
#endif
faligndata %d34, %d36, %d50
ldda [SRC + 0x20]%asi, %d40
faligndata %d36, %d38, %d52
ldda [SRC + 0x28]%asi, %d42
faligndata %d38, %d40, %d54
ldda [SRC + 0x30]%asi, %d44
faligndata %d40, %d42, %d56
ldda [SRC + 0x38]%asi, %d46
faligndata %d42, %d44, %d58
ldda [SRC + VIS_BLOCKSIZE]%asi, %d32
sub CNT, VIS_BLOCKSIZE, CNT
add SRC, VIS_BLOCKSIZE, SRC
add REALSRC, VIS_BLOCKSIZE, REALSRC
ba,a,pt %ncc, 1f
nop
.align ICACHE_LINE_SIZE
1:
ldda [SRC + 0x08]%asi, %d34
faligndata %d44, %d46, %d60
ldda [SRC + 0x10]%asi, %d36
faligndata %d46, %d32, %d62
stda %d48, [DST]ASI_BLK_P
ldda [SRC + 0x18]%asi, %d38
faligndata %d32, %d34, %d48
ldda [SRC + 0x20]%asi, %d40
faligndata %d34, %d36, %d50
ldda [SRC + 0x28]%asi, %d42
faligndata %d36, %d38, %d52
ldda [SRC + 0x30]%asi, %d44
faligndata %d38, %d40, %d54
ldda [SRC + 0x38]%asi, %d46
faligndata %d40, %d42, %d56
sub CNT, VIS_BLOCKSIZE, CNT
ldda [SRC + VIS_BLOCKSIZE]%asi, %d32
faligndata %d42, %d44, %d58
prefetcha [SRC + ((FIRST_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read
add DST, VIS_BLOCKSIZE, DST
prefetcha [SRC + ((SECOND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
add REALSRC, VIS_BLOCKSIZE, REALSRC
cmp CNT, VIS_BLOCKSIZE + 8
bgu,pt %ncc, 1b
add SRC, VIS_BLOCKSIZE, SRC
! only if REALSRC & 0x7 is 0
cmp CNT, VIS_BLOCKSIZE
bne %ncc, 3f
andcc REALSRC, 0x7, %g0
bz,pt %ncc, 2f
nop
3:
faligndata %d44, %d46, %d60
faligndata %d46, %d32, %d62
stda %d48, [DST]ASI_BLK_P
add DST, VIS_BLOCKSIZE, DST
ba,pt %ncc, 3f
nop
2:
ldda [SRC + 0x08]%asi, %d34
faligndata %d44, %d46, %d60
ldda [SRC + 0x10]%asi, %d36
faligndata %d46, %d32, %d62
stda %d48, [DST]ASI_BLK_P
ldda [SRC + 0x18]%asi, %d38
ldda [SRC + 0x20]%asi, %d40
ldda [SRC + 0x28]%asi, %d42
ldda [SRC + 0x30]%asi, %d44
ldda [SRC + 0x38]%asi, %d46
sub CNT, VIS_BLOCKSIZE, CNT
add DST, VIS_BLOCKSIZE, DST
add SRC, VIS_BLOCKSIZE, SRC
add REALSRC, VIS_BLOCKSIZE, REALSRC
stda %d32, [DST]ASI_BLK_P
add DST, VIS_BLOCKSIZE, DST
ba,a,pt %ncc, 4f
nop
3: tst CNT
bz,a %ncc, 4f
nop
5: lduba [REALSRC]ASI_USER, TMP
inc REALSRC
inc DST
deccc CNT
bgu %ncc, 5b
stb TMP, [DST - 1]
4:
.copyin_exit:
membar #Sync
ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
wr %o2, 0, %gsr
ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
btst FPRS_DU, %o3
bz,pt %icc, 4f
nop
BLD_FPQ3Q4_FROMSTACK(%o2)
ba,pt %ncc, 1f
wr %o3, 0, %fprs ! restore fprs
4:
FZEROQ3Q4
wr %o3, 0, %fprs ! restore fprs
1:
membar #Sync ! sync error barrier
andn %l6, FPUSED_FLAG, %l6
stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
FP_ALLOWMIGRATE(5, 6)
ret
restore %g0, 0, %o0
/*
* We got here because of a fault during copyin
* Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
*/
.copyin_err:
ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
tst %o4
bz,pt %ncc, 2f ! if not, return error
nop
ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with
jmp %g2 ! original arguments
restore %g0, 0, %g0 ! dispose of copy window
2:
ret
restore %g0, -1, %o0 ! return error value
SET_SIZE(copyin_more)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
xcopyin(const void *uaddr, void *kaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(xcopyin)
cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
bleu,pt %ncc, .xcopyin_small ! go to larger cases
xor %o0, %o1, %o3 ! are src, dst alignable?
btst 7, %o3 !
bz,pt %ncc, .xcopyin_8 ! check for longword alignment
nop
btst 1, %o3 !
bz,pt %ncc, .xcopyin_2 ! check for half-word
nop
sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_1)], %o3
tst %o3
bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .xcopyin_small ! go to small copy
nop
ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
nop
.xcopyin_2:
btst 3, %o3 !
bz,pt %ncc, .xcopyin_4 ! check for word alignment
nop
sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_2)], %o3
tst %o3
bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .xcopyin_small ! go to small copy
nop
ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
nop
.xcopyin_4:
! already checked longword, must be word aligned
sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_4)], %o3
tst %o3
bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .xcopyin_small ! go to small copy
nop
ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
nop
.xcopyin_8:
sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_8)], %o3
tst %o3
bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .xcopyin_small ! go to small copy
nop
ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
nop
.xcopyin_small:
sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value
or %o5, %lo(.sm_xcopyin_err), %o5
ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul
membar #Sync ! sync error barrier
ba,pt %ncc, .sm_do_copyin ! common code
stn %o5, [THREAD_REG + T_LOFAULT]
.xcopyin_more:
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value
ba,pt %ncc, .do_copyin
or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
/*
* We got here because of fault during xcopyin
* Errno value is in ERRNO
*/
.xcopyin_err:
ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
tst %o4
bz,pt %ncc, 2f ! if not, return error
nop
ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with
jmp %g2 ! original arguments
restore %g0, 0, %g0 ! dispose of copy window
2:
ret
restore ERRNO, 0, %o0 ! return errno value
.sm_xcopyin_err:
membar #Sync
stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
mov SM_SAVE_SRC, %o0
mov SM_SAVE_DST, %o1
mov SM_SAVE_COUNT, %o2
ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
tst %o3
bz,pt %ncc, 3f ! if not, return error
nop
ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with
jmp %o5 ! original arguments
nop
3:
retl
or %g1, 0, %o0 ! return errno value
SET_SIZE(xcopyin)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
xcopyin_little(const void *uaddr, void *kaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(xcopyin_little)
sethi %hi(.xcopyio_err), %o5
or %o5, %lo(.xcopyio_err), %o5
ldn [THREAD_REG + T_LOFAULT], %o4
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT]
mov %o4, %o5
subcc %g0, %o2, %o3
add %o0, %o2, %o0
bz,pn %ncc, 2f ! check for zero bytes
sub %o2, 1, %o4
add %o0, %o4, %o0 ! start w/last byte
add %o1, %o2, %o1
lduba [%o0 + %o3]ASI_AIUSL, %o4
1: stb %o4, [%o1 + %o3]
inccc %o3
sub %o0, 2, %o0 ! get next byte
bcc,a,pt %ncc, 1b
lduba [%o0 + %o3]ASI_AIUSL, %o4
2:
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return (0)
.xcopyio_err:
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g1, %o0
SET_SIZE(xcopyin_little)
#endif /* lint */
/*
* Copy a block of storage - must not overlap (from + len <= to).
* No fault handler installed (to be called under on_fault())
*/
#if defined(lint)
/* ARGSUSED */
void
copyin_noerr(const void *ufrom, void *kto, size_t count)
{}
#else /* lint */
ENTRY(copyin_noerr)
cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
bleu,pt %ncc, .copyin_ne_small ! go to larger cases
xor %o0, %o1, %o3 ! are src, dst alignable?
btst 7, %o3 !
bz,pt %ncc, .copyin_ne_8 ! check for longword alignment
nop
btst 1, %o3 !
bz,pt %ncc, .copyin_ne_2 ! check for half-word
nop
sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_1)], %o3
tst %o3
bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyin_ne_small ! go to small copy
nop
ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
nop
.copyin_ne_2:
btst 3, %o3 !
bz,pt %ncc, .copyin_ne_4 ! check for word alignment
nop
sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_2)], %o3
tst %o3
bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyin_ne_small ! go to small copy
nop
ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
nop
.copyin_ne_4:
! already checked longword, must be word aligned
sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_4)], %o3
tst %o3
bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyin_ne_small ! go to small copy
nop
ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
nop
.copyin_ne_8:
sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_8)], %o3
tst %o3
bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyin_ne_small ! go to small copy
nop
ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
nop
.copyin_ne_small:
ldn [THREAD_REG + T_LOFAULT], %o4
tst %o4
bz,pn %ncc, .sm_do_copyin
nop
sethi %hi(.sm_copyio_noerr), %o5
or %o5, %lo(.sm_copyio_noerr), %o5
membar #Sync ! sync error barrier
ba,pt %ncc, .sm_do_copyin
stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault
.copyin_noerr_more:
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
sethi %hi(.copyio_noerr), REAL_LOFAULT
ba,pt %ncc, .do_copyin
or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
.copyio_noerr:
jmp %l6
restore %g0,0,%g0
.sm_copyio_noerr:
membar #Sync
stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault
jmp %o4
nop
SET_SIZE(copyin_noerr)
#endif /* lint */
/*
* Copy a block of storage - must not overlap (from + len <= to).
* No fault handler installed (to be called under on_fault())
*/
#if defined(lint)
/* ARGSUSED */
void
copyout_noerr(const void *kfrom, void *uto, size_t count)
{}
#else /* lint */
ENTRY(copyout_noerr)
cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
bleu,pt %ncc, .copyout_ne_small ! go to larger cases
xor %o0, %o1, %o3 ! are src, dst alignable?
btst 7, %o3 !
bz,pt %ncc, .copyout_ne_8 ! check for longword alignment
nop
btst 1, %o3 !
bz,pt %ncc, .copyout_ne_2 ! check for half-word
nop
sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_1)], %o3
tst %o3
bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyout_ne_small ! go to small copy
nop
ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
nop
.copyout_ne_2:
btst 3, %o3 !
bz,pt %ncc, .copyout_ne_4 ! check for word alignment
nop
sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_2)], %o3
tst %o3
bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyout_ne_small ! go to small copy
nop
ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
nop
.copyout_ne_4:
! already checked longword, must be word aligned
sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_4)], %o3
tst %o3
bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyout_ne_small ! go to small copy
nop
ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
nop
.copyout_ne_8:
sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
ld [%o3 + %lo(hw_copy_limit_8)], %o3
tst %o3
bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
cmp %o2, %o3 ! if length <= limit
bleu,pt %ncc, .copyout_ne_small ! go to small copy
nop
ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
nop
.copyout_ne_small:
ldn [THREAD_REG + T_LOFAULT], %o4
tst %o4
bz,pn %ncc, .sm_do_copyout
nop
sethi %hi(.sm_copyio_noerr), %o5
or %o5, %lo(.sm_copyio_noerr), %o5
membar #Sync ! sync error barrier
ba,pt %ncc, .sm_do_copyout
stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault
.copyout_noerr_more:
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
sethi %hi(.copyio_noerr), REAL_LOFAULT
ba,pt %ncc, .do_copyout
or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
SET_SIZE(copyout_noerr)
#endif /* lint */
/*
* hwblkclr - clears block-aligned, block-multiple-sized regions that are
* longer than 256 bytes in length using spitfire's block stores. If
* the criteria for using this routine are not met then it calls bzero
* and returns 1. Otherwise 0 is returned indicating success.
* Caller is responsible for ensuring use_hw_bzero is true and that
* kpreempt_disable() has been called.
*/
#ifdef lint
/*ARGSUSED*/
int
hwblkclr(void *addr, size_t len)
{
return(0);
}
#else /* lint */
! %i0 - start address
! %i1 - length of region (multiple of 64)
! %l0 - saved fprs
! %l1 - pointer to saved %d32 block
! %l2 - saved curthread->t_lwp
ENTRY(hwblkclr)
! get another window w/space for one aligned block of saved fpregs
save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
#ifdef ROCK_CR_6654578
! Address aligned to 128 byte
andcc %i0, ST_CACHE_ALIGN, %g0
bnz,pn %ncc, .normal_hwblkclr
nop
! multiple of 8k len, call page_hwblkclr
set PAGE_MASK, %i3
andcc %i1, %i3, %g0
bnz,pn %ncc, .normal_hwblkclr
nop
mov %i0, %o0
call page_hwblkclr
mov %i1, %o1
ret
restore %g0, 0, %o0 ! I$ sync not required
.normal_hwblkclr:
#endif
! Must be block-aligned
andcc %i0, (VIS_BLOCKSIZE-1), %g0
bnz,pn %ncc, 1f
nop
! ... and must be 256 bytes or more
cmp %i1, 256
blu,pn %ncc, 1f
nop
! ... and length must be a multiple of VIS_BLOCKSIZE
andcc %i1, (VIS_BLOCKSIZE-1), %g0
bz,pn %ncc, 2f
nop
1: ! punt, call bzero but notify the caller that bzero was used
mov %i0, %o0
call bzero
mov %i1, %o1
! call rock_sync_icache
mov %i0, %o0
call rock_sync_icache
mov %i0, %o0
ret
restore %g0, 0, %o0 ! did not use block operations
2: mov %g0, %l3 ! clear flag to say fp regs not saved
rd %fprs, %l0 ! check for unused fp
! FPU enabled ? If not, enable it.
btst FPRS_FEF, %l0
bz,a,pt %icc, 1f
wr %g0, FPRS_FEF, %fprs
! FPU enabled, but is Q3Q4 dirty ? If yes, save them.
btst FPRS_DU, %l0
bz,pn %icc, 1f
nop
! save in-use fpregs on stack
membar #Sync
add %fp, STACK_BIAS - 65, %l1
and %l1, -VIS_BLOCKSIZE, %l1
stda %d32, [%l1]ASI_BLK_P
! Set a flag saying fp regs are saved.
mov 1, %l3
! Need to wait only here for the above save to be completed
membar #StoreStore|#StoreLoad|#LoadStore
1: wr %g0, ASI_BLK_P, %asi
! Clear block
movxtod %g0, %d32
movxtod %g0, %d34
fsrc1 %d32, %d36
fsrc1 %d32, %d38
fsrc1 %d32, %d40
fsrc1 %d32, %d42
fsrc1 %d32, %d44
fsrc1 %d32, %d46
mov 256, %i3
ba,pt %ncc, .pz_doblock
nop
.pz_blkstart:
! stda %d32, [%i0 + 192]%asi ! in dly slot of branch that got us here
#ifdef ROCK_CR_6654578
prefetcha [%i0 + VIS_COPY_THRESHOLD + 128]%asi, #n_writes
#endif
stda %d32, [%i0 + 128]%asi
#ifdef ROCK_CR_6654578
prefetcha [%i0 + VIS_COPY_THRESHOLD + 64]%asi, #n_writes
#endif
stda %d32, [%i0 + 64]%asi
#ifdef ROCK_CR_6654578
prefetcha [%i0 + VIS_COPY_THRESHOLD + 0]%asi, #n_writes
#endif
stda %d32, [%i0]%asi
.pz_zinst:
add %i0, %i3, %i0
sub %i1, %i3, %i1
.pz_doblock:
#ifdef ROCK_CR_6654578
prefetcha [%i0 + VIS_COPY_THRESHOLD + 192]%asi, #n_writes
#endif
cmp %i1, 256
bgeu,a %ncc, .pz_blkstart
stda %d32, [%i0 + 192]%asi
cmp %i1, 64
blu %ncc, .pz_finish
andn %i1, (64-1), %i3
srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words
set .pz_zinst, %i4
sub %i4, %i2, %i4
jmp %i4
nop
.pz_finish:
brz,a %l3, .pz_finished
wr %l0, 0, %fprs ! restore fprs
! restore fpregs from stack
ldda [%l1]ASI_BLK_P, %d32
wr %l0, 0, %fprs ! restore fprs
.pz_finished:
membar #Sync
ret
restore %g0, 0, %o0 ! return (bzero or not)
SET_SIZE(hwblkclr)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
void
hw_pa_bcopy32(uint64_t src, uint64_t dst)
{}
#else /*!lint */
/*
* Copy 32 bytes of data from src (%o0) to dst (%o1)
* using physical addresses.
*/
ENTRY_NP(hw_pa_bcopy32)
rdpr %pstate, %g1
andn %g1, PSTATE_IE, %g2
wrpr %g0, %g2, %pstate
rdpr %pstate, %g0
ldxa [%o0]ASI_MEM, %o2
add %o0, 8, %o0
ldxa [%o0]ASI_MEM, %o3
add %o0, 8, %o0
ldxa [%o0]ASI_MEM, %o4
add %o0, 8, %o0
ldxa [%o0]ASI_MEM, %o5
stxa %o2, [%o1]ASI_MEM
add %o1, 8, %o1
stxa %o3, [%o1]ASI_MEM
add %o1, 8, %o1
stxa %o4, [%o1]ASI_MEM
add %o1, 8, %o1
stxa %o5, [%o1]ASI_MEM
retl
wrpr %g0, %g1, %pstate
SET_SIZE(hw_pa_bcopy32)
#endif /* lint */
/*
* Zero a block of storage.
*
* uzero is used by the kernel to zero a block in user address space.
*/
#if defined(lint)
/* ARGSUSED */
int
kzero(void *addr, size_t count)
{ return(0); }
/* ARGSUSED */
void
uzero(void *addr, size_t count)
{}
#else /* lint */
ENTRY(uzero)
!
! Set a new lo_fault handler only if we came in with one
! already specified.
!
wr %g0, ASI_USER, %asi
ldn [THREAD_REG + T_LOFAULT], %o5
tst %o5
bz,pt %ncc, .do_zero
sethi %hi(.zeroerr), %o2
or %o2, %lo(.zeroerr), %o2
membar #Sync
ba,pt %ncc, .do_zero
stn %o2, [THREAD_REG + T_LOFAULT]
ENTRY(kzero)
!
! Always set a lo_fault handler
!
wr %g0, ASI_P, %asi
ldn [THREAD_REG + T_LOFAULT], %o5
sethi %hi(.zeroerr), %o2
or %o5, LOFAULT_SET, %o5
or %o2, %lo(.zeroerr), %o2
membar #Sync
ba,pt %ncc, .do_zero
stn %o2, [THREAD_REG + T_LOFAULT]
/*
* We got here because of a fault during kzero or if
* uzero or bzero was called with t_lofault non-zero.
* Otherwise we've already run screaming from the room.
* Errno value is in %g1. Note that we're here iff
* we did set t_lofault.
*/
.zeroerr:
!
! Undo asi register setting. Just set it to be the
! kernel default without checking.
!
wr %g0, ASI_P, %asi
!
! We did set t_lofault. It may well have been zero coming in.
!
1:
tst %o5
membar #Sync
bne,pn %ncc, 3f
andncc %o5, LOFAULT_SET, %o5
2:
!
! Old handler was zero. Just return the error.
!
retl ! return
mov %g1, %o0 ! error code from %g1
3:
!
! We're here because %o5 was non-zero. It was non-zero
! because either LOFAULT_SET was present, a previous fault
! handler was present or both. In all cases we need to reset
! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
! before we either simply return the error or we invoke the
! previously specified handler.
!
be %ncc, 2b
stn %o5, [THREAD_REG + T_LOFAULT]
jmp %o5 ! goto real handler
nop
SET_SIZE(kzero)
SET_SIZE(uzero)
#endif /* lint */
/*
* Zero a block of storage.
*/
#if defined(lint)
/* ARGSUSED */
void
bzero(void *addr, size_t count)
{}
#else /* lint */
ENTRY(bzero)
wr %g0, ASI_P, %asi
ldn [THREAD_REG + T_LOFAULT], %o5 ! save old vector
tst %o5
bz,pt %ncc, .do_zero
sethi %hi(.zeroerr), %o2
or %o2, %lo(.zeroerr), %o2
membar #Sync ! sync error barrier
stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector
.do_zero:
cmp %o1, 7
blu,pn %ncc, .byteclr
nop
cmp %o1, 15
blu,pn %ncc, .wdalign
nop
andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound
bz,pt %ncc, .blkalign ! already double aligned
sub %o3, 8, %o3 ! -(bytes till double aligned)
add %o1, %o3, %o1 ! update o1 with new count
1:
stba %g0, [%o0]%asi
inccc %o3
bl,pt %ncc, 1b
inc %o0
! Now address is double aligned
.blkalign:
cmp %o1, 0x80 ! check if there are 128 bytes to set
blu,pn %ncc, .bzero_small
mov %o1, %o3
andcc %o0, 0x3f, %o3 ! is block aligned?
bz,pt %ncc, .bzero_blk
sub %o3, 0x40, %o3 ! -(bytes till block aligned)
add %o1, %o3, %o1 ! o1 is the remainder
! Clear -(%o3) bytes till block aligned
1:
stxa %g0, [%o0]%asi
addcc %o3, 8, %o3
bl,pt %ncc, 1b
add %o0, 8, %o0
.bzero_blk:
and %o1, 0x3f, %o3 ! calc bytes left after blk clear
andn %o1, 0x3f, %o4 ! calc size of blocks in bytes
cmp %o4, 0x100 ! 256 bytes or more
blu,pn %ncc, 3f
nop
2:
stxa %g0, [%o0+0x0]%asi
stxa %g0, [%o0+0x40]%asi
stxa %g0, [%o0+0x80]%asi
stxa %g0, [%o0+0xc0]%asi
stxa %g0, [%o0+0x8]%asi
stxa %g0, [%o0+0x10]%asi
stxa %g0, [%o0+0x18]%asi
stxa %g0, [%o0+0x20]%asi
stxa %g0, [%o0+0x28]%asi
stxa %g0, [%o0+0x30]%asi
stxa %g0, [%o0+0x38]%asi
stxa %g0, [%o0+0x48]%asi
stxa %g0, [%o0+0x50]%asi
stxa %g0, [%o0+0x58]%asi
stxa %g0, [%o0+0x60]%asi
stxa %g0, [%o0+0x68]%asi
stxa %g0, [%o0+0x70]%asi
stxa %g0, [%o0+0x78]%asi
stxa %g0, [%o0+0x88]%asi
stxa %g0, [%o0+0x90]%asi
stxa %g0, [%o0+0x98]%asi
stxa %g0, [%o0+0xa0]%asi
stxa %g0, [%o0+0xa8]%asi
stxa %g0, [%o0+0xb0]%asi
stxa %g0, [%o0+0xb8]%asi
stxa %g0, [%o0+0xc8]%asi
stxa %g0, [%o0+0xd0]%asi
stxa %g0, [%o0+0xd8]%asi
stxa %g0, [%o0+0xe0]%asi
stxa %g0, [%o0+0xe8]%asi
stxa %g0, [%o0+0xf0]%asi
stxa %g0, [%o0+0xf8]%asi
sub %o4, 0x100, %o4
cmp %o4, 0x100
bgu,pt %ncc, 2b
add %o0, 0x100, %o0
3:
! ... check if 64 bytes to set
cmp %o4, 0x40
blu %ncc, .bzero_blk_done
nop
4:
stxa %g0, [%o0+0x0]%asi
stxa %g0, [%o0+0x8]%asi
stxa %g0, [%o0+0x10]%asi
stxa %g0, [%o0+0x18]%asi
stxa %g0, [%o0+0x20]%asi
stxa %g0, [%o0+0x28]%asi
stxa %g0, [%o0+0x30]%asi
stxa %g0, [%o0+0x38]%asi
subcc %o4, 0x40, %o4
bgu,pt %ncc, 3b
add %o0, 0x40, %o0
.bzero_blk_done:
membar #Sync
.bzero_small:
! Set the remaining doubles
subcc %o3, 8, %o3 ! Can we store any doubles?
blu,pn %ncc, .byteclr
and %o1, 7, %o1 ! calc bytes left after doubles
.dbclr:
stxa %g0, [%o0]%asi ! Clear the doubles
subcc %o3, 8, %o3
bgeu,pt %ncc, .dbclr
add %o0, 8, %o0
ba .byteclr
nop
.wdalign:
andcc %o0, 3, %o3 ! is add aligned on a word boundary
bz,pn %ncc, .wdclr
andn %o1, 3, %o3 ! create word sized count in %o3
dec %o1 ! decrement count
stba %g0, [%o0]%asi ! clear a byte
ba .wdalign
inc %o0 ! next byte
.wdclr:
sta %g0, [%o0]%asi ! 4-byte clearing loop
subcc %o3, 4, %o3
bnz,pt %ncc, .wdclr
inc 4, %o0
and %o1, 3, %o1 ! leftover count, if any
.byteclr:
! Set the leftover bytes
brz %o1, .bzero_exit
nop
7:
deccc %o1 ! byte clearing loop
stba %g0, [%o0]%asi
bgu,pt %ncc, 7b
inc %o0
.bzero_exit:
!
! We're just concerned with whether t_lofault was set
! when we came in. We end up here from either kzero()
! or bzero(). kzero() *always* sets a lofault handler.
! It ors LOFAULT_SET into %o5 to indicate it has done
! this even if the value of %o5 is otherwise zero.
! bzero() sets a lofault handler *only* if one was
! previously set. Accordingly we need to examine
! %o5 and if it is non-zero be sure to clear LOFAULT_SET
! before resetting the error handler.
!
tst %o5
bz %ncc, 1f
andn %o5, LOFAULT_SET, %o5
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1:
retl
clr %o0 ! return (0)
SET_SIZE(bzero)
#endif /* lint */
#ifdef ROCK_CR_6654578
/* This code tries to maximize bandwidth by being clever about accessing
* the two cache lines that are BUDDY PAIRS in the L3 cache. When line 0
* of a pair is accessed, it will take hundreds of cycles to get the line
* from memory, which brings in a 128-byte line to L3. Until the line is
* installed in L3, any other access to that line (such as buddy line 1)
* is blocked. For best throughput, we access many lines that are the first
* of their buddy pairs, and only after many such accesses have been made,
* we access the sequence of second buddy pair lines. Hopefully the second
* set of accesses comes after the L3 lines are installed, so the accesses
* hitin L3 without being delayed. This should yield better throughput.
* To keep this code simple, we assume the addresses given are aligned at
* least on a 128 byte boundary, and the length is assumed to be a multiple
* of 8k bytes.
*/
#ifdef lint
/*ARGSUSED*/
int
page_hwblkclr(void *addr, size_t len)
{
return(0);
}
#else /* lint */
ENTRY(page_hwblkclr)
save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
! %i0 address
! %i1 len
rd %fprs, %l0
mov %g0, %l2 ! clear flag to say fp regs not saved
! FPU enabled ? If not, enable it.
btst FPRS_FEF, %l0
bz,a,pt %icc, 1f
wr %g0, FPRS_FEF, %fprs
! FPU enabled, but is Q3Q4 dirty ? If yes, save them.
btst FPRS_DU, %l0
bz,pn %icc, 1f
nop
! save in-use fpregs on stack
add %fp, STACK_BIAS - 65, %l1 ! get stack frame for fp regs
and %l1, -VIS_BLOCKSIZE, %l1 ! block align frame
stda %d32, [%l1]ASI_BLK_P ! %l1 = addr of saved fp regs
! Set a flag saying fp regs are saved.
mov 1, %l2
! enable fp
1: membar #StoreStore|#StoreLoad|#LoadStore
movxtod %g0, %d32
movxtod %g0, %d34
movxtod %g0, %d36
movxtod %g0, %d38
movxtod %g0, %d40
movxtod %g0, %d42
movxtod %g0, %d44
movxtod %g0, %d46
ba myloop2
srl %i1,12,%i1
.align 64
myloop2:
mov 2,%l5
mov %i0, %l3
buddyloop:
set 4096, %l4
add %i0, %l4, %l4
prefetcha [%l4]ASI_BLK_P, #n_writes
mov 32,%l6
innerloop:
subcc %l6,1,%l6
stda %d32,[%i0]ASI_BLK_P
bg,pt %icc,innerloop
add %i0, 128, %i0
subcc %l5,1,%l5
add %l3, 64, %i0
bg,pt %icc,buddyloop
nop
subcc %i1,1,%i1
add %i0, 4032, %i0
bg,pt %icc,myloop2
nop
brz,a %l2, 2f
wr %l0, 0, %fprs ! restore fprs
! restore fpregs from stack
ldda [%l1]ASI_BLK_P, %d32
wr %l0, 0, %fprs ! restore fprs
2:
membar #Sync
ret
restore %g0, 0, %o0
SET_SIZE(page_hwblkclr)
#endif /* lint */
#endif /* ROCK_CR_6654578 */
#if defined(lint)
int use_hw_bcopy = 1;
int use_hw_bzero = 1;
uint_t hw_copy_limit_1 = 0x100;
uint_t hw_copy_limit_2 = 0x200;
uint_t hw_copy_limit_4 = 0x400;
uint_t hw_copy_limit_8 = 0x400;
#else /* !lint */
DGDEF(use_hw_bcopy)
.word 1
DGDEF(use_hw_bzero)
.word 1
DGDEF(hw_copy_limit_1)
.word 0x100
DGDEF(hw_copy_limit_2)
.word 0x200
DGDEF(hw_copy_limit_4)
.word 0x400
DGDEF(hw_copy_limit_8)
.word 0x400
.align 64
.section ".text"
#endif /* !lint */