/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/asm_linkage.h>
#include <sys/vtrace.h>
#include <sys/machthread.h>
#include <sys/clock.h>
#include <sys/asi.h>
#include <sys/fsr.h>
#include <sys/privregs.h>
#include <sys/machasi.h>
#include <sys/niagaraasi.h>
#if !defined(lint)
#include "assym.h"
#endif /* lint */
/*
* Pseudo-code to aid in understanding the control flow of the
* bcopy/kcopy routine.
*
* ! WARNING : <Register usage convention>
* ! In kcopy() the %o5, holds previous error handler and a flag
* ! LOFAULT_SET (low bits). The %o5 is null in bcopy().
* ! The %o5 is not available for any other use.
*
* On entry:
* ! Determine whether to use the FP register version or the
* ! the leaf routine version depending on the size of the copy.
* ! Set up error handling accordingly.
* ! The transition point depends on FP_COPY
* ! For both versions %o5 is reserved
*
* kcopy():
* if(length > FP_COPY)
* go to regular_kcopy
*
* ! Setup_leaf_rtn_error_handler
* %o5 = curthread->t_lofault; ! save existing handler in %o5
* %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag
* curthread->t_lofault = .sm_copyerr;
* goto small_bcopy();
*
* regular_kcopy:
* save_registers()
* %o5 = curthread->t_lofault; ! save existing handler in %o5
* %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag
* curthread->t_lofault = .copyerr;
* goto do_copy();
*
* bcopy():
* if(length > FP_COPY)
* go to regular_bcopy
*
* ! Setup_leaf_rtn_error_handler
* %o5 = curthread->t_lofault; ! save existing handler in %o5
* curthread->t_lofault = .sm_copyerr;
* goto small_bcopy();
*
* regular_bcopy:
* %o5 = curthread->t_lofault; ! save existing handler in %o5
* curthread->t_lofault = .copyerr;
* goto do_copy();
*
* small_bcopy:
* ! handle copies smaller than FP_COPY
* restore t_lofault handler
* exit
*
* do_copy:
* ! handle copies larger than FP_COPY
* save fp_regs
* blockcopy;
* restore fp_regs
* restore t_lofault handler if came from kcopy();
*
*
* In leaf lofault handler:
* curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault
* return (errno)
*
* In lofault handler:
* curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault
* restore fp_regs
* return (errno)
*
*
*
* For all of bcopy/copyin/copyout the copy logic is specialized according
* to how the src and dst is aligned and how much data needs to be moved.
* The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL))
*
* N2/RF Flow :
*
* if (count < FP_COPY) { (584 bytes)
* set small fault handler (no register window save/restore)
* if count < SHORTCOPY (7 bytes)
* copy bytes; go to short_exit
* else
* determine dst alignment, move minimum bytes/halfwords to
* get dst aligned on long word boundary
* if( src is on long word boundary ) {
* medlong: src/dst aligned on 8 bytes
* copy with ldx/stx in 4-way unrolled loop;
* copy final 0-31 bytes; go to short_exit
* } else { src/dst not aligned on 8 bytes
* if src is word aligned, ld/st words in 32-byte chunks
* if src is half word aligned, ld half, ld word, ld half; pack
* into long word, store long words in 32-byte chunks
* if src is byte aligned, ld byte,half,word parts; pack into long
* word, store long words in 32-byte chunks
* move final 0-31 bytes according to src alignment; go to short_exit
* short_exit:
* restore trap handler if needed, retl
* else { More than FP_COPY bytes
* set fault handler
* disable kernel preemption
* save registers, save FP registers if in use
* move bytes to align destination register on long word boundary
* if(src is on long word boundary) { src/dst aligned on 8 bytes
* align dst on 64 byte boundary; use 8-way test for each of 8 possible
* src alignments relative to a 64 byte boundary to select the
* 16-way unrolled loop (128 bytes) to use for
* block load, fmovd, block-init-store, block-store, fmovd operations
* then go to remain_stuff.
* remain_stuff: move remaining bytes. go to long_exit
* } else {
* setup alignaddr for faligndata instructions
* align dst on 64 byte boundary; use 8-way test for each of 8 possible
* src alignments to nearest long word relative to 64 byte boundary to
* select the 8-way unrolled loop (64 bytes) to use for
* block load, falign, fmovd, block-store loop
* (only use block-init-store when src/dst on 8 byte boundaries.)
* goto unalign_done.
* unalign_done:
* move remaining bytes for unaligned cases. go to long_exit
* long_exit:
* restore %gsr, FP regs (either from stack or set to zero),
* restore trap handler, check for kernel preemption request,
* handle if needed, ret.
* }
*
* Other platforms include hw_bcopy_limit_[1248] to control the exact
* point where the FP register code is used. On those platforms, the
* FP register code did not leave data in L2 cache, potentially affecting
* performance more than the gain/loss from the algorithm difference.
* For N2/RF, block store places data in the L2 cache, so use or non-use
* of the FP registers has no effect on L2 cache behavior.
* The cost for testing hw_bcopy_limit_* according to different
* alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits
* were not used. That cost was judged too high relative to the benefits,
* so the hw_bcopy_limit option is omitted from this code.
*/
/*
* Less then or equal this number of bytes we will always copy byte-for-byte
*/
#define SMALL_LIMIT 7
/*
* LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
* handler was set
*/
#define LOFAULT_SET 2
/*
* This define is to align data for the unaligned source cases.
* The data1, data2 and data3 is merged into data1 and data2.
* The data3 is preserved for next merge.
*/
#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \
sllx data1, lshift, data1 ;\
srlx data2, rshift, tmp ;\
or data1, tmp, data1 ;\
sllx data2, lshift, data2 ;\
srlx data3, rshift, tmp ;\
or data2, tmp, data2
/*
* This macro is to align the data. Basically it merges
* data1 and data2 to form double word.
*/
#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \
sllx data1, lshift, data1 ;\
srlx data2, rshift, tmp ;\
or data1, tmp, data1
#if !defined(NIAGARA_IMPL)
/*
* Flags set in the lower bits of the t_lofault address:
* FPUSED_FLAG: The FP registers were in use and must be restored
* LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls
* COPY_FLAGS: Both of the above
*
* Other flags:
* KPREEMPT_FLAG: kpreempt needs to be called
*/
#define FPUSED_FLAG 1
#define LOFAULT_SET 2
#define COPY_FLAGS (FPUSED_FLAG | LOFAULT_SET)
#define KPREEMPT_FLAG 4
#define ALIGN_OFF_1_7 \
faligndata %d0, %d2, %d48 ;\
faligndata %d2, %d4, %d50 ;\
faligndata %d4, %d6, %d52 ;\
faligndata %d6, %d8, %d54 ;\
faligndata %d8, %d10, %d56 ;\
faligndata %d10, %d12, %d58 ;\
faligndata %d12, %d14, %d60 ;\
faligndata %d14, %d16, %d62
#define ALIGN_OFF_8_15 \
faligndata %d2, %d4, %d48 ;\
faligndata %d4, %d6, %d50 ;\
faligndata %d6, %d8, %d52 ;\
faligndata %d8, %d10, %d54 ;\
faligndata %d10, %d12, %d56 ;\
faligndata %d12, %d14, %d58 ;\
faligndata %d14, %d16, %d60 ;\
faligndata %d16, %d18, %d62
#define ALIGN_OFF_16_23 \
faligndata %d4, %d6, %d48 ;\
faligndata %d6, %d8, %d50 ;\
faligndata %d8, %d10, %d52 ;\
faligndata %d10, %d12, %d54 ;\
faligndata %d12, %d14, %d56 ;\
faligndata %d14, %d16, %d58 ;\
faligndata %d16, %d18, %d60 ;\
faligndata %d18, %d20, %d62
#define ALIGN_OFF_24_31 \
faligndata %d6, %d8, %d48 ;\
faligndata %d8, %d10, %d50 ;\
faligndata %d10, %d12, %d52 ;\
faligndata %d12, %d14, %d54 ;\
faligndata %d14, %d16, %d56 ;\
faligndata %d16, %d18, %d58 ;\
faligndata %d18, %d20, %d60 ;\
faligndata %d20, %d22, %d62
#define ALIGN_OFF_32_39 \
faligndata %d8, %d10, %d48 ;\
faligndata %d10, %d12, %d50 ;\
faligndata %d12, %d14, %d52 ;\
faligndata %d14, %d16, %d54 ;\
faligndata %d16, %d18, %d56 ;\
faligndata %d18, %d20, %d58 ;\
faligndata %d20, %d22, %d60 ;\
faligndata %d22, %d24, %d62
#define ALIGN_OFF_40_47 \
faligndata %d10, %d12, %d48 ;\
faligndata %d12, %d14, %d50 ;\
faligndata %d14, %d16, %d52 ;\
faligndata %d16, %d18, %d54 ;\
faligndata %d18, %d20, %d56 ;\
faligndata %d20, %d22, %d58 ;\
faligndata %d22, %d24, %d60 ;\
faligndata %d24, %d26, %d62
#define ALIGN_OFF_48_55 \
faligndata %d12, %d14, %d48 ;\
faligndata %d14, %d16, %d50 ;\
faligndata %d16, %d18, %d52 ;\
faligndata %d18, %d20, %d54 ;\
faligndata %d20, %d22, %d56 ;\
faligndata %d22, %d24, %d58 ;\
faligndata %d24, %d26, %d60 ;\
faligndata %d26, %d28, %d62
#define ALIGN_OFF_56_63 \
faligndata %d14, %d16, %d48 ;\
faligndata %d16, %d18, %d50 ;\
faligndata %d18, %d20, %d52 ;\
faligndata %d20, %d22, %d54 ;\
faligndata %d22, %d24, %d56 ;\
faligndata %d24, %d26, %d58 ;\
faligndata %d26, %d28, %d60 ;\
faligndata %d28, %d30, %d62
/*
* FP_COPY indicates the minimum number of bytes needed
* to justify using FP/VIS-accelerated memory operations.
* The FPBLK code assumes a minimum number of bytes are available
* to be moved on entry. Check that code carefully before
* reducing FP_COPY below 256.
*/
#define FP_COPY 584
#define SHORTCOPY 7
#define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P
#define ASI_STBI_AIUS ASI_BLK_INIT_QUAD_LDD_AIUS
#define CACHE_LINE 64
#define VIS_BLOCKSIZE 64
/*
* Size of stack frame in order to accomodate a 64-byte aligned
* floating-point register save area and 2 64-bit temp locations.
* All copy functions use three quadrants of fp registers; to assure a
* block-aligned three block buffer in which to save we must reserve
* four blocks on stack.
*
* _______________________________________ <-- %fp + STACK_BIAS
* | We may need to preserve 3 quadrants |
* | of fp regs, but since we do so with |
* | BST/BLD we need room in which to |
* | align to VIS_BLOCKSIZE bytes. So |
* | this area is 4 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET
* |-------------------------------------|
* | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET
* |-------------------------------------|
* | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET
* ---------------------------------------
*/
#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 4)
#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 3) + 1)
#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8)
#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8)
/*
* In FP copies if we do not have preserved data to restore over
* the fp regs we used then we must zero those regs to avoid
* exposing portions of the data to later threads (data security).
*/
#define FZERO \
fzero %f0 ;\
fzero %f2 ;\
faddd %f0, %f2, %f4 ;\
fmuld %f0, %f2, %f6 ;\
faddd %f0, %f2, %f8 ;\
fmuld %f0, %f2, %f10 ;\
faddd %f0, %f2, %f12 ;\
fmuld %f0, %f2, %f14 ;\
faddd %f0, %f2, %f16 ;\
fmuld %f0, %f2, %f18 ;\
faddd %f0, %f2, %f20 ;\
fmuld %f0, %f2, %f22 ;\
faddd %f0, %f2, %f24 ;\
fmuld %f0, %f2, %f26 ;\
faddd %f0, %f2, %f28 ;\
fmuld %f0, %f2, %f30 ;\
faddd %f0, %f2, %f48 ;\
fmuld %f0, %f2, %f50 ;\
faddd %f0, %f2, %f52 ;\
fmuld %f0, %f2, %f54 ;\
faddd %f0, %f2, %f56 ;\
fmuld %f0, %f2, %f58 ;\
faddd %f0, %f2, %f60 ;\
fmuld %f0, %f2, %f62
#if !defined(lint)
/*
* Macros to save and restore fp registers to/from the stack.
* Used to save and restore in-use fp registers when we want to use FP.
*/
#define BST_FP_TOSTACK(tmp1) \
/* membar #Sync */ ;\
add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
stda %f0, [tmp1]ASI_BLK_P ;\
add tmp1, VIS_BLOCKSIZE, tmp1 ;\
stda %f16, [tmp1]ASI_BLK_P ;\
add tmp1, VIS_BLOCKSIZE, tmp1 ;\
stda %f48, [tmp1]ASI_BLK_P ;\
membar #Sync
#define BLD_FP_FROMSTACK(tmp1) \
/* membar #Sync - provided at copy completion */ ;\
add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
ldda [tmp1]ASI_BLK_P, %f0 ;\
add tmp1, VIS_BLOCKSIZE, tmp1 ;\
ldda [tmp1]ASI_BLK_P, %f16 ;\
add tmp1, VIS_BLOCKSIZE, tmp1 ;\
ldda [tmp1]ASI_BLK_P, %f48 ;\
membar #Sync
#endif /* NIAGARA_IMPL */
#endif /* lint */
/*
* Copy a block of storage, returning an error code if `from' or
* `to' takes a kernel pagefault which cannot be resolved.
* Returns errno value on pagefault error, 0 if all ok
*/
#if defined(lint)
/* ARGSUSED */
int
kcopy(const void *from, void *to, size_t count)
{ return(0); }
#else /* lint */
.seg ".text"
.align 4
ENTRY(kcopy)
#if !defined(NIAGARA_IMPL)
cmp %o2, FP_COPY ! check for small copy/leaf case
bgt,pt %ncc, .kcopy_more !
nop
.kcopy_small: ! setup error handler
sethi %hi(.sm_copyerr), %o4
or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value
ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
! Note that we carefully do *not* flag the setting of
! t_lofault.
membar #Sync ! sync error barrier
b .sm_do_copy ! common code
stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault
.kcopy_more:
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
sethi %hi(.copyerr), %l7 ! copyerr is lofault value
or %l7, %lo(.copyerr), %l7
ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
! Note that we carefully do *not* flag the setting of
! t_lofault.
membar #Sync ! sync error barrier
b .do_copy ! common code
stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
/*
* We got here because of a fault during a small kcopy or bcopy.
* if a fault handler existed when bcopy was called.
* No floating point registers are used by the small copies.
* Small copies are from a leaf routine
* Errno value is in %g1.
*/
.sm_copyerr:
! The kcopy will always set a t_lofault handler. If it fires,
! we're expected to just return the error code and not to
! invoke any existing error handler. As far as bcopy is concerned,
! we only set t_lofault if there was an existing lofault handler.
! In that case we're expected to invoke the previously existing
! handler after resetting the t_lofault value.
btst LOFAULT_SET, %o5
membar #Sync ! sync error barrier
andn %o5, LOFAULT_SET, %o5 ! clear fault flag
bnz,pn %ncc, 3f
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g1, %o0
3:
! We're here via bcopy. There must have been an error handler
! in place otherwise we would have died a nasty death already.
jmp %o5 ! goto real handler
mov %g0, %o0
/*
* end of .sm_copyerr
*/
/*
* We got here because of a fault during kcopy or bcopy if a fault
* handler existed when bcopy was called.
* stack and fp registers need to be restored
* Errno value is in %g1.
*/
.copyerr:
sethi %hi(.copyerr2), %l1
or %l1, %lo(.copyerr2), %l1
membar #Sync ! sync error barrier
stn %l1, [THREAD_REG + T_LOFAULT] ! set t_lofault
btst FPUSED_FLAG, %o5
bz,pt %xcc, 1f
and %o5, LOFAULT_SET, %l1 ! copy flag to %l1
membar #Sync ! sync error barrier
wr %l5, 0, %gsr
btst FPRS_FEF, %g5
bz,pt %icc, 4f
nop
! restore fpregs from stack
BLD_FP_FROMSTACK(%o2)
ba,pt %ncc, 2f
wr %g5, 0, %fprs ! restore fprs
4:
FZERO
wr %g5, 0, %fprs ! restore fprs
2:
ldn [THREAD_REG + T_LWP], %o2
brnz,pt %o2, 1f
nop
ldsb [THREAD_REG + T_PREEMPT], %l0
deccc %l0
bnz,pn %ncc, 1f
stb %l0, [THREAD_REG + T_PREEMPT]
! Check for a kernel preemption request
ldn [THREAD_REG + T_CPU], %l0
ldub [%l0 + CPU_KPRUNRUN], %l0
brnz,a,pt %l0, 1f ! Need to call kpreempt?
or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
! The kcopy will always set a t_lofault handler. If it fires,
! we're expected to just return the error code and not to
! invoke any existing error handler. As far as bcopy is concerned,
! we only set t_lofault if there was an existing lofault handler.
! In that case we're expected to invoke the previously existing
! handler after resetting the t_lofault value.
1:
andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
! call kpreempt if necessary
btst KPREEMPT_FLAG, %l1
bz,pt %icc, 2f
nop
call kpreempt
rdpr %pil, %o0 ! pass %pil
2:
btst LOFAULT_SET, %l1
bnz,pn %ncc, 3f
nop
ret
restore %g1, 0, %o0
3:
! We're here via bcopy. There must have been an error handler
! in place otherwise we would have died a nasty death already.
jmp %o5 ! goto real handler
restore %g0, 0, %o0 ! dispose of copy window
/*
* We got here because of a fault in .copyerr. We can't safely restore fp
* state, so we panic.
*/
fp_panic_msg:
.asciz "Unable to restore fp state after copy operation"
.align 4
.copyerr2:
set fp_panic_msg, %o0
call panic
nop
/*
* end of .copyerr
*/
#else /* NIAGARA_IMPL */
save %sp, -SA(MINFRAME), %sp
set .copyerr, %l7 ! copyerr is lofault value
ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
or %o5, LOFAULT_SET, %o5
membar #Sync ! sync error barrier
b .do_copy ! common code
stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
/*
* We got here because of a fault during kcopy.
* Errno value is in %g1.
*/
.copyerr:
! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
! into %o5 to indicate it has set t_lofault handler. Need to clear
! LOFAULT_SET flag before restoring the error handler.
andn %o5, LOFAULT_SET, %o5
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
ret
restore %g1, 0, %o0
#endif /* NIAGARA_IMPL */
SET_SIZE(kcopy)
#endif /* lint */
/*
* Copy a block of storage - must not overlap (from + len <= to).
*/
#if defined(lint)
/* ARGSUSED */
void
bcopy(const void *from, void *to, size_t count)
{}
#else /* lint */
ENTRY(bcopy)
#if !defined(NIAGARA_IMPL)
cmp %o2, FP_COPY ! check for small copy/leaf case
bgt,pt %ncc, .bcopy_more !
nop
.bcopy_small: ! setup error handler
ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
tst %o5
bz,pt %icc, .sm_do_copy
sethi %hi(.sm_copyerr), %o4
or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value
membar #Sync ! sync error barrier
stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault
or %o5, LOFAULT_SET, %o5 ! Error should trampoline
.sm_do_copy:
mov %o0, %g1 ! save %o0
cmp %o2, SHORTCOPY ! make sure there is enough to align
ble,pt %ncc, .bc_smallest
andcc %o1, 0x7, %o3 ! is dest long aligned
bnz,pn %ncc, .bc_align
andcc %o1, 1, %o3 ! is dest byte aligned
! Destination is long word aligned
.bc_al_src:
andcc %o0, 7, %o3
brnz,pt %o3, .bc_src_dst_unal8
nop
/*
* Special case for handling when src and dest are both long word aligned
* and total data to move is less than FP_COPY bytes
* Also handles finish up for large block moves, so may be less than 32 bytes
*/
.bc_medlong:
subcc %o2, 31, %o2 ! adjust length to allow cc test
ble,pt %ncc, .bc_medl31
nop
.bc_medl32:
ldx [%o0], %o4 ! move 32 bytes
subcc %o2, 32, %o2 ! decrement length count by 32
stx %o4, [%o1]
ldx [%o0+8], %o4
stx %o4, [%o1+8]
ldx [%o0+16], %o4
add %o0, 32, %o0 ! increase src ptr by 32
stx %o4, [%o1+16]
ldx [%o0-8], %o4
add %o1, 32, %o1 ! increase dst ptr by 32
bgu,pt %ncc, .bc_medl32 ! repeat if at least 32 bytes left
stx %o4, [%o1-8]
.bc_medl31:
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .bc_medl7 ! skip if 7 or fewer bytes left
nop
.bc_medl8:
ldx [%o0], %o4 ! move 8 bytes
add %o0, 8, %o0 ! increase src ptr by 8
subcc %o2, 8, %o2 ! decrease count by 8
add %o1, 8, %o1 ! increase dst ptr by 8
bgu,pt %ncc, .bc_medl8
stx %o4, [%o1-8]
.bc_medl7:
addcc %o2, 7, %o2 ! finish adjustment of remaining count
bnz,pt %ncc, .bc_small4 ! do final bytes if not finished
.bc_smallx: ! finish up and exit
tst %o5
bz,pt %ncc, .bc_sm_done
andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
.bc_sm_done:
retl
mov %g0, %o0
.bc_small4:
cmp %o2, 4
blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
nop !
ld [%o0], %o4 ! move 4 bytes
add %o0, 4, %o0 ! increase src ptr by 4
add %o1, 4, %o1 ! increase dst ptr by 4
subcc %o2, 4, %o2 ! decrease count by 4
bz,pt %ncc, .bc_smallx
stw %o4, [%o1-4]
.bc_small3x: ! Exactly 1, 2, or 3 bytes remain
subcc %o2, 1, %o2 ! reduce count for cc test
ldub [%o0], %o4 ! load one byte
bz,pt %ncc, .bc_smallx
stb %o4, [%o1] ! store one byte
ldub [%o0+1], %o4 ! load second byte
subcc %o2, 1, %o2
bz,pt %ncc, .bc_smallx
stb %o4, [%o1+1] ! store second byte
ldub [%o0+2], %o4 ! load third byte
ba .bc_smallx
stb %o4, [%o1+2] ! store third byte
.bc_smallest: ! 7 or fewer bytes remain
tst %o2
bz,pt %ncc, .bc_smallx
cmp %o2, 4
blt,pt %ncc, .bc_small3x
nop
ldub [%o0], %o4 ! read byte
subcc %o2, 4, %o2 ! reduce count by 4
stb %o4, [%o1] ! write byte
ldub [%o0+1], %o4 ! repeat for total of 4 bytes
add %o0, 4, %o0 ! advance src by 4
stb %o4, [%o1+1]
ldub [%o0-2], %o4
add %o1, 4, %o1 ! advance dst by 4
stb %o4, [%o1-2]
ldub [%o0-1], %o4
bnz,pt %ncc, .bc_small3x
stb %o4, [%o1-1]
ba .bc_smallx
nop
/*
* Align destination to long word boundary
*/
.bc_align: ! byte align test in prior branch delay
bnz,pt %ncc, .bc_al_d1
.bc_al_d1f: ! dest is now half word aligned
andcc %o1, 2, %o3
bnz,pt %ncc, .bc_al_d2
.bc_al_d2f: ! dest is now word aligned
andcc %o1, 4, %o3 ! is dest longword aligned?
bz,pt %ncc, .bc_al_src
nop
.bc_al_d4: ! dest is word aligned; src is unknown
ldub [%o0], %o4 ! move a word (src align unknown)
ldub [%o0+1], %o3
sll %o4, 24, %o4 ! position
sll %o3, 16, %o3 ! position
or %o4, %o3, %o3 ! merge
ldub [%o0+2], %o4
sll %o4, 8, %o4 ! position
or %o4, %o3, %o3 ! merge
ldub [%o0+3], %o4
or %o4, %o3, %o4 ! merge
stw %o4,[%o1] ! store four bytes
add %o0, 4, %o0 ! adjust src by 4
add %o1, 4, %o1 ! adjust dest by 4
sub %o2, 4, %o2 ! adjust count by 4
andcc %o0, 7, %o3 ! check for src long word alignment
brz,pt %o3, .bc_medlong
.bc_src_dst_unal8:
! dst is 8-byte aligned, src is not
! Size is less than FP_COPY
! Following code is to select for alignment
andcc %o0, 0x3, %o3 ! test word alignment
bz,pt %ncc, .bc_medword
nop
andcc %o0, 0x1, %o3 ! test halfword alignment
bnz,pt %ncc, .bc_med_byte ! go to byte move if not halfword
andcc %o0, 0x2, %o3 ! test which byte alignment
ba .bc_medhalf
nop
.bc_al_d1: ! align dest to half word
ldub [%o0], %o4 ! move a byte
add %o0, 1, %o0
stb %o4, [%o1]
add %o1, 1, %o1
andcc %o1, 2, %o3
bz,pt %ncc, .bc_al_d2f
sub %o2, 1, %o2
.bc_al_d2: ! align dest to word
ldub [%o0], %o4 ! move a half-word (src align unknown)
ldub [%o0+1], %o3
sll %o4, 8, %o4 ! position
or %o4, %o3, %o4 ! merge
sth %o4, [%o1]
add %o0, 2, %o0
add %o1, 2, %o1
andcc %o1, 4, %o3 ! is dest longword aligned?
bz,pt %ncc, .bc_al_src
sub %o2, 2, %o2
ba .bc_al_d4
nop
/*
* Handle all cases where src and dest are aligned on word
* boundaries. Use unrolled loops for better performance.
* This option wins over standard large data move when
* source and destination is in cache for medium
* to short data moves.
*/
.bc_medword:
subcc %o2, 31, %o2 ! adjust length to allow cc test
ble,pt %ncc, .bc_medw31
nop
.bc_medw32:
ld [%o0], %o4 ! move a block of 32 bytes
stw %o4, [%o1]
ld [%o0+4], %o4
stw %o4, [%o1+4]
ld [%o0+8], %o4
stw %o4, [%o1+8]
ld [%o0+12], %o4
stw %o4, [%o1+12]
ld [%o0+16], %o4
stw %o4, [%o1+16]
ld [%o0+20], %o4
subcc %o2, 32, %o2 ! decrement length count
stw %o4, [%o1+20]
ld [%o0+24], %o4
add %o0, 32, %o0 ! increase src ptr by 32
stw %o4, [%o1+24]
ld [%o0-4], %o4
add %o1, 32, %o1 ! increase dst ptr by 32
bgu,pt %ncc, .bc_medw32 ! repeat if at least 32 bytes left
stw %o4, [%o1-4]
.bc_medw31:
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .bc_medw7 ! skip if 7 or fewer bytes left
nop !
.bc_medw15:
ld [%o0], %o4 ! move a block of 8 bytes
subcc %o2, 8, %o2 ! decrement length count
stw %o4, [%o1]
add %o0, 8, %o0 ! increase src ptr by 8
ld [%o0-4], %o4
add %o1, 8, %o1 ! increase dst ptr by 8
bgu,pt %ncc, .bc_medw15
stw %o4, [%o1-4]
.bc_medw7:
addcc %o2, 7, %o2 ! finish adjustment of remaining count
bz,pt %ncc, .bc_smallx ! exit if finished
cmp %o2, 4
blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
nop !
ld [%o0], %o4 ! move 4 bytes
add %o0, 4, %o0 ! increase src ptr by 4
add %o1, 4, %o1 ! increase dst ptr by 4
subcc %o2, 4, %o2 ! decrease count by 4
bnz .bc_small3x
stw %o4, [%o1-4]
ba .bc_smallx
nop
.bc_medhalf:
subcc %o2, 31, %o2 ! adjust length to allow cc test
ble,pt %ncc, .bc_medh31
nop
.bc_medh32: ! load and store block of 32 bytes
subcc %o2, 32, %o2 ! decrement length count
lduh [%o0], %o4 ! move 32 bytes
lduw [%o0+2], %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
lduh [%o0+6], %o4
or %o4, %o3, %o4
stx %o4, [%o1]
lduh [%o0+8], %o4
lduw [%o0+10], %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
lduh [%o0+14], %o4
or %o4, %o3, %o4
stx %o4, [%o1+8]
lduh [%o0+16], %o4
lduw [%o0+18], %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
lduh [%o0+22], %o4
or %o4, %o3, %o4
stx %o4, [%o1+16]
add %o0, 32, %o0 ! increase src ptr by 32
add %o1, 32, %o1 ! increase dst ptr by 32
lduh [%o0-8], %o4
lduw [%o0-6], %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
lduh [%o0-2], %o4
or %o3, %o4, %o4
bgu,pt %ncc, .bc_medh32 ! repeat if at least 32 bytes left
stx %o4, [%o1-8]
.bc_medh31:
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .bc_medh7 ! skip if 7 or fewer bytes left
nop !
.bc_medh15:
lduh [%o0], %o4 ! move 16 bytes
subcc %o2, 8, %o2 ! decrement length count
lduw [%o0+2], %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
add %o1, 8, %o1 ! increase dst ptr by 8
lduh [%o0+6], %o4
add %o0, 8, %o0 ! increase src ptr by 8
or %o4, %o3, %o4
bgu,pt %ncc, .bc_medh15
stx %o4, [%o1-8]
.bc_medh7:
addcc %o2, 7, %o2 ! finish adjustment of remaining count
bz,pt %ncc, .bc_smallx ! exit if finished
cmp %o2, 4
blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
nop !
lduh [%o0], %o4
sll %o4, 16, %o4
lduh [%o0+2], %o3
or %o3, %o4, %o4
subcc %o2, 4, %o2
add %o0, 4, %o0
add %o1, 4, %o1
bnz .bc_small3x
stw %o4, [%o1-4]
ba .bc_smallx
nop
.align 16
.bc_med_byte:
bnz,pt %ncc, .bc_medbh32a ! go to correct byte move
subcc %o2, 31, %o2 ! adjust length to allow cc test
ble,pt %ncc, .bc_medb31
nop
.bc_medb32: ! Alignment 1 or 5
subcc %o2, 32, %o2 ! decrement length count
ldub [%o0], %o4 ! load and store a block of 32 bytes
sllx %o4, 56, %o3
lduh [%o0+1], %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduw [%o0+3], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+7], %o4
or %o4, %o3, %o4
stx %o4, [%o1]
ldub [%o0+8], %o4
sllx %o4, 56, %o3
lduh [%o0+9], %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduw [%o0+11], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+15], %o4
or %o4, %o3, %o4
stx %o4, [%o1+8]
ldub [%o0+16], %o4
sllx %o4, 56, %o3
lduh [%o0+17], %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduw [%o0+19], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+23], %o4
or %o4, %o3, %o4
stx %o4, [%o1+16]
add %o0, 32, %o0 ! increase src ptr by 32
add %o1, 32, %o1 ! increase dst ptr by 32
ldub [%o0-8], %o4
sllx %o4, 56, %o3
lduh [%o0-7], %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduw [%o0-5], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0-1], %o4
or %o4, %o3, %o4
bgu,pt %ncc, .bc_medb32 ! repeat if at least 32 bytes left
stx %o4, [%o1-8]
.bc_medb31: ! 31 or fewer bytes remaining
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left
nop !
.bc_medb15:
ldub [%o0], %o4 ! load and store a block of 8 bytes
subcc %o2, 8, %o2 ! decrement length count
sllx %o4, 56, %o3
lduh [%o0+1], %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduw [%o0+3], %o4
add %o1, 8, %o1 ! increase dst ptr by 16
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+7], %o4
add %o0, 8, %o0 ! increase src ptr by 16
or %o4, %o3, %o4
bgu,pt %ncc, .bc_medb15
stx %o4, [%o1-8]
.bc_medb7:
addcc %o2, 7, %o2 ! finish adjustment of remaining count
bz,pt %ncc, .bc_smallx ! exit if finished
cmp %o2, 4
blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
nop !
ldub [%o0], %o4 ! move 4 bytes
sll %o4, 24, %o3
lduh [%o0+1], %o4
sll %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+3], %o4
or %o4, %o3, %o4
subcc %o2, 4, %o2
add %o0, 4, %o0
add %o1, 4, %o1
bnz .bc_small3x
stw %o4, [%o1-4]
ba .bc_smallx
nop
.align 16
.bc_medbh32a: ! Alignment 3 or 7
ble,pt %ncc, .bc_medbh31
nop
.bc_medbh32: ! Alignment 3 or 7
subcc %o2, 32, %o2 ! decrement length count
ldub [%o0], %o4 ! load and store a block of 32 bytes
sllx %o4, 56, %o3
lduw [%o0+1], %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduh [%o0+5], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+7], %o4
or %o4, %o3, %o4
stx %o4, [%o1]
ldub [%o0+8], %o4
sllx %o4, 56, %o3
lduw [%o0+9], %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduh [%o0+13], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+15], %o4
or %o4, %o3, %o4
stx %o4, [%o1+8]
ldub [%o0+16], %o4
sllx %o4, 56, %o3
lduw [%o0+17], %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduh [%o0+21], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+23], %o4
or %o4, %o3, %o4
stx %o4, [%o1+16]
add %o0, 32, %o0 ! increase src ptr by 32
add %o1, 32, %o1 ! increase dst ptr by 32
ldub [%o0-8], %o4
sllx %o4, 56, %o3
lduw [%o0-7], %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduh [%o0-3], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0-1], %o4
or %o4, %o3, %o4
bgu,pt %ncc, .bc_medbh32 ! repeat if at least 32 bytes left
stx %o4, [%o1-8]
.bc_medbh31:
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left
nop !
.bc_medbh15:
ldub [%o0], %o4 ! load and store a block of 8 bytes
sllx %o4, 56, %o3
lduw [%o0+1], %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduh [%o0+5], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+7], %o4
or %o4, %o3, %o4
stx %o4, [%o1]
subcc %o2, 8, %o2 ! decrement length count
add %o1, 8, %o1 ! increase dst ptr by 8
add %o0, 8, %o0 ! increase src ptr by 8
bgu,pt %ncc, .bc_medbh15
stx %o4, [%o1-8]
ba .bc_medb7
nop
SET_SIZE(bcopy)
/*
* The _more entry points are not intended to be used directly by
* any caller from outside this file. They are provided to allow
* profiling and dtrace of the portions of the copy code that uses
* the floating point registers.
*/
ENTRY(bcopy_more)
.bcopy_more:
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
brz,pt %o5, .do_copy
nop
sethi %hi(.copyerr), %l7 ! copyerr is lofault value
or %l7, %lo(.copyerr), %l7
membar #Sync ! sync error barrier
stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
! We've already captured whether t_lofault was zero on entry.
! We need to mark ourselves as being from bcopy since both
! kcopy and bcopy use the same code path. If LOFAULT_SET is
! set and the saved lofault was zero, we won't reset lofault on
! returning.
or %o5, LOFAULT_SET, %o5
.do_copy:
ldn [THREAD_REG + T_LWP], %o3
brnz,pt %o3, 1f
nop
/*
* kpreempt_disable();
*/
ldsb [THREAD_REG +T_PREEMPT], %o3
inc %o3
stb %o3, [THREAD_REG + T_PREEMPT]
1:
/*
* Following code is for large copies. We know there is at
* least FP_COPY bytes available. FP regs are used, so
* we save registers and fp regs before starting
*/
rd %fprs, %g5 ! check for unused fp
or %o5,FPUSED_FLAG,%o5
! if fprs.fef == 0, set it.
! Setting it when already set costs more than checking
andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
bz,pt %ncc, .bc_fp_unused
prefetch [%i0 + (1 * CACHE_LINE)], #one_read
BST_FP_TOSTACK(%o3)
ba .bc_fp_ready
.bc_fp_unused:
andcc %i1, 1, %o3 ! is dest byte aligned
wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
.bc_fp_ready:
rd %gsr, %l5 ! save %gsr value
bnz,pt %ncc, .bc_big_d1
.bc_big_d1f: ! dest is now half word aligned
andcc %i1, 2, %o3
bnz,pt %ncc, .bc_big_d2
.bc_big_d2f: ! dest is now word aligned
andcc %i1, 4, %o3
bnz,pt %ncc, .bc_big_d4
.bc_big_d4f: ! dest is now long word aligned
andcc %i0, 7, %o3 ! is src long word aligned
brnz,pt %o3, .bc_big_unal8
prefetch [%i0 + (2 * CACHE_LINE)], #one_read
! Src and dst are long word aligned
! align dst to 64 byte boundary
andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
brz,pn %o3, .bc_al_to_64
nop
sub %o3, 64, %o3 ! %o3 has negative bytes to move
add %i2, %o3, %i2 ! adjust remaining count
andcc %o3, 8, %o4 ! odd long words to move?
brz,pt %o4, .bc_al_to_16
nop
add %o3, 8, %o3
ldx [%i0], %o4
add %i0, 8, %i0 ! increment src ptr
add %i1, 8, %i1 ! increment dst ptr
stx %o4, [%i1-8]
! Dest is aligned on 16 bytes, src 8 byte aligned
.bc_al_to_16:
andcc %o3, 0x30, %o4 ! pair of long words to move?
brz,pt %o4, .bc_al_to_64
nop
.bc_al_mv_16:
add %o3, 16, %o3
ldx [%i0], %o4
stx %o4, [%i1]
ldx [%i0+8], %o4
add %i0, 16, %i0 ! increment src ptr
stx %o4, [%i1+8]
andcc %o3, 48, %o4
brnz,pt %o4, .bc_al_mv_16
add %i1, 16, %i1 ! increment dst ptr
! Dest is aligned on 64 bytes, src 8 byte aligned
.bc_al_to_64:
! Determine source alignment
! to correct 8 byte offset
andcc %i0, 32, %o3
brnz,pn %o3, .bc_aln_1
andcc %i0, 16, %o3
brnz,pn %o3, .bc_aln_01
andcc %i0, 8, %o3
brz,pn %o3, .bc_aln_000
prefetch [%i0 + (3 * CACHE_LINE)], #one_read
ba .bc_aln_001
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.bc_aln_01:
brnz,pn %o3, .bc_aln_011
prefetch [%i0 + (3 * CACHE_LINE)], #one_read
ba .bc_aln_010
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.bc_aln_1:
andcc %i0, 16, %o3
brnz,pn %o3, .bc_aln_11
andcc %i0, 8, %o3
brnz,pn %o3, .bc_aln_101
prefetch [%i0 + (3 * CACHE_LINE)], #one_read
ba .bc_aln_100
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.bc_aln_11:
brz,pn %o3, .bc_aln_110
prefetch [%i0 + (3 * CACHE_LINE)], #one_read
.bc_aln_111:
! Alignment off by 8 bytes
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
ldd [%i0], %d0
add %i0, 8, %i0
sub %i2, 8, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.bc_aln_111_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d2
fmovd %d18, %d4
fmovd %d20, %d6
fmovd %d22, %d8
fmovd %d24, %d10
fmovd %d26, %d12
fmovd %d28, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d30, %d0
bgt,pt %ncc, .bc_aln_111_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
std %d0, [%i1]
ba .bc_remain_stuff
add %i1, 8, %i1
! END OF aln_111
.bc_aln_110:
! Alignment off by 16 bytes
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
ldd [%i0], %d0
ldd [%i0+8], %d2
add %i0, 16, %i0
sub %i2, 16, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.bc_aln_110_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d4
fmovd %d18, %d6
fmovd %d20, %d8
fmovd %d22, %d10
fmovd %d24, %d12
fmovd %d26, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d28, %d0
fmovd %d30, %d2
bgt,pt %ncc, .bc_aln_110_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
std %d0, [%i1]
std %d2, [%i1+8]
ba .bc_remain_stuff
add %i1, 16, %i1
! END OF aln_110
.bc_aln_101:
! Alignment off by 24 bytes
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
ldd [%i0], %d0
ldd [%i0+8], %d2
ldd [%i0+16], %d4
add %i0, 24, %i0
sub %i2, 24, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.bc_aln_101_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d6
fmovd %d18, %d8
fmovd %d20, %d10
fmovd %d22, %d12
fmovd %d24, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d26, %d0
fmovd %d28, %d2
fmovd %d30, %d4
bgt,pt %ncc, .bc_aln_101_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
std %d0, [%i1]
std %d2, [%i1+8]
std %d4, [%i1+16]
ba .bc_remain_stuff
add %i1, 24, %i1
! END OF aln_101
.bc_aln_100:
! Alignment off by 32 bytes
ldd [%i0], %d0
ldd [%i0+8], %d2
ldd [%i0+16],%d4
ldd [%i0+24],%d6
add %i0, 32, %i0
sub %i2, 32, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.bc_aln_100_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d8
fmovd %d18, %d10
fmovd %d20, %d12
fmovd %d22, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d24, %d0
fmovd %d26, %d2
fmovd %d28, %d4
fmovd %d30, %d6
bgt,pt %ncc, .bc_aln_100_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
std %d0, [%i1]
std %d2, [%i1+8]
std %d4, [%i1+16]
std %d6, [%i1+24]
ba .bc_remain_stuff
add %i1, 32, %i1
! END OF aln_100
.bc_aln_011:
! Alignment off by 40 bytes
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
ldd [%i0], %d0
ldd [%i0+8], %d2
ldd [%i0+16], %d4
ldd [%i0+24], %d6
ldd [%i0+32], %d8
add %i0, 40, %i0
sub %i2, 40, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.bc_aln_011_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d10
fmovd %d18, %d12
fmovd %d20, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d22, %d0
fmovd %d24, %d2
fmovd %d26, %d4
fmovd %d28, %d6
fmovd %d30, %d8
bgt,pt %ncc, .bc_aln_011_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
std %d0, [%i1]
std %d2, [%i1+8]
std %d4, [%i1+16]
std %d6, [%i1+24]
std %d8, [%i1+32]
ba .bc_remain_stuff
add %i1, 40, %i1
! END OF aln_011
.bc_aln_010:
! Alignment off by 48 bytes
ldd [%i0], %d0
ldd [%i0+8], %d2
ldd [%i0+16], %d4
ldd [%i0+24], %d6
ldd [%i0+32], %d8
ldd [%i0+40], %d10
add %i0, 48, %i0
sub %i2, 48, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.bc_aln_010_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d12
fmovd %d18, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d20, %d0
fmovd %d22, %d2
fmovd %d24, %d4
fmovd %d26, %d6
fmovd %d28, %d8
fmovd %d30, %d10
bgt,pt %ncc, .bc_aln_010_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
std %d0, [%i1]
std %d2, [%i1+8]
std %d4, [%i1+16]
std %d6, [%i1+24]
std %d8, [%i1+32]
std %d10, [%i1+40]
ba .bc_remain_stuff
add %i1, 48, %i1
! END OF aln_010
.bc_aln_001:
! Alignment off by 56 bytes
ldd [%i0], %d0
ldd [%i0+8], %d2
ldd [%i0+16], %d4
ldd [%i0+24], %d6
ldd [%i0+32], %d8
ldd [%i0+40], %d10
ldd [%i0+48], %d12
add %i0, 56, %i0
sub %i2, 56, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.bc_aln_001_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d18, %d0
fmovd %d20, %d2
fmovd %d22, %d4
fmovd %d24, %d6
fmovd %d26, %d8
fmovd %d28, %d10
fmovd %d30, %d12
bgt,pt %ncc, .bc_aln_001_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
std %d0, [%i1]
std %d2, [%i1+8]
std %d4, [%i1+16]
std %d6, [%i1+24]
std %d8, [%i1+32]
std %d10, [%i1+40]
std %d12, [%i1+48]
ba .bc_remain_stuff
add %i1, 56, %i1
! END OF aln_001
.bc_aln_000:
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.bc_aln_000_loop:
ldda [%i0]ASI_BLK_P,%d0
subcc %o3, 64, %o3
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
bgt,pt %ncc, .bc_aln_000_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
! END OF aln_000
.bc_remain_stuff:
subcc %i2, 31, %i2 ! adjust length to allow cc test
ble,pt %ncc, .bc_aln_31
nop
.bc_aln_32:
ldx [%i0], %o4 ! move 32 bytes
subcc %i2, 32, %i2 ! decrement length count by 32
stx %o4, [%i1]
ldx [%i0+8], %o4
stx %o4, [%i1+8]
ldx [%i0+16], %o4
add %i0, 32, %i0 ! increase src ptr by 32
stx %o4, [%i1+16]
ldx [%i0-8], %o4
add %i1, 32, %i1 ! increase dst ptr by 32
bgu,pt %ncc, .bc_aln_32 ! repeat if at least 32 bytes left
stx %o4, [%i1-8]
.bc_aln_31:
addcc %i2, 24, %i2 ! adjust count to be off by 7
ble,pt %ncc, .bc_aln_7 ! skip if 7 or fewer bytes left
nop !
.bc_aln_15:
ldx [%i0], %o4 ! move 8 bytes
add %i0, 8, %i0 ! increase src ptr by 8
subcc %i2, 8, %i2 ! decrease count by 8
add %i1, 8, %i1 ! increase dst ptr by 8
bgu,pt %ncc, .bc_aln_15
stx %o4, [%i1-8] !
.bc_aln_7:
addcc %i2, 7, %i2 ! finish adjustment of remaining count
bz,pt %ncc, .bc_exit ! exit if finished
cmp %i2, 4
blt,pt %ncc, .bc_unaln3x ! skip if less than 4 bytes left
nop !
ld [%i0], %o4 ! move 4 bytes
add %i0, 4, %i0 ! increase src ptr by 4
add %i1, 4, %i1 ! increase dst ptr by 4
subcc %i2, 4, %i2 ! decrease count by 4
bnz .bc_unaln3x
stw %o4, [%i1-4]
ba .bc_exit
nop
! destination alignment code
.bc_big_d1:
ldub [%i0], %o4 ! move a byte
add %i0, 1, %i0
stb %o4, [%i1]
add %i1, 1, %i1
andcc %i1, 2, %o3
bz,pt %ncc, .bc_big_d2f
sub %i2, 1, %i2
.bc_big_d2:
ldub [%i0], %o4 ! move a half-word (src align unknown)
ldub [%i0+1], %o3
add %i0, 2, %i0
sll %o4, 8, %o4 ! position
or %o4, %o3, %o4 ! merge
sth %o4, [%i1]
add %i1, 2, %i1
andcc %i1, 4, %o3
bz,pt %ncc, .bc_big_d4f
sub %i2, 2, %i2
.bc_big_d4:
ldub [%i0], %o4 ! move a word (src align unknown)
ldub [%i0+1], %o3
sll %o4, 24, %o4 ! position
sll %o3, 16, %o3 ! position
or %o4, %o3, %o3 ! merge
ldub [%i0+2], %o4
sll %o4, 8, %o4 ! position
or %o4, %o3, %o3 ! merge
ldub [%i0+3], %o4
or %o4, %o3, %o4 ! merge
stw %o4,[%i1] ! store four bytes
add %i0, 4, %i0 ! adjust src by 4
add %i1, 4, %i1 ! adjust dest by 4
ba .bc_big_d4f
sub %i2, 4, %i2 ! adjust count by 4
! Dst is on 8 byte boundary; src is not;
.bc_big_unal8:
andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned?
bz %ncc, .bc_unalnsrc
sub %o3, 64, %o3 ! %o3 will be multiple of 8
neg %o3 ! bytes until dest is 64 byte aligned
sub %i2, %o3, %i2 ! update cnt with bytes to be moved
! Move bytes according to source alignment
andcc %i0, 0x1, %o4
bnz %ncc, .bc_unalnbyte ! check for byte alignment
nop
andcc %i0, 2, %o4 ! check for half word alignment
bnz %ncc, .bc_unalnhalf
nop
! Src is word aligned, move bytes until dest 64 byte aligned
.bc_unalnword:
ld [%i0], %o4 ! load 4 bytes
stw %o4, [%i1] ! and store 4 bytes
ld [%i0+4], %o4 ! load 4 bytes
add %i0, 8, %i0 ! increase src ptr by 8
stw %o4, [%i1+4] ! and store 4 bytes
subcc %o3, 8, %o3 ! decrease count by 8
bnz %ncc, .bc_unalnword
add %i1, 8, %i1 ! increase dst ptr by 8
ba .bc_unalnsrc
nop
! Src is half-word aligned, move bytes until dest 64 byte aligned
.bc_unalnhalf:
lduh [%i0], %o4 ! load 2 bytes
sllx %o4, 32, %i3 ! shift left
lduw [%i0+2], %o4
or %o4, %i3, %i3
sllx %i3, 16, %i3
lduh [%i0+6], %o4
or %o4, %i3, %i3
stx %i3, [%i1]
add %i0, 8, %i0
subcc %o3, 8, %o3
bnz %ncc, .bc_unalnhalf
add %i1, 8, %i1
ba .bc_unalnsrc
nop
! Src is Byte aligned, move bytes until dest 64 byte aligned
.bc_unalnbyte:
sub %i1, %i0, %i1 ! share pointer advance
.bc_unalnbyte_loop:
ldub [%i0], %o4
sllx %o4, 56, %i3
lduh [%i0+1], %o4
sllx %o4, 40, %o4
or %o4, %i3, %i3
lduh [%i0+3], %o4
sllx %o4, 24, %o4
or %o4, %i3, %i3
lduh [%i0+5], %o4
sllx %o4, 8, %o4
or %o4, %i3, %i3
ldub [%i0+7], %o4
or %o4, %i3, %i3
stx %i3, [%i1+%i0]
subcc %o3, 8, %o3
bnz %ncc, .bc_unalnbyte_loop
add %i0, 8, %i0
add %i1,%i0, %i1 ! restore pointer
! Destination is now block (64 byte aligned), src is not 8 byte aligned
.bc_unalnsrc:
andn %i2, 0x3f, %i3 ! %i3 is multiple of block size
and %i2, 0x3f, %i2 ! residue bytes in %i2
add %i2, 64, %i2 ! Insure we don't load beyond
sub %i3, 64, %i3 ! end of source buffer
andn %i0, 0x3f, %o4 ! %o4 has block aligned src address
prefetch [%o4 + (3 * CACHE_LINE)], #one_read
alignaddr %i0, %g0, %g0 ! generate %gsr
add %i0, %i3, %i0 ! advance %i0 to after blocks
!
! Determine source alignment to correct 8 byte offset
andcc %i0, 0x20, %o3
brnz,pn %o3, .bc_unaln_1
andcc %i0, 0x10, %o3
brnz,pn %o3, .bc_unaln_01
andcc %i0, 0x08, %o3
brz,a %o3, .bc_unaln_000
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .bc_unaln_001
nop
.bc_unaln_01:
brnz,a %o3, .bc_unaln_011
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .bc_unaln_010
nop
.bc_unaln_1:
brnz,pn %o3, .bc_unaln_11
andcc %i0, 0x08, %o3
brnz,a %o3, .bc_unaln_101
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .bc_unaln_100
nop
.bc_unaln_11:
brz,pn %o3, .bc_unaln_110
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.bc_unaln_111:
ldd [%o4+56], %d14
.bc_unaln_111_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d14, %d16, %d48
faligndata %d16, %d18, %d50
faligndata %d18, %d20, %d52
faligndata %d20, %d22, %d54
faligndata %d22, %d24, %d56
faligndata %d24, %d26, %d58
faligndata %d26, %d28, %d60
faligndata %d28, %d30, %d62
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .bc_unaln_111_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .bc_unaln_done
nop
.bc_unaln_110:
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.bc_unaln_110_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d12, %d14, %d48
faligndata %d14, %d16, %d50
faligndata %d16, %d18, %d52
faligndata %d18, %d20, %d54
faligndata %d20, %d22, %d56
faligndata %d22, %d24, %d58
faligndata %d24, %d26, %d60
faligndata %d26, %d28, %d62
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .bc_unaln_110_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .bc_unaln_done
nop
.bc_unaln_101:
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.bc_unaln_101_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d10, %d12, %d48
faligndata %d12, %d14, %d50
faligndata %d14, %d16, %d52
faligndata %d16, %d18, %d54
faligndata %d18, %d20, %d56
faligndata %d20, %d22, %d58
faligndata %d22, %d24, %d60
faligndata %d24, %d26, %d62
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .bc_unaln_101_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .bc_unaln_done
nop
.bc_unaln_100:
ldd [%o4+32], %d8
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.bc_unaln_100_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d8, %d10, %d48
faligndata %d10, %d12, %d50
faligndata %d12, %d14, %d52
faligndata %d14, %d16, %d54
faligndata %d16, %d18, %d56
faligndata %d18, %d20, %d58
faligndata %d20, %d22, %d60
faligndata %d22, %d24, %d62
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .bc_unaln_100_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .bc_unaln_done
nop
.bc_unaln_011:
ldd [%o4+24], %d6
ldd [%o4+32], %d8
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.bc_unaln_011_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d6, %d8, %d48
faligndata %d8, %d10, %d50
faligndata %d10, %d12, %d52
faligndata %d12, %d14, %d54
faligndata %d14, %d16, %d56
faligndata %d16, %d18, %d58
faligndata %d18, %d20, %d60
faligndata %d20, %d22, %d62
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .bc_unaln_011_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .bc_unaln_done
nop
.bc_unaln_010:
ldd [%o4+16], %d4
ldd [%o4+24], %d6
ldd [%o4+32], %d8
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.bc_unaln_010_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d4, %d6, %d48
faligndata %d6, %d8, %d50
faligndata %d8, %d10, %d52
faligndata %d10, %d12, %d54
faligndata %d12, %d14, %d56
faligndata %d14, %d16, %d58
faligndata %d16, %d18, %d60
faligndata %d18, %d20, %d62
fmovd %d20, %d4
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .bc_unaln_010_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .bc_unaln_done
nop
.bc_unaln_001:
ldd [%o4+8], %d2
ldd [%o4+16], %d4
ldd [%o4+24], %d6
ldd [%o4+32], %d8
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.bc_unaln_001_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d2, %d4, %d48
faligndata %d4, %d6, %d50
faligndata %d6, %d8, %d52
faligndata %d8, %d10, %d54
faligndata %d10, %d12, %d56
faligndata %d12, %d14, %d58
faligndata %d14, %d16, %d60
faligndata %d16, %d18, %d62
fmovd %d18, %d2
fmovd %d20, %d4
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .bc_unaln_001_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .bc_unaln_done
nop
.bc_unaln_000:
ldda [%o4]ASI_BLK_P, %d0
.bc_unaln_000_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d0, %d2, %d48
faligndata %d2, %d4, %d50
faligndata %d4, %d6, %d52
faligndata %d6, %d8, %d54
faligndata %d8, %d10, %d56
faligndata %d10, %d12, %d58
faligndata %d12, %d14, %d60
faligndata %d14, %d16, %d62
fmovd %d16, %d0
fmovd %d18, %d2
fmovd %d20, %d4
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .bc_unaln_000_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
.bc_unaln_done:
! Handle trailing bytes, 64 to 127
! Dest long word aligned, Src not long word aligned
cmp %i2, 15
bleu %ncc, .bc_unaln_short
andn %i2, 0x7, %i3 ! %i3 is multiple of 8
and %i2, 0x7, %i2 ! residue bytes in %i2
add %i2, 8, %i2
sub %i3, 8, %i3 ! insure we don't load past end of src
andn %i0, 0x7, %o4 ! %o4 has long word aligned src address
add %i0, %i3, %i0 ! advance %i0 to after multiple of 8
ldd [%o4], %d0 ! fetch partial word
.bc_unaln_by8:
ldd [%o4+8], %d2
add %o4, 8, %o4
faligndata %d0, %d2, %d16
subcc %i3, 8, %i3
std %d16, [%i1]
fmovd %d2, %d0
bgu,pt %ncc, .bc_unaln_by8
add %i1, 8, %i1
.bc_unaln_short:
cmp %i2, 8
blt,pt %ncc, .bc_unalnfin
nop
ldub [%i0], %o4
sll %o4, 24, %o3
ldub [%i0+1], %o4
sll %o4, 16, %o4
or %o4, %o3, %o3
ldub [%i0+2], %o4
sll %o4, 8, %o4
or %o4, %o3, %o3
ldub [%i0+3], %o4
or %o4, %o3, %o3
stw %o3, [%i1]
ldub [%i0+4], %o4
sll %o4, 24, %o3
ldub [%i0+5], %o4
sll %o4, 16, %o4
or %o4, %o3, %o3
ldub [%i0+6], %o4
sll %o4, 8, %o4
or %o4, %o3, %o3
ldub [%i0+7], %o4
or %o4, %o3, %o3
stw %o3, [%i1+4]
add %i0, 8, %i0
add %i1, 8, %i1
sub %i2, 8, %i2
.bc_unalnfin:
cmp %i2, 4
blt,pt %ncc, .bc_unalnz
tst %i2
ldub [%i0], %o3 ! read byte
subcc %i2, 4, %i2 ! reduce count by 4
sll %o3, 24, %o3 ! position
ldub [%i0+1], %o4
sll %o4, 16, %o4 ! position
or %o4, %o3, %o3 ! merge
ldub [%i0+2], %o4
sll %o4, 8, %o4 ! position
or %o4, %o3, %o3 ! merge
add %i1, 4, %i1 ! advance dst by 4
ldub [%i0+3], %o4
add %i0, 4, %i0 ! advance src by 4
or %o4, %o3, %o4 ! merge
bnz,pt %ncc, .bc_unaln3x
stw %o4, [%i1-4]
ba .bc_exit
nop
.bc_unalnz:
bz,pt %ncc, .bc_exit
.bc_unaln3x: ! Exactly 1, 2, or 3 bytes remain
subcc %i2, 1, %i2 ! reduce count for cc test
ldub [%i0], %o4 ! load one byte
bz,pt %ncc, .bc_exit
stb %o4, [%i1] ! store one byte
ldub [%i0+1], %o4 ! load second byte
subcc %i2, 1, %i2
bz,pt %ncc, .bc_exit
stb %o4, [%i1+1] ! store second byte
ldub [%i0+2], %o4 ! load third byte
stb %o4, [%i1+2] ! store third byte
.bc_exit:
wr %l5, %g0, %gsr ! restore %gsr
brnz %g5, .bc_fp_restore
and %o5, COPY_FLAGS, %l1 ! save flags in %l1
FZERO
wr %g5, %g0, %fprs
ba,pt %ncc, .bc_ex2
nop
.bc_fp_restore:
BLD_FP_FROMSTACK(%o4)
.bc_ex2:
ldn [THREAD_REG + T_LWP], %o2
brnz,pt %o2, 1f
nop
ldsb [THREAD_REG + T_PREEMPT], %l0
deccc %l0
bnz,pn %ncc, 1f
stb %l0, [THREAD_REG + T_PREEMPT]
! Check for a kernel preemption request
ldn [THREAD_REG + T_CPU], %l0
ldub [%l0 + CPU_KPRUNRUN], %l0
brnz,a,pt %l0, 1f ! Need to call kpreempt?
or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
1:
btst LOFAULT_SET, %l1
bz,pn %icc, 3f
andncc %o5, COPY_FLAGS, %o5
! Here via bcopy. Check to see if the handler was NULL.
! If so, just return quietly. Otherwise, reset the
! handler and return.
bz,pn %ncc, 2f
nop
membar #Sync
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2:
btst KPREEMPT_FLAG, %l1
bz,pt %icc, 3f
nop
call kpreempt
rdpr %pil, %o0 ! pass %pil
3:
ret
restore %g0, 0, %o0
SET_SIZE(bcopy_more)
#else /* NIAGARA_IMPL */
save %sp, -SA(MINFRAME), %sp
clr %o5 ! flag LOFAULT_SET is not set for bcopy
.do_copy:
cmp %i2, 12 ! for small counts
blu %ncc, .bytecp ! just copy bytes
.empty
cmp %i2, 128 ! for less than 128 bytes
blu,pn %ncc, .bcb_punt ! no block st/quad ld
nop
set use_hw_bcopy, %o2
ld [%o2], %o2
brz,pn %o2, .bcb_punt
nop
subcc %i1, %i0, %i3
bneg,a,pn %ncc, 1f
neg %i3
1:
/*
* Compare against 256 since we should be checking block addresses
* and (dest & ~63) - (src & ~63) can be 3 blocks even if
* src = dest + (64 * 3) + 63.
*/
cmp %i3, 256
blu,pn %ncc, .bcb_punt
nop
/*
* Copy that reach here have at least 2 blocks of data to copy.
*/
.do_blockcopy:
! Swap src/dst since the code below is memcpy code
! and memcpy/bcopy have different calling sequences
mov %i1, %i5
mov %i0, %i1
mov %i5, %i0
! Block (64 bytes) align the destination.
andcc %i0, 0x3f, %i3 ! is dst aligned on a 64 bytes
bz %xcc, .chksrc ! dst is already double aligned
sub %i3, 0x40, %i3
neg %i3 ! bytes till dst 64 bytes aligned
sub %i2, %i3, %i2 ! update i2 with new count
! Based on source and destination alignment do
! either 8 bytes, 4 bytes, 2 bytes or byte copy.
! Is dst & src 8B aligned
or %i0, %i1, %o2
andcc %o2, 0x7, %g0
bz %ncc, .alewdcp
nop
! Is dst & src 4B aligned
andcc %o2, 0x3, %g0
bz %ncc, .alwdcp
nop
! Is dst & src 2B aligned
andcc %o2, 0x1, %g0
bz %ncc, .alhlfwdcp
nop
! 1B aligned
1: ldub [%i1], %o2
stb %o2, [%i0]
inc %i1
deccc %i3
bgu,pt %ncc, 1b
inc %i0
ba .chksrc
nop
! dst & src 4B aligned
.alwdcp:
ld [%i1], %o2
st %o2, [%i0]
add %i1, 0x4, %i1
subcc %i3, 0x4, %i3
bgu,pt %ncc, .alwdcp
add %i0, 0x4, %i0
ba .chksrc
nop
! dst & src 2B aligned
.alhlfwdcp:
lduh [%i1], %o2
stuh %o2, [%i0]
add %i1, 0x2, %i1
subcc %i3, 0x2, %i3
bgu,pt %ncc, .alhlfwdcp
add %i0, 0x2, %i0
ba .chksrc
nop
! dst & src 8B aligned
.alewdcp:
ldx [%i1], %o2
stx %o2, [%i0]
add %i1, 0x8, %i1
subcc %i3, 0x8, %i3
bgu,pt %ncc, .alewdcp
add %i0, 0x8, %i0
! Now Destination is block (64 bytes) aligned
.chksrc:
andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
sub %i2, %i3, %i2 ! Residue bytes in %i2
mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
andcc %i1, 0xf, %o2 ! is src quadword aligned
bz,pn %xcc, .blkcpy ! src offset in %o2
nop
cmp %o2, 0x8
bg .cpy_upper_double
nop
bl .cpy_lower_double
nop
! Falls through when source offset is equal to 8 i.e.
! source is double word aligned.
! In this case no shift/merge of data is required
sub %i1, %o2, %i1 ! align the src at 16 bytes.
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetch [%l0+0x0], #one_read
ldda [%i1+0x0]%asi, %l2
loop0:
ldda [%i1+0x10]%asi, %l4
prefetch [%l0+0x40], #one_read
stxa %l3, [%i0+0x0]%asi
stxa %l4, [%i0+0x8]%asi
ldda [%i1+0x20]%asi, %l2
stxa %l5, [%i0+0x10]%asi
stxa %l2, [%i0+0x18]%asi
ldda [%i1+0x30]%asi, %l4
stxa %l3, [%i0+0x20]%asi
stxa %l4, [%i0+0x28]%asi
ldda [%i1+0x40]%asi, %l2
stxa %l5, [%i0+0x30]%asi
stxa %l2, [%i0+0x38]%asi
add %l0, 0x40, %l0
add %i1, 0x40, %i1
subcc %i3, 0x40, %i3
bgu,pt %xcc, loop0
add %i0, 0x40, %i0
ba .blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
.cpy_lower_double:
sub %i1, %o2, %i1 ! align the src at 16 bytes.
sll %o2, 3, %o0 ! %o0 left shift
mov 0x40, %o1
sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetch [%l0+0x0], #one_read
ldda [%i1+0x0]%asi, %l2 ! partial data in %l2 and %l3 has
! complete data
loop1:
ldda [%i1+0x10]%asi, %l4 ! %l4 has partial data for this read.
ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
! into %l2 and %l3
prefetch [%l0+0x40], #one_read
stxa %l2, [%i0+0x0]%asi
stxa %l3, [%i0+0x8]%asi
ldda [%i1+0x20]%asi, %l2
ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
stxa %l4, [%i0+0x10]%asi ! %l4 from previous read
stxa %l5, [%i0+0x18]%asi ! into %l4 and %l5
! Repeat the same for next 32 bytes.
ldda [%i1+0x30]%asi, %l4
ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
stxa %l2, [%i0+0x20]%asi
stxa %l3, [%i0+0x28]%asi
ldda [%i1+0x40]%asi, %l2
ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
stxa %l4, [%i0+0x30]%asi
stxa %l5, [%i0+0x38]%asi
add %l0, 0x40, %l0
add %i1, 0x40, %i1
subcc %i3, 0x40, %i3
bgu,pt %xcc, loop1
add %i0, 0x40, %i0
ba .blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
.cpy_upper_double:
sub %i1, %o2, %i1 ! align the src at 16 bytes.
mov 0x8, %o0
sub %o2, %o0, %o0
sll %o0, 3, %o0 ! %o0 left shift
mov 0x40, %o1
sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetch [%l0+0x0], #one_read
ldda [%i1+0x0]%asi, %l2 ! partial data in %l3 for this read and
! no data in %l2
loop2:
ldda [%i1+0x10]%asi, %l4 ! %l4 has complete data and %l5 has
! partial
ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
! into %l3 and %l4
prefetch [%l0+0x40], #one_read
stxa %l3, [%i0+0x0]%asi
stxa %l4, [%i0+0x8]%asi
ldda [%i1+0x20]%asi, %l2
ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
stxa %l5, [%i0+0x10]%asi ! %l5 from previous read
stxa %l2, [%i0+0x18]%asi ! into %l5 and %l2
! Repeat the same for next 32 bytes.
ldda [%i1+0x30]%asi, %l4
ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
stxa %l3, [%i0+0x20]%asi
stxa %l4, [%i0+0x28]%asi
ldda [%i1+0x40]%asi, %l2
ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
stxa %l5, [%i0+0x30]%asi
stxa %l2, [%i0+0x38]%asi
add %l0, 0x40, %l0
add %i1, 0x40, %i1
subcc %i3, 0x40, %i3
bgu,pt %xcc, loop2
add %i0, 0x40, %i0
ba .blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
! Both Source and Destination are block aligned.
! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.blkcpy:
prefetch [%i1+0x0], #one_read
1:
ldda [%i1+0x0]%asi, %l0
ldda [%i1+0x10]%asi, %l2
prefetch [%i1+0x40], #one_read
stxa %l0, [%i0+0x0]%asi
ldda [%i1+0x20]%asi, %l4
ldda [%i1+0x30]%asi, %l6
stxa %l1, [%i0+0x8]%asi
stxa %l2, [%i0+0x10]%asi
stxa %l3, [%i0+0x18]%asi
stxa %l4, [%i0+0x20]%asi
stxa %l5, [%i0+0x28]%asi
stxa %l6, [%i0+0x30]%asi
stxa %l7, [%i0+0x38]%asi
add %i1, 0x40, %i1
subcc %i3, 0x40, %i3
bgu,pt %xcc, 1b
add %i0, 0x40, %i0
.blkdone:
membar #Sync
brz,pt %i2, .blkexit
nop
! Handle trailing bytes
cmp %i2, 0x8
blu,pt %ncc, .residue
nop
! Can we do some 8B ops
or %i1, %i0, %o2
andcc %o2, 0x7, %g0
bnz %ncc, .last4
nop
! Do 8byte ops as long as possible
.last8:
ldx [%i1], %o2
stx %o2, [%i0]
add %i1, 0x8, %i1
sub %i2, 0x8, %i2
cmp %i2, 0x8
bgu,pt %ncc, .last8
add %i0, 0x8, %i0
brz,pt %i2, .blkexit
nop
ba .residue
nop
.last4:
! Can we do 4B ops
andcc %o2, 0x3, %g0
bnz %ncc, .last2
nop
1:
ld [%i1], %o2
st %o2, [%i0]
add %i1, 0x4, %i1
sub %i2, 0x4, %i2
cmp %i2, 0x4
bgu,pt %ncc, 1b
add %i0, 0x4, %i0
brz,pt %i2, .blkexit
nop
ba .residue
nop
.last2:
! Can we do 2B ops
andcc %o2, 0x1, %g0
bnz %ncc, .residue
nop
1:
lduh [%i1], %o2
stuh %o2, [%i0]
add %i1, 0x2, %i1
sub %i2, 0x2, %i2
cmp %i2, 0x2
bgu,pt %ncc, 1b
add %i0, 0x2, %i0
brz,pt %i2, .blkexit
nop
.residue:
ldub [%i1], %o2
stb %o2, [%i0]
inc %i1
deccc %i2
bgu,pt %ncc, .residue
inc %i0
.blkexit:
membar #Sync ! sync error barrier
! Restore t_lofault handler, if came here from kcopy().
tst %o5
bz %ncc, 1f
andn %o5, LOFAULT_SET, %o5
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1:
ret
restore %g0, 0, %o0
.bcb_punt:
!
! use aligned transfers where possible
!
xor %i0, %i1, %o4 ! xor from and to address
btst 7, %o4 ! if lower three bits zero
bz .aldoubcp ! can align on double boundary
.empty ! assembler complaints about label
xor %i0, %i1, %o4 ! xor from and to address
btst 3, %o4 ! if lower two bits zero
bz .alwordcp ! can align on word boundary
btst 3, %i0 ! delay slot, from address unaligned?
!
! use aligned reads and writes where possible
! this differs from wordcp in that it copes
! with odd alignment between source and destnation
! using word reads and writes with the proper shifts
! in between to align transfers to and from memory
! i0 - src address, i1 - dest address, i2 - count
! i3, i4 - tmps for used generating complete word
! i5 (word to write)
! l0 size in bits of upper part of source word (US)
! l1 size in bits of lower part of source word (LS = 32 - US)
! l2 size in bits of upper part of destination word (UD)
! l3 size in bits of lower part of destination word (LD = 32 - UD)
! l4 number of bytes leftover after aligned transfers complete
! l5 the number 32
!
mov 32, %l5 ! load an oft-needed constant
bz .align_dst_only
btst 3, %i1 ! is destnation address aligned?
clr %i4 ! clear registers used in either case
bz .align_src_only
clr %l0
!
! both source and destination addresses are unaligned
!
1: ! align source
ldub [%i0], %i3 ! read a byte from source address
add %i0, 1, %i0 ! increment source address
or %i4, %i3, %i4 ! or in with previous bytes (if any)
btst 3, %i0 ! is source aligned?
add %l0, 8, %l0 ! increment size of upper source (US)
bnz,a 1b
sll %i4, 8, %i4 ! make room for next byte
sub %l5, %l0, %l1 ! generate shift left count (LS)
sll %i4, %l1, %i4 ! prepare to get rest
ld [%i0], %i3 ! read a word
add %i0, 4, %i0 ! increment source address
srl %i3, %l0, %i5 ! upper src bits into lower dst bits
or %i4, %i5, %i5 ! merge
mov 24, %l3 ! align destination
1:
srl %i5, %l3, %i4 ! prepare to write a single byte
stb %i4, [%i1] ! write a byte
add %i1, 1, %i1 ! increment destination address
sub %i2, 1, %i2 ! decrement count
btst 3, %i1 ! is destination aligned?
bnz,a 1b
sub %l3, 8, %l3 ! delay slot, decrement shift count (LD)
sub %l5, %l3, %l2 ! generate shift left count (UD)
sll %i5, %l2, %i5 ! move leftover into upper bytes
cmp %l2, %l0 ! cmp # reqd to fill dst w old src left
bgu %ncc, .more_needed ! need more to fill than we have
nop
sll %i3, %l1, %i3 ! clear upper used byte(s)
srl %i3, %l1, %i3
! get the odd bytes between alignments
sub %l0, %l2, %l0 ! regenerate shift count
sub %l5, %l0, %l1 ! generate new shift left count (LS)
and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0
andn %i2, 3, %i2 ! # of aligned bytes that can be moved
srl %i3, %l0, %i4
or %i5, %i4, %i5
st %i5, [%i1] ! write a word
subcc %i2, 4, %i2 ! decrement count
bz %ncc, .unalign_out
add %i1, 4, %i1 ! increment destination address
b 2f
sll %i3, %l1, %i5 ! get leftover into upper bits
.more_needed:
sll %i3, %l0, %i3 ! save remaining byte(s)
srl %i3, %l0, %i3
sub %l2, %l0, %l1 ! regenerate shift count
sub %l5, %l1, %l0 ! generate new shift left count
sll %i3, %l1, %i4 ! move to fill empty space
b 3f
or %i5, %i4, %i5 ! merge to complete word
!
! the source address is aligned and destination is not
!
.align_dst_only:
ld [%i0], %i4 ! read a word
add %i0, 4, %i0 ! increment source address
mov 24, %l0 ! initial shift alignment count
1:
srl %i4, %l0, %i3 ! prepare to write a single byte
stb %i3, [%i1] ! write a byte
add %i1, 1, %i1 ! increment destination address
sub %i2, 1, %i2 ! decrement count
btst 3, %i1 ! is destination aligned?
bnz,a 1b
sub %l0, 8, %l0 ! delay slot, decrement shift count
.xfer:
sub %l5, %l0, %l1 ! generate shift left count
sll %i4, %l1, %i5 ! get leftover
3:
and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0
andn %i2, 3, %i2 ! # of aligned bytes that can be moved
2:
ld [%i0], %i3 ! read a source word
add %i0, 4, %i0 ! increment source address
srl %i3, %l0, %i4 ! upper src bits into lower dst bits
or %i5, %i4, %i5 ! merge with upper dest bits (leftover)
st %i5, [%i1] ! write a destination word
subcc %i2, 4, %i2 ! decrement count
bz %ncc, .unalign_out ! check if done
add %i1, 4, %i1 ! increment destination address
b 2b ! loop
sll %i3, %l1, %i5 ! get leftover
.unalign_out:
tst %l4 ! any bytes leftover?
bz %ncc, .cpdone
.empty ! allow next instruction in delay slot
1:
sub %l0, 8, %l0 ! decrement shift
srl %i3, %l0, %i4 ! upper src byte into lower dst byte
stb %i4, [%i1] ! write a byte
subcc %l4, 1, %l4 ! decrement count
bz %ncc, .cpdone ! done?
add %i1, 1, %i1 ! increment destination
tst %l0 ! any more previously read bytes
bnz %ncc, 1b ! we have leftover bytes
mov %l4, %i2 ! delay slot, mv cnt where dbytecp wants
b .dbytecp ! let dbytecp do the rest
sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
!
! the destination address is aligned and the source is not
!
.align_src_only:
ldub [%i0], %i3 ! read a byte from source address
add %i0, 1, %i0 ! increment source address
or %i4, %i3, %i4 ! or in with previous bytes (if any)
btst 3, %i0 ! is source aligned?
add %l0, 8, %l0 ! increment shift count (US)
bnz,a .align_src_only
sll %i4, 8, %i4 ! make room for next byte
b,a .xfer
!
! if from address unaligned for double-word moves,
! move bytes till it is, if count is < 56 it could take
! longer to align the thing than to do the transfer
! in word size chunks right away
!
.aldoubcp:
cmp %i2, 56 ! if count < 56, use wordcp, it takes
blu,a %ncc, .alwordcp ! longer to align doubles than words
mov 3, %o0 ! mask for word alignment
call .alignit ! copy bytes until aligned
mov 7, %o0 ! mask for double alignment
!
! source and destination are now double-word aligned
! i3 has aligned count returned by alignit
!
and %i2, 7, %i2 ! unaligned leftover count
sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
5:
ldx [%i0+%i1], %o4 ! read from address
stx %o4, [%i1] ! write at destination address
subcc %i3, 8, %i3 ! dec count
bgu %ncc, 5b
add %i1, 8, %i1 ! delay slot, inc to address
cmp %i2, 4 ! see if we can copy a word
blu %ncc, .dbytecp ! if 3 or less bytes use bytecp
.empty
!
! for leftover bytes we fall into wordcp, if needed
!
.wordcp:
and %i2, 3, %i2 ! unaligned leftover count
5:
ld [%i0+%i1], %o4 ! read from address
st %o4, [%i1] ! write at destination address
subcc %i3, 4, %i3 ! dec count
bgu %ncc, 5b
add %i1, 4, %i1 ! delay slot, inc to address
b,a .dbytecp
! we come here to align copies on word boundaries
.alwordcp:
call .alignit ! go word-align it
mov 3, %o0 ! bits that must be zero to be aligned
b .wordcp
sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
!
! byte copy, works with any alignment
!
.bytecp:
b .dbytecp
sub %i0, %i1, %i0 ! i0 gets difference of src and dst
!
! differenced byte copy, works with any alignment
! assumes dest in %i1 and (source - dest) in %i0
!
1:
stb %o4, [%i1] ! write to address
inc %i1 ! inc to address
.dbytecp:
deccc %i2 ! dec count
bgeu,a %ncc, 1b ! loop till done
ldub [%i0+%i1], %o4 ! read from address
.cpdone:
membar #Sync ! sync error barrier
! Restore t_lofault handler, if came here from kcopy().
tst %o5
bz %ncc, 1f
andn %o5, LOFAULT_SET, %o5
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1:
ret
restore %g0, 0, %o0 ! return (0)
/*
* Common code used to align transfers on word and doubleword
* boundaries. Aligns source and destination and returns a count
* of aligned bytes to transfer in %i3
*/
1:
inc %i0 ! inc from
stb %o4, [%i1] ! write a byte
inc %i1 ! inc to
dec %i2 ! dec count
.alignit:
btst %o0, %i0 ! %o0 is bit mask to check for alignment
bnz,a 1b
ldub [%i0], %o4 ! read next byte
retl
andn %i2, %o0, %i3 ! return size of aligned bytes
SET_SIZE(bcopy)
#endif /* NIAGARA_IMPL */
#endif /* lint */
/*
* Block copy with possibly overlapped operands.
*/
#if defined(lint)
/*ARGSUSED*/
void
ovbcopy(const void *from, void *to, size_t count)
{}
#else /* lint */
ENTRY(ovbcopy)
tst %o2 ! check count
bgu,a %ncc, 1f ! nothing to do or bad arguments
subcc %o0, %o1, %o3 ! difference of from and to address
retl ! return
nop
1:
bneg,a %ncc, 2f
neg %o3 ! if < 0, make it positive
2: cmp %o2, %o3 ! cmp size and abs(from - to)
bleu %ncc, bcopy ! if size <= abs(diff): use bcopy,
.empty ! no overlap
cmp %o0, %o1 ! compare from and to addresses
blu %ncc, .ov_bkwd ! if from < to, copy backwards
nop
!
! Copy forwards.
!
.ov_fwd:
ldub [%o0], %o3 ! read from address
inc %o0 ! inc from address
stb %o3, [%o1] ! write to address
deccc %o2 ! dec count
bgu %ncc, .ov_fwd ! loop till done
inc %o1 ! inc to address
retl ! return
nop
!
! Copy backwards.
!
.ov_bkwd:
deccc %o2 ! dec count
ldub [%o0 + %o2], %o3 ! get byte at end of src
bgu %ncc, .ov_bkwd ! loop till done
stb %o3, [%o1 + %o2] ! delay slot, store at end of dst
retl ! return
nop
SET_SIZE(ovbcopy)
#endif /* lint */
/*
* hwblkpagecopy()
*
* Copies exactly one page. This routine assumes the caller (ppcopy)
* has already disabled kernel preemption and has checked
* use_hw_bcopy.
*/
#ifdef lint
/*ARGSUSED*/
void
hwblkpagecopy(const void *src, void *dst)
{ }
#else /* lint */
ENTRY(hwblkpagecopy)
save %sp, -SA(MINFRAME), %sp
! %i0 - source address (arg)
! %i1 - destination address (arg)
! %i2 - length of region (not arg)
set PAGESIZE, %i2
/*
* Copying exactly one page and PAGESIZE is in mutliple of 0x80.
*/
mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
prefetch [%i0+0x0], #one_read
prefetch [%i0+0x40], #one_read
1:
prefetch [%i0+0x80], #one_read
prefetch [%i0+0xc0], #one_read
ldda [%i0+0x0]%asi, %l0
ldda [%i0+0x10]%asi, %l2
ldda [%i0+0x20]%asi, %l4
ldda [%i0+0x30]%asi, %l6
stxa %l0, [%i1+0x0]%asi
stxa %l1, [%i1+0x8]%asi
stxa %l2, [%i1+0x10]%asi
stxa %l3, [%i1+0x18]%asi
stxa %l4, [%i1+0x20]%asi
stxa %l5, [%i1+0x28]%asi
stxa %l6, [%i1+0x30]%asi
stxa %l7, [%i1+0x38]%asi
ldda [%i0+0x40]%asi, %l0
ldda [%i0+0x50]%asi, %l2
ldda [%i0+0x60]%asi, %l4
ldda [%i0+0x70]%asi, %l6
stxa %l0, [%i1+0x40]%asi
stxa %l1, [%i1+0x48]%asi
stxa %l2, [%i1+0x50]%asi
stxa %l3, [%i1+0x58]%asi
stxa %l4, [%i1+0x60]%asi
stxa %l5, [%i1+0x68]%asi
stxa %l6, [%i1+0x70]%asi
stxa %l7, [%i1+0x78]%asi
add %i0, 0x80, %i0
subcc %i2, 0x80, %i2
bgu,pt %xcc, 1b
add %i1, 0x80, %i1
membar #Sync
ret
restore %g0, 0, %o0
SET_SIZE(hwblkpagecopy)
#endif /* lint */
/*
* Transfer data to and from user space -
* Note that these routines can cause faults
* It is assumed that the kernel has nothing at
* less than KERNELBASE in the virtual address space.
*
* Note that copyin(9F) and copyout(9F) are part of the
* DDI/DKI which specifies that they return '-1' on "errors."
*
* Sigh.
*
* So there's two extremely similar routines - xcopyin() and xcopyout()
* which return the errno that we've faithfully computed. This
* allows other callers (e.g. uiomove(9F)) to work correctly.
* Given that these are used pretty heavily, we expand the calling
* sequences inline for all flavours (rather than making wrappers).
*
* There are also stub routines for xcopyout_little and xcopyin_little,
* which currently are intended to handle requests of <= 16 bytes from
* do_unaligned. Future enhancement to make them handle 8k pages efficiently
* is left as an exercise...
*/
/*
* Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
*
* General theory of operation:
*
* None of the copyops routines grab a window until it's decided that
* we need to do a HW block copy operation. This saves a window
* spill/fill when we're called during socket ops. The typical IO
* path won't cause spill/fill traps.
*
* This code uses a set of 4 limits for the maximum size that will
* be copied given a particular input/output address alignment.
* the default limits are:
*
* single byte aligned - 256 (hw_copy_limit_1)
* two byte aligned - 512 (hw_copy_limit_2)
* four byte aligned - 1024 (hw_copy_limit_4)
* eight byte aligned - 1024 (hw_copy_limit_8)
*
* If the value for a particular limit is zero, the copy will be done
* via the copy loops rather than block store/quad load instructions.
*
* Flow:
*
* If count == zero return zero.
*
* Store the previous lo_fault handler into %g6.
* Place our secondary lofault handler into %g5.
* Place the address of our nowindow fault handler into %o3.
* Place the address of the windowed fault handler into %o4.
* --> We'll use this handler if we end up grabbing a window
* --> before we use block initializing store and quad load ASIs
*
* If count is less than or equal to SMALL_LIMIT (7) we
* always do a byte for byte copy.
*
* If count is > SMALL_LIMIT, we check the alignment of the input
* and output pointers. Based on the alignment we check count
* against a limit based on detected alignment. If we exceed the
* alignment value we copy via block initializing store and quad
* load instructions.
*
* If we don't exceed one of the limits, we store -count in %o3,
* we store the number of chunks (8, 4, 2 or 1 byte) operated
* on in our basic copy loop in %o2. Following this we branch
* to the appropriate copy loop and copy that many chunks.
* Since we've been adding the chunk size to %o3 each time through
* as well as decrementing %o2, we can tell if any data is
* is left to be copied by examining %o3. If that is zero, we're
* done and can go home. If not, we figure out what the largest
* chunk size left to be copied is and branch to that copy loop
* unless there's only one byte left. We load that as we're
* branching to code that stores it just before we return.
*
* Fault handlers are invoked if we reference memory that has no
* current mapping. All forms share the same copyio_fault handler.
* This routine handles fixing up the stack and general housecleaning.
* Each copy operation has a simple fault handler that is then called
* to do the work specific to the invidual operation. The handler
* for copyOP and xcopyOP are found at the end of individual function.
* The handlers for xcopyOP_little are found at the end of xcopyin_little.
* The handlers for copyOP_noerr are found at the end of copyin_noerr.
*/
/*
* Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
*/
#if defined(lint)
/*ARGSUSED*/
int
copyout(const void *kaddr, void *uaddr, size_t count)
{ return (0); }
#else /* lint */
/*
* We save the arguments in the following registers in case of a fault:
* kaddr - %g2
* uaddr - %g3
* count - %g4
*/
#define SAVE_SRC %g2
#define SAVE_DST %g3
#define SAVE_COUNT %g4
#define REAL_LOFAULT %g5
#define SAVED_LOFAULT %g6
/*
* Generic copyio fault handler. This is the first line of defense when a
* fault occurs in (x)copyin/(x)copyout. In order for this to function
* properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
* This allows us to share common code for all the flavors of the copy
* operations, including the _noerr versions.
*
* Note that this function will restore the original input parameters before
* calling REAL_LOFAULT. So the real handler can vector to the appropriate
* member of the t_copyop structure, if needed.
*/
ENTRY(copyio_fault)
#if !defined(NIAGARA_IMPL)
btst FPUSED_FLAG, SAVED_LOFAULT
bz 1f
andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
wr %l5, 0, %gsr ! restore gsr
btst FPRS_FEF, %g1
bz %icc, 4f
nop
! restore fpregs from stack
BLD_FP_FROMSTACK(%o2)
ba,pt %ncc, 1f
nop
4:
FZERO ! zero all of the fpregs
wr %g1, %g0, %fprs ! restore fprs
1:
restore
mov SAVE_SRC, %o0
mov SAVE_DST, %o1
jmp REAL_LOFAULT
mov SAVE_COUNT, %o2
#else /* NIAGARA_IMPL */
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
restore
mov SAVE_SRC, %o0
mov SAVE_DST, %o1
jmp REAL_LOFAULT
mov SAVE_COUNT, %o2
#endif /* NIAGARA_IMPL */
SET_SIZE(copyio_fault)
ENTRY(copyio_fault_nowindow)
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
mov SAVE_SRC, %o0
mov SAVE_DST, %o1
jmp REAL_LOFAULT
mov SAVE_COUNT, %o2
SET_SIZE(copyio_fault_nowindow)
ENTRY(copyout)
sethi %hi(.copyout_err), REAL_LOFAULT
or REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
#if !defined(NIAGARA_IMPL)
.do_copyout:
tst %o2 ! check for zero count; quick exit
bz,pt %ncc, .co_smallqx
mov %o0, SAVE_SRC
mov %o1, SAVE_DST
mov %o2, SAVE_COUNT
cmp %o2, FP_COPY ! check for small copy/leaf case
bgt,pt %ncc, .co_copy_more
ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
/*
* Small copy out code
*
*/
sethi %hi(copyio_fault_nowindow), %o3
or %o3, %lo(copyio_fault_nowindow), %o3
membar #Sync
stn %o3, [THREAD_REG + T_LOFAULT]
mov ASI_USER, %asi
cmp %o2, SHORTCOPY ! make sure there is enough to align
ble,pt %ncc, .co_smallest
andcc %o1, 0x7, %o3 ! is dest long word aligned
bnz,pn %ncc, .co_align
andcc %o1, 1, %o3 ! is dest byte aligned
! Destination is long word aligned
! 8 cases for src alignment; load parts, store long words
.co_al_src:
andcc %o0, 7, %o3
brnz,pt %o3, .co_src_dst_unal8
nop
/*
* Special case for handling when src and dest are both long word aligned
* and total data to move is less than FP_COPY bytes
* Also handles finish up for large block moves, so may be less than 32 bytes
*/
.co_medlong:
subcc %o2, 31, %o2 ! adjust length to allow cc test
ble,pt %ncc, .co_medl31
nop
.co_medl32:
ldx [%o0], %o4 ! move 32 bytes
subcc %o2, 32, %o2 ! decrement length count by 32
stxa %o4, [%o1]%asi
ldx [%o0+8], %o4
stxa %o4, [%o1+8]%asi
ldx [%o0+16], %o4
add %o0, 32, %o0 ! increase src ptr by 32
stxa %o4, [%o1+16]%asi
ldx [%o0-8], %o4
add %o1, 32, %o1 ! increase dst ptr by 32
bgu,pt %ncc, .co_medl32 ! repeat if at least 32 bytes left
stxa %o4, [%o1-8]%asi
.co_medl31:
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .co_medl7 ! skip if 7 or fewer bytes left
nop
.co_medl8:
ldx [%o0], %o4 ! move 8 bytes
add %o0, 8, %o0 ! increase src ptr by 8
subcc %o2, 8, %o2 ! decrease count by 8
add %o1, 8, %o1 ! increase dst ptr by 8
bgu,pt %ncc, .co_medl8
stxa %o4, [%o1-8]%asi
.co_medl7:
addcc %o2, 7, %o2 ! finish adjustment of remaining count
bnz,pt %ncc, .co_small4 ! do final bytes if not finished
.co_smallx: ! finish up and exit
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
.co_smallqx:
retl
mov %g0, %o0
.co_small4:
cmp %o2, 4
blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
nop !
ld [%o0], %o4 ! move 4 bytes
add %o0, 4, %o0 ! increase src ptr by 4
add %o1, 4, %o1 ! increase dst ptr by 4
subcc %o2, 4, %o2 ! decrease count by 4
bz,pt %ncc, .co_smallx
stwa %o4, [%o1-4]%asi
.co_small3x: ! Exactly 1, 2, or 3 bytes remain
subcc %o2, 1, %o2 ! reduce count for cc test
ldub [%o0], %o4 ! load one byte
bz,pt %ncc, .co_smallx
stba %o4, [%o1]%asi ! store one byte
ldub [%o0+1], %o4 ! load second byte
subcc %o2, 1, %o2
bz,pt %ncc, .co_smallx
stba %o4, [%o1+1]%asi ! store second byte
ldub [%o0+2], %o4 ! load third byte
ba .co_smallx
stba %o4, [%o1+2]%asi ! store third byte
.co_smallest: ! 7 or fewer bytes remain
cmp %o2, 4
blt,pt %ncc, .co_small3x
nop
ldub [%o0], %o4 ! read byte
subcc %o2, 4, %o2 ! reduce count by 4
stba %o4, [%o1]%asi ! write byte
ldub [%o0+1], %o4 ! repeat for total of 4 bytes
add %o0, 4, %o0 ! advance src by 4
stba %o4, [%o1+1]%asi
ldub [%o0-2], %o4
add %o1, 4, %o1 ! advance dst by 4
stba %o4, [%o1-2]%asi
ldub [%o0-1], %o4
bnz,pt %ncc, .co_small3x
stba %o4, [%o1-1]%asi
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
retl
mov %g0, %o0
.co_align: ! byte align test in prior branch delay
bnz,pt %ncc, .co_al_d1
.co_al_d1f: ! dest is now half word aligned
andcc %o1, 2, %o3
bnz,pt %ncc, .co_al_d2
.co_al_d2f: ! dest is now word aligned
andcc %o1, 4, %o3 ! is dest longword aligned?
bz,pt %ncc, .co_al_src
nop
.co_al_d4: ! dest is word aligned; src is unknown
ldub [%o0], %o4 ! move a word (src align unknown)
ldub [%o0+1], %o3
sll %o4, 24, %o4 ! position
sll %o3, 16, %o3 ! position
or %o4, %o3, %o3 ! merge
ldub [%o0+2], %o4
sll %o4, 8, %o4 ! position
or %o4, %o3, %o3 ! merge
ldub [%o0+3], %o4
or %o4, %o3, %o4 ! merge
stwa %o4,[%o1]%asi ! store four bytes
add %o0, 4, %o0 ! adjust src by 4
add %o1, 4, %o1 ! adjust dest by 4
sub %o2, 4, %o2 ! adjust count by 4
andcc %o0, 7, %o3 ! check for src long word alignment
brz,pt %o3, .co_medlong
.co_src_dst_unal8:
! dst is 8-byte aligned, src is not
! Size is less than FP_COPY
! Following code is to select for alignment
andcc %o0, 0x3, %o3 ! test word alignment
bz,pt %ncc, .co_medword
nop
andcc %o0, 0x1, %o3 ! test halfword alignment
bnz,pt %ncc, .co_med_byte ! go to byte move if not halfword
andcc %o0, 0x2, %o3 ! test which byte alignment
ba .co_medhalf
nop
.co_al_d1: ! align dest to half word
ldub [%o0], %o4 ! move a byte
add %o0, 1, %o0
stba %o4, [%o1]%asi
add %o1, 1, %o1
andcc %o1, 2, %o3
bz,pt %ncc, .co_al_d2f
sub %o2, 1, %o2
.co_al_d2: ! align dest to word
ldub [%o0], %o4 ! move a half-word (src align unknown)
ldub [%o0+1], %o3
sll %o4, 8, %o4 ! position
or %o4, %o3, %o4 ! merge
stha %o4, [%o1]%asi
add %o0, 2, %o0
add %o1, 2, %o1
andcc %o1, 4, %o3 ! is dest longword aligned?
bz,pt %ncc, .co_al_src
sub %o2, 2, %o2
ba .co_al_d4
nop
/*
* Handle all cases where src and dest are aligned on word
* boundaries. Use unrolled loops for better performance.
* This option wins over standard large data move when
* source and destination is in cache for medium
* to short data moves.
*/
.co_medword:
subcc %o2, 31, %o2 ! adjust length to allow cc test
ble,pt %ncc, .co_medw31
nop
.co_medw32:
ld [%o0], %o4 ! move a block of 32 bytes
stwa %o4, [%o1]%asi
ld [%o0+4], %o4
stwa %o4, [%o1+4]%asi
ld [%o0+8], %o4
stwa %o4, [%o1+8]%asi
ld [%o0+12], %o4
stwa %o4, [%o1+12]%asi
ld [%o0+16], %o4
stwa %o4, [%o1+16]%asi
ld [%o0+20], %o4
subcc %o2, 32, %o2 ! decrement length count
stwa %o4, [%o1+20]%asi
ld [%o0+24], %o4
add %o0, 32, %o0 ! increase src ptr by 32
stwa %o4, [%o1+24]%asi
ld [%o0-4], %o4
add %o1, 32, %o1 ! increase dst ptr by 32
bgu,pt %ncc, .co_medw32 ! repeat if at least 32 bytes left
stwa %o4, [%o1-4]%asi
.co_medw31:
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .co_medw7 ! skip if 7 or fewer bytes left
nop !
.co_medw15:
ld [%o0], %o4 ! move a block of 8 bytes
subcc %o2, 8, %o2 ! decrement length count
stwa %o4, [%o1]%asi
add %o0, 8, %o0 ! increase src ptr by 8
ld [%o0-4], %o4
add %o1, 8, %o1 ! increase dst ptr by 8
bgu,pt %ncc, .co_medw15
stwa %o4, [%o1-4]%asi
.co_medw7:
addcc %o2, 7, %o2 ! finish adjustment of remaining count
bz,pt %ncc, .co_smallx ! exit if finished
cmp %o2, 4
blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
nop !
ld [%o0], %o4 ! move 4 bytes
add %o0, 4, %o0 ! increase src ptr by 4
add %o1, 4, %o1 ! increase dst ptr by 4
subcc %o2, 4, %o2 ! decrease count by 4
bnz .co_small3x
stwa %o4, [%o1-4]%asi
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
retl
mov %g0, %o0
.co_medhalf:
subcc %o2, 31, %o2 ! adjust length to allow cc test
ble,pt %ncc, .co_medh31
nop
.co_medh32: ! load and store block of 32 bytes
lduh [%o0], %o4 ! move 32 bytes
subcc %o2, 32, %o2 ! decrement length count
lduw [%o0+2], %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
lduh [%o0+6], %o4
or %o4, %o3, %o4
stxa %o4, [%o1]%asi
lduh [%o0+8], %o4
lduw [%o0+10], %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
lduh [%o0+14], %o4
or %o4, %o3, %o4
stxa %o4, [%o1+8]%asi
lduh [%o0+16], %o4
lduw [%o0+18], %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
lduh [%o0+22], %o4
or %o4, %o3, %o4
stxa %o4, [%o1+16]%asi
add %o0, 32, %o0 ! increase src ptr by 32
add %o1, 32, %o1 ! increase dst ptr by 32
lduh [%o0-8], %o4
lduw [%o0-6], %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
lduh [%o0-2], %o4
or %o3, %o4, %o4
bgu,pt %ncc, .co_medh32 ! repeat if at least 32 bytes left
stxa %o4, [%o1-8]%asi
.co_medh31:
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .co_medh7 ! skip if 7 or fewer bytes left
nop !
.co_medh15:
lduh [%o0], %o4 ! move 16 bytes
subcc %o2, 8, %o2 ! decrement length count
lduw [%o0+2], %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
add %o1, 8, %o1 ! increase dst ptr by 8
lduh [%o0+6], %o4
add %o0, 8, %o0 ! increase src ptr by 8
or %o4, %o3, %o4
bgu,pt %ncc, .co_medh15
stxa %o4, [%o1-8]%asi
.co_medh7:
addcc %o2, 7, %o2 ! finish adjustment of remaining count
bz,pt %ncc, .co_smallx ! exit if finished
cmp %o2, 4
blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
nop !
lduh [%o0], %o4
sll %o4, 16, %o4
lduh [%o0+2], %o3
or %o3, %o4, %o4
subcc %o2, 4, %o2
add %o0, 4, %o0
add %o1, 4, %o1
bnz .co_small3x
stwa %o4, [%o1-4]%asi
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
retl
mov %g0, %o0
.align 16
.co_med_byte:
bnz,pt %ncc, .co_medbh32a ! go to correct byte move
subcc %o2, 31, %o2 ! adjust length to allow cc test
ble,pt %ncc, .co_medb31
nop
.co_medb32: ! Alignment 1 or 5
subcc %o2, 32, %o2 ! decrement length count
ldub [%o0], %o4 ! load and store a block of 32 bytes
sllx %o4, 56, %o3
lduh [%o0+1], %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduw [%o0+3], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+7], %o4
or %o4, %o3, %o4
stxa %o4, [%o1]%asi
ldub [%o0+8], %o4
sllx %o4, 56, %o3
lduh [%o0+9], %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduw [%o0+11], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+15], %o4
or %o4, %o3, %o4
stxa %o4, [%o1+8]%asi
ldub [%o0+16], %o4
sllx %o4, 56, %o3
lduh [%o0+17], %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduw [%o0+19], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+23], %o4
or %o4, %o3, %o4
stxa %o4, [%o1+16]%asi
add %o0, 32, %o0 ! increase src ptr by 32
add %o1, 32, %o1 ! increase dst ptr by 32
ldub [%o0-8], %o4
sllx %o4, 56, %o3
lduh [%o0-7], %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduw [%o0-5], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0-1], %o4
or %o4, %o3, %o4
bgu,pt %ncc, .co_medb32 ! repeat if at least 32 bytes left
stxa %o4, [%o1-8]%asi
.co_medb31: ! 31 or fewer bytes remaining
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left
nop !
.co_medb15:
ldub [%o0], %o4 ! load and store a block of 8 bytes
subcc %o2, 8, %o2 ! decrement length count
sllx %o4, 56, %o3
lduh [%o0+1], %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduw [%o0+3], %o4
add %o1, 8, %o1 ! increase dst ptr by 16
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+7], %o4
add %o0, 8, %o0 ! increase src ptr by 16
or %o4, %o3, %o4
bgu,pt %ncc, .co_medb15
stxa %o4, [%o1-8]%asi
.co_medb7:
addcc %o2, 7, %o2 ! finish adjustment of remaining count
bz,pt %ncc, .co_smallx ! exit if finished
cmp %o2, 4
blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
nop !
ldub [%o0], %o4 ! move 4 bytes
sll %o4, 24, %o3
lduh [%o0+1], %o4
sll %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+3], %o4
or %o4, %o3, %o4
subcc %o2, 4, %o2
add %o0, 4, %o0
add %o1, 4, %o1
bnz .co_small3x
stwa %o4, [%o1-4]%asi
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
retl
mov %g0, %o0
.align 16
.co_medbh32a:
ble,pt %ncc, .co_medbh31
nop
.co_medbh32: ! Alignment 3 or 7
subcc %o2, 32, %o2 ! decrement length count
ldub [%o0], %o4 ! load and store a block of 32 bytes
sllx %o4, 56, %o3
lduw [%o0+1], %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduh [%o0+5], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+7], %o4
or %o4, %o3, %o4
stxa %o4, [%o1]%asi
ldub [%o0+8], %o4
sllx %o4, 56, %o3
lduw [%o0+9], %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduh [%o0+13], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+15], %o4
or %o4, %o3, %o4
stxa %o4, [%o1+8]%asi
ldub [%o0+16], %o4
sllx %o4, 56, %o3
lduw [%o0+17], %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduh [%o0+21], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+23], %o4
or %o4, %o3, %o4
stxa %o4, [%o1+16]%asi
add %o0, 32, %o0 ! increase src ptr by 32
add %o1, 32, %o1 ! increase dst ptr by 32
ldub [%o0-8], %o4
sllx %o4, 56, %o3
lduw [%o0-7], %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduh [%o0-3], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0-1], %o4
or %o4, %o3, %o4
bgu,pt %ncc, .co_medbh32 ! repeat if at least 32 bytes left
stxa %o4, [%o1-8]%asi
.co_medbh31:
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left
nop !
.co_medbh15:
ldub [%o0], %o4 ! load and store a block of 8 bytes
sllx %o4, 56, %o3
lduw [%o0+1], %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduh [%o0+5], %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
ldub [%o0+7], %o4
or %o4, %o3, %o4
stxa %o4, [%o1]%asi
subcc %o2, 8, %o2 ! decrement length count
add %o1, 8, %o1 ! increase dst ptr by 8
add %o0, 8, %o0 ! increase src ptr by 8
bgu,pt %ncc, .co_medbh15
stxa %o4, [%o1-8]%asi
ba .co_medb7
nop
/*
* End of small copy (no window) code
*/
/*
* Long copy code
*/
.co_copy_more:
sethi %hi(copyio_fault), %o3
or %o3, %lo(copyio_fault), %o3
membar #Sync
stn %o3, [THREAD_REG + T_LOFAULT]
/*
* Following code is for large copies. We know there is at
* least FP_COPY bytes available. FP regs are used, so
* we save registers and fp regs before starting
*/
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
rd %fprs, %g1 ! check for unused fp
! if fprs.fef == 0, set it.
! Setting it when already set costs more than checking
andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0
bz,pt %ncc, .co_fp_unused
mov ASI_USER, %asi
BST_FP_TOSTACK(%o3)
ba .co_fp_ready
.co_fp_unused:
prefetch [%i0 + (1 * CACHE_LINE)], #one_read
wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
.co_fp_ready:
rd %gsr, %l5 ! save %gsr value
andcc %i1, 1, %o3 ! is dest byte aligned
bnz,pt %ncc, .co_big_d1
.co_big_d1f: ! dest is now half word aligned
andcc %i1, 2, %o3
bnz,pt %ncc, .co_big_d2
.co_big_d2f: ! dest is now word aligned
andcc %i1, 4, %o3 ! is dest longword aligned
bnz,pt %ncc, .co_big_d4
.co_big_d4f: ! dest is now long word aligned
andcc %i0, 7, %o3 ! is src long word aligned
brnz,pt %o3, .co_big_unal8
prefetch [%i0 + (2 * CACHE_LINE)], #one_read
! Src and dst are long word aligned
! align dst to 64 byte boundary
andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
brz,pn %o3, .co_al_to_64
nop
sub %o3, 64, %o3 ! %o3 has negative bytes to move
add %i2, %o3, %i2 ! adjust remaining count
andcc %o3, 8, %o4 ! odd long words to move?
brz,pt %o4, .co_al_to_16
nop
add %o3, 8, %o3
ldx [%i0], %o4
add %i0, 8, %i0 ! increment src ptr
stxa %o4, [%i1]ASI_USER
add %i1, 8, %i1 ! increment dst ptr
! Dest is aligned on 16 bytes, src 8 byte aligned
.co_al_to_16:
andcc %o3, 0x30, %o4 ! move to move?
brz,pt %o4, .co_al_to_64
nop
.co_al_mv_16:
add %o3, 16, %o3
ldx [%i0], %o4
stxa %o4, [%i1]ASI_USER
add %i0, 16, %i0 ! increment src ptr
ldx [%i0-8], %o4
add %i1, 8, %i1 ! increment dst ptr
stxa %o4, [%i1]ASI_USER
andcc %o3, 0x30, %o4
brnz,pt %o4, .co_al_mv_16
add %i1, 8, %i1 ! increment dst ptr
! Dest is aligned on 64 bytes, src 8 byte aligned
.co_al_to_64:
! Determine source alignment
! to correct 8 byte offset
andcc %i0, 32, %o3
brnz,pn %o3, .co_aln_1
andcc %i0, 16, %o3
brnz,pn %o3, .co_aln_01
andcc %i0, 8, %o3
brz,pn %o3, .co_aln_000
prefetch [%i0 + (3 * CACHE_LINE)], #one_read
ba .co_aln_001
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.co_aln_01:
brnz,pn %o3, .co_aln_011
prefetch [%i0 + (3 * CACHE_LINE)], #one_read
ba .co_aln_010
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.co_aln_1:
andcc %i0, 16, %o3
brnz,pn %o3, .co_aln_11
andcc %i0, 8, %o3
brnz,pn %o3, .co_aln_101
prefetch [%i0 + (3 * CACHE_LINE)], #one_read
ba .co_aln_100
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.co_aln_11:
brz,pn %o3, .co_aln_110
prefetch [%i0 + (3 * CACHE_LINE)], #one_read
.co_aln_111:
! Alignment off by 8 bytes
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
ldd [%i0], %d0
add %i0, 8, %i0
sub %i2, 8, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.co_aln_111_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d2
fmovd %d18, %d4
fmovd %d20, %d6
fmovd %d22, %d8
fmovd %d24, %d10
fmovd %d26, %d12
fmovd %d28, %d14
stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_AIUS
add %i0, 64, %i0
fmovd %d30, %d0
bgt,pt %ncc, .co_aln_111_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
stda %d0, [%i1]ASI_USER
ba .co_remain_stuff
add %i1, 8, %i1
! END OF aln_111
.co_aln_110:
! Alignment off by 16 bytes
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
ldd [%i0], %d0
ldd [%i0+8], %d2
add %i0, 16, %i0
sub %i2, 16, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.co_aln_110_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d4
fmovd %d18, %d6
fmovd %d20, %d8
fmovd %d22, %d10
fmovd %d24, %d12
fmovd %d26, %d14
stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_AIUS
add %i0, 64, %i0
fmovd %d28, %d0
fmovd %d30, %d2
bgt,pt %ncc, .co_aln_110_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
stda %d0, [%i1]%asi
stda %d2, [%i1+8]%asi
ba .co_remain_stuff
add %i1, 16, %i1
! END OF aln_110
.co_aln_101:
! Alignment off by 24 bytes
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
ldd [%i0], %d0
ldd [%i0+8], %d2
ldd [%i0+16], %d4
add %i0, 24, %i0
sub %i2, 24, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.co_aln_101_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d6
fmovd %d18, %d8
fmovd %d20, %d10
fmovd %d22, %d12
fmovd %d24, %d14
stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_AIUS
add %i0, 64, %i0
fmovd %d26, %d0
fmovd %d28, %d2
fmovd %d30, %d4
bgt,pt %ncc, .co_aln_101_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
stda %d0, [%i1]%asi
stda %d2, [%i1+8]%asi
stda %d4, [%i1+16]%asi
ba .co_remain_stuff
add %i1, 24, %i1
! END OF aln_101
.co_aln_100:
! Alignment off by 32 bytes
ldd [%i0], %d0
ldd [%i0+8], %d2
ldd [%i0+16],%d4
ldd [%i0+24],%d6
add %i0, 32, %i0
sub %i2, 32, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.co_aln_100_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d8
fmovd %d18, %d10
fmovd %d20, %d12
fmovd %d22, %d14
stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_AIUS
add %i0, 64, %i0
fmovd %d24, %d0
fmovd %d26, %d2
fmovd %d28, %d4
fmovd %d30, %d6
bgt,pt %ncc, .co_aln_100_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
stda %d0, [%i1]%asi
stda %d2, [%i1+8]%asi
stda %d4, [%i1+16]%asi
stda %d6, [%i1+24]%asi
ba .co_remain_stuff
add %i1, 32, %i1
! END OF aln_100
.co_aln_011:
! Alignment off by 40 bytes
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
ldd [%i0], %d0
ldd [%i0+8], %d2
ldd [%i0+16], %d4
ldd [%i0+24], %d6
ldd [%i0+32], %d8
add %i0, 40, %i0
sub %i2, 40, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.co_aln_011_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d10
fmovd %d18, %d12
fmovd %d20, %d14
stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_AIUS
add %i0, 64, %i0
fmovd %d22, %d0
fmovd %d24, %d2
fmovd %d26, %d4
fmovd %d28, %d6
fmovd %d30, %d8
bgt,pt %ncc, .co_aln_011_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
stda %d0, [%i1]%asi
stda %d2, [%i1+8]%asi
stda %d4, [%i1+16]%asi
stda %d6, [%i1+24]%asi
stda %d8, [%i1+32]%asi
ba .co_remain_stuff
add %i1, 40, %i1
! END OF aln_011
.co_aln_010:
! Alignment off by 48 bytes
ldd [%i0], %d0
ldd [%i0+8], %d2
ldd [%i0+16], %d4
ldd [%i0+24], %d6
ldd [%i0+32], %d8
ldd [%i0+40], %d10
add %i0, 48, %i0
sub %i2, 48, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.co_aln_010_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d12
fmovd %d18, %d14
stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_AIUS
add %i0, 64, %i0
fmovd %d20, %d0
fmovd %d22, %d2
fmovd %d24, %d4
fmovd %d26, %d6
fmovd %d28, %d8
fmovd %d30, %d10
bgt,pt %ncc, .co_aln_010_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
stda %d0, [%i1]%asi
stda %d2, [%i1+8]%asi
stda %d4, [%i1+16]%asi
stda %d6, [%i1+24]%asi
stda %d8, [%i1+32]%asi
stda %d10, [%i1+40]%asi
ba .co_remain_stuff
add %i1, 48, %i1
! END OF aln_010
.co_aln_001:
! Alignment off by 56 bytes
ldd [%i0], %d0
ldd [%i0+8], %d2
ldd [%i0+16], %d4
ldd [%i0+24], %d6
ldd [%i0+32], %d8
ldd [%i0+40], %d10
ldd [%i0+48], %d12
add %i0, 56, %i0
sub %i2, 56, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.co_aln_001_loop:
ldda [%i0]ASI_BLK_P,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d14
stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_AIUS
add %i0, 64, %i0
fmovd %d18, %d0
fmovd %d20, %d2
fmovd %d22, %d4
fmovd %d24, %d6
fmovd %d26, %d8
fmovd %d28, %d10
fmovd %d30, %d12
bgt,pt %ncc, .co_aln_001_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
stda %d0, [%i1]%asi
stda %d2, [%i1+8]%asi
stda %d4, [%i1+16]%asi
stda %d6, [%i1+24]%asi
stda %d8, [%i1+32]%asi
stda %d10, [%i1+40]%asi
stda %d12, [%i1+48]%asi
ba .co_remain_stuff
add %i1, 56, %i1
! END OF aln_001
.co_aln_000:
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.co_aln_000_loop:
ldda [%i0]ASI_BLK_P,%d0
subcc %o3, 64, %o3
stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_AIUS
add %i0, 64, %i0
bgt,pt %ncc, .co_aln_000_loop
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
add %i1, %i0, %i1
! END OF aln_000
.co_remain_stuff:
subcc %i2, 31, %i2 ! adjust length to allow cc test
ble,pt %ncc, .co_aln_31
nop
.co_aln_32:
ldx [%i0], %o4 ! move 32 bytes
subcc %i2, 32, %i2 ! decrement length count by 32
stxa %o4, [%i1]%asi
ldx [%i0+8], %o4
stxa %o4, [%i1+8]%asi
ldx [%i0+16], %o4
add %i0, 32, %i0 ! increase src ptr by 32
stxa %o4, [%i1+16]%asi
ldx [%i0-8], %o4
add %i1, 32, %i1 ! increase dst ptr by 32
bgu,pt %ncc, .co_aln_32 ! repeat if at least 32 bytes left
stxa %o4, [%i1-8]%asi
.co_aln_31:
addcc %i2, 24, %i2 ! adjust count to be off by 7
ble,pt %ncc, .co_aln_7 ! skip if 7 or fewer bytes left
nop !
.co_aln_15:
ldx [%i0], %o4 ! move 8 bytes
add %i0, 8, %i0 ! increase src ptr by 8
subcc %i2, 8, %i2 ! decrease count by 8
add %i1, 8, %i1 ! increase dst ptr by 8
bgu,pt %ncc, .co_aln_15
stxa %o4, [%i1-8]%asi
.co_aln_7:
addcc %i2, 7, %i2 ! finish adjustment of remaining count
bz,pt %ncc, .co_exit ! exit if finished
cmp %i2, 4
blt,pt %ncc, .co_unaln3x ! skip if less than 4 bytes left
nop !
ld [%i0], %o4 ! move 4 bytes
add %i0, 4, %i0 ! increase src ptr by 4
add %i1, 4, %i1 ! increase dst ptr by 4
subcc %i2, 4, %i2 ! decrease count by 4
bnz .co_unaln3x
stwa %o4, [%i1-4]%asi
ba .co_exit
nop
! destination alignment code
.co_big_d1:
ldub [%i0], %o4 ! move a byte
add %i0, 1, %i0
stba %o4, [%i1]ASI_USER
add %i1, 1, %i1
andcc %i1, 2, %o3
bz,pt %ncc, .co_big_d2f
sub %i2, 1, %i2
.co_big_d2:
ldub [%i0], %o4 ! move a half-word (src align unknown)
ldub [%i0+1], %o3
add %i0, 2, %i0
sll %o4, 8, %o4 ! position
or %o4, %o3, %o4 ! merge
stha %o4, [%i1]ASI_USER
add %i1, 2, %i1
andcc %i1, 4, %o3 ! is dest longword aligned
bz,pt %ncc, .co_big_d4f
sub %i2, 2, %i2
.co_big_d4: ! dest is at least word aligned
nop
ldub [%i0], %o4 ! move a word (src align unknown)
ldub [%i0+1], %o3
sll %o4, 24, %o4 ! position
sll %o3, 16, %o3 ! position
or %o4, %o3, %o3 ! merge
ldub [%i0+2], %o4
sll %o4, 8, %o4 ! position
or %o4, %o3, %o3 ! merge
ldub [%i0+3], %o4
or %o4, %o3, %o4 ! merge
stwa %o4,[%i1]ASI_USER ! store four bytes
add %i0, 4, %i0 ! adjust src by 4
add %i1, 4, %i1 ! adjust dest by 4
ba .co_big_d4f
sub %i2, 4, %i2 ! adjust count by 4
! Dst is on 8 byte boundary; src is not;
.co_big_unal8:
andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned?
bz %ncc, .co_unalnsrc
sub %o3, 64, %o3 ! %o3 will be multiple of 8
neg %o3 ! bytes until dest is 64 byte aligned
sub %i2, %o3, %i2 ! update cnt with bytes to be moved
! Move bytes according to source alignment
andcc %i0, 0x1, %o4
bnz %ncc, .co_unalnbyte ! check for byte alignment
nop
andcc %i0, 2, %o4 ! check for half word alignment
bnz %ncc, .co_unalnhalf
nop
! Src is word aligned, move bytes until dest 64 byte aligned
.co_unalnword:
ld [%i0], %o4 ! load 4 bytes
stwa %o4, [%i1]%asi ! and store 4 bytes
ld [%i0+4], %o4 ! load 4 bytes
add %i0, 8, %i0 ! increase src ptr by 8
stwa %o4, [%i1+4]%asi ! and store 4 bytes
subcc %o3, 8, %o3 ! decrease count by 8
bnz %ncc, .co_unalnword
add %i1, 8, %i1 ! increase dst ptr by 8
ba .co_unalnsrc
nop
! Src is half-word aligned, move bytes until dest 64 byte aligned
.co_unalnhalf:
lduh [%i0], %o4 ! load 2 bytes
sllx %o4, 32, %i3 ! shift left
lduw [%i0+2], %o4
or %o4, %i3, %i3
sllx %i3, 16, %i3
lduh [%i0+6], %o4
or %o4, %i3, %i3
stxa %i3, [%i1]ASI_USER
add %i0, 8, %i0
subcc %o3, 8, %o3
bnz %ncc, .co_unalnhalf
add %i1, 8, %i1
ba .co_unalnsrc
nop
! Src is Byte aligned, move bytes until dest 64 byte aligned
.co_unalnbyte:
sub %i1, %i0, %i1 ! share pointer advance
.co_unalnbyte_loop:
ldub [%i0], %o4
sllx %o4, 56, %i3
lduh [%i0+1], %o4
sllx %o4, 40, %o4
or %o4, %i3, %i3
lduh [%i0+3], %o4
sllx %o4, 24, %o4
or %o4, %i3, %i3
lduh [%i0+5], %o4
sllx %o4, 8, %o4
or %o4, %i3, %i3
ldub [%i0+7], %o4
or %o4, %i3, %i3
stxa %i3, [%i1+%i0]ASI_USER
subcc %o3, 8, %o3
bnz %ncc, .co_unalnbyte_loop
add %i0, 8, %i0
add %i1,%i0, %i1 ! restore pointer
! Destination is now block (64 byte aligned), src is not 8 byte aligned
.co_unalnsrc:
andn %i2, 0x3f, %i3 ! %i3 is multiple of block size
and %i2, 0x3f, %i2 ! residue bytes in %i2
add %i2, 64, %i2 ! Insure we don't load beyond
sub %i3, 64, %i3 ! end of source buffer
andn %i0, 0x3f, %o4 ! %o4 has block aligned src address
prefetch [%o4 + (3 * CACHE_LINE)], #one_read
alignaddr %i0, %g0, %g0 ! generate %gsr
add %i0, %i3, %i0 ! advance %i0 to after blocks
!
! Determine source alignment to correct 8 byte offset
andcc %i0, 0x20, %o3
brnz,pn %o3, .co_unaln_1
andcc %i0, 0x10, %o3
brnz,pn %o3, .co_unaln_01
andcc %i0, 0x08, %o3
brz,a %o3, .co_unaln_000
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .co_unaln_001
nop
.co_unaln_01:
brnz,a %o3, .co_unaln_011
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .co_unaln_010
nop
.co_unaln_1:
brnz,pn %o3, .co_unaln_11
andcc %i0, 0x08, %o3
brnz,a %o3, .co_unaln_101
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .co_unaln_100
nop
.co_unaln_11:
brz,pn %o3, .co_unaln_110
prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.co_unaln_111:
ldd [%o4+56], %d14
.co_unaln_111_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d14, %d16, %d48
faligndata %d16, %d18, %d50
faligndata %d18, %d20, %d52
faligndata %d20, %d22, %d54
faligndata %d22, %d24, %d56
faligndata %d24, %d26, %d58
faligndata %d26, %d28, %d60
faligndata %d28, %d30, %d62
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_AIUS
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .co_unaln_111_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .co_unaln_done
nop
.co_unaln_110:
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.co_unaln_110_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d12, %d14, %d48
faligndata %d14, %d16, %d50
faligndata %d16, %d18, %d52
faligndata %d18, %d20, %d54
faligndata %d20, %d22, %d56
faligndata %d22, %d24, %d58
faligndata %d24, %d26, %d60
faligndata %d26, %d28, %d62
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_AIUS
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .co_unaln_110_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .co_unaln_done
nop
.co_unaln_101:
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.co_unaln_101_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d10, %d12, %d48
faligndata %d12, %d14, %d50
faligndata %d14, %d16, %d52
faligndata %d16, %d18, %d54
faligndata %d18, %d20, %d56
faligndata %d20, %d22, %d58
faligndata %d22, %d24, %d60
faligndata %d24, %d26, %d62
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_AIUS
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .co_unaln_101_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .co_unaln_done
nop
.co_unaln_100:
ldd [%o4+32], %d8
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.co_unaln_100_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d8, %d10, %d48
faligndata %d10, %d12, %d50
faligndata %d12, %d14, %d52
faligndata %d14, %d16, %d54
faligndata %d16, %d18, %d56
faligndata %d18, %d20, %d58
faligndata %d20, %d22, %d60
faligndata %d22, %d24, %d62
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_AIUS
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .co_unaln_100_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .co_unaln_done
nop
.co_unaln_011:
ldd [%o4+24], %d6
ldd [%o4+32], %d8
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.co_unaln_011_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d6, %d8, %d48
faligndata %d8, %d10, %d50
faligndata %d10, %d12, %d52
faligndata %d12, %d14, %d54
faligndata %d14, %d16, %d56
faligndata %d16, %d18, %d58
faligndata %d18, %d20, %d60
faligndata %d20, %d22, %d62
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_AIUS
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .co_unaln_011_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .co_unaln_done
nop
.co_unaln_010:
ldd [%o4+16], %d4
ldd [%o4+24], %d6
ldd [%o4+32], %d8
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.co_unaln_010_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d4, %d6, %d48
faligndata %d6, %d8, %d50
faligndata %d8, %d10, %d52
faligndata %d10, %d12, %d54
faligndata %d12, %d14, %d56
faligndata %d14, %d16, %d58
faligndata %d16, %d18, %d60
faligndata %d18, %d20, %d62
fmovd %d20, %d4
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_AIUS
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .co_unaln_010_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .co_unaln_done
nop
.co_unaln_001:
ldd [%o4+8], %d2
ldd [%o4+16], %d4
ldd [%o4+24], %d6
ldd [%o4+32], %d8
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.co_unaln_001_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d2, %d4, %d48
faligndata %d4, %d6, %d50
faligndata %d6, %d8, %d52
faligndata %d8, %d10, %d54
faligndata %d10, %d12, %d56
faligndata %d12, %d14, %d58
faligndata %d14, %d16, %d60
faligndata %d16, %d18, %d62
fmovd %d18, %d2
fmovd %d20, %d4
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_AIUS
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .co_unaln_001_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
ba .co_unaln_done
nop
.co_unaln_000:
ldda [%o4]ASI_BLK_P, %d0
.co_unaln_000_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d0, %d2, %d48
faligndata %d2, %d4, %d50
faligndata %d4, %d6, %d52
faligndata %d6, %d8, %d54
faligndata %d8, %d10, %d56
faligndata %d10, %d12, %d58
faligndata %d12, %d14, %d60
faligndata %d14, %d16, %d62
fmovd %d16, %d0
fmovd %d18, %d2
fmovd %d20, %d4
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_AIUS
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .co_unaln_000_loop
prefetch [%o4 + (4 * CACHE_LINE)], #one_read
.co_unaln_done:
! Handle trailing bytes, 64 to 127
! Dest long word aligned, Src not long word aligned
cmp %i2, 15
bleu %ncc, .co_unaln_short
andn %i2, 0x7, %i3 ! %i3 is multiple of 8
and %i2, 0x7, %i2 ! residue bytes in %i2
add %i2, 8, %i2
sub %i3, 8, %i3 ! insure we don't load past end of src
andn %i0, 0x7, %o4 ! %o4 has long word aligned src address
add %i0, %i3, %i0 ! advance %i0 to after multiple of 8
ldd [%o4], %d0 ! fetch partial word
.co_unaln_by8:
ldd [%o4+8], %d2
add %o4, 8, %o4
faligndata %d0, %d2, %d16
subcc %i3, 8, %i3
stda %d16, [%i1]%asi
fmovd %d2, %d0
bgu,pt %ncc, .co_unaln_by8
add %i1, 8, %i1
.co_unaln_short:
cmp %i2, 8
blt,pt %ncc, .co_unalnfin
nop
ldub [%i0], %o4
sll %o4, 24, %o3
ldub [%i0+1], %o4
sll %o4, 16, %o4
or %o4, %o3, %o3
ldub [%i0+2], %o4
sll %o4, 8, %o4
or %o4, %o3, %o3
ldub [%i0+3], %o4
or %o4, %o3, %o3
stwa %o3, [%i1]%asi
ldub [%i0+4], %o4
sll %o4, 24, %o3
ldub [%i0+5], %o4
sll %o4, 16, %o4
or %o4, %o3, %o3
ldub [%i0+6], %o4
sll %o4, 8, %o4
or %o4, %o3, %o3
ldub [%i0+7], %o4
or %o4, %o3, %o3
stwa %o3, [%i1+4]%asi
add %i0, 8, %i0
add %i1, 8, %i1
sub %i2, 8, %i2
.co_unalnfin:
cmp %i2, 4
blt,pt %ncc, .co_unalnz
tst %i2
ldub [%i0], %o3 ! read byte
subcc %i2, 4, %i2 ! reduce count by 4
sll %o3, 24, %o3 ! position
ldub [%i0+1], %o4
sll %o4, 16, %o4 ! position
or %o4, %o3, %o3 ! merge
ldub [%i0+2], %o4
sll %o4, 8, %o4 ! position
or %o4, %o3, %o3 ! merge
add %i1, 4, %i1 ! advance dst by 4
ldub [%i0+3], %o4
add %i0, 4, %i0 ! advance src by 4
or %o4, %o3, %o4 ! merge
bnz,pt %ncc, .co_unaln3x
stwa %o4, [%i1-4]%asi
ba .co_exit
nop
.co_unalnz:
bz,pt %ncc, .co_exit
wr %l5, %g0, %gsr ! restore %gsr
.co_unaln3x: ! Exactly 1, 2, or 3 bytes remain
subcc %i2, 1, %i2 ! reduce count for cc test
ldub [%i0], %o4 ! load one byte
bz,pt %ncc, .co_exit
stba %o4, [%i1]%asi ! store one byte
ldub [%i0+1], %o4 ! load second byte
subcc %i2, 1, %i2
bz,pt %ncc, .co_exit
stba %o4, [%i1+1]%asi ! store second byte
ldub [%i0+2], %o4 ! load third byte
stba %o4, [%i1+2]%asi ! store third byte
.co_exit:
brnz %g1, .co_fp_restore
nop
FZERO
wr %g1, %g0, %fprs
ba,pt %ncc, .co_ex2
membar #Sync
.co_fp_restore:
BLD_FP_FROMSTACK(%o4)
.co_ex2:
andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
ret
restore %g0, 0, %o0
.copyout_err:
ldn [THREAD_REG + T_COPYOPS], %o4
brz %o4, 2f
nop
ldn [%o4 + CP_COPYOUT], %g2
jmp %g2
nop
2:
retl
mov -1, %o0
#else /* NIAGARA_IMPL */
.do_copyout:
!
! Check the length and bail if zero.
!
tst %o2
bnz,pt %ncc, 1f
nop
retl
clr %o0
1:
sethi %hi(copyio_fault), %o4
or %o4, %lo(copyio_fault), %o4
sethi %hi(copyio_fault_nowindow), %o3
ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
or %o3, %lo(copyio_fault_nowindow), %o3
membar #Sync
stn %o3, [THREAD_REG + T_LOFAULT]
mov %o0, SAVE_SRC
mov %o1, SAVE_DST
mov %o2, SAVE_COUNT
!
! Check to see if we're more than SMALL_LIMIT (7 bytes).
! Run in leaf mode, using the %o regs as our input regs.
!
subcc %o2, SMALL_LIMIT, %o3
bgu,a,pt %ncc, .dco_ns
or %o0, %o1, %o3
!
! What was previously ".small_copyout"
! Do full differenced copy.
!
.dcobcp:
sub %g0, %o2, %o3 ! negate count
add %o0, %o2, %o0 ! make %o0 point at the end
add %o1, %o2, %o1 ! make %o1 point at the end
ba,pt %ncc, .dcocl
ldub [%o0 + %o3], %o4 ! load first byte
!
! %o0 and %o2 point at the end and remain pointing at the end
! of their buffers. We pull things out by adding %o3 (which is
! the negation of the length) to the buffer end which gives us
! the curent location in the buffers. By incrementing %o3 we walk
! through both buffers without having to bump each buffer's
! pointer. A very fast 4 instruction loop.
!
.align 16
.dcocl:
stba %o4, [%o1 + %o3]ASI_USER
inccc %o3
bl,a,pt %ncc, .dcocl
ldub [%o0 + %o3], %o4
!
! We're done. Go home.
!
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
retl
clr %o0
!
! Try aligned copies from here.
!
.dco_ns:
! %o0 = kernel addr (to be copied from)
! %o1 = user addr (to be copied to)
! %o2 = length
! %o3 = %o1 | %o2 (used for alignment checking)
! %o4 is alternate lo_fault
! %o5 is original lo_fault
!
! See if we're single byte aligned. If we are, check the
! limit for single byte copies. If we're smaller or equal,
! bounce to the byte for byte copy loop. Otherwise do it in
! HW (if enabled).
!
btst 1, %o3
bz,pt %icc, .dcoh8
btst 7, %o3
!
! Single byte aligned. Do we do it via HW or via
! byte for byte? Do a quick no memory reference
! check to pick up small copies.
!
sethi %hi(hw_copy_limit_1), %o3
!
! Big enough that we need to check the HW limit for
! this size copy.
!
ld [%o3 + %lo(hw_copy_limit_1)], %o3
!
! Is HW copy on? If not, do everything byte for byte.
!
tst %o3
bz,pn %icc, .dcobcp
subcc %o3, %o2, %o3
!
! If we're less than or equal to the single byte copy limit,
! bop to the copy loop.
!
bge,pt %ncc, .dcobcp
nop
!
! We're big enough and copy is on. Do it with HW.
!
ba,pt %ncc, .big_copyout
nop
.dcoh8:
!
! 8 byte aligned?
!
bnz,a %ncc, .dcoh4
btst 3, %o3
!
! See if we're in the "small range".
! If so, go off and do the copy.
! If not, load the hard limit. %o3 is
! available for reuse.
!
sethi %hi(hw_copy_limit_8), %o3
ld [%o3 + %lo(hw_copy_limit_8)], %o3
!
! If it's zero, there's no HW bcopy.
! Bop off to the aligned copy.
!
tst %o3
bz,pn %icc, .dcos8
subcc %o3, %o2, %o3
!
! We're negative if our size is larger than hw_copy_limit_8.
!
bge,pt %ncc, .dcos8
nop
!
! HW assist is on and we're large enough. Do it.
!
ba,pt %ncc, .big_copyout
nop
.dcos8:
!
! Housekeeping for copy loops. Uses same idea as in the byte for
! byte copy loop above.
!
add %o0, %o2, %o0
add %o1, %o2, %o1
sub %g0, %o2, %o3
ba,pt %ncc, .dodebc
srl %o2, 3, %o2 ! Number of 8 byte chunks to copy
!
! 4 byte aligned?
!
.dcoh4:
bnz,pn %ncc, .dcoh2
!
! See if we're in the "small range".
! If so, go off an do the copy.
! If not, load the hard limit. %o3 is
! available for reuse.
!
sethi %hi(hw_copy_limit_4), %o3
ld [%o3 + %lo(hw_copy_limit_4)], %o3
!
! If it's zero, there's no HW bcopy.
! Bop off to the aligned copy.
!
tst %o3
bz,pn %icc, .dcos4
subcc %o3, %o2, %o3
!
! We're negative if our size is larger than hw_copy_limit_4.
!
bge,pt %ncc, .dcos4
nop
!
! HW assist is on and we're large enough. Do it.
!
ba,pt %ncc, .big_copyout
nop
.dcos4:
add %o0, %o2, %o0
add %o1, %o2, %o1
sub %g0, %o2, %o3
ba,pt %ncc, .dodfbc
srl %o2, 2, %o2 ! Number of 4 byte chunks to copy
!
! We must be 2 byte aligned. Off we go.
! The check for small copies was done in the
! delay at .dcoh4
!
.dcoh2:
ble %ncc, .dcos2
sethi %hi(hw_copy_limit_2), %o3
ld [%o3 + %lo(hw_copy_limit_2)], %o3
tst %o3
bz,pn %icc, .dcos2
subcc %o3, %o2, %o3
bge,pt %ncc, .dcos2
nop
!
! HW is on and we're big enough. Do it.
!
ba,pt %ncc, .big_copyout
nop
.dcos2:
add %o0, %o2, %o0
add %o1, %o2, %o1
sub %g0, %o2, %o3
ba,pt %ncc, .dodtbc
srl %o2, 1, %o2 ! Number of 2 byte chunks to copy
.small_copyout:
!
! Why are we doing this AGAIN? There are certain conditions in
! big_copyout that will cause us to forego the HW assisted copies
! and bounce back to a non-HW assisted copy. This dispatches those
! copies. Note that we branch around this in the main line code.
!
! We make no check for limits or HW enablement here. We've
! already been told that we're a poster child so just go off
! and do it.
!
or %o0, %o1, %o3
btst 1, %o3
bnz %icc, .dcobcp ! Most likely
btst 7, %o3
bz %icc, .dcos8
btst 3, %o3
bz %icc, .dcos4
nop
ba,pt %ncc, .dcos2
nop
.align 32
.dodebc:
ldx [%o0 + %o3], %o4
deccc %o2
stxa %o4, [%o1 + %o3]ASI_USER
bg,pt %ncc, .dodebc
addcc %o3, 8, %o3
!
! End of copy loop. Check to see if we're done. Most
! eight byte aligned copies end here.
!
bz,pt %ncc, .dcofh
nop
!
! Something is left - do it byte for byte.
!
ba,pt %ncc, .dcocl
ldub [%o0 + %o3], %o4 ! load next byte
!
! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
!
.align 32
.dodfbc:
lduw [%o0 + %o3], %o4
deccc %o2
sta %o4, [%o1 + %o3]ASI_USER
bg,pt %ncc, .dodfbc
addcc %o3, 4, %o3
!
! End of copy loop. Check to see if we're done. Most
! four byte aligned copies end here.
!
bz,pt %ncc, .dcofh
nop
!
! Something is left. Do it byte for byte.
!
ba,pt %ncc, .dcocl
ldub [%o0 + %o3], %o4 ! load next byte
!
! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
! copy.
!
.align 32
.dodtbc:
lduh [%o0 + %o3], %o4
deccc %o2
stha %o4, [%o1 + %o3]ASI_USER
bg,pt %ncc, .dodtbc
addcc %o3, 2, %o3
!
! End of copy loop. Anything left?
!
bz,pt %ncc, .dcofh
nop
!
! Deal with the last byte
!
ldub [%o0 + %o3], %o4
stba %o4, [%o1 + %o3]ASI_USER
.dcofh:
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
clr %o0
.big_copyout:
! We're going to go off and do a block copy.
! Switch fault handlers and grab a window. We
! don't do a membar #Sync since we've done only
! kernel data to this point.
stn %o4, [THREAD_REG + T_LOFAULT]
! Copy out that reach here are larger than 256 bytes. The
! hw_copy_limit_1 is set to 256. Never set this limit less
! 128 bytes.
save %sp, -SA(MINFRAME), %sp
.do_block_copyout:
! Swap src/dst since the code below is memcpy code
! and memcpy/bcopy have different calling sequences
mov %i1, %i5
mov %i0, %i1
mov %i5, %i0
! Block (64 bytes) align the destination.
andcc %i0, 0x3f, %i3 ! is dst block aligned
bz %ncc, copyout_blalign ! dst already block aligned
sub %i3, 0x40, %i3
neg %i3 ! bytes till dst 64 bytes aligned
sub %i2, %i3, %i2 ! update i2 with new count
! Based on source and destination alignment do
! either 8 bytes, 4 bytes, 2 bytes or byte copy.
! Is dst & src 8B aligned
or %i0, %i1, %o2
andcc %o2, 0x7, %g0
bz %ncc, .co_alewdcp
nop
! Is dst & src 4B aligned
andcc %o2, 0x3, %g0
bz %ncc, .co_alwdcp
nop
! Is dst & src 2B aligned
andcc %o2, 0x1, %g0
bz %ncc, .co_alhlfwdcp
nop
! 1B aligned
1: ldub [%i1], %o2
stba %o2, [%i0]ASI_USER
inc %i1
deccc %i3
bgu,pt %ncc, 1b
inc %i0
ba copyout_blalign
nop
! dst & src 4B aligned
.co_alwdcp:
ld [%i1], %o2
sta %o2, [%i0]ASI_USER
add %i1, 0x4, %i1
subcc %i3, 0x4, %i3
bgu,pt %ncc, .co_alwdcp
add %i0, 0x4, %i0
ba copyout_blalign
nop
! dst & src 2B aligned
.co_alhlfwdcp:
lduh [%i1], %o2
stuha %o2, [%i0]ASI_USER
add %i1, 0x2, %i1
subcc %i3, 0x2, %i3
bgu,pt %ncc, .co_alhlfwdcp
add %i0, 0x2, %i0
ba copyout_blalign
nop
! dst & src 8B aligned
.co_alewdcp:
ldx [%i1], %o2
stxa %o2, [%i0]ASI_USER
add %i1, 0x8, %i1
subcc %i3, 0x8, %i3
bgu,pt %ncc, .co_alewdcp
add %i0, 0x8, %i0
! Now Destination is block (64 bytes) aligned
copyout_blalign:
andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
sub %i2, %i3, %i2 ! Residue bytes in %i2
mov ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
andcc %i1, 0xf, %o2 ! is src quadword aligned
bz,pn %xcc, .co_blkcpy ! src offset in %o2 (last 4-bits)
nop
cmp %o2, 0x8
bg .co_upper_double
nop
bl .co_lower_double
nop
! Falls through when source offset is equal to 8 i.e.
! source is double word aligned.
! In this case no shift/merge of data is required
sub %i1, %o2, %i1 ! align the src at 16 bytes.
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetch [%l0+0x0], #one_read
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
.co_loop0:
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
prefetch [%l0+0x40], #one_read
stxa %l3, [%i0+0x0]%asi
stxa %l4, [%i0+0x8]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
stxa %l5, [%i0+0x10]%asi
stxa %l2, [%i0+0x18]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
stxa %l3, [%i0+0x20]%asi
stxa %l4, [%i0+0x28]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
stxa %l5, [%i0+0x30]%asi
stxa %l2, [%i0+0x38]%asi
add %l0, 0x40, %l0
subcc %i3, 0x40, %i3
bgu,pt %xcc, .co_loop0
add %i0, 0x40, %i0
ba .co_blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
.co_lower_double:
sub %i1, %o2, %i1 ! align the src at 16 bytes.
sll %o2, 3, %o0 ! %o0 left shift
mov 0x40, %o1
sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetch [%l0+0x0], #one_read
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l2 and %l3 has
! complete data
.co_loop1:
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has partial data
! for this read.
ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
! into %l2 and %l3
prefetch [%l0+0x40], #one_read
stxa %l2, [%i0+0x0]%asi
stxa %l3, [%i0+0x8]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
! %l4 from previous read
! into %l4 and %l5
stxa %l4, [%i0+0x10]%asi
stxa %l5, [%i0+0x18]%asi
! Repeat the same for next 32 bytes.
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
stxa %l2, [%i0+0x20]%asi
stxa %l3, [%i0+0x28]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
stxa %l4, [%i0+0x30]%asi
stxa %l5, [%i0+0x38]%asi
add %l0, 0x40, %l0
subcc %i3, 0x40, %i3
bgu,pt %xcc, .co_loop1
add %i0, 0x40, %i0
ba .co_blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
.co_upper_double:
sub %i1, %o2, %i1 ! align the src at 16 bytes.
sub %o2, 0x8, %o0
sll %o0, 3, %o0 ! %o0 left shift
mov 0x40, %o1
sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetch [%l0+0x0], #one_read
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l3
! for this read and
! no data in %l2
.co_loop2:
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has complete data
! and %l5 has partial
ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
! into %l3 and %l4
prefetch [%l0+0x40], #one_read
stxa %l3, [%i0+0x0]%asi
stxa %l4, [%i0+0x8]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
! %l5 from previous read
! into %l5 and %l2
stxa %l5, [%i0+0x10]%asi
stxa %l2, [%i0+0x18]%asi
! Repeat the same for next 32 bytes.
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
stxa %l3, [%i0+0x20]%asi
stxa %l4, [%i0+0x28]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
stxa %l5, [%i0+0x30]%asi
stxa %l2, [%i0+0x38]%asi
add %l0, 0x40, %l0
subcc %i3, 0x40, %i3
bgu,pt %xcc, .co_loop2
add %i0, 0x40, %i0
ba .co_blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.co_blkcpy:
andn %i1, 0x3f, %o0 ! %o0 has block aligned source
prefetch [%o0+0x0], #one_read
1:
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
add %i1, 0x10, %i1
prefetch [%o0+0x40], #one_read
stxa %l0, [%i0+0x0]%asi
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
add %i1, 0x10, %i1
stxa %l1, [%i0+0x8]%asi
stxa %l2, [%i0+0x10]%asi
stxa %l3, [%i0+0x18]%asi
stxa %l4, [%i0+0x20]%asi
stxa %l5, [%i0+0x28]%asi
stxa %l6, [%i0+0x30]%asi
stxa %l7, [%i0+0x38]%asi
add %o0, 0x40, %o0
subcc %i3, 0x40, %i3
bgu,pt %xcc, 1b
add %i0, 0x40, %i0
.co_blkdone:
membar #Sync
brz,pt %i2, .copyout_exit
nop
! Handle trailing bytes
cmp %i2, 0x8
blu,pt %ncc, .co_residue
nop
! Can we do some 8B ops
or %i1, %i0, %o2
andcc %o2, 0x7, %g0
bnz %ncc, .co_last4
nop
! Do 8byte ops as long as possible
.co_last8:
ldx [%i1], %o2
stxa %o2, [%i0]ASI_USER
add %i1, 0x8, %i1
sub %i2, 0x8, %i2
cmp %i2, 0x8
bgu,pt %ncc, .co_last8
add %i0, 0x8, %i0
brz,pt %i2, .copyout_exit
nop
ba .co_residue
nop
.co_last4:
! Can we do 4B ops
andcc %o2, 0x3, %g0
bnz %ncc, .co_last2
nop
1:
ld [%i1], %o2
sta %o2, [%i0]ASI_USER
add %i1, 0x4, %i1
sub %i2, 0x4, %i2
cmp %i2, 0x4
bgu,pt %ncc, 1b
add %i0, 0x4, %i0
brz,pt %i2, .copyout_exit
nop
ba .co_residue
nop
.co_last2:
! Can we do 2B ops
andcc %o2, 0x1, %g0
bnz %ncc, .co_residue
nop
1:
lduh [%i1], %o2
stuha %o2, [%i0]ASI_USER
add %i1, 0x2, %i1
sub %i2, 0x2, %i2
cmp %i2, 0x2
bgu,pt %ncc, 1b
add %i0, 0x2, %i0
brz,pt %i2, .copyout_exit
nop
! Copy the residue as byte copy
.co_residue:
ldub [%i1], %i4
stba %i4, [%i0]ASI_USER
inc %i1
deccc %i2
bgu,pt %xcc, .co_residue
inc %i0
.copyout_exit:
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
ret
restore %g0, 0, %o0
.copyout_err:
ldn [THREAD_REG + T_COPYOPS], %o4
brz %o4, 2f
nop
ldn [%o4 + CP_COPYOUT], %g2
jmp %g2
nop
2:
retl
mov -1, %o0
#endif /* NIAGARA_IMPL */
SET_SIZE(copyout)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
xcopyout(const void *kaddr, void *uaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(xcopyout)
sethi %hi(.xcopyout_err), REAL_LOFAULT
b .do_copyout
or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
.xcopyout_err:
ldn [THREAD_REG + T_COPYOPS], %o4
brz %o4, 2f
nop
ldn [%o4 + CP_XCOPYOUT], %g2
jmp %g2
nop
2:
retl
mov %g1, %o0
SET_SIZE(xcopyout)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
xcopyout_little(const void *kaddr, void *uaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(xcopyout_little)
sethi %hi(.little_err), %o4
ldn [THREAD_REG + T_LOFAULT], %o5
or %o4, %lo(.little_err), %o4
membar #Sync ! sync error barrier
stn %o4, [THREAD_REG + T_LOFAULT]
subcc %g0, %o2, %o3
add %o0, %o2, %o0
bz,pn %ncc, 2f ! check for zero bytes
sub %o2, 1, %o4
add %o0, %o4, %o0 ! start w/last byte
add %o1, %o2, %o1
ldub [%o0+%o3], %o4
1: stba %o4, [%o1+%o3]ASI_AIUSL
inccc %o3
sub %o0, 2, %o0 ! get next byte
bcc,a,pt %ncc, 1b
ldub [%o0+%o3], %o4
2: membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return (0)
SET_SIZE(xcopyout_little)
#endif /* lint */
/*
* Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
*/
#if defined(lint)
/*ARGSUSED*/
int
copyin(const void *uaddr, void *kaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(copyin)
sethi %hi(.copyin_err), REAL_LOFAULT
or REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
#if !defined(NIAGARA_IMPL)
.do_copyin:
tst %o2 ! check for zero count; quick exit
bz,pt %ncc, .ci_smallqx
mov %o0, SAVE_SRC
mov %o1, SAVE_DST
mov %o2, SAVE_COUNT
cmp %o2, FP_COPY ! check for small copy/leaf case
bgt,pt %ncc, .ci_copy_more
ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
/*
* Small copy in code
*
*/
sethi %hi(copyio_fault_nowindow), %o3
or %o3, %lo(copyio_fault_nowindow), %o3
membar #Sync
stn %o3, [THREAD_REG + T_LOFAULT]
mov ASI_USER, %asi
cmp %o2, SHORTCOPY ! make sure there is enough to align
ble,pt %ncc, .ci_smallest
andcc %o1, 0x7, %o3 ! is dest long word aligned
bnz,pn %ncc, .ci_align
andcc %o1, 1, %o3 ! is dest byte aligned
! Destination is long word aligned
.ci_al_src:
andcc %o0, 7, %o3
brnz,pt %o3, .ci_src_dst_unal8
nop
/*
* Special case for handling when src and dest are both long word aligned
* and total data to move is less than FP_COPY bytes
* Also handles finish up for large block moves, so may be less than 32 bytes
*/
.ci_medlong:
subcc %o2, 31, %o2 ! adjust length to allow cc test
ble,pt %ncc, .ci_medl31
nop
.ci_medl32:
ldxa [%o0]%asi, %o4 ! move 32 bytes
subcc %o2, 32, %o2 ! decrement length count by 32
stx %o4, [%o1]
ldxa [%o0+8]%asi, %o4
stx %o4, [%o1+8]
ldxa [%o0+16]%asi, %o4
add %o0, 32, %o0 ! increase src ptr by 32
stx %o4, [%o1+16]
ldxa [%o0-8]%asi, %o4
add %o1, 32, %o1 ! increase dst ptr by 32
bgu,pt %ncc, .ci_medl32 ! repeat if at least 32 bytes left
stx %o4, [%o1-8]
.ci_medl31:
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .ci_medl7 ! skip if 7 or fewer bytes left
nop
.ci_medl8:
ldxa [%o0]%asi, %o4 ! move 8 bytes
add %o0, 8, %o0 ! increase src ptr by 8
subcc %o2, 8, %o2 ! decrease count by 8
add %o1, 8, %o1 ! increase dst ptr by 8
bgu,pt %ncc, .ci_medl8
stx %o4, [%o1-8]
.ci_medl7:
addcc %o2, 7, %o2 ! finish adjustment of remaining count
bnz,pt %ncc, .ci_small4 ! do final bytes if not finished
nop
.ci_smallx: ! finish up and exit
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
.ci_smallqx:
retl
mov %g0, %o0
.ci_small4:
cmp %o2, 4
blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
nop !
lda [%o0]%asi, %o4 ! move 4 bytes
add %o0, 4, %o0 ! increase src ptr by 4
add %o1, 4, %o1 ! increase dst ptr by 4
subcc %o2, 4, %o2 ! decrease count by 4
bz %ncc, .ci_smallx
stw %o4, [%o1-4]
.ci_small3x: ! Exactly 1, 2, or 3 bytes remain
subcc %o2, 1, %o2 ! reduce count for cc test
lduba [%o0]%asi, %o4 ! load one byte
bz,pt %ncc, .ci_smallx
stb %o4, [%o1] ! store one byte
lduba [%o0+1]%asi, %o4 ! load second byte
subcc %o2, 1, %o2
bz,pt %ncc, .ci_smallx
stb %o4, [%o1+1] ! store second byte
lduba [%o0+2]%asi, %o4 ! load third byte
ba .ci_smallx
stb %o4, [%o1+2] ! store third byte
.ci_smallest: ! 7 or fewer bytes remain
cmp %o2, 4
blt,pt %ncc, .ci_small3x
nop
lduba [%o0]%asi, %o4 ! read byte
subcc %o2, 4, %o2 ! reduce count by 4
stb %o4, [%o1] ! write byte
lduba [%o0+1]%asi, %o4 ! repeat for total of 4 bytes
add %o0, 4, %o0 ! advance src by 4
stb %o4, [%o1+1]
lduba [%o0-2]%asi, %o4
add %o1, 4, %o1 ! advance dst by 4
stb %o4, [%o1-2]
lduba [%o0-1]%asi, %o4
bnz,pt %ncc, .ci_small3x
stb %o4, [%o1-1]
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
retl
mov %g0, %o0
.ci_align:
bnz,pt %ncc, .ci_al_d1
.ci_al_d1f: ! dest is now half word aligned
andcc %o1, 2, %o3 ! is dest word aligned
bnz,pt %ncc, .ci_al_d2
.ci_al_d2f: ! dest is now word aligned
andcc %o1, 4, %o3 ! is dest longword aligned?
bz,pt %ncc, .ci_al_src
nop
.ci_al_d4: ! dest is word aligned; src is unknown
lduba [%o0]%asi, %o4 ! move a word (src align unknown)
lduba [%o0+1]%asi, %o3
sll %o4, 24, %o4 ! position
sll %o3, 16, %o3 ! position
or %o4, %o3, %o3 ! merge
lduba [%o0+2]%asi, %o4
sll %o4, 8, %o4 ! position
or %o4, %o3, %o3 ! merge
lduba [%o0+3]%asi, %o4
or %o4, %o3, %o4 ! merge
stw %o4,[%o1] ! store four bytes
add %o0, 4, %o0 ! adjust src by 4
add %o1, 4, %o1 ! adjust dest by 4
sub %o2, 4, %o2 ! adjust count by 4
andcc %o0, 7, %o3 ! check for src long word alignment
brz,pt %o3, .ci_medlong
.ci_src_dst_unal8:
! dst is 8-byte aligned, src is not
! Size is less than FP_COPY
! Following code is to select for alignment
andcc %o0, 0x3, %o3 ! test word alignment
bz,pt %ncc, .ci_medword
nop
andcc %o0, 0x1, %o3 ! test halfword alignment
bnz,pt %ncc, .ci_med_byte ! go to byte move if not halfword
andcc %o0, 0x2, %o3 ! test which byte alignment
ba .ci_medhalf
nop
.ci_al_d1: ! align dest to half word
lduba [%o0]%asi, %o4 ! move a byte
add %o0, 1, %o0
stb %o4, [%o1]
add %o1, 1, %o1
andcc %o1, 2, %o3 ! is dest word aligned
bz,pt %ncc, .ci_al_d2f
sub %o2, 1, %o2
.ci_al_d2: ! align dest to word
lduba [%o0]%asi, %o4 ! move a half-word (src align unknown)
lduba [%o0+1]%asi, %o3
sll %o4, 8, %o4 ! position
or %o4, %o3, %o4 ! merge
sth %o4, [%o1]
add %o0, 2, %o0
add %o1, 2, %o1
andcc %o1, 4, %o3 ! is dest longword aligned?
bz,pt %ncc, .ci_al_src
sub %o2, 2, %o2
ba .ci_al_d4
nop
/*
* Handle all cases where src and dest are aligned on word
* boundaries. Use unrolled loops for better performance.
* This option wins over standard large data move when
* source and destination is in cache for medium
* to short data moves.
*/
.ci_medword:
subcc %o2, 31, %o2 ! adjust length to allow cc test
ble,pt %ncc, .ci_medw31
nop
.ci_medw32:
lda [%o0]%asi, %o4 ! move a block of 32 bytes
stw %o4, [%o1]
lda [%o0+4]%asi, %o4
stw %o4, [%o1+4]
lda [%o0+8]%asi, %o4
stw %o4, [%o1+8]
lda [%o0+12]%asi, %o4
stw %o4, [%o1+12]
lda [%o0+16]%asi, %o4
stw %o4, [%o1+16]
lda [%o0+20]%asi, %o4
subcc %o2, 32, %o2 ! decrement length count
stw %o4, [%o1+20]
lda [%o0+24]%asi, %o4
add %o0, 32, %o0 ! increase src ptr by 32
stw %o4, [%o1+24]
lda [%o0-4]%asi, %o4
add %o1, 32, %o1 ! increase dst ptr by 32
bgu,pt %ncc, .ci_medw32 ! repeat if at least 32 bytes left
stw %o4, [%o1-4]
.ci_medw31:
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .ci_medw7 ! skip if 7 or fewer bytes left
nop !
.ci_medw15:
lda [%o0]%asi, %o4 ! move a block of 8 bytes
subcc %o2, 8, %o2 ! decrement length count
stw %o4, [%o1]
add %o0, 8, %o0 ! increase src ptr by 8
lda [%o0-4]%asi, %o4
add %o1, 8, %o1 ! increase dst ptr by 8
bgu,pt %ncc, .ci_medw15
stw %o4, [%o1-4]
.ci_medw7:
addcc %o2, 7, %o2 ! finish adjustment of remaining count
bz,pt %ncc, .ci_smallx ! exit if finished
cmp %o2, 4
blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
nop !
lda [%o0]%asi, %o4 ! move 4 bytes
add %o0, 4, %o0 ! increase src ptr by 4
add %o1, 4, %o1 ! increase dst ptr by 4
subcc %o2, 4, %o2 ! decrease count by 4
bnz .ci_small3x
stw %o4, [%o1-4]
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
retl
mov %g0, %o0
.ci_medhalf:
subcc %o2, 31, %o2 ! adjust length to allow cc test
ble,pt %ncc, .ci_medh31
nop
.ci_medh32: ! load and store block of 32 bytes
subcc %o2, 32, %o2 ! decrement length count
lduha [%o0]%asi, %o4 ! move 32 bytes
lduwa [%o0+2]%asi, %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
lduha [%o0+6]%asi, %o4
or %o4, %o3, %o4
stx %o4, [%o1]
lduha [%o0+8]%asi, %o4
lduwa [%o0+10]%asi, %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
lduha [%o0+14]%asi, %o4
or %o4, %o3, %o4
stx %o4, [%o1+8]
lduha [%o0+16]%asi, %o4
lduwa [%o0+18]%asi, %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
lduha [%o0+22]%asi, %o4
or %o4, %o3, %o4
stx %o4, [%o1+16]
add %o0, 32, %o0 ! increase src ptr by 32
add %o1, 32, %o1 ! increase dst ptr by 32
lduha [%o0-8]%asi, %o4
lduwa [%o0-6]%asi, %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
lduha [%o0-2]%asi, %o4
or %o3, %o4, %o4
bgu,pt %ncc, .ci_medh32 ! repeat if at least 32 bytes left
stx %o4, [%o1-8]
.ci_medh31:
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .ci_medh7 ! skip if 7 or fewer bytes left
nop !
.ci_medh15:
lduha [%o0]%asi, %o4 ! move 16 bytes
subcc %o2, 8, %o2 ! decrement length count
lduwa [%o0+2]%asi, %o3
sllx %o4, 48, %o4
sllx %o3, 16, %o3
or %o4, %o3, %o3
add %o1, 8, %o1 ! increase dst ptr by 8
lduha [%o0+6]%asi, %o4
add %o0, 8, %o0 ! increase src ptr by 8
or %o4, %o3, %o4
bgu,pt %ncc, .ci_medh15
stx %o4, [%o1-8]
.ci_medh7:
addcc %o2, 7, %o2 ! finish adjustment of remaining count
bz,pt %ncc, .ci_smallx ! exit if finished
cmp %o2, 4
blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
nop !
lduha [%o0]%asi, %o4
sll %o4, 16, %o4
lduha [%o0+2]%asi, %o3
or %o3, %o4, %o4
subcc %o2, 4, %o2
add %o0, 4, %o0
add %o1, 4, %o1
bnz .ci_small3x
stw %o4, [%o1-4]
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
retl
mov %g0, %o0
.align 16
.ci_med_byte:
bnz,pt %ncc, .ci_medbh32a ! go to correct byte move
subcc %o2, 31, %o2 ! adjust length to allow cc test
ble,pt %ncc, .ci_medb31
nop
.ci_medb32: ! Alignment 1 or 5
subcc %o2, 32, %o2 ! decrement length count
lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes
sllx %o4, 56, %o3
lduha [%o0+1]%asi, %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduwa [%o0+3]%asi, %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
lduba [%o0+7]%asi, %o4
or %o4, %o3, %o4
stx %o4, [%o1]
lduba [%o0+8]%asi, %o4
sllx %o4, 56, %o3
lduha [%o0+9]%asi, %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduwa [%o0+11]%asi, %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
lduba [%o0+15]%asi, %o4
or %o4, %o3, %o4
stx %o4, [%o1+8]
lduba [%o0+16]%asi, %o4
sllx %o4, 56, %o3
lduha [%o0+17]%asi, %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduwa [%o0+19]%asi, %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
lduba [%o0+23]%asi, %o4
or %o4, %o3, %o4
stx %o4, [%o1+16]
add %o0, 32, %o0 ! increase src ptr by 32
add %o1, 32, %o1 ! increase dst ptr by 32
lduba [%o0-8]%asi, %o4
sllx %o4, 56, %o3
lduha [%o0-7]%asi, %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduwa [%o0-5]%asi, %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
lduba [%o0-1]%asi, %o4
or %o4, %o3, %o4
bgu,pt %ncc, .ci_medb32 ! repeat if at least 32 bytes left
stx %o4, [%o1-8]
.ci_medb31: ! 31 or fewer bytes remaining
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left
nop !
.ci_medb15:
lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes
subcc %o2, 8, %o2 ! decrement length count
sllx %o4, 56, %o3
lduha [%o0+1]%asi, %o4
sllx %o4, 40, %o4
or %o4, %o3, %o3
lduwa [%o0+3]%asi, %o4
add %o1, 8, %o1 ! increase dst ptr by 16
sllx %o4, 8, %o4
or %o4, %o3, %o3
lduba [%o0+7]%asi, %o4
add %o0, 8, %o0 ! increase src ptr by 16
or %o4, %o3, %o4
bgu,pt %ncc, .ci_medb15
stx %o4, [%o1-8]
.ci_medb7:
addcc %o2, 7, %o2 ! finish adjustment of remaining count
bz,pt %ncc, .ci_smallx ! exit if finished
cmp %o2, 4
blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
nop !
lduba [%o0]%asi, %o4 ! move 4 bytes
sll %o4, 24, %o3
lduha [%o0+1]%asi, %o4
sll %o4, 8, %o4
or %o4, %o3, %o3
lduba [%o0+3]%asi, %o4
or %o4, %o3, %o4
subcc %o2, 4, %o2
add %o0, 4, %o0
add %o1, 4, %o1
bnz .ci_small3x
stw %o4, [%o1-4]
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
retl
mov %g0, %o0
.align 16
.ci_medbh32a: ! Alignment 3 or 7
ble,pt %ncc, .ci_medbh31
nop
.ci_medbh32: ! Alignment 3 or 7
subcc %o2, 32, %o2 ! decrement length count
lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes
sllx %o4, 56, %o3
lduwa [%o0+1]%asi, %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduha [%o0+5]%asi, %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
lduba [%o0+7]%asi, %o4
or %o4, %o3, %o4
stx %o4, [%o1]
lduba [%o0+8]%asi, %o4
sllx %o4, 56, %o3
lduwa [%o0+9]%asi, %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduha [%o0+13]%asi, %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
lduba [%o0+15]%asi, %o4
or %o4, %o3, %o4
stx %o4, [%o1+8]
lduba [%o0+16]%asi, %o4
sllx %o4, 56, %o3
lduwa [%o0+17]%asi, %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduha [%o0+21]%asi, %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
lduba [%o0+23]%asi, %o4
or %o4, %o3, %o4
stx %o4, [%o1+16]
add %o0, 32, %o0 ! increase src ptr by 32
add %o1, 32, %o1 ! increase dst ptr by 32
lduba [%o0-8]%asi, %o4
sllx %o4, 56, %o3
lduwa [%o0-7]%asi, %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduha [%o0-3]%asi, %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
lduba [%o0-1]%asi, %o4
or %o4, %o3, %o4
bgu,pt %ncc, .ci_medbh32 ! repeat if at least 32 bytes left
stx %o4, [%o1-8]
.ci_medbh31:
addcc %o2, 24, %o2 ! adjust count to be off by 7
ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left
nop !
.ci_medbh15:
lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes
sllx %o4, 56, %o3
lduwa [%o0+1]%asi, %o4
sllx %o4, 24, %o4
or %o4, %o3, %o3
lduha [%o0+5]%asi, %o4
sllx %o4, 8, %o4
or %o4, %o3, %o3
lduba [%o0+7]%asi, %o4
or %o4, %o3, %o4
stx %o4, [%o1]
subcc %o2, 8, %o2 ! decrement length count
add %o1, 8, %o1 ! increase dst ptr by 8
add %o0, 8, %o0 ! increase src ptr by 8
bgu,pt %ncc, .ci_medbh15
stx %o4, [%o1-8]
ba .ci_medb7
nop
/*
* End of small copy in code (no window)
*
*/
/*
* Long copy in code (using register window and fp regs)
*
*/
.ci_copy_more:
sethi %hi(copyio_fault), %o3
or %o3, %lo(copyio_fault), %o3
membar #Sync
stn %o3, [THREAD_REG + T_LOFAULT]
/*
* Following code is for large copies. We know there is at
* least FP_COPY bytes available. FP regs are used, so
* we save registers and fp regs before starting
*/
save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
rd %fprs, %g1 ! check for unused fp
! if fprs.fef == 0, set it.
! Setting it when already set costs more than checking
andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0
bz,pt %ncc, .ci_fp_unused
mov ASI_USER, %asi
BST_FP_TOSTACK(%o3)
ba .ci_fp_ready
.ci_fp_unused:
prefetcha [%i0 + (1 * CACHE_LINE)]%asi, #one_read
wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
.ci_fp_ready:
rd %gsr, %l5 ! save %gsr value
andcc %i1, 1, %o3 ! is dest byte aligned
bnz,pt %ncc, .ci_big_d1
.ci_big_d1f: ! dest is now half word aligned
andcc %i1, 2, %o3
bnz,pt %ncc, .ci_big_d2
.ci_big_d2f: ! dest is now word aligned
andcc %i1, 4, %o3
bnz,pt %ncc, .ci_big_d4
.ci_big_d4f: ! dest is long word aligned
andcc %i0, 7, %o3 ! is src long word aligned
brnz,pt %o3, .ci_big_unal8
prefetcha [%i0 + (2 * CACHE_LINE)]%asi, #one_read
! Src and dst are long word aligned
! align dst to 64 byte boundary
andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
brz,pn %o3, .ci_al_to_64
nop
sub %o3, 64, %o3 ! %o3 has negative bytes to move
add %i2, %o3, %i2 ! adjust remaining count
andcc %o3, 8, %o4 ! odd long words to move?
brz,pt %o4, .ci_al_to_16
nop
add %o3, 8, %o3
ldxa [%i0]%asi, %o4
add %i0, 8, %i0 ! increment src ptr
add %i1, 8, %i1 ! increment dst ptr
stx %o4, [%i1-8]
! Dest is aligned on 16 bytes, src 8 byte aligned
.ci_al_to_16:
andcc %o3, 0x30, %o4 ! pair of long words to move?
brz,pt %o4, .ci_al_to_64
nop
.ci_al_mv_16:
add %o3, 16, %o3
ldxa [%i0]%asi, %o4
stx %o4, [%i1]
add %i0, 16, %i0 ! increment src ptr
ldxa [%i0-8]%asi, %o4
stx %o4, [%i1+8]
andcc %o3, 0x30, %o4
brnz,pt %o4, .ci_al_mv_16
add %i1, 16, %i1 ! increment dst ptr
! Dest is aligned on 64 bytes, src 8 byte aligned
.ci_al_to_64:
! Determine source alignment
! to correct 8 byte offset
andcc %i0, 32, %o3
brnz,pn %o3, .ci_aln_1
andcc %i0, 16, %o3
brnz,pn %o3, .ci_aln_01
andcc %i0, 8, %o3
brz,pn %o3, .ci_aln_000
prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
ba .ci_aln_001
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
.ci_aln_01:
brnz,pn %o3, .ci_aln_011
prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
ba .ci_aln_010
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
.ci_aln_1:
andcc %i0, 16, %o3
brnz,pn %o3, .ci_aln_11
andcc %i0, 8, %o3
brnz,pn %o3, .ci_aln_101
prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
ba .ci_aln_100
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
.ci_aln_11:
brz,pn %o3, .ci_aln_110
prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
.ci_aln_111:
! Alignment off by 8 bytes
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
ldda [%i0]%asi, %d0
add %i0, 8, %i0
sub %i2, 8, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.ci_aln_111_loop:
ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d2
fmovd %d18, %d4
fmovd %d20, %d6
fmovd %d22, %d8
fmovd %d24, %d10
fmovd %d26, %d12
fmovd %d28, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d30, %d0
bgt,pt %ncc, .ci_aln_111_loop
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
add %i1, %i0, %i1
std %d0, [%i1]
ba .ci_remain_stuff
add %i1, 8, %i1
! END OF aln_111
.ci_aln_110:
! Alignment off by 16 bytes
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
ldda [%i0]%asi, %d0
ldda [%i0+8]%asi, %d2
add %i0, 16, %i0
sub %i2, 16, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.ci_aln_110_loop:
ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d4
fmovd %d18, %d6
fmovd %d20, %d8
fmovd %d22, %d10
fmovd %d24, %d12
fmovd %d26, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d28, %d0
fmovd %d30, %d2
bgt,pt %ncc, .ci_aln_110_loop
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
add %i1, %i0, %i1
std %d0, [%i1]
std %d2, [%i1+8]
ba .ci_remain_stuff
add %i1, 16, %i1
! END OF aln_110
.ci_aln_101:
! Alignment off by 24 bytes
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
ldda [%i0]%asi, %d0
ldda [%i0+8]%asi, %d2
ldda [%i0+16]%asi, %d4
add %i0, 24, %i0
sub %i2, 24, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.ci_aln_101_loop:
ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d6
fmovd %d18, %d8
fmovd %d20, %d10
fmovd %d22, %d12
fmovd %d24, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d26, %d0
fmovd %d28, %d2
fmovd %d30, %d4
bgt,pt %ncc, .ci_aln_101_loop
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
add %i1, %i0, %i1
std %d0, [%i1]
std %d2, [%i1+8]
std %d4, [%i1+16]
ba .ci_remain_stuff
add %i1, 24, %i1
! END OF aln_101
.ci_aln_100:
! Alignment off by 32 bytes
ldda [%i0]%asi, %d0
ldda [%i0+8]%asi, %d2
ldda [%i0+16]%asi,%d4
ldda [%i0+24]%asi,%d6
add %i0, 32, %i0
sub %i2, 32, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.ci_aln_100_loop:
ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d8
fmovd %d18, %d10
fmovd %d20, %d12
fmovd %d22, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d24, %d0
fmovd %d26, %d2
fmovd %d28, %d4
fmovd %d30, %d6
bgt,pt %ncc, .ci_aln_100_loop
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
add %i1, %i0, %i1
std %d0, [%i1]
std %d2, [%i1+8]
std %d4, [%i1+16]
std %d6, [%i1+24]
ba .ci_remain_stuff
add %i1, 32, %i1
! END OF aln_100
.ci_aln_011:
! Alignment off by 40 bytes
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
ldda [%i0]%asi, %d0
ldda [%i0+8]%asi, %d2
ldda [%i0+16]%asi, %d4
ldda [%i0+24]%asi, %d6
ldda [%i0+32]%asi, %d8
add %i0, 40, %i0
sub %i2, 40, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.ci_aln_011_loop:
ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d10
fmovd %d18, %d12
fmovd %d20, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d22, %d0
fmovd %d24, %d2
fmovd %d26, %d4
fmovd %d28, %d6
fmovd %d30, %d8
bgt,pt %ncc, .ci_aln_011_loop
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
add %i1, %i0, %i1
std %d0, [%i1]
std %d2, [%i1+8]
std %d4, [%i1+16]
std %d6, [%i1+24]
std %d8, [%i1+32]
ba .ci_remain_stuff
add %i1, 40, %i1
! END OF aln_011
.ci_aln_010:
! Alignment off by 48 bytes
ldda [%i0]%asi, %d0
ldda [%i0+8]%asi, %d2
ldda [%i0+16]%asi, %d4
ldda [%i0+24]%asi, %d6
ldda [%i0+32]%asi, %d8
ldda [%i0+40]%asi, %d10
add %i0, 48, %i0
sub %i2, 48, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.ci_aln_010_loop:
ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d12
fmovd %d18, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d20, %d0
fmovd %d22, %d2
fmovd %d24, %d4
fmovd %d26, %d6
fmovd %d28, %d8
fmovd %d30, %d10
bgt,pt %ncc, .ci_aln_010_loop
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
add %i1, %i0, %i1
std %d0, [%i1]
std %d2, [%i1+8]
std %d4, [%i1+16]
std %d6, [%i1+24]
std %d8, [%i1+32]
std %d10, [%i1+40]
ba .ci_remain_stuff
add %i1, 48, %i1
! END OF aln_010
.ci_aln_001:
! Alignment off by 56 bytes
ldda [%i0]%asi, %d0
ldda [%i0+8]%asi, %d2
ldda [%i0+16]%asi, %d4
ldda [%i0+24]%asi, %d6
ldda [%i0+32]%asi, %d8
ldda [%i0+40]%asi, %d10
ldda [%i0+48]%asi, %d12
add %i0, 56, %i0
sub %i2, 56, %i2
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.ci_aln_001_loop:
ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
subcc %o3, 64, %o3
fmovd %d16, %d14
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
fmovd %d18, %d0
fmovd %d20, %d2
fmovd %d22, %d4
fmovd %d24, %d6
fmovd %d26, %d8
fmovd %d28, %d10
fmovd %d30, %d12
bgt,pt %ncc, .ci_aln_001_loop
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
add %i1, %i0, %i1
std %d0, [%i1]
std %d2, [%i1+8]
std %d4, [%i1+16]
std %d6, [%i1+24]
std %d8, [%i1+32]
std %d10, [%i1+40]
std %d12, [%i1+48]
ba .ci_remain_stuff
add %i1, 56, %i1
! END OF aln_001
.ci_aln_000:
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
and %i2, 0x7f, %i2 ! residue bytes in %i2
sub %i1, %i0, %i1
.ci_aln_000_loop:
ldda [%i0]ASI_BLK_AIUS,%d0
subcc %o3, 64, %o3
stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
stda %d0,[%i0+%i1]ASI_BLK_P
add %i0, 64, %i0
bgt,pt %ncc, .ci_aln_000_loop
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
add %i1, %i0, %i1
! END OF aln_000
.ci_remain_stuff:
subcc %i2, 31, %i2 ! adjust length to allow cc test
ble,pt %ncc, .ci_aln_31
nop
.ci_aln_32:
ldxa [%i0]%asi, %o4 ! move 32 bytes
subcc %i2, 32, %i2 ! decrement length count by 32
stx %o4, [%i1]
ldxa [%i0+8]%asi, %o4
stx %o4, [%i1+8]
ldxa [%i0+16]%asi, %o4
add %i0, 32, %i0 ! increase src ptr by 32
stx %o4, [%i1+16]
ldxa [%i0-8]%asi, %o4
add %i1, 32, %i1 ! increase dst ptr by 32
bgu,pt %ncc, .ci_aln_32 ! repeat if at least 32 bytes left
stx %o4, [%i1-8]
.ci_aln_31:
addcc %i2, 24, %i2 ! adjust count to be off by 7
ble,pt %ncc, .ci_aln_7 ! skip if 7 or fewer bytes left
nop !
.ci_aln_15:
ldxa [%i0]%asi, %o4 ! move 8 bytes
add %i0, 8, %i0 ! increase src ptr by 8
subcc %i2, 8, %i2 ! decrease count by 8
add %i1, 8, %i1 ! increase dst ptr by 8
bgu,pt %ncc, .ci_aln_15
stx %o4, [%i1-8] !
.ci_aln_7:
addcc %i2, 7, %i2 ! finish adjustment of remaining count
bz,pt %ncc, .ci_exit ! exit if finished
cmp %i2, 4
blt,pt %ncc, .ci_unaln3x ! skip if less than 4 bytes left
nop !
lda [%i0]%asi, %o4 ! move 4 bytes
add %i0, 4, %i0 ! increase src ptr by 4
add %i1, 4, %i1 ! increase dst ptr by 4
subcc %i2, 4, %i2 ! decrease count by 4
bnz .ci_unaln3x
stw %o4, [%i1-4]
ba .ci_exit
nop
! destination alignment code
.ci_big_d1:
lduba [%i0]%asi, %o4 ! move a byte
add %i0, 1, %i0
stb %o4, [%i1]
add %i1, 1, %i1
andcc %i1, 2, %o3
bz,pt %ncc, .ci_big_d2f
sub %i2, 1, %i2
.ci_big_d2: ! dest is now at least half word aligned
lduba [%i0]%asi, %o4 ! move a half-word (src align unknown)
lduba [%i0+1]%asi, %o3
add %i0, 2, %i0
sll %o4, 8, %o4 ! position
or %o4, %o3, %o4 ! merge
sth %o4, [%i1]
add %i1, 2, %i1
andcc %i1, 4, %o3
bz,pt %ncc, .ci_big_d4f
sub %i2, 2, %i2
.ci_big_d4: ! dest is at least word aligned
nop
lduba [%i0]%asi, %o4 ! move a word (src align unknown)
lduba [%i0+1]%asi, %o3
sll %o4, 24, %o4 ! position
sll %o3, 16, %o3 ! position
or %o4, %o3, %o3 ! merge
lduba [%i0+2]%asi, %o4
sll %o4, 8, %o4 ! position
or %o4, %o3, %o3 ! merge
lduba [%i0+3]%asi, %o4
or %o4, %o3, %o4 ! merge
stw %o4,[%i1] ! store four bytes
add %i0, 4, %i0 ! adjust src by 4
add %i1, 4, %i1 ! adjust dest by 4
ba .ci_big_d4f
sub %i2, 4, %i2 ! adjust count by 4
! Dst is on 8 byte boundary; src is not;
.ci_big_unal8:
andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned?
bz %ncc, .ci_unalnsrc
sub %o3, 64, %o3 ! %o3 will be multiple of 8
neg %o3 ! bytes until dest is 64 byte aligned
sub %i2, %o3, %i2 ! update cnt with bytes to be moved
! Move bytes according to source alignment
andcc %i0, 0x1, %o4
bnz %ncc, .ci_unalnbyte ! check for byte alignment
nop
andcc %i0, 2, %o4 ! check for half word alignment
bnz %ncc, .ci_unalnhalf
nop
! Src is word aligned, move bytes until dest 64 byte aligned
.ci_unalnword:
lda [%i0]%asi, %o4 ! load 4 bytes
stw %o4, [%i1] ! and store 4 bytes
lda [%i0+4]%asi, %o4 ! load 4 bytes
add %i0, 8, %i0 ! increase src ptr by 8
stw %o4, [%i1+4] ! and store 4 bytes
subcc %o3, 8, %o3 ! decrease count by 8
bnz %ncc, .ci_unalnword
add %i1, 8, %i1 ! increase dst ptr by 8
ba .ci_unalnsrc
nop
! Src is half-word aligned, move bytes until dest 64 byte aligned
.ci_unalnhalf:
lduha [%i0]%asi, %o4 ! load 2 bytes
sllx %o4, 32, %i3 ! shift left
lduwa [%i0+2]%asi, %o4
or %o4, %i3, %i3
sllx %i3, 16, %i3
lduha [%i0+6]%asi, %o4
or %o4, %i3, %i3
stx %i3, [%i1]
add %i0, 8, %i0
subcc %o3, 8, %o3
bnz %ncc, .ci_unalnhalf
add %i1, 8, %i1
ba .ci_unalnsrc
nop
! Src is Byte aligned, move bytes until dest 64 byte aligned
.ci_unalnbyte:
sub %i1, %i0, %i1 ! share pointer advance
.ci_unalnbyte_loop:
lduba [%i0]%asi, %o4
sllx %o4, 56, %i3
lduha [%i0+1]%asi, %o4
sllx %o4, 40, %o4
or %o4, %i3, %i3
lduha [%i0+3]%asi, %o4
sllx %o4, 24, %o4
or %o4, %i3, %i3
lduha [%i0+5]%asi, %o4
sllx %o4, 8, %o4
or %o4, %i3, %i3
lduba [%i0+7]%asi, %o4
or %o4, %i3, %i3
stx %i3, [%i1+%i0]
subcc %o3, 8, %o3
bnz %ncc, .ci_unalnbyte_loop
add %i0, 8, %i0
add %i1,%i0, %i1 ! restore pointer
! Destination is now block (64 byte aligned), src is not 8 byte aligned
.ci_unalnsrc:
andn %i2, 0x3f, %i3 ! %i3 is multiple of block size
and %i2, 0x3f, %i2 ! residue bytes in %i2
add %i2, 64, %i2 ! Insure we don't load beyond
sub %i3, 64, %i3 ! end of source buffer
andn %i0, 0x3f, %o4 ! %o4 has block aligned src address
prefetcha [%o4 + (3 * CACHE_LINE)]%asi, #one_read
alignaddr %i0, %g0, %g0 ! generate %gsr
add %i0, %i3, %i0 ! advance %i0 to after blocks
!
! Determine source alignment to correct 8 byte offset
andcc %i0, 0x20, %o3
brnz,pn %o3, .ci_unaln_1
andcc %i0, 0x10, %o3
brnz,pn %o3, .ci_unaln_01
andcc %i0, 0x08, %o3
brz,a %o3, .ci_unaln_000
prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
ba .ci_unaln_001
nop
.ci_unaln_01:
brnz,a %o3, .ci_unaln_011
prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
ba .ci_unaln_010
nop
.ci_unaln_1:
brnz,pn %o3, .ci_unaln_11
andcc %i0, 0x08, %o3
brnz,a %o3, .ci_unaln_101
prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
ba .ci_unaln_100
nop
.ci_unaln_11:
brz,pn %o3, .ci_unaln_110
prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
.ci_unaln_111:
ldda [%o4+56]%asi, %d14
.ci_unaln_111_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_AIUS, %d16
faligndata %d14, %d16, %d48
faligndata %d16, %d18, %d50
faligndata %d18, %d20, %d52
faligndata %d20, %d22, %d54
faligndata %d22, %d24, %d56
faligndata %d24, %d26, %d58
faligndata %d26, %d28, %d60
faligndata %d28, %d30, %d62
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .ci_unaln_111_loop
prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
ba .ci_unaln_done
nop
.ci_unaln_110:
ldda [%o4+48]%asi, %d12
ldda [%o4+56]%asi, %d14
.ci_unaln_110_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_AIUS, %d16
faligndata %d12, %d14, %d48
faligndata %d14, %d16, %d50
faligndata %d16, %d18, %d52
faligndata %d18, %d20, %d54
faligndata %d20, %d22, %d56
faligndata %d22, %d24, %d58
faligndata %d24, %d26, %d60
faligndata %d26, %d28, %d62
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .ci_unaln_110_loop
prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
ba .ci_unaln_done
nop
.ci_unaln_101:
ldda [%o4+40]%asi, %d10
ldda [%o4+48]%asi, %d12
ldda [%o4+56]%asi, %d14
.ci_unaln_101_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_AIUS, %d16
faligndata %d10, %d12, %d48
faligndata %d12, %d14, %d50
faligndata %d14, %d16, %d52
faligndata %d16, %d18, %d54
faligndata %d18, %d20, %d56
faligndata %d20, %d22, %d58
faligndata %d22, %d24, %d60
faligndata %d24, %d26, %d62
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .ci_unaln_101_loop
prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
ba .ci_unaln_done
nop
.ci_unaln_100:
ldda [%o4+32]%asi, %d8
ldda [%o4+40]%asi, %d10
ldda [%o4+48]%asi, %d12
ldda [%o4+56]%asi, %d14
.ci_unaln_100_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_AIUS, %d16
faligndata %d8, %d10, %d48
faligndata %d10, %d12, %d50
faligndata %d12, %d14, %d52
faligndata %d14, %d16, %d54
faligndata %d16, %d18, %d56
faligndata %d18, %d20, %d58
faligndata %d20, %d22, %d60
faligndata %d22, %d24, %d62
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .ci_unaln_100_loop
prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
ba .ci_unaln_done
nop
.ci_unaln_011:
ldda [%o4+24]%asi, %d6
ldda [%o4+32]%asi, %d8
ldda [%o4+40]%asi, %d10
ldda [%o4+48]%asi, %d12
ldda [%o4+56]%asi, %d14
.ci_unaln_011_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_AIUS, %d16
faligndata %d6, %d8, %d48
faligndata %d8, %d10, %d50
faligndata %d10, %d12, %d52
faligndata %d12, %d14, %d54
faligndata %d14, %d16, %d56
faligndata %d16, %d18, %d58
faligndata %d18, %d20, %d60
faligndata %d20, %d22, %d62
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .ci_unaln_011_loop
prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
ba .ci_unaln_done
nop
.ci_unaln_010:
ldda [%o4+16]%asi, %d4
ldda [%o4+24]%asi, %d6
ldda [%o4+32]%asi, %d8
ldda [%o4+40]%asi, %d10
ldda [%o4+48]%asi, %d12
ldda [%o4+56]%asi, %d14
.ci_unaln_010_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_AIUS, %d16
faligndata %d4, %d6, %d48
faligndata %d6, %d8, %d50
faligndata %d8, %d10, %d52
faligndata %d10, %d12, %d54
faligndata %d12, %d14, %d56
faligndata %d14, %d16, %d58
faligndata %d16, %d18, %d60
faligndata %d18, %d20, %d62
fmovd %d20, %d4
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .ci_unaln_010_loop
prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
ba .ci_unaln_done
nop
.ci_unaln_001:
ldda [%o4+8]%asi, %d2
ldda [%o4+16]%asi, %d4
ldda [%o4+24]%asi, %d6
ldda [%o4+32]%asi, %d8
ldda [%o4+40]%asi, %d10
ldda [%o4+48]%asi, %d12
ldda [%o4+56]%asi, %d14
.ci_unaln_001_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_AIUS, %d16
faligndata %d2, %d4, %d48
faligndata %d4, %d6, %d50
faligndata %d6, %d8, %d52
faligndata %d8, %d10, %d54
faligndata %d10, %d12, %d56
faligndata %d12, %d14, %d58
faligndata %d14, %d16, %d60
faligndata %d16, %d18, %d62
fmovd %d18, %d2
fmovd %d20, %d4
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .ci_unaln_001_loop
prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
ba .ci_unaln_done
nop
.ci_unaln_000:
ldda [%o4]ASI_BLK_AIUS, %d0
.ci_unaln_000_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_AIUS, %d16
faligndata %d0, %d2, %d48
faligndata %d2, %d4, %d50
faligndata %d4, %d6, %d52
faligndata %d6, %d8, %d54
faligndata %d8, %d10, %d56
faligndata %d10, %d12, %d58
faligndata %d12, %d14, %d60
faligndata %d14, %d16, %d62
fmovd %d16, %d0
fmovd %d18, %d2
fmovd %d20, %d4
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%i1]ASI_BLK_P
subcc %i3, 64, %i3
add %i1, 64, %i1
bgu,pt %ncc, .ci_unaln_000_loop
prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
.ci_unaln_done:
! Handle trailing bytes, 64 to 127
! Dest long word aligned, Src not long word aligned
cmp %i2, 15
bleu %ncc, .ci_unaln_short
andn %i2, 0x7, %i3 ! %i3 is multiple of 8
and %i2, 0x7, %i2 ! residue bytes in %i2
add %i2, 8, %i2
sub %i3, 8, %i3 ! insure we don't load past end of src
andn %i0, 0x7, %o4 ! %o4 has long word aligned src address
add %i0, %i3, %i0 ! advance %i0 to after multiple of 8
ldda [%o4]%asi, %d0 ! fetch partial word
.ci_unaln_by8:
ldda [%o4+8]%asi, %d2
add %o4, 8, %o4
faligndata %d0, %d2, %d16
subcc %i3, 8, %i3
std %d16, [%i1]
fmovd %d2, %d0
bgu,pt %ncc, .ci_unaln_by8
add %i1, 8, %i1
.ci_unaln_short:
cmp %i2, 8
blt,pt %ncc, .ci_unalnfin
nop
lduba [%i0]%asi, %o4
sll %o4, 24, %o3
lduba [%i0+1]%asi, %o4
sll %o4, 16, %o4
or %o4, %o3, %o3
lduba [%i0+2]%asi, %o4
sll %o4, 8, %o4
or %o4, %o3, %o3
lduba [%i0+3]%asi, %o4
or %o4, %o3, %o3
stw %o3, [%i1]
lduba [%i0+4]%asi, %o4
sll %o4, 24, %o3
lduba [%i0+5]%asi, %o4
sll %o4, 16, %o4
or %o4, %o3, %o3
lduba [%i0+6]%asi, %o4
sll %o4, 8, %o4
or %o4, %o3, %o3
lduba [%i0+7]%asi, %o4
or %o4, %o3, %o3
stw %o3, [%i1+4]
add %i0, 8, %i0
add %i1, 8, %i1
sub %i2, 8, %i2
.ci_unalnfin:
cmp %i2, 4
blt,pt %ncc, .ci_unalnz
tst %i2
lduba [%i0]%asi, %o3 ! read byte
subcc %i2, 4, %i2 ! reduce count by 4
sll %o3, 24, %o3 ! position
lduba [%i0+1]%asi, %o4
sll %o4, 16, %o4 ! position
or %o4, %o3, %o3 ! merge
lduba [%i0+2]%asi, %o4
sll %o4, 8, %o4 ! position
or %o4, %o3, %o3 ! merge
add %i1, 4, %i1 ! advance dst by 4
lduba [%i0+3]%asi, %o4
add %i0, 4, %i0 ! advance src by 4
or %o4, %o3, %o4 ! merge
bnz,pt %ncc, .ci_unaln3x
stw %o4, [%i1-4]
ba .ci_exit
nop
.ci_unalnz:
bz,pt %ncc, .ci_exit
wr %l5, %g0, %gsr ! restore %gsr
.ci_unaln3x: ! Exactly 1, 2, or 3 bytes remain
subcc %i2, 1, %i2 ! reduce count for cc test
lduba [%i0]%asi, %o4 ! load one byte
bz,pt %ncc, .ci_exit
stb %o4, [%i1] ! store one byte
lduba [%i0+1]%asi, %o4 ! load second byte
subcc %i2, 1, %i2
bz,pt %ncc, .ci_exit
stb %o4, [%i1+1] ! store second byte
lduba [%i0+2]%asi, %o4 ! load third byte
stb %o4, [%i1+2] ! store third byte
.ci_exit:
brnz %g1, .ci_fp_restore
nop
FZERO
wr %g1, %g0, %fprs
ba,pt %ncc, .ci_ex2
membar #Sync
.ci_fp_restore:
BLD_FP_FROMSTACK(%o4)
.ci_ex2:
andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
ret
restore %g0, 0, %o0
.copyin_err:
ldn [THREAD_REG + T_COPYOPS], %o4
brz %o4, 2f
nop
ldn [%o4 + CP_COPYIN], %g2
jmp %g2
nop
2:
retl
mov -1, %o0
#else /* NIAGARA_IMPL */
.do_copyin:
!
! Check the length and bail if zero.
!
tst %o2
bnz,pt %ncc, 1f
nop
retl
clr %o0
1:
sethi %hi(copyio_fault), %o4
or %o4, %lo(copyio_fault), %o4
sethi %hi(copyio_fault_nowindow), %o3
ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
or %o3, %lo(copyio_fault_nowindow), %o3
membar #Sync
stn %o3, [THREAD_REG + T_LOFAULT]
mov %o0, SAVE_SRC
mov %o1, SAVE_DST
mov %o2, SAVE_COUNT
!
! Check to see if we're more than SMALL_LIMIT.
!
subcc %o2, SMALL_LIMIT, %o3
bgu,a,pt %ncc, .dci_ns
or %o0, %o1, %o3
!
! What was previously ".small_copyin"
!
.dcibcp:
sub %g0, %o2, %o3 ! setup for copy loop
add %o0, %o2, %o0
add %o1, %o2, %o1
ba,pt %ncc, .dcicl
lduba [%o0 + %o3]ASI_USER, %o4
!
! %o0 and %o1 point at the end and remain pointing at the end
! of their buffers. We pull things out by adding %o3 (which is
! the negation of the length) to the buffer end which gives us
! the curent location in the buffers. By incrementing %o3 we walk
! through both buffers without having to bump each buffer's
! pointer. A very fast 4 instruction loop.
!
.align 16
.dcicl:
stb %o4, [%o1 + %o3]
inccc %o3
bl,a,pt %ncc, .dcicl
lduba [%o0 + %o3]ASI_USER, %o4
!
! We're done. Go home.
!
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
retl
clr %o0
!
! Try aligned copies from here.
!
.dci_ns:
!
! See if we're single byte aligned. If we are, check the
! limit for single byte copies. If we're smaller, or equal,
! bounce to the byte for byte copy loop. Otherwise do it in
! HW (if enabled).
!
btst 1, %o3
bz,a,pt %icc, .dcih8
btst 7, %o3
!
! We're single byte aligned.
!
sethi %hi(hw_copy_limit_1), %o3
ld [%o3 + %lo(hw_copy_limit_1)], %o3
!
! Is HW copy on? If not do everything byte for byte.
!
tst %o3
bz,pn %icc, .dcibcp
subcc %o3, %o2, %o3
!
! Are we bigger than the HW limit? If not
! go to byte for byte.
!
bge,pt %ncc, .dcibcp
nop
!
! We're big enough and copy is on. Do it with HW.
!
ba,pt %ncc, .big_copyin
nop
.dcih8:
!
! 8 byte aligned?
!
bnz,a %ncc, .dcih4
btst 3, %o3
!
! We're eight byte aligned.
!
sethi %hi(hw_copy_limit_8), %o3
ld [%o3 + %lo(hw_copy_limit_8)], %o3
!
! Is HW assist on? If not, do it with the aligned copy.
!
tst %o3
bz,pn %icc, .dcis8
subcc %o3, %o2, %o3
bge %ncc, .dcis8
nop
ba,pt %ncc, .big_copyin
nop
.dcis8:
!
! Housekeeping for copy loops. Uses same idea as in the byte for
! byte copy loop above.
!
add %o0, %o2, %o0
add %o1, %o2, %o1
sub %g0, %o2, %o3
ba,pt %ncc, .didebc
srl %o2, 3, %o2 ! Number of 8 byte chunks to copy
!
! 4 byte aligned?
!
.dcih4:
bnz %ncc, .dcih2
sethi %hi(hw_copy_limit_4), %o3
ld [%o3 + %lo(hw_copy_limit_4)], %o3
!
! Is HW assist on? If not, do it with the aligned copy.
!
tst %o3
bz,pn %icc, .dcis4
subcc %o3, %o2, %o3
!
! We're negative if our size is less than or equal to hw_copy_limit_4.
!
bge %ncc, .dcis4
nop
ba,pt %ncc, .big_copyin
nop
.dcis4:
!
! Housekeeping for copy loops. Uses same idea as in the byte
! for byte copy loop above.
!
add %o0, %o2, %o0
add %o1, %o2, %o1
sub %g0, %o2, %o3
ba,pt %ncc, .didfbc
srl %o2, 2, %o2 ! Number of 4 byte chunks to copy
.dcih2:
!
! We're two byte aligned. Check for "smallness"
! done in delay at .dcih4
!
bleu,pt %ncc, .dcis2
sethi %hi(hw_copy_limit_2), %o3
ld [%o3 + %lo(hw_copy_limit_2)], %o3
!
! Is HW assist on? If not, do it with the aligned copy.
!
tst %o3
bz,pn %icc, .dcis2
subcc %o3, %o2, %o3
!
! Are we larger than the HW limit?
!
bge %ncc, .dcis2
nop
!
! HW assist is on and we're large enough to use it.
!
ba,pt %ncc, .big_copyin
nop
!
! Housekeeping for copy loops. Uses same idea as in the byte
! for byte copy loop above.
!
.dcis2:
add %o0, %o2, %o0
add %o1, %o2, %o1
sub %g0, %o2, %o3
ba,pt %ncc, .didtbc
srl %o2, 1, %o2 ! Number of 2 byte chunks to copy
!
.small_copyin:
!
! Why are we doing this AGAIN? There are certain conditions in
! big copyin that will cause us to forgo the HW assisted copys
! and bounce back to a non-hw assisted copy. This dispatches
! those copies. Note that we branch around this in the main line
! code.
!
! We make no check for limits or HW enablement here. We've
! already been told that we're a poster child so just go off
! and do it.
!
or %o0, %o1, %o3
btst 1, %o3
bnz %icc, .dcibcp ! Most likely
btst 7, %o3
bz %icc, .dcis8
btst 3, %o3
bz %icc, .dcis4
nop
ba,pt %ncc, .dcis2
nop
!
! Eight byte aligned copies. A steal from the original .small_copyin
! with modifications. %o2 is number of 8 byte chunks to copy. When
! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
! to copy.
!
.align 32
.didebc:
ldxa [%o0 + %o3]ASI_USER, %o4
deccc %o2
stx %o4, [%o1 + %o3]
bg,pt %ncc, .didebc
addcc %o3, 8, %o3
!
! End of copy loop. Most 8 byte aligned copies end here.
!
bz,pt %ncc, .dcifh
nop
!
! Something is left. Do it byte for byte.
!
ba,pt %ncc, .dcicl
lduba [%o0 + %o3]ASI_USER, %o4
!
! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
!
.align 32
.didfbc:
lduwa [%o0 + %o3]ASI_USER, %o4
deccc %o2
st %o4, [%o1 + %o3]
bg,pt %ncc, .didfbc
addcc %o3, 4, %o3
!
! End of copy loop. Most 4 byte aligned copies end here.
!
bz,pt %ncc, .dcifh
nop
!
! Something is left. Do it byte for byte.
!
ba,pt %ncc, .dcicl
lduba [%o0 + %o3]ASI_USER, %o4
!
! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
! copy.
!
.align 32
.didtbc:
lduha [%o0 + %o3]ASI_USER, %o4
deccc %o2
sth %o4, [%o1 + %o3]
bg,pt %ncc, .didtbc
addcc %o3, 2, %o3
!
! End of copy loop. Most 2 byte aligned copies end here.
!
bz,pt %ncc, .dcifh
nop
!
! Deal with the last byte
!
lduba [%o0 + %o3]ASI_USER, %o4
stb %o4, [%o1 + %o3]
.dcifh:
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
clr %o0
.big_copyin:
! We're going off to do a block copy.
! Switch fault hendlers and grab a window. We
! don't do a membar #Sync since we've done only
! kernel data to this point.
stn %o4, [THREAD_REG + T_LOFAULT]
! Copy in that reach here are larger than 256 bytes. The
! hw_copy_limit_1 is set to 256. Never set this limit less
! 128 bytes.
save %sp, -SA(MINFRAME), %sp
.do_blockcopyin:
! Swap src/dst since the code below is memcpy code
! and memcpy/bcopy have different calling sequences
mov %i1, %i5
mov %i0, %i1
mov %i5, %i0
! Block (64 bytes) align the destination.
andcc %i0, 0x3f, %i3 ! is dst block aligned
bz %ncc, copyin_blalign ! dst already block aligned
sub %i3, 0x40, %i3
neg %i3 ! bytes till dst 64 bytes aligned
sub %i2, %i3, %i2 ! update i2 with new count
! Based on source and destination alignment do
! either 8 bytes, 4 bytes, 2 bytes or byte copy.
! Is dst & src 8B aligned
or %i0, %i1, %o2
andcc %o2, 0x7, %g0
bz %ncc, .ci_alewdcp
nop
! Is dst & src 4B aligned
andcc %o2, 0x3, %g0
bz %ncc, .ci_alwdcp
nop
! Is dst & src 2B aligned
andcc %o2, 0x1, %g0
bz %ncc, .ci_alhlfwdcp
nop
! 1B aligned
1: lduba [%i1]ASI_USER, %o2
stb %o2, [%i0]
inc %i1
deccc %i3
bgu,pt %ncc, 1b
inc %i0
ba copyin_blalign
nop
! dst & src 4B aligned
.ci_alwdcp:
lda [%i1]ASI_USER, %o2
st %o2, [%i0]
add %i1, 0x4, %i1
subcc %i3, 0x4, %i3
bgu,pt %ncc, .ci_alwdcp
add %i0, 0x4, %i0
ba copyin_blalign
nop
! dst & src 2B aligned
.ci_alhlfwdcp:
lduha [%i1]ASI_USER, %o2
stuh %o2, [%i0]
add %i1, 0x2, %i1
subcc %i3, 0x2, %i3
bgu,pt %ncc, .ci_alhlfwdcp
add %i0, 0x2, %i0
ba copyin_blalign
nop
! dst & src 8B aligned
.ci_alewdcp:
ldxa [%i1]ASI_USER, %o2
stx %o2, [%i0]
add %i1, 0x8, %i1
subcc %i3, 0x8, %i3
bgu,pt %ncc, .ci_alewdcp
add %i0, 0x8, %i0
copyin_blalign:
andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
sub %i2, %i3, %i2 ! Residue bytes in %i2
mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
andcc %i1, 0xf, %o2 ! is src quadword aligned
bz,pn %xcc, .ci_blkcpy ! src offset in %o2 (last 4-bits)
nop
cmp %o2, 0x8
bg .ci_upper_double
nop
bl .ci_lower_double
nop
! Falls through when source offset is equal to 8 i.e.
! source is double word aligned.
! In this case no shift/merge of data is required
sub %i1, %o2, %i1 ! align the src at 16 bytes.
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetcha [%l0]ASI_USER, #one_read
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
add %l0, 0x40, %l0
.ci_loop0:
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
prefetcha [%l0]ASI_USER, #one_read
stxa %l3, [%i0+0x0]%asi
stxa %l4, [%i0+0x8]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
stxa %l5, [%i0+0x10]%asi
stxa %l2, [%i0+0x18]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
stxa %l3, [%i0+0x20]%asi
stxa %l4, [%i0+0x28]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
stxa %l5, [%i0+0x30]%asi
stxa %l2, [%i0+0x38]%asi
add %l0, 0x40, %l0
subcc %i3, 0x40, %i3
bgu,pt %xcc, .ci_loop0
add %i0, 0x40, %i0
ba .ci_blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
.ci_lower_double:
sub %i1, %o2, %i1 ! align the src at 16 bytes.
sll %o2, 3, %o0 ! %o0 left shift
mov 0x40, %o1
sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetcha [%l0]ASI_USER, #one_read
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l2
! and %l3 has complete
! data
add %l0, 0x40, %l0
.ci_loop1:
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has partial data
! for this read.
ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
! into %l2 and %l3
prefetcha [%l0]ASI_USER, #one_read
stxa %l2, [%i0+0x0]%asi
stxa %l3, [%i0+0x8]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
! %l4 from previous read
! into %l4 and %l5
stxa %l4, [%i0+0x10]%asi
stxa %l5, [%i0+0x18]%asi
! Repeat the same for next 32 bytes.
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
stxa %l2, [%i0+0x20]%asi
stxa %l3, [%i0+0x28]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
stxa %l4, [%i0+0x30]%asi
stxa %l5, [%i0+0x38]%asi
add %l0, 0x40, %l0
subcc %i3, 0x40, %i3
bgu,pt %xcc, .ci_loop1
add %i0, 0x40, %i0
ba .ci_blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
.ci_upper_double:
sub %i1, %o2, %i1 ! align the src at 16 bytes.
sub %o2, 0x8, %o0
sll %o0, 3, %o0 ! %o0 left shift
mov 0x40, %o1
sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetcha [%l0]ASI_USER, #one_read
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l3
! for this read and
! no data in %l2
add %l0, 0x40, %l0
.ci_loop2:
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has complete data
! and %l5 has partial
ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
! into %l3 and %l4
prefetcha [%l0]ASI_USER, #one_read
stxa %l3, [%i0+0x0]%asi
stxa %l4, [%i0+0x8]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
! %l5 from previous read
! into %l5 and %l2
stxa %l5, [%i0+0x10]%asi
stxa %l2, [%i0+0x18]%asi
! Repeat the same for next 32 bytes.
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
stxa %l3, [%i0+0x20]%asi
stxa %l4, [%i0+0x28]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
stxa %l5, [%i0+0x30]%asi
stxa %l2, [%i0+0x38]%asi
add %l0, 0x40, %l0
subcc %i3, 0x40, %i3
bgu,pt %xcc, .ci_loop2
add %i0, 0x40, %i0
ba .ci_blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.ci_blkcpy:
andn %i1, 0x3f, %o0 ! %o0 has block aligned source
prefetcha [%o0]ASI_USER, #one_read
add %o0, 0x40, %o0
1:
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
add %i1, 0x10, %i1
prefetcha [%o0]ASI_USER, #one_read
stxa %l0, [%i0+0x0]%asi
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
add %i1, 0x10, %i1
stxa %l1, [%i0+0x8]%asi
stxa %l2, [%i0+0x10]%asi
stxa %l3, [%i0+0x18]%asi
stxa %l4, [%i0+0x20]%asi
stxa %l5, [%i0+0x28]%asi
stxa %l6, [%i0+0x30]%asi
stxa %l7, [%i0+0x38]%asi
add %o0, 0x40, %o0
subcc %i3, 0x40, %i3
bgu,pt %xcc, 1b
add %i0, 0x40, %i0
.ci_blkdone:
membar #Sync
brz,pt %i2, .copyin_exit
nop
! Handle trailing bytes
cmp %i2, 0x8
blu,pt %ncc, .ci_residue
nop
! Can we do some 8B ops
or %i1, %i0, %o2
andcc %o2, 0x7, %g0
bnz %ncc, .ci_last4
nop
! Do 8byte ops as long as possible
.ci_last8:
ldxa [%i1]ASI_USER, %o2
stx %o2, [%i0]
add %i1, 0x8, %i1
sub %i2, 0x8, %i2
cmp %i2, 0x8
bgu,pt %ncc, .ci_last8
add %i0, 0x8, %i0
brz,pt %i2, .copyin_exit
nop
ba .ci_residue
nop
.ci_last4:
! Can we do 4B ops
andcc %o2, 0x3, %g0
bnz %ncc, .ci_last2
nop
1:
lda [%i1]ASI_USER, %o2
st %o2, [%i0]
add %i1, 0x4, %i1
sub %i2, 0x4, %i2
cmp %i2, 0x4
bgu,pt %ncc, 1b
add %i0, 0x4, %i0
brz,pt %i2, .copyin_exit
nop
ba .ci_residue
nop
.ci_last2:
! Can we do 2B ops
andcc %o2, 0x1, %g0
bnz %ncc, .ci_residue
nop
1:
lduha [%i1]ASI_USER, %o2
stuh %o2, [%i0]
add %i1, 0x2, %i1
sub %i2, 0x2, %i2
cmp %i2, 0x2
bgu,pt %ncc, 1b
add %i0, 0x2, %i0
brz,pt %i2, .copyin_exit
nop
! Copy the residue as byte copy
.ci_residue:
lduba [%i1]ASI_USER, %i4
stb %i4, [%i0]
inc %i1
deccc %i2
bgu,pt %xcc, .ci_residue
inc %i0
.copyin_exit:
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
ret
restore %g0, 0, %o0
.copyin_err:
ldn [THREAD_REG + T_COPYOPS], %o4
brz %o4, 2f
nop
ldn [%o4 + CP_COPYIN], %g2
jmp %g2
nop
2:
retl
mov -1, %o0
#endif /* NIAGARA_IMPL */
SET_SIZE(copyin)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
xcopyin(const void *uaddr, void *kaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(xcopyin)
sethi %hi(.xcopyin_err), REAL_LOFAULT
b .do_copyin
or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
.xcopyin_err:
ldn [THREAD_REG + T_COPYOPS], %o4
brz %o4, 2f
nop
ldn [%o4 + CP_XCOPYIN], %g2
jmp %g2
nop
2:
retl
mov %g1, %o0
SET_SIZE(xcopyin)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
xcopyin_little(const void *uaddr, void *kaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(xcopyin_little)
sethi %hi(.little_err), %o4
ldn [THREAD_REG + T_LOFAULT], %o5
or %o4, %lo(.little_err), %o4
membar #Sync ! sync error barrier
stn %o4, [THREAD_REG + T_LOFAULT]
subcc %g0, %o2, %o3
add %o0, %o2, %o0
bz,pn %ncc, 2f ! check for zero bytes
sub %o2, 1, %o4
add %o0, %o4, %o0 ! start w/last byte
add %o1, %o2, %o1
lduba [%o0+%o3]ASI_AIUSL, %o4
1: stb %o4, [%o1+%o3]
inccc %o3
sub %o0, 2, %o0 ! get next byte
bcc,a,pt %ncc, 1b
lduba [%o0+%o3]ASI_AIUSL, %o4
2: membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return (0)
.little_err:
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g1, %o0
SET_SIZE(xcopyin_little)
#endif /* lint */
/*
* Copy a block of storage - must not overlap (from + len <= to).
* No fault handler installed (to be called under on_fault())
*/
#if defined(lint)
/* ARGSUSED */
void
copyin_noerr(const void *ufrom, void *kto, size_t count)
{}
#else /* lint */
ENTRY(copyin_noerr)
sethi %hi(.copyio_noerr), REAL_LOFAULT
b .do_copyin
or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
.copyio_noerr:
jmp SAVED_LOFAULT
nop
SET_SIZE(copyin_noerr)
#endif /* lint */
/*
* Copy a block of storage - must not overlap (from + len <= to).
* No fault handler installed (to be called under on_fault())
*/
#if defined(lint)
/* ARGSUSED */
void
copyout_noerr(const void *kfrom, void *uto, size_t count)
{}
#else /* lint */
ENTRY(copyout_noerr)
sethi %hi(.copyio_noerr), REAL_LOFAULT
b .do_copyout
or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
SET_SIZE(copyout_noerr)
#endif /* lint */
#if defined(lint)
int use_hw_bcopy = 1;
int use_hw_bzero = 1;
uint_t hw_copy_limit_1 = 0x100;
uint_t hw_copy_limit_2 = 0x200;
uint_t hw_copy_limit_4 = 0x400;
uint_t hw_copy_limit_8 = 0x400;
#else /* !lint */
.align 4
DGDEF(use_hw_bcopy)
.word 1
DGDEF(use_hw_bzero)
.word 1
DGDEF(hw_copy_limit_1)
.word 0x100
DGDEF(hw_copy_limit_2)
.word 0x200
DGDEF(hw_copy_limit_4)
.word 0x400
DGDEF(hw_copy_limit_8)
.word 0x400
.align 64
.section ".text"
#endif /* !lint */
/*
* hwblkclr - clears block-aligned, block-multiple-sized regions that are
* longer than 256 bytes in length using Niagara's block stores/quad store.
* If the criteria for using this routine are not met then it calls bzero
* and returns 1. Otherwise 0 is returned indicating success.
* Caller is responsible for ensuring use_hw_bzero is true and that
* kpreempt_disable() has been called.
*/
#ifdef lint
/*ARGSUSED*/
int
hwblkclr(void *addr, size_t len)
{
return(0);
}
#else /* lint */
! %i0 - start address
! %i1 - length of region (multiple of 64)
ENTRY(hwblkclr)
save %sp, -SA(MINFRAME), %sp
! Must be block-aligned
andcc %i0, 0x3f, %g0
bnz,pn %ncc, 1f
nop
! ... and must be 256 bytes or more
cmp %i1, 0x100
blu,pn %ncc, 1f
nop
! ... and length must be a multiple of 64
andcc %i1, 0x3f, %g0
bz,pn %ncc, .pz_doblock
mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
1: ! punt, call bzero but notify the caller that bzero was used
mov %i0, %o0
call bzero
mov %i1, %o1
ret
restore %g0, 1, %o0 ! return (1) - did not use block operations
! Already verified that there are at least 256 bytes to set
.pz_doblock:
stxa %g0, [%i0+0x0]%asi
stxa %g0, [%i0+0x40]%asi
stxa %g0, [%i0+0x80]%asi
stxa %g0, [%i0+0xc0]%asi
stxa %g0, [%i0+0x8]%asi
stxa %g0, [%i0+0x10]%asi
stxa %g0, [%i0+0x18]%asi
stxa %g0, [%i0+0x20]%asi
stxa %g0, [%i0+0x28]%asi
stxa %g0, [%i0+0x30]%asi
stxa %g0, [%i0+0x38]%asi
stxa %g0, [%i0+0x48]%asi
stxa %g0, [%i0+0x50]%asi
stxa %g0, [%i0+0x58]%asi
stxa %g0, [%i0+0x60]%asi
stxa %g0, [%i0+0x68]%asi
stxa %g0, [%i0+0x70]%asi
stxa %g0, [%i0+0x78]%asi
stxa %g0, [%i0+0x88]%asi
stxa %g0, [%i0+0x90]%asi
stxa %g0, [%i0+0x98]%asi
stxa %g0, [%i0+0xa0]%asi
stxa %g0, [%i0+0xa8]%asi
stxa %g0, [%i0+0xb0]%asi
stxa %g0, [%i0+0xb8]%asi
stxa %g0, [%i0+0xc8]%asi
stxa %g0, [%i0+0xd0]%asi
stxa %g0, [%i0+0xd8]%asi
stxa %g0, [%i0+0xe0]%asi
stxa %g0, [%i0+0xe8]%asi
stxa %g0, [%i0+0xf0]%asi
stxa %g0, [%i0+0xf8]%asi
sub %i1, 0x100, %i1
cmp %i1, 0x100
bgu,pt %ncc, .pz_doblock
add %i0, 0x100, %i0
2:
! Check if more than 64 bytes to set
cmp %i1,0x40
blu %ncc, .pz_finish
nop
3:
stxa %g0, [%i0+0x0]%asi
stxa %g0, [%i0+0x8]%asi
stxa %g0, [%i0+0x10]%asi
stxa %g0, [%i0+0x18]%asi
stxa %g0, [%i0+0x20]%asi
stxa %g0, [%i0+0x28]%asi
stxa %g0, [%i0+0x30]%asi
stxa %g0, [%i0+0x38]%asi
subcc %i1, 0x40, %i1
bgu,pt %ncc, 3b
add %i0, 0x40, %i0
.pz_finish:
membar #Sync
ret
restore %g0, 0, %o0 ! return (bzero or not)
SET_SIZE(hwblkclr)
#endif /* lint */
#ifdef lint
/* Copy 32 bytes of data from src to dst using physical addresses */
/*ARGSUSED*/
void
hw_pa_bcopy32(uint64_t src, uint64_t dst)
{}
#else /*!lint */
/*
* Copy 32 bytes of data from src (%o0) to dst (%o1)
* using physical addresses.
*/
ENTRY_NP(hw_pa_bcopy32)
rdpr %pstate, %g1
andn %g1, PSTATE_IE, %g2
wrpr %g0, %g2, %pstate
ldxa [%o0]ASI_MEM, %o2
add %o0, 8, %o0
ldxa [%o0]ASI_MEM, %o3
add %o0, 8, %o0
ldxa [%o0]ASI_MEM, %o4
add %o0, 8, %o0
ldxa [%o0]ASI_MEM, %o5
stxa %o2, [%o1]ASI_MEM
add %o1, 8, %o1
stxa %o3, [%o1]ASI_MEM
add %o1, 8, %o1
stxa %o4, [%o1]ASI_MEM
add %o1, 8, %o1
stxa %o5, [%o1]ASI_MEM
membar #Sync
retl
wrpr %g0, %g1, %pstate
SET_SIZE(hw_pa_bcopy32)
#endif /* lint */
/*
* Zero a block of storage.
*
* uzero is used by the kernel to zero a block in user address space.
*/
/*
* Control flow of the bzero/kzero/uzero routine.
*
* For fewer than 7 bytes stores, bytes will be zeroed.
*
* For less than 15 bytes stores, align the address on 4 byte boundary.
* Then store as many 4-byte chunks, followed by trailing bytes.
*
* For sizes greater than 15 bytes, align the address on 8 byte boundary.
* if (count > 128) {
* store as many 8-bytes chunks to block align the address
* store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
* store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
* }
* Store as many 8-byte chunks, followed by trailing bytes.
*/
#if defined(lint)
/* ARGSUSED */
int
kzero(void *addr, size_t count)
{ return(0); }
/* ARGSUSED */
void
uzero(void *addr, size_t count)
{}
#else /* lint */
ENTRY(uzero)
!
! Set a new lo_fault handler only if we came in with one
! already specified.
!
wr %g0, ASI_USER, %asi
ldn [THREAD_REG + T_LOFAULT], %o5
tst %o5
bz,pt %ncc, .do_zero
sethi %hi(.zeroerr), %o2
or %o2, %lo(.zeroerr), %o2
membar #Sync
ba,pt %ncc, .do_zero
stn %o2, [THREAD_REG + T_LOFAULT]
ENTRY(kzero)
!
! Always set a lo_fault handler
!
wr %g0, ASI_P, %asi
ldn [THREAD_REG + T_LOFAULT], %o5
sethi %hi(.zeroerr), %o2
or %o5, LOFAULT_SET, %o5
or %o2, %lo(.zeroerr), %o2
membar #Sync
ba,pt %ncc, .do_zero
stn %o2, [THREAD_REG + T_LOFAULT]
/*
* We got here because of a fault during kzero or if
* uzero or bzero was called with t_lofault non-zero.
* Otherwise we've already run screaming from the room.
* Errno value is in %g1. Note that we're here iff
* we did set t_lofault.
*/
.zeroerr:
!
! Undo asi register setting. Just set it to be the
! kernel default without checking.
!
wr %g0, ASI_P, %asi
!
! We did set t_lofault. It may well have been zero coming in.
!
1:
tst %o5
membar #Sync
bne,pn %ncc, 3f
andncc %o5, LOFAULT_SET, %o5
2:
!
! Old handler was zero. Just return the error.
!
retl ! return
mov %g1, %o0 ! error code from %g1
3:
!
! We're here because %o5 was non-zero. It was non-zero
! because either LOFAULT_SET was present, a previous fault
! handler was present or both. In all cases we need to reset
! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
! before we either simply return the error or we invoke the
! previously specified handler.
!
be %ncc, 2b
stn %o5, [THREAD_REG + T_LOFAULT]
jmp %o5 ! goto real handler
nop
SET_SIZE(kzero)
SET_SIZE(uzero)
#endif /* lint */
/*
* Zero a block of storage.
*/
#if defined(lint)
/* ARGSUSED */
void
bzero(void *addr, size_t count)
{}
#else /* lint */
ENTRY(bzero)
wr %g0, ASI_P, %asi
ldn [THREAD_REG + T_LOFAULT], %o5 ! save old vector
tst %o5
bz,pt %ncc, .do_zero
sethi %hi(.zeroerr), %o2
or %o2, %lo(.zeroerr), %o2
membar #Sync ! sync error barrier
stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector
.do_zero:
cmp %o1, 7
blu,pn %ncc, .byteclr
nop
cmp %o1, 15
blu,pn %ncc, .wdalign
nop
andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound
bz,pt %ncc, .blkalign ! already double aligned
sub %o3, 8, %o3 ! -(bytes till double aligned)
add %o1, %o3, %o1 ! update o1 with new count
1:
stba %g0, [%o0]%asi
inccc %o3
bl,pt %ncc, 1b
inc %o0
! Now address is double aligned
.blkalign:
cmp %o1, 0x80 ! check if there are 128 bytes to set
blu,pn %ncc, .bzero_small
mov %o1, %o3
sethi %hi(use_hw_bzero), %o2
ld [%o2 + %lo(use_hw_bzero)], %o2
tst %o2
bz %ncc, .bzero_small
mov %o1, %o3
rd %asi, %o3
wr %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
cmp %o3, ASI_P
bne,a %ncc, .algnblk
wr %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
.algnblk:
andcc %o0, 0x3f, %o3 ! is block aligned?
bz,pt %ncc, .bzero_blk
sub %o3, 0x40, %o3 ! -(bytes till block aligned)
add %o1, %o3, %o1 ! o1 is the remainder
! Clear -(%o3) bytes till block aligned
1:
stxa %g0, [%o0]%asi
addcc %o3, 8, %o3
bl,pt %ncc, 1b
add %o0, 8, %o0
.bzero_blk:
and %o1, 0x3f, %o3 ! calc bytes left after blk clear
andn %o1, 0x3f, %o4 ! calc size of blocks in bytes
cmp %o4, 0x100 ! 256 bytes or more
blu,pn %ncc, 3f
nop
2:
stxa %g0, [%o0+0x0]%asi
stxa %g0, [%o0+0x40]%asi
stxa %g0, [%o0+0x80]%asi
stxa %g0, [%o0+0xc0]%asi
stxa %g0, [%o0+0x8]%asi
stxa %g0, [%o0+0x10]%asi
stxa %g0, [%o0+0x18]%asi
stxa %g0, [%o0+0x20]%asi
stxa %g0, [%o0+0x28]%asi
stxa %g0, [%o0+0x30]%asi
stxa %g0, [%o0+0x38]%asi
stxa %g0, [%o0+0x48]%asi
stxa %g0, [%o0+0x50]%asi
stxa %g0, [%o0+0x58]%asi
stxa %g0, [%o0+0x60]%asi
stxa %g0, [%o0+0x68]%asi
stxa %g0, [%o0+0x70]%asi
stxa %g0, [%o0+0x78]%asi
stxa %g0, [%o0+0x88]%asi
stxa %g0, [%o0+0x90]%asi
stxa %g0, [%o0+0x98]%asi
stxa %g0, [%o0+0xa0]%asi
stxa %g0, [%o0+0xa8]%asi
stxa %g0, [%o0+0xb0]%asi
stxa %g0, [%o0+0xb8]%asi
stxa %g0, [%o0+0xc8]%asi
stxa %g0, [%o0+0xd0]%asi
stxa %g0, [%o0+0xd8]%asi
stxa %g0, [%o0+0xe0]%asi
stxa %g0, [%o0+0xe8]%asi
stxa %g0, [%o0+0xf0]%asi
stxa %g0, [%o0+0xf8]%asi
sub %o4, 0x100, %o4
cmp %o4, 0x100
bgu,pt %ncc, 2b
add %o0, 0x100, %o0
3:
! ... check if 64 bytes to set
cmp %o4, 0x40
blu %ncc, .bzero_blk_done
nop
4:
stxa %g0, [%o0+0x0]%asi
stxa %g0, [%o0+0x8]%asi
stxa %g0, [%o0+0x10]%asi
stxa %g0, [%o0+0x18]%asi
stxa %g0, [%o0+0x20]%asi
stxa %g0, [%o0+0x28]%asi
stxa %g0, [%o0+0x30]%asi
stxa %g0, [%o0+0x38]%asi
subcc %o4, 0x40, %o4
bgu,pt %ncc, 3b
add %o0, 0x40, %o0
.bzero_blk_done:
membar #Sync
!
! Undo asi register setting.
!
rd %asi, %o4
wr %g0, ASI_P, %asi
cmp %o4, ASI_BLK_INIT_ST_QUAD_LDD_P
bne,a %ncc, .bzero_small
wr %g0, ASI_USER, %asi
.bzero_small:
! Set the remaining doubles
subcc %o3, 8, %o3 ! Can we store any doubles?
blu,pn %ncc, .byteclr
and %o1, 7, %o1 ! calc bytes left after doubles
.dbclr:
stxa %g0, [%o0]%asi ! Clear the doubles
subcc %o3, 8, %o3
bgeu,pt %ncc, .dbclr
add %o0, 8, %o0
ba .byteclr
nop
.wdalign:
andcc %o0, 3, %o3 ! is add aligned on a word boundary
bz,pn %ncc, .wdclr
andn %o1, 3, %o3 ! create word sized count in %o3
dec %o1 ! decrement count
stba %g0, [%o0]%asi ! clear a byte
ba .wdalign
inc %o0 ! next byte
.wdclr:
sta %g0, [%o0]%asi ! 4-byte clearing loop
subcc %o3, 4, %o3
bnz,pt %ncc, .wdclr
inc 4, %o0
and %o1, 3, %o1 ! leftover count, if any
.byteclr:
! Set the leftover bytes
brz %o1, .bzero_exit
nop
7:
deccc %o1 ! byte clearing loop
stba %g0, [%o0]%asi
bgu,pt %ncc, 7b
inc %o0
.bzero_exit:
!
! We're just concerned with whether t_lofault was set
! when we came in. We end up here from either kzero()
! or bzero(). kzero() *always* sets a lofault handler.
! It ors LOFAULT_SET into %o5 to indicate it has done
! this even if the value of %o5 is otherwise zero.
! bzero() sets a lofault handler *only* if one was
! previously set. Accordingly we need to examine
! %o5 and if it is non-zero be sure to clear LOFAULT_SET
! before resetting the error handler.
!
tst %o5
bz %ncc, 1f
andn %o5, LOFAULT_SET, %o5
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1:
retl
clr %o0 ! return (0)
SET_SIZE(bzero)
#endif /* lint */