niagara_copy.s revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/asm_linkage.h>
#include <sys/machthread.h>
#include <sys/privregs.h>
#include <sys/niagaraasi.h>
#if !defined(lint)
#include "assym.h"
#endif /* lint */
/*
* Pseudo-code to aid in understanding the control flow of the
*
* On entry to kcopy:
* %l7 = curthread->t_lofault;
* curthread->t_lofault = .copyerr;
* %o5 = %l7; ! save existing handler in %o5
* Call bcopy();
*
* On entry to bcopy:
*
* if (length < 128)
* goto_regular_copy;
*
* if (!use_vis)
* goto_regular_copy;
*
* do_blockcopy_here;
*
* In lofault handler:
* curthread->t_lofault = %o5; ! restore old t_lofault
* return (errno)
*
*/
/*
* Less then or equal this number of bytes we will always copy byte-for-byte
*/
#define SMALL_LIMIT 7
/*
* Size of stack frame in order to accomodate a 64-byte aligned
* floating-point register save area and 2 32-bit temp locations.
*/
/*
* LOFAULT_SET : Flag set by kzero to indicate that lo_fault handler was set
*/
#define LOFAULT_SET 2
/*
* This define is to align data for the unaligned source cases.
* The data1, data2 and data3 is merged into data1 and data2.
* The data3 is preserved for next merge.
*/
/*
* This macro is to align the data. Basically it merges
* data1 and data2 to form double word.
*/
/*
* Copy a block of storage, returning an error code if `from' or
* `to' takes a kernel pagefault which cannot be resolved.
* Returns errno value on pagefault error, 0 if all ok
*/
#if defined(lint)
/* ARGSUSED */
int
{ return(0); }
#else /* lint */
.seg ".text"
.align 4
/*
* We got here because of a fault during kcopy.
* Errno value is in %g1.
*/
.copyerr:
#endif /* lint */
/*
* Copy a block of storage - must not overlap (from + len <= to).
*
* Copy a page of memory.
* Assumes double word alignment and a count >= 256.
*/
#if defined(lint)
/* ARGSUSED */
void
{}
#else /* lint */
.do_copy:
1:
/*
* Compare against 256 since we should be checking block addresses
* and (dest & ~63) - (src & ~63) can be 3 blocks even if
* src = dest + (64 * 3) + 63.
*/
/*
* Copy that reach here have at least 2 blocks of data to copy.
*/
.chksrc:
! partial
.blkcpy:
1:
.blkdone:
.residue:
.blkexit:
.bcb_punt:
!
!
!
!
!
!
bnz,a 1b
1:
bnz,a 1b
b 2f
b 3f
!
!
1:
bnz,a 1b
.xfer:
3:
2:
b 2b ! loop
1:
!
!
b,a .xfer
!
!
.aldoubcp:
!
!
5:
!
!
.wordcp:
5:
b,a .dbytecp
.alwordcp:
b .wordcp
!
!
.bytecp:
b .dbytecp
!
!
1:
.dbytecp:
.cpdone:
/*
* Common code used to align transfers on word and doubleword
* boudaries. Aligns source and destination and returns a count
* of aligned bytes to transfer in %i3
*/
1:
.alignit:
bnz,a 1b
#endif /* lint */
/*
* Block copy with possibly overlapped operands.
*/
#if defined(lint)
/*ARGSUSED*/
void
{}
#else /* lint */
retl ! return
1:
!
!
.ov_fwd:
retl ! return
!
!
.ov_bkwd:
retl ! return
#endif /* lint */
/*
* hwblkpagecopy()
*
* Copies exactly one page. This routine assumes the caller (ppcopy)
* has already disabled kernel preemption and has checked
* use_hw_bcopy.
*/
#ifdef lint
/*ARGSUSED*/
void
{ }
#else /* lint */
/*
* Copying exactly one page and PAGESIZE is in mutliple of 0x80.
*/
1:
#endif /* lint */
/*
* Transfer data to and from user space -
* Note that these routines can cause faults
* It is assumed that the kernel has nothing at
* less than KERNELBASE in the virtual address space.
*
* Note that copyin(9F) and copyout(9F) are part of the
*
* Sigh.
*
* So there's two extremely similar routines - xcopyin() and xcopyout()
* which return the errno that we've faithfully computed. This
* allows other callers (e.g. uiomove(9F)) to work correctly.
* Given that these are used pretty heavily, we expand the calling
* sequences inline for all flavours (rather than making wrappers).
*
* There are also stub routines for xcopyout_little and xcopyin_little,
* which currently are intended to handle requests of <= 16 bytes from
* do_unaligned. Future enhancement to make them handle 8k pages efficiently
* is left as an exercise...
*/
/*
* Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
*
* General theory of operation:
*
* None of the copyops routines grab a window until it's decided that
* we need to do a HW block copy operation. This saves a window
*
* This code uses a set of 4 limits for the maximum size that will
* the default limits are:
*
* single byte aligned - 256 (hw_copy_limit_1)
* two byte aligned - 512 (hw_copy_limit_2)
* four byte aligned - 1024 (hw_copy_limit_4)
* eight byte aligned - 1024 (hw_copy_limit_8)
*
* If the value for a particular limit is zero, the copy will be done
*
* Flow:
*
* If count == zero return zero.
*
* Store the previous lo_fault handler into %g6.
* Place our secondary lofault handler into %g5.
* Place the address of our nowindow fault handler into %o3.
* Place the address of the windowed fault handler into %o4.
* --> We'll use this handler if we end up grabbing a window
* --> before we use block initializing store and quad load ASIs
*
* If count is less than or equal to SMALL_LIMIT (7) we
* always do a byte for byte copy.
*
* If count is > SMALL_LIMIT, we check the alignment of the input
* and output pointers. Based on the alignment we check count
* against a limit based on detected alignment. If we exceed the
* alignment value we copy via block initializing store and quad
* load instructions.
*
* If we don't exceed one of the limits, we store -count in %o3,
* we store the number of chunks (8, 4, 2 or 1 byte) operated
* on in our basic copy loop in %o2. Following this we branch
* to the appropriate copy loop and copy that many chunks.
* Since we've been adding the chunk size to %o3 each time through
* as well as decrementing %o2, we can tell if any data is
* is left to be copied by examining %o3. If that is zero, we're
* done and can go home. If not, we figure out what the largest
* chunk size left to be copied is and branch to that copy loop
* unless there's only one byte left. We load that as we're
* branching to code that stores it just before we return.
*
* Fault handlers are invoked if we reference memory that has no
* current mapping. All forms share the same copyio_fault handler.
* This routine handles fixing up the stack and general housecleaning.
* Each copy operation has a simple fault handler that is then called
* to do the work specific to the invidual operation. The handler
* for copyOP and xcopyOP are found at the end of individual function.
* The handlers for xcopyOP_little are found at the end of xcopyin_little.
* The handlers for copyOP_noerr are found at the end of copyin_noerr.
*/
/*
* Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
*/
#if defined(lint)
/*ARGSUSED*/
int
{ return (0); }
#else /* lint */
/*
* We save the arguments in the following registers in case of a fault:
* kaddr - %g2
* uaddr - %g3
* count - %g4
*/
#define SAVE_COUNT %g4
#define REAL_LOFAULT %g5
#define SAVED_LOFAULT %g6
/*
* Generic copyio fault handler. This is the first line of defense when a
* fault occurs in (x)copyin/(x)copyout. In order for this to function
* properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
* This allows us to share common code for all the flavors of the copy
* operations, including the _noerr versions.
*
* Note that this function will restore the original input parameters before
* calling REAL_LOFAULT. So the real handler can vector to the appropriate
* member of the t_copyop structure, if needed.
*/
!
!
1:
!
! Run in leaf mode, using the %o regs as our input regs.
!
subcc %o2, SMALL_LIMIT, %o3
bgu,a,pt %ncc, .dco_ns
or %o0, %o1, %o3
!
! What was previously ".small_copyout"
! Do full differenced copy.
!
.dcobcp:
sub %g0, %o2, %o3 ! negate count
add %o0, %o2, %o0 ! make %o0 point at the end
add %o1, %o2, %o1 ! make %o1 point at the end
ba,pt %ncc, .dcocl
ldub [%o0 + %o3], %o4 ! load first byte
!
! %o0 and %o2 point at the end and remain pointing at the end
! of their buffers. We pull things out by adding %o3 (which is
! the negation of the length) to the buffer end which gives us
! the curent location in the buffers. By incrementing %o3 we walk
! through both buffers without having to bump each buffer's
!
.align 16
.dcocl:
!
! We're done. Go home.
!
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
retl
clr %o0
!
! Try aligned copies from here.
!
.dco_ns:
! %o0 = kernel addr (to be copied from)
! %o1 = user addr (to be copied to)
! %o2 = length
! %o3 = %o1 | %o2 (used for alignment checking)
! %o4 is alternate lo_fault
! %o5 is original lo_fault
!
! bounce to the byte for byte copy loop. Otherwise do it in
! HW (if enabled).
!
btst 1, %o3
bz,pt %icc, .dcoh8
btst 7, %o3
!
! Single byte aligned. Do we do it via HW or via
! byte for byte? Do a quick no memory reference
! check to pick up small copies.
!
sethi %hi(hw_copy_limit_1), %o3
!
! Big enough that we need to check the HW limit for
! this size copy.
!
ld [%o3 + %lo(hw_copy_limit_1)], %o3
!
! Is HW copy on? If not, do everything byte for byte.
!
tst %o3
bz,pn %icc, .dcobcp
subcc %o3, %o2, %o3
!
!
!
! We're big enough and copy is on. Do it with HW.
!
ba,pt %ncc, .big_copyout
nop
.dcoh8:
!
! 8 byte aligned?
!
bnz,a %ncc, .dcoh4
btst 3, %o3
!
!
!
!
!
! We're negative if our size is larger than hw_copy_limit_8.
!
bge,pt %ncc, .dcos8
nop
!
!
.dcos8:
!
!
!
!
.dcoh4:
!
! If so, go off an do the copy.
! If not, load the hard limit. %o3 is
! available for reuse.
!
sethi %hi(hw_copy_limit_4), %o3
ld [%o3 + %lo(hw_copy_limit_4)], %o3
!
! Bop off to the aligned copy.
!
tst %o3
bz,pn %icc, .dcos4
subcc %o3, %o2, %o3
!
!
!
!
ba,pt %ncc, .big_copyout
nop
.dcos4:
add %o0, %o2, %o0
add %o1, %o2, %o1
sub %g0, %o2, %o3
ba,pt %ncc, .dodfbc
srl %o2, 2, %o2 ! Number of 4 byte chunks to copy
!
! We must be 2 byte aligned. Off we go.
! The check for small copies was done in the
! delay at .dcoh4
!
.dcoh2:
ble %ncc, .dcos2
sethi %hi(hw_copy_limit_2), %o3
ld [%o3 + %lo(hw_copy_limit_2)], %o3
tst %o3
bz,pn %icc, .dcos2
subcc %o3, %o2, %o3
bge,pt %ncc, .dcos2
nop
!
!
.dcos2:
!
!
!
.align 32
.dodebc:
!
! eight byte aligned copies end here.
!
bz,pt %ncc, .dcofh
nop
!
! Something is left - do it byte for byte.
!
ba,pt %ncc, .dcocl
ldub [%o0 + %o3], %o4 ! load next byte
!
! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
!
.align 32
.dodfbc:
lduw [%o0 + %o3], %o4
deccc %o2
sta %o4, [%o1 + %o3]ASI_USER
bg,pt %ncc, .dodfbc
addcc %o3, 4, %o3
!
!
!
!
!
! copy.
!
.align 32
.dodtbc:
!
!
!
!
.dcofh:
!
! We're going to go off and do a block copy.
! Switch fault handlers and grab a window. We
! kernel data to this point.
!
stn %o4, [THREAD_REG + T_LOFAULT]
save %sp, -SA(MINFRAME), %sp
! Copy out that reach here are larger than 256 bytes. The
! hw_copy_limit_1 is set to 256. Never set this limit less
! 128 bytes.
.do_block_copyout:
mov %i1, %i5
mov %i0, %i1
mov %i5, %i0
andcc %i0, 7, %i3 ! is dst double aligned
bz %ncc, copyout_blkcpy
sub %i3, 8, %i3
neg %i3 ! bytes till double aligned
sub %i2, %i3, %i2 ! update %i2 with new count
! Align Destination on double-word boundary
1: ldub [%i1], %i4
inc %i1
stba %i4, [%i0]ASI_USER
deccc %i3
bgu %ncc, 1b
inc %i0
copyout_blkcpy:
andcc %i0, 63, %i3
bz,pn %ncc, copyout_blalign ! now block aligned
sub %i3, 64, %i3
neg %i3 ! bytes till block aligned
sub %i2, %i3, %i2 ! update %i2 with new count
! Copy %i3 bytes till dst is block (64 byte) aligned. use
! double word copies.
andcc %i1, 7, %g1 ! is src aligned on a 8 bytes
bz %ncc, .co_dbcopy ! %g1 has source offset (last 3-bits)
sll %g1, 3, %l1 ! left shift
mov 0x40, %l2
sub %l2, %l1, %l2 ! right shift = (64 - left shift)
! Now use double word copies to align destination.
.co_double:
sub %i1, %g1, %i1 ! align the src at 8 bytes.
ldx [%i1], %o2
2:
ldx [%i1+8], %o4
ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3)
stxa %o2, [%i0]ASI_USER
mov %o4, %o2
add %i1, 0x8, %i1
subcc %i3, 0x8, %i3
bgu,pt %ncc, 2b
add %i0, 0x8, %i0
ba copyout_blalign
add %i1, %g1, %i1
! Both source and destination are double aligned.
! No shift and merge of data required in this case.
.co_dbcopy:
ldx [%i1], %o2
stxa %o2, [%i0]ASI_USER
add %i1, 0x8, %i1
subcc %i3, 0x8, %i3
bgu,pt %ncc, .co_dbcopy
add %i0, 0x8, %i0
copyout_blalign:
andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
sub %i2, %i3, %i2 ! Residue bytes in %i2
mov ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
andcc %i1, 0xf, %o2 ! is src quadword aligned
bz,pn %xcc, .co_blkcpy ! src offset in %o2 (last 4-bits)
nop
cmp %o2, 0x8
bg .co_upper_double
nop
bl .co_lower_double
nop
! Falls through when source offset is equal to 8 i.e.
! source is double word aligned.
sub %i1, %o2, %i1 ! align the src at 16 bytes.
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetch [%l0+0x0], #one_read
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
.co_loop0:
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
prefetch [%l0+0x40], #one_read
stxa %l3, [%i0+0x0]%asi
stxa %l4, [%i0+0x8]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
stxa %l5, [%i0+0x10]%asi
stxa %l2, [%i0+0x18]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
stxa %l3, [%i0+0x20]%asi
stxa %l4, [%i0+0x28]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
stxa %l5, [%i0+0x30]%asi
stxa %l2, [%i0+0x38]%asi
add %l0, 0x40, %l0
subcc %i3, 0x40, %i3
bgu,pt %xcc, .co_loop0
add %i0, 0x40, %i0
ba .co_blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
.co_lower_double:
sub %i1, %o2, %i1 ! align the src at 16 bytes.
sll %o2, 3, %o0 ! %o0 left shift
mov 0x40, %o1
sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetch [%l0+0x0], #one_read
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l2 and %l3 has
! complete data
.co_loop1:
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has partial data
! for this read.
ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
! into %l2 and %l3
prefetch [%l0+0x40], #one_read
stxa %l2, [%i0+0x0]%asi
stxa %l3, [%i0+0x8]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
! %l4 from previous read
! into %l4 and %l5
stxa %l4, [%i0+0x10]%asi
stxa %l5, [%i0+0x18]%asi
! Repeat the same for next 32 bytes.
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
stxa %l2, [%i0+0x20]%asi
stxa %l3, [%i0+0x28]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
stxa %l4, [%i0+0x30]%asi
stxa %l5, [%i0+0x38]%asi
add %l0, 0x40, %l0
subcc %i3, 0x40, %i3
bgu,pt %xcc, .co_loop1
add %i0, 0x40, %i0
ba .co_blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
.co_upper_double:
sub %i1, %o2, %i1 ! align the src at 16 bytes.
sub %o2, 0x8, %o0
sll %o0, 3, %o0 ! %o0 left shift
mov 0x40, %o1
sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetch [%l0+0x0], #one_read
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l3
! for this read and
! no data in %l2
.co_loop2:
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has complete data
! and %l5 has partial
ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
! into %l3 and %l4
prefetch [%l0+0x40], #one_read
stxa %l3, [%i0+0x0]%asi
stxa %l4, [%i0+0x8]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
! %l5 from previous read
! into %l5 and %l2
stxa %l5, [%i0+0x10]%asi
stxa %l2, [%i0+0x18]%asi
! Repeat the same for next 32 bytes.
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
stxa %l3, [%i0+0x20]%asi
stxa %l4, [%i0+0x28]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
stxa %l5, [%i0+0x30]%asi
stxa %l2, [%i0+0x38]%asi
add %l0, 0x40, %l0
subcc %i3, 0x40, %i3
bgu,pt %xcc, .co_loop2
add %i0, 0x40, %i0
ba .co_blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.co_blkcpy:
andn %i1, 0x3f, %o0 ! %o0 has block aligned source
prefetch [%o0+0x0], #one_read
1:
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
add %i1, 0x10, %i1
prefetch [%o0+0x40], #one_read
stxa %l0, [%i0+0x0]%asi
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
add %i1, 0x10, %i1
stxa %l1, [%i0+0x8]%asi
stxa %l2, [%i0+0x10]%asi
stxa %l3, [%i0+0x18]%asi
stxa %l4, [%i0+0x20]%asi
stxa %l5, [%i0+0x28]%asi
stxa %l6, [%i0+0x30]%asi
stxa %l7, [%i0+0x38]%asi
add %o0, 0x40, %o0
subcc %i3, 0x40, %i3
bgu,pt %xcc, 1b
add %i0, 0x40, %i0
.co_blkdone:
membar #Sync
! Copy as much rest of the data as double word copy.
.co_dwcp:
cmp %i2, 0x8 ! Not enough bytes to copy as double
blu %ncc, .co_dbdone
nop
andn %i2, 0x7, %i3 ! %i3 count is multiple of 8 bytes size
sub %i2, %i3, %i2 ! Residue bytes in %i2
andcc %i1, 7, %g1 ! is src aligned on a 8 bytes
bz %ncc, .co_cpy_db
nop
sll %g1, 3, %l0 ! left shift
mov 0x40, %l1
sub %l1, %l0, %l1 ! right shift = (64 - left shift)
.co_cpy_wd:
sub %i1, %g1, %i1 ! align the src at 8 bytes.
ldx [%i1], %o2
3:
ldx [%i1+8], %o4
ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3)
stxa %o2, [%i0]ASI_USER
mov %o4, %o2
add %i1, 0x8, %i1
subcc %i3, 0x8, %i3
bgu,pt %ncc, 3b
add %i0, 0x8, %i0
ba .co_dbdone
add %i1, %g1, %i1
.co_cpy_db:
ldx [%i1], %o2
stxa %o2, [%i0]ASI_USER
add %i1, 0x8, %i1
subcc %i3, 0x8, %i3
bgu,pt %ncc, .co_cpy_db
add %i0, 0x8, %i0
.co_dbdone:
tst %i2
bz,pt %xcc, .copyout_exit
nop
! Copy the residue as byte copy
.co_residue:
ldub [%i1], %i4
stba %i4, [%i0]ASI_USER
inc %i1
deccc %i2
bgu %xcc, .co_residue
inc %i0
.copyout_exit:
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
ret
restore %g0, 0, %o0
.copyout_err:
ldn [THREAD_REG + T_COPYOPS], %o4
brz %o4, 2f
nop
ldn [%o4 + CP_COPYOUT], %g2
jmp %g2
nop
2:
retl
mov -1, %o0
SET_SIZE(copyout)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
xcopyout(const void *kaddr, void *uaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(xcopyout)
sethi %hi(.xcopyout_err), REAL_LOFAULT
b .do_copyout
or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
.xcopyout_err:
ldn [THREAD_REG + T_COPYOPS], %o4
brz %o4, 2f
nop
ldn [%o4 + CP_XCOPYOUT], %g2
jmp %g2
nop
2:
retl
mov %g1, %o0
SET_SIZE(xcopyout)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
xcopyout_little(const void *kaddr, void *uaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(xcopyout_little)
sethi %hi(.little_err), %o4
ldn [THREAD_REG + T_LOFAULT], %o5
or %o4, %lo(.little_err), %o4
membar #Sync ! sync error barrier
stn %o4, [THREAD_REG + T_LOFAULT]
subcc %g0, %o2, %o3
add %o0, %o2, %o0
bz,pn %ncc, 2f ! check for zero bytes
sub %o2, 1, %o4
add %o1, %o2, %o1
ldub [%o0+%o3], %o4
1: stba %o4, [%o1+%o3]ASI_AIUSL
inccc %o3
sub %o0, 2, %o0 ! get next byte
bcc,a,pt %ncc, 1b
ldub [%o0+%o3], %o4
2: membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return (0)
SET_SIZE(xcopyout_little)
#endif /* lint */
/*
* Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
*/
#if defined(lint)
/*ARGSUSED*/
int
copyin(const void *uaddr, void *kaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(copyin)
sethi %hi(.copyin_err), REAL_LOFAULT
or REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
.do_copyin:
!
! Check the length and bail if zero.
!
tst %o2
bnz,pt %ncc, 1f
nop
retl
clr %o0
1:
sethi %hi(copyio_fault), %o4
or %o4, %lo(copyio_fault), %o4
sethi %hi(copyio_fault_nowindow), %o3
ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
or %o3, %lo(copyio_fault_nowindow), %o3
membar #Sync
stn %o3, [THREAD_REG + T_LOFAULT]
mov %o0, SAVE_SRC
mov %o1, SAVE_DST
mov %o2, SAVE_COUNT
!
!
!
!
.dcibcp:
!
! pointer. A very fast 4 instruction loop.
!
.align 16
.dcicl:
stb %o4, [%o1 + %o3]
inccc %o3
bl,a,pt %ncc, .dcicl
lduba [%o0 + %o3]ASI_USER, %o4
!
!
!
!
.dci_ns:
!
!
!
! We're single byte aligned.
!
sethi %hi(hw_copy_limit_1), %o3
ld [%o3 + %lo(hw_copy_limit_1)], %o3
!
! Is HW copy on? If not do everything byte for byte.
!
tst %o3
bz,pn %icc, .dcibcp
subcc %o3, %o2, %o3
!
! Are we bigger than the HW limit? If not
! go to byte for byte.
!
bge,pt %ncc, .dcibcp
nop
!
!
.dcih8:
!
!
!
! We're eight byte aligned.
!
sethi %hi(hw_copy_limit_8), %o3
ld [%o3 + %lo(hw_copy_limit_8)], %o3
!
! Is HW assist on? If not, do it with the aligned copy.
!
tst %o3
bz,pn %icc, .dcis8
subcc %o3, %o2, %o3
bge %ncc, .dcis8
nop
ba,pt %ncc, .big_copyin
nop
.dcis8:
!
! Housekeeping for copy loops. Uses same idea as in the byte for
! byte copy loop above.
!
add %o0, %o2, %o0
add %o1, %o2, %o1
sub %g0, %o2, %o3
ba,pt %ncc, .didebc
srl %o2, 3, %o2 ! Number of 8 byte chunks to copy
!
! 4 byte aligned?
!
.dcih4:
bnz %ncc, .dcih2
sethi %hi(hw_copy_limit_4), %o3
ld [%o3 + %lo(hw_copy_limit_4)], %o3
!
! Is HW assist on? If not, do it with the aligned copy.
!
tst %o3
bz,pn %icc, .dcis4
subcc %o3, %o2, %o3
!
!
.dcis4:
!
!
.dcih2:
!
! We're two byte aligned. Check for "smallness"
! done in delay at .dcih4
!
bleu,pt %ncc, .dcis2
sethi %hi(hw_copy_limit_2), %o3
ld [%o3 + %lo(hw_copy_limit_2)], %o3
!
! Is HW assist on? If not, do it with the aligned copy.
!
tst %o3
bz,pn %icc, .dcis2
subcc %o3, %o2, %o3
!
! Are we larger than the HW limit?
!
bge %ncc, .dcis2
nop
!
!
!
!
.dcis2:
!
!
! code.
!
!
!
!
.align 32
.didebc:
!
!
!
!
!
!
.align 32
.didfbc:
!
!
!
!
!
! copy.
!
.align 32
.didtbc:
!
!
!
!
.dcifh:
!
! We're going off to do a block copy.
! Switch fault hendlers and grab a window. We
! kernel data to this point.
!
stn %o4, [THREAD_REG + T_LOFAULT]
save %sp, -SA(MINFRAME), %sp
! Copy in that reach here are larger than 256 bytes. The
! hw_copy_limit_1 is set to 256. Never set this limit less
! 128 bytes.
.do_blockcopyin:
mov %i1, %i5
mov %i0, %i1
mov %i5, %i0
andcc %i0, 7, %i3 ! is dst double aligned
bz %ncc, copyin_blkcpy
sub %i3, 8, %i3
neg %i3 ! bytes till double aligned
sub %i2, %i3, %i2 ! update %i2 with new count
! Align Destination on double-word boundary
1: lduba [%i1]ASI_USER, %i4
inc %i1
stb %i4, [%i0]
deccc %i3
bgu %ncc, 1b
inc %i0
copyin_blkcpy:
andcc %i0, 63, %i3
bz,pn %ncc, copyin_blalign ! now block aligned
sub %i3, 64, %i3
neg %i3 ! bytes till block aligned
sub %i2, %i3, %i2 ! update %i2 with new count
! Copy %i3 bytes till dst is block (64 byte) aligned. use
! double word copies.
andcc %i1, 7, %g1 ! is src aligned on a 8 bytes
bz %ncc, .ci_dbcopy ! %g1 has source offset (last 3-bits)
sll %g1, 3, %l1 ! left shift
mov 0x40, %l2
sub %l2, %l1, %l2 ! right shift = (64 - left shift)
! Now use double word copies to align destination.
.ci_double:
sub %i1, %g1, %i1 ! align the src at 8 bytes.
ldxa [%i1]ASI_USER, %o2
2:
add %i1, 0x8, %i1
ldxa [%i1]ASI_USER, %o4
ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3)
stx %o2, [%i0]
mov %o4, %o2
subcc %i3, 0x8, %i3
bgu,pt %ncc, 2b
add %i0, 0x8, %i0
ba copyin_blalign
add %i1, %g1, %i1
! Both source and destination are double aligned.
! No shift and merge of data required in this case.
.ci_dbcopy:
ldxa [%i1]ASI_USER, %o2
stx %o2, [%i0]
add %i1, 0x8, %i1
subcc %i3, 0x8, %i3
bgu,pt %ncc, .ci_dbcopy
add %i0, 0x8, %i0
copyin_blalign:
andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
sub %i2, %i3, %i2 ! Residue bytes in %i2
mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
andcc %i1, 0xf, %o2 ! is src quadword aligned
bz,pn %xcc, .ci_blkcpy ! src offset in %o2 (last 4-bits)
nop
cmp %o2, 0x8
bg .ci_upper_double
nop
bl .ci_lower_double
nop
! Falls through when source offset is equal to 8 i.e.
! source is double word aligned.
sub %i1, %o2, %i1 ! align the src at 16 bytes.
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetch [%l0+0x0], #one_read
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
.ci_loop0:
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
prefetch [%l0+0x40], #one_read
stxa %l3, [%i0+0x0]%asi
stxa %l4, [%i0+0x8]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
stxa %l5, [%i0+0x10]%asi
stxa %l2, [%i0+0x18]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
stxa %l3, [%i0+0x20]%asi
stxa %l4, [%i0+0x28]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
stxa %l5, [%i0+0x30]%asi
stxa %l2, [%i0+0x38]%asi
add %l0, 0x40, %l0
subcc %i3, 0x40, %i3
bgu,pt %xcc, .ci_loop0
add %i0, 0x40, %i0
ba .ci_blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
.ci_lower_double:
sub %i1, %o2, %i1 ! align the src at 16 bytes.
sll %o2, 3, %o0 ! %o0 left shift
mov 0x40, %o1
sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetch [%l0+0x0], #one_read
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l2
! and %l3 has complete
! data
.ci_loop1:
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has partial data
! for this read.
ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
! into %l2 and %l3
prefetch [%l0+0x40], #one_read
stxa %l2, [%i0+0x0]%asi
stxa %l3, [%i0+0x8]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
! %l4 from previous read
! into %l4 and %l5
stxa %l4, [%i0+0x10]%asi
stxa %l5, [%i0+0x18]%asi
! Repeat the same for next 32 bytes.
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
stxa %l2, [%i0+0x20]%asi
stxa %l3, [%i0+0x28]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
stxa %l4, [%i0+0x30]%asi
stxa %l5, [%i0+0x38]%asi
add %l0, 0x40, %l0
subcc %i3, 0x40, %i3
bgu,pt %xcc, .ci_loop1
add %i0, 0x40, %i0
ba .ci_blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
.ci_upper_double:
sub %i1, %o2, %i1 ! align the src at 16 bytes.
sub %o2, 0x8, %o0
sll %o0, 3, %o0 ! %o0 left shift
mov 0x40, %o1
sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
andn %i1, 0x3f, %l0 ! %l0 has block aligned source
prefetch [%l0+0x0], #one_read
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l3
! for this read and
! no data in %l2
.ci_loop2:
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has complete data
! and %l5 has partial
ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
! into %l3 and %l4
prefetch [%l0+0x40], #one_read
stxa %l3, [%i0+0x0]%asi
stxa %l4, [%i0+0x8]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
! %l5 from previous read
! into %l5 and %l2
stxa %l5, [%i0+0x10]%asi
stxa %l2, [%i0+0x18]%asi
! Repeat the same for next 32 bytes.
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
stxa %l3, [%i0+0x20]%asi
stxa %l4, [%i0+0x28]%asi
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
stxa %l5, [%i0+0x30]%asi
stxa %l2, [%i0+0x38]%asi
add %l0, 0x40, %l0
subcc %i3, 0x40, %i3
bgu,pt %xcc, .ci_loop2
add %i0, 0x40, %i0
ba .ci_blkdone
add %i1, %o2, %i1 ! increment the source by src offset
! the src offset was stored in %o2
! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.ci_blkcpy:
andn %i1, 0x3f, %o0 ! %o0 has block aligned source
prefetch [%o0+0x0], #one_read
1:
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
add %i1, 0x10, %i1
prefetch [%o0+0x40], #one_read
stxa %l0, [%i0+0x0]%asi
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
add %i1, 0x10, %i1
ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
add %i1, 0x10, %i1
stxa %l1, [%i0+0x8]%asi
stxa %l2, [%i0+0x10]%asi
stxa %l3, [%i0+0x18]%asi
stxa %l4, [%i0+0x20]%asi
stxa %l5, [%i0+0x28]%asi
stxa %l6, [%i0+0x30]%asi
stxa %l7, [%i0+0x38]%asi
add %o0, 0x40, %o0
subcc %i3, 0x40, %i3
bgu,pt %xcc, 1b
add %i0, 0x40, %i0
.ci_blkdone:
membar #Sync
! Copy as much rest of the data as double word copy.
.ci_dwcp:
cmp %i2, 0x8 ! Not enough bytes to copy as double
blu %ncc, .ci_dbdone
nop
andn %i2, 0x7, %i3 ! %i3 count is multiple of 8 bytes size
sub %i2, %i3, %i2 ! Residue bytes in %i2
andcc %i1, 7, %g1 ! is src aligned on a 8 bytes
bz %ncc, .ci_cpy_db
nop
sll %g1, 3, %l0 ! left shift
mov 0x40, %l1
sub %l1, %l0, %l1 ! right shift = (64 - left shift)
.ci_cpy_dbwd:
sub %i1, %g1, %i1 ! align the src at 8 bytes.
ldxa [%i1]ASI_USER, %o2
3:
add %i1, 0x8, %i1
ldxa [%i1]ASI_USER, %o4
ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3)
stx %o2, [%i0]
mov %o4, %o2
subcc %i3, 0x8, %i3
bgu,pt %ncc, 3b
add %i0, 0x8, %i0
ba .ci_dbdone
add %i1, %g1, %i1
.ci_cpy_db:
ldxa [%i1]ASI_USER, %o2
stx %o2, [%i0]
add %i1, 0x8, %i1
subcc %i3, 0x8, %i3
bgu,pt %ncc, .ci_cpy_db
add %i0, 0x8, %i0
.ci_dbdone:
tst %i2
bz,pt %xcc, .copyin_exit
nop
! Copy the residue as byte copy
.ci_residue:
lduba [%i1]ASI_USER, %i4
stb %i4, [%i0]
inc %i1
deccc %i2
bgu %xcc, .ci_residue
inc %i0
.copyin_exit:
membar #Sync
stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
ret
restore %g0, 0, %o0
.copyin_err:
ldn [THREAD_REG + T_COPYOPS], %o4
brz %o4, 2f
nop
ldn [%o4 + CP_COPYIN], %g2
jmp %g2
nop
2:
retl
mov -1, %o0
SET_SIZE(copyin)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
xcopyin(const void *uaddr, void *kaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(xcopyin)
sethi %hi(.xcopyin_err), REAL_LOFAULT
b .do_copyin
or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
.xcopyin_err:
ldn [THREAD_REG + T_COPYOPS], %o4
brz %o4, 2f
nop
ldn [%o4 + CP_XCOPYIN], %g2
jmp %g2
nop
2:
retl
mov %g1, %o0
SET_SIZE(xcopyin)
#endif /* lint */
#ifdef lint
/*ARGSUSED*/
int
xcopyin_little(const void *uaddr, void *kaddr, size_t count)
{ return (0); }
#else /* lint */
ENTRY(xcopyin_little)
sethi %hi(.little_err), %o4
ldn [THREAD_REG + T_LOFAULT], %o5
or %o4, %lo(.little_err), %o4
membar #Sync ! sync error barrier
stn %o4, [THREAD_REG + T_LOFAULT]
subcc %g0, %o2, %o3
add %o0, %o2, %o0
bz,pn %ncc, 2f ! check for zero bytes
sub %o2, 1, %o4
add %o1, %o2, %o1
lduba [%o0+%o3]ASI_AIUSL, %o4
1: stb %o4, [%o1+%o3]
inccc %o3
sub %o0, 2, %o0 ! get next byte
bcc,a,pt %ncc, 1b
lduba [%o0+%o3]ASI_AIUSL, %o4
2: membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g0, %o0 ! return (0)
.little_err:
membar #Sync ! sync error barrier
stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
retl
mov %g1, %o0
SET_SIZE(xcopyin_little)
#endif /* lint */
/*
* Copy a block of storage - must not overlap (from + len <= to).
* No fault handler installed (to be called under on_fault())
*/
#if defined(lint)
/* ARGSUSED */
void
copyin_noerr(const void *ufrom, void *kto, size_t count)
{}
#else /* lint */
ENTRY(copyin_noerr)
sethi %hi(.copyio_noerr), REAL_LOFAULT
b .do_copyin
or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
.copyio_noerr:
jmp SAVED_LOFAULT
nop
SET_SIZE(copyin_noerr)
#endif /* lint */
/*
* Copy a block of storage - must not overlap (from + len <= to).
* No fault handler installed (to be called under on_fault())
*/
#if defined(lint)
/* ARGSUSED */
void
copyout_noerr(const void *kfrom, void *uto, size_t count)
{}
#else /* lint */
ENTRY(copyout_noerr)
sethi %hi(.copyio_noerr), REAL_LOFAULT
b .do_copyout
or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
SET_SIZE(copyout_noerr)
#endif /* lint */
#if defined(lint)
int use_hw_bcopy = 1;
int use_hw_bzero = 1;
uint_t hw_copy_limit_1 = 0x100;
uint_t hw_copy_limit_2 = 0x200;
uint_t hw_copy_limit_4 = 0x400;
uint_t hw_copy_limit_8 = 0x400;
#else /* !lint */
.align 4
DGDEF(use_hw_bcopy)
.word 1
DGDEF(use_hw_bzero)
.word 1
DGDEF(hw_copy_limit_1)
.word 0x100
DGDEF(hw_copy_limit_2)
.word 0x200
DGDEF(hw_copy_limit_4)
.word 0x400
DGDEF(hw_copy_limit_8)
.word 0x400
.align 64
.section ".text"
#endif /* !lint */
/*
* hwblkclr - clears block-aligned, block-multiple-sized regions that are
*/
#ifdef lint
/*ARGSUSED*/
int
{
return(0);
}
#else /* lint */
2:
3:
#endif /* lint */
#ifdef lint
/* Copy 32 bytes of data from src to dst using physical addresses */
/*ARGSUSED*/
void
{}
#else /*!lint */
/*
* Copy 32 bytes of data from src (%o0) to dst (%o1)
* using physical addresses.
*/
#endif /* lint */
/*
* Zero a block of storage.
*
* uzero is used by the kernel to zero a block in user address space.
*/
/*
*
* For fewer than 7 bytes stores, bytes will be zeroed.
*
* For less than 15 bytes stores, align the address on 4 byte boundary.
* Then store as many 4-byte chunks, followed by trailing bytes.
*
* For sizes greater than 15 bytes, align the address on 8 byte boundary.
* if (count > 128) {
* store as many 8-bytes chunks to block align the address
* store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
* }
* Store as many 8-byte chunks, followed by trailing bytes.
*/
#if defined(lint)
/* ARGSUSED */
int
{ return(0); }
/* ARGSUSED */
void
{}
#else /* lint */
!
!
!
!
/*
* We got here because of a fault during kzero or if
* uzero or bzero was called with t_lofault non-zero.
* Otherwise we've already run screaming from the room.
* Errno value is in %g1. Note that we're here iff
* we did set t_lofault.
*/
.zeroerr:
!
!
!
!
1:
2:
!
!
retl ! return
3:
!
! We're here because %o5 was non-zero. It was non-zero
! because either LOFAULT_SET was present, a previous fault
! handler was present or both. In all cases we need to reset
! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
! before we either simply return the error or we invoke the
! previously specified handler.
!
be %ncc, 2b
stn %o5, [THREAD_REG + T_LOFAULT]
jmp %o5 ! goto real handler
nop
SET_SIZE(kzero)
SET_SIZE(uzero)
#endif /* lint */
/*
* Zero a block of storage.
*/
#if defined(lint)
/* ARGSUSED */
void
bzero(void *addr, size_t count)
{}
#else /* lint */
ENTRY(bzero)
wr %g0, ASI_P, %asi
ldn [THREAD_REG + T_LOFAULT], %o5 ! save old vector
tst %o5
bz,pt %ncc, .do_zero
sethi %hi(.zeroerr), %o2
or %o2, %lo(.zeroerr), %o2
membar #Sync ! sync error barrier
stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector
.do_zero:
cmp %o1, 7
blu,pn %ncc, .byteclr
nop
cmp %o1, 15
blu,pn %ncc, .wdalign
nop
andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound
bz,pt %ncc, .blkalign ! already double aligned
sub %o3, 8, %o3 ! -(bytes till double aligned)
add %o1, %o3, %o1 ! update o1 with new count
1:
stba %g0, [%o0]%asi
inccc %o3
bl,pt %ncc, 1b
inc %o0
! Now address is double aligned
.blkalign:
cmp %o1, 0x80 ! check if there are 128 bytes to set
blu,pn %ncc, .bzero_small
mov %o1, %o3
sethi %hi(use_hw_bzero), %o2
ld [%o2 + %lo(use_hw_bzero)], %o2
tst %o2
bz %ncc, .bzero_small
mov %o1, %o3
rd %asi, %o3
wr %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
cmp %o3, ASI_P
bne,a %ncc, .algnblk
wr %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
.algnblk:
andcc %o0, 0x3f, %o3 ! is block aligned?
bz,pt %ncc, .bzero_blk
sub %o3, 0x40, %o3 ! -(bytes till block aligned)
add %o1, %o3, %o1 ! o1 is the remainder
! Clear -(%o3) bytes till block aligned
1:
stxa %g0, [%o0]%asi
addcc %o3, 8, %o3
bl,pt %ncc, 1b
add %o0, 8, %o0
.bzero_blk:
and %o1, 0x3f, %o3 ! calc bytes left after blk clear
andn %o1, 0x3f, %o4 ! calc size of blocks in bytes
cmp %o4, 0x100 ! 256 bytes or more
blu,pn %ncc, 3f
nop
2:
stxa %g0, [%o0+0x0]%asi
stxa %g0, [%o0+0x40]%asi
stxa %g0, [%o0+0x80]%asi
stxa %g0, [%o0+0xc0]%asi
stxa %g0, [%o0+0x8]%asi
stxa %g0, [%o0+0x10]%asi
stxa %g0, [%o0+0x18]%asi
stxa %g0, [%o0+0x20]%asi
stxa %g0, [%o0+0x28]%asi
stxa %g0, [%o0+0x30]%asi
stxa %g0, [%o0+0x38]%asi
stxa %g0, [%o0+0x48]%asi
stxa %g0, [%o0+0x50]%asi
stxa %g0, [%o0+0x58]%asi
stxa %g0, [%o0+0x60]%asi
stxa %g0, [%o0+0x68]%asi
stxa %g0, [%o0+0x70]%asi
stxa %g0, [%o0+0x78]%asi
stxa %g0, [%o0+0x88]%asi
stxa %g0, [%o0+0x90]%asi
stxa %g0, [%o0+0x98]%asi
stxa %g0, [%o0+0xa0]%asi
stxa %g0, [%o0+0xa8]%asi
stxa %g0, [%o0+0xb0]%asi
stxa %g0, [%o0+0xb8]%asi
stxa %g0, [%o0+0xc8]%asi
stxa %g0, [%o0+0xd0]%asi
stxa %g0, [%o0+0xd8]%asi
stxa %g0, [%o0+0xe0]%asi
stxa %g0, [%o0+0xe8]%asi
stxa %g0, [%o0+0xf0]%asi
stxa %g0, [%o0+0xf8]%asi
sub %o4, 0x100, %o4
cmp %o4, 0x100
bgu,pt %ncc, 2b
add %o0, 0x100, %o0
3:
! ... check if 64 bytes to set
cmp %o4, 0x40
blu %ncc, .bzero_blk_done
nop
4:
stxa %g0, [%o0+0x0]%asi
stxa %g0, [%o0+0x8]%asi
stxa %g0, [%o0+0x10]%asi
stxa %g0, [%o0+0x18]%asi
stxa %g0, [%o0+0x20]%asi
stxa %g0, [%o0+0x28]%asi
stxa %g0, [%o0+0x30]%asi
stxa %g0, [%o0+0x38]%asi
subcc %o4, 0x40, %o4
bgu,pt %ncc, 3b
add %o0, 0x40, %o0
.bzero_blk_done:
membar #Sync
!
! Undo asi register setting.
!
rd %asi, %o4
wr %g0, ASI_P, %asi
cmp %o4, ASI_BLK_INIT_ST_QUAD_LDD_P
bne,a %ncc, .bzero_small
wr %g0, ASI_USER, %asi
.bzero_small:
! Set the remaining doubles
subcc %o3, 8, %o3 ! Can we store any doubles?
blu,pn %ncc, .byteclr
and %o1, 7, %o1 ! calc bytes left after doubles
.dbclr:
stxa %g0, [%o0]%asi ! Clear the doubles
subcc %o3, 8, %o3
bgeu,pt %ncc, .dbclr
add %o0, 8, %o0
ba .byteclr
nop
.wdalign:
andcc %o0, 3, %o3 ! is add aligned on a word boundary
bz,pn %ncc, .wdclr
andn %o1, 3, %o3 ! create word sized count in %o3
dec %o1 ! decrement count
stba %g0, [%o0]%asi ! clear a byte
ba .wdalign
inc %o0 ! next byte
.wdclr:
sta %g0, [%o0]%asi ! 4-byte clearing loop
subcc %o3, 4, %o3
bnz,pt %ncc, .wdclr
inc 4, %o0
and %o1, 3, %o1 ! leftover count, if any
.byteclr:
! Set the leftover bytes
brz %o1, .bzero_exit
nop
7:
deccc %o1 ! byte clearing loop
stba %g0, [%o0]%asi
bgu,pt %ncc, 7b
inc %o0
.bzero_exit:
!
!
1:
#endif /* lint */