sun4u/cpu/spitfire_copy.s

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

#include <sys/param.h>
#include <sys/errno.h>
#include <sys/asm_linkage.h>
#include <sys/vtrace.h>
#include <sys/machthread.h>
#include <sys/clock.h>
#include <sys/asi.h>
#include <sys/fsr.h>
#include <sys/privregs.h>

#if !defined(lint)
#include "assym.h"
#endif  /* lint */


/*
 * Pseudo-code to aid in understanding the control flow of the
 * bcopy routine.
 *
 * On entry to bcopy:
 *
 *  %l6 = curthread->t_lofault;
 *  used_block_copy = FALSE;            ! %l6 |= 1
 *  if (%l6 != NULL) {
 *      curthread->t_lofault = .copyerr;
 *      caller_error_handler = TRUE     ! %l6 |= 2
 *  }
 *
 *  if (length < VIS_COPY)
 *      goto regular_copy;
 *
 *  if (!use_vis)
 *      goto_regular_copy;
 *
 *  if (curthread->t_lwp == NULL) {
 *      ! Kernel threads do not have pcb's in which to store
 *      ! the floating point state, disallow preemption during
 *      ! the copy.
 *      kpreempt_disable(curthread);
 *  }
 *
 *  old_fprs = %fprs;
 *  old_gsr = %gsr;
 *  if (%fprs.fef) {
 *              ! If we need to save 4 blocks of fpregs then make sure
 *      ! the length is still appropriate for that extra overhead.
 *      if (length < (large_length + (64 * 4))) {
 *          if (curthread->t_lwp == NULL)
 *              kpreempt_enable(curthread);
 *          goto regular_copy;
 *      }
 *      %fprs.fef = 1;
 *      save current fpregs on stack using blockstore
 *  } else {
 *      %fprs.fef = 1;
 *  }
 *
 *  used_block_copy = 1;                ! %l6 |= 1
 *  do_blockcopy_here;
 *
 * In lofault handler:
 *  curthread->t_lofault = .copyerr2;
 *  Continue on with the normal exit handler
 *
 * On exit:
 *  call_kpreempt = 0;
 *  if (used_block_copy) {              ! %l6 & 1
 *      %gsr = old_gsr;
 *      if (old_fprs & FPRS_FEF)
 *          restore fpregs from stack using blockload
 *      else
 *          zero fpregs
 *      %fprs = old_fprs;
 *      if (curthread->t_lwp == NULL) {
 *          kpreempt_enable(curthread);
 *          call_kpreempt = 1;
 *      }
 *  }
 *  curthread->t_lofault = (%l6 & ~3);
 *  if (call_kpreempt)
 *      kpreempt(%pil);
 *  return (0)
 *
 * In second lofault handler (.copyerr2):
 *  We've tried to restore fp state from the stack and failed.  To
 *  prevent from returning with a corrupted fp state, we will panic.
 */

/*
 * Notes on preserving existing fp state:
 *
 * When a copyOP decides to use fp we may have to preserve existing
 * floating point state.  It is not the caller's state that we need to
 * preserve - the rest of the kernel does not use fp and, anyway, fp
 * registers are volatile across a call.  Some examples:
 *
 *  - userland has fp state and is interrupted (device interrupt
 *    or trap) and within the interrupt/trap handling we use
 *    bcopy()
 *  - another (higher level) interrupt or trap handler uses bcopy
 *    while a bcopy from an earlier interrupt is still active
 *  - an asynchronous error trap occurs while fp state exists (in
 *    userland or in kernel copy) and the tl0 component of the handling
 *    uses bcopy
 *  - a user process with fp state incurs a copy-on-write fault and
 *    hwblkpagecopy always uses fp
 *
 * We therefore need a per-call place in which to preserve fp state -
 * using our stack is ideal (and since fp copy cannot be leaf optimized
 * because of calls it makes, this is no hardship).
 *
 * To make sure that floating point state is always saved and restored
 * correctly, the following "big rules" must be followed when the floating
 * point registers will be used:
 *
 * 1. %l6 always holds the caller's lofault handler.  Also in this register,
 *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
 *    use.  Bit 2 (BCOPY_FLAG) indicates that the call was to bcopy.
 *
 * 2. The FPUSED flag indicates that all FP state has been successfully stored
 *    on the stack.  It should not be set until this save has been completed.
 *
 * 3. The FPUSED flag should not be cleared on exit until all FP state has
 *    been restored from the stack.  If an error occurs while restoring
 *    data from the stack, the error handler can check this flag to see if
 *    a restore is necessary.
 *
 * 4. Code run under the new lofault handler must be kept to a minimum.  In
 *    particular, any calls to kpreempt() should not be made until after the
 *    lofault handler has been restored.
 */

/*
 * This shadows sys/machsystm.h which can't be included due to the lack of
 * _ASM guards in include files it references. Change it here, change it there.
 */
#define VIS_COPY_THRESHOLD 900

/*
 * Less then or equal this number of bytes we will always copy byte-for-byte
 */
#define SMALL_LIMIT 7

/*
 * Flags set in the lower bits of the t_lofault address:
 * FPUSED_FLAG: The FP registers were in use and must be restored
 * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls
 * COPY_FLAGS: Both of the above
 *
 * Other flags:
 * KPREEMPT_FLAG: kpreempt needs to be called
 */
#define FPUSED_FLAG 1
#define BCOPY_FLAG  2
#define COPY_FLAGS  (FPUSED_FLAG | BCOPY_FLAG)
#define KPREEMPT_FLAG   4

/*
 * Size of stack frame in order to accomodate a 64-byte aligned
 * floating-point register save area and 2 32-bit temp locations.
 */
#define HWCOPYFRAMESIZE ((64 * 5) + (2 * 4))

#define SAVED_FPREGS_OFFSET (64 * 5)
#define SAVED_FPRS_OFFSET   (SAVED_FPREGS_OFFSET + 4)
#define SAVED_GSR_OFFSET    (SAVED_FPRS_OFFSET + 4)

/*
 * Common macros used by the various versions of the block copy
 * routines in this file.
 */

#define FZERO               \
    fzero   %f0         ;\
    fzero   %f2         ;\
    faddd   %f0, %f2, %f4       ;\
    fmuld   %f0, %f2, %f6       ;\
    faddd   %f0, %f2, %f8       ;\
    fmuld   %f0, %f2, %f10      ;\
    faddd   %f0, %f2, %f12      ;\
    fmuld   %f0, %f2, %f14      ;\
    faddd   %f0, %f2, %f16      ;\
    fmuld   %f0, %f2, %f18      ;\
    faddd   %f0, %f2, %f20      ;\
    fmuld   %f0, %f2, %f22      ;\
    faddd   %f0, %f2, %f24      ;\
    fmuld   %f0, %f2, %f26      ;\
    faddd   %f0, %f2, %f28      ;\
    fmuld   %f0, %f2, %f30      ;\
    faddd   %f0, %f2, %f32      ;\
    fmuld   %f0, %f2, %f34      ;\
    faddd   %f0, %f2, %f36      ;\
    fmuld   %f0, %f2, %f38      ;\
    faddd   %f0, %f2, %f40      ;\
    fmuld   %f0, %f2, %f42      ;\
    faddd   %f0, %f2, %f44      ;\
    fmuld   %f0, %f2, %f46      ;\
    faddd   %f0, %f2, %f48      ;\
    fmuld   %f0, %f2, %f50      ;\
    faddd   %f0, %f2, %f52      ;\
    fmuld   %f0, %f2, %f54      ;\
    faddd   %f0, %f2, %f56      ;\
    fmuld   %f0, %f2, %f58      ;\
    faddd   %f0, %f2, %f60      ;\
    fmuld   %f0, %f2, %f62


#define FALIGN_D0           \
    faligndata %d0, %d2, %d48   ;\
    faligndata %d2, %d4, %d50   ;\
    faligndata %d4, %d6, %d52   ;\
    faligndata %d6, %d8, %d54   ;\
    faligndata %d8, %d10, %d56  ;\
    faligndata %d10, %d12, %d58 ;\
    faligndata %d12, %d14, %d60 ;\
    faligndata %d14, %d16, %d62

#define FALIGN_D16          \
    faligndata %d16, %d18, %d48 ;\
    faligndata %d18, %d20, %d50 ;\
    faligndata %d20, %d22, %d52 ;\
    faligndata %d22, %d24, %d54 ;\
    faligndata %d24, %d26, %d56 ;\
    faligndata %d26, %d28, %d58 ;\
    faligndata %d28, %d30, %d60 ;\
    faligndata %d30, %d32, %d62

#define FALIGN_D32          \
    faligndata %d32, %d34, %d48 ;\
    faligndata %d34, %d36, %d50 ;\
    faligndata %d36, %d38, %d52 ;\
    faligndata %d38, %d40, %d54 ;\
    faligndata %d40, %d42, %d56 ;\
    faligndata %d42, %d44, %d58 ;\
    faligndata %d44, %d46, %d60 ;\
    faligndata %d46, %d0, %d62

#define FALIGN_D2           \
    faligndata %d2, %d4, %d48   ;\
    faligndata %d4, %d6, %d50   ;\
    faligndata %d6, %d8, %d52   ;\
    faligndata %d8, %d10, %d54  ;\
    faligndata %d10, %d12, %d56 ;\
    faligndata %d12, %d14, %d58 ;\
    faligndata %d14, %d16, %d60 ;\
    faligndata %d16, %d18, %d62

#define FALIGN_D18          \
    faligndata %d18, %d20, %d48 ;\
    faligndata %d20, %d22, %d50 ;\
    faligndata %d22, %d24, %d52 ;\
    faligndata %d24, %d26, %d54 ;\
    faligndata %d26, %d28, %d56 ;\
    faligndata %d28, %d30, %d58 ;\
    faligndata %d30, %d32, %d60 ;\
    faligndata %d32, %d34, %d62

#define FALIGN_D34          \
    faligndata %d34, %d36, %d48 ;\
    faligndata %d36, %d38, %d50 ;\
    faligndata %d38, %d40, %d52 ;\
    faligndata %d40, %d42, %d54 ;\
    faligndata %d42, %d44, %d56 ;\
    faligndata %d44, %d46, %d58 ;\
    faligndata %d46, %d0, %d60  ;\
    faligndata %d0, %d2, %d62

#define FALIGN_D4           \
    faligndata %d4, %d6, %d48   ;\
    faligndata %d6, %d8, %d50   ;\
    faligndata %d8, %d10, %d52  ;\
    faligndata %d10, %d12, %d54 ;\
    faligndata %d12, %d14, %d56 ;\
    faligndata %d14, %d16, %d58 ;\
    faligndata %d16, %d18, %d60 ;\
    faligndata %d18, %d20, %d62

#define FALIGN_D20          \
    faligndata %d20, %d22, %d48 ;\
    faligndata %d22, %d24, %d50 ;\
    faligndata %d24, %d26, %d52 ;\
    faligndata %d26, %d28, %d54 ;\
    faligndata %d28, %d30, %d56 ;\
    faligndata %d30, %d32, %d58 ;\
    faligndata %d32, %d34, %d60 ;\
    faligndata %d34, %d36, %d62

#define FALIGN_D36          \
    faligndata %d36, %d38, %d48 ;\
    faligndata %d38, %d40, %d50 ;\
    faligndata %d40, %d42, %d52 ;\
    faligndata %d42, %d44, %d54 ;\
    faligndata %d44, %d46, %d56 ;\
    faligndata %d46, %d0, %d58  ;\
    faligndata %d0, %d2, %d60   ;\
    faligndata %d2, %d4, %d62

#define FALIGN_D6           \
    faligndata %d6, %d8, %d48   ;\
    faligndata %d8, %d10, %d50  ;\
    faligndata %d10, %d12, %d52 ;\
    faligndata %d12, %d14, %d54 ;\
    faligndata %d14, %d16, %d56 ;\
    faligndata %d16, %d18, %d58 ;\
    faligndata %d18, %d20, %d60 ;\
    faligndata %d20, %d22, %d62

#define FALIGN_D22          \
    faligndata %d22, %d24, %d48 ;\
    faligndata %d24, %d26, %d50 ;\
    faligndata %d26, %d28, %d52 ;\
    faligndata %d28, %d30, %d54 ;\
    faligndata %d30, %d32, %d56 ;\
    faligndata %d32, %d34, %d58 ;\
    faligndata %d34, %d36, %d60 ;\
    faligndata %d36, %d38, %d62

#define FALIGN_D38          \
    faligndata %d38, %d40, %d48 ;\
    faligndata %d40, %d42, %d50 ;\
    faligndata %d42, %d44, %d52 ;\
    faligndata %d44, %d46, %d54 ;\
    faligndata %d46, %d0, %d56  ;\
    faligndata %d0, %d2, %d58   ;\
    faligndata %d2, %d4, %d60   ;\
    faligndata %d4, %d6, %d62

#define FALIGN_D8           \
    faligndata %d8, %d10, %d48  ;\
    faligndata %d10, %d12, %d50 ;\
    faligndata %d12, %d14, %d52 ;\
    faligndata %d14, %d16, %d54 ;\
    faligndata %d16, %d18, %d56 ;\
    faligndata %d18, %d20, %d58 ;\
    faligndata %d20, %d22, %d60 ;\
    faligndata %d22, %d24, %d62

#define FALIGN_D24          \
    faligndata %d24, %d26, %d48 ;\
    faligndata %d26, %d28, %d50 ;\
    faligndata %d28, %d30, %d52 ;\
    faligndata %d30, %d32, %d54 ;\
    faligndata %d32, %d34, %d56 ;\
    faligndata %d34, %d36, %d58 ;\
    faligndata %d36, %d38, %d60 ;\
    faligndata %d38, %d40, %d62

#define FALIGN_D40          \
    faligndata %d40, %d42, %d48 ;\
    faligndata %d42, %d44, %d50 ;\
    faligndata %d44, %d46, %d52 ;\
    faligndata %d46, %d0, %d54  ;\
    faligndata %d0, %d2, %d56   ;\
    faligndata %d2, %d4, %d58   ;\
    faligndata %d4, %d6, %d60   ;\
    faligndata %d6, %d8, %d62

#define FALIGN_D10          \
    faligndata %d10, %d12, %d48 ;\
    faligndata %d12, %d14, %d50 ;\
    faligndata %d14, %d16, %d52 ;\
    faligndata %d16, %d18, %d54 ;\
    faligndata %d18, %d20, %d56 ;\
    faligndata %d20, %d22, %d58 ;\
    faligndata %d22, %d24, %d60 ;\
    faligndata %d24, %d26, %d62

#define FALIGN_D26          \
    faligndata %d26, %d28, %d48 ;\
    faligndata %d28, %d30, %d50 ;\
    faligndata %d30, %d32, %d52 ;\
    faligndata %d32, %d34, %d54 ;\
    faligndata %d34, %d36, %d56 ;\
    faligndata %d36, %d38, %d58 ;\
    faligndata %d38, %d40, %d60 ;\
    faligndata %d40, %d42, %d62

#define FALIGN_D42          \
    faligndata %d42, %d44, %d48 ;\
    faligndata %d44, %d46, %d50 ;\
    faligndata %d46, %d0, %d52  ;\
    faligndata %d0, %d2, %d54   ;\
    faligndata %d2, %d4, %d56   ;\
    faligndata %d4, %d6, %d58   ;\
    faligndata %d6, %d8, %d60   ;\
    faligndata %d8, %d10, %d62

#define FALIGN_D12          \
    faligndata %d12, %d14, %d48 ;\
    faligndata %d14, %d16, %d50 ;\
    faligndata %d16, %d18, %d52 ;\
    faligndata %d18, %d20, %d54 ;\
    faligndata %d20, %d22, %d56 ;\
    faligndata %d22, %d24, %d58 ;\
    faligndata %d24, %d26, %d60 ;\
    faligndata %d26, %d28, %d62

#define FALIGN_D28          \
    faligndata %d28, %d30, %d48 ;\
    faligndata %d30, %d32, %d50 ;\
    faligndata %d32, %d34, %d52 ;\
    faligndata %d34, %d36, %d54 ;\
    faligndata %d36, %d38, %d56 ;\
    faligndata %d38, %d40, %d58 ;\
    faligndata %d40, %d42, %d60 ;\
    faligndata %d42, %d44, %d62

#define FALIGN_D44          \
    faligndata %d44, %d46, %d48 ;\
    faligndata %d46, %d0, %d50  ;\
    faligndata %d0, %d2, %d52   ;\
    faligndata %d2, %d4, %d54   ;\
    faligndata %d4, %d6, %d56   ;\
    faligndata %d6, %d8, %d58   ;\
    faligndata %d8, %d10, %d60  ;\
    faligndata %d10, %d12, %d62

#define FALIGN_D14          \
    faligndata %d14, %d16, %d48 ;\
    faligndata %d16, %d18, %d50 ;\
    faligndata %d18, %d20, %d52 ;\
    faligndata %d20, %d22, %d54 ;\
    faligndata %d22, %d24, %d56 ;\
    faligndata %d24, %d26, %d58 ;\
    faligndata %d26, %d28, %d60 ;\
    faligndata %d28, %d30, %d62

#define FALIGN_D30          \
    faligndata %d30, %d32, %d48 ;\
    faligndata %d32, %d34, %d50 ;\
    faligndata %d34, %d36, %d52 ;\
    faligndata %d36, %d38, %d54 ;\
    faligndata %d38, %d40, %d56 ;\
    faligndata %d40, %d42, %d58 ;\
    faligndata %d42, %d44, %d60 ;\
    faligndata %d44, %d46, %d62

#define FALIGN_D46          \
    faligndata %d46, %d0, %d48  ;\
    faligndata %d0, %d2, %d50   ;\
    faligndata %d2, %d4, %d52   ;\
    faligndata %d4, %d6, %d54   ;\
    faligndata %d6, %d8, %d56   ;\
    faligndata %d8, %d10, %d58  ;\
    faligndata %d10, %d12, %d60 ;\
    faligndata %d12, %d14, %d62


/*
 * Copy a block of storage, returning an error code if `from' or
 * `to' takes a kernel pagefault which cannot be resolved.
 * Returns errno value on pagefault error, 0 if all ok
 */


#if defined(lint)

/* ARGSUSED */
int
kcopy(const void *from, void *to, size_t count)
{ return(0); }

#else   /* lint */

    .seg    ".text"
    .align  4

    ENTRY(kcopy)

    save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
    set .copyerr, %l6       ! copyerr is lofault value
    ldn [THREAD_REG + T_LOFAULT], %l7   ! save existing handler
    membar  #Sync           ! sync error barrier (see copy.s)
    stn %l6, [THREAD_REG + T_LOFAULT]   ! set t_lofault
    !
    ! Note that we carefully do *not* flag the setting of
    ! t_lofault.
    !
    ba,pt   %ncc, .do_copy      ! common code
      mov   %l7, %l6

/*
 * We got here because of a fault during kcopy or bcopy if a fault
 * handler existed when bcopy was called.
 * Errno value is in %g1.
 */
.copyerr:
    set .copyerr2, %l1
    membar  #Sync           ! sync error barrier
    stn %l1, [THREAD_REG + T_LOFAULT]   ! set t_lofault
    btst    FPUSED_FLAG, %l6
    bz  %icc, 1f
      and   %l6, BCOPY_FLAG, %l1    ! copy flag to %l1

    membar  #Sync

    ld  [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2  ! restore gsr
    wr  %o2, 0, %gsr

    ld  [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
    btst    FPRS_FEF, %o3
    bz  %icc, 4f
      nop

    ! restore fpregs from stack
    membar  #Sync
    add %fp, STACK_BIAS - 257, %o2
    and %o2, -64, %o2
    ldda    [%o2]ASI_BLK_P, %d0
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d16
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d32
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d48
    membar  #Sync

    ba,pt   %ncc, 2f
      wr    %o3, 0, %fprs       ! restore fprs

4:
    FZERO               ! zero all of the fpregs
    wr  %o3, 0, %fprs       ! restore fprs

2:  ldn [THREAD_REG + T_LWP], %o2
    tst %o2
    bnz,pt  %ncc, 1f
      nop

    ldsb    [THREAD_REG + T_PREEMPT], %l0
    deccc   %l0
    bnz,pn  %ncc, 1f
      stb   %l0, [THREAD_REG + T_PREEMPT]

    ! Check for a kernel preemption request
    ldn [THREAD_REG + T_CPU], %l0
    ldub    [%l0 + CPU_KPRUNRUN], %l0
    tst %l0
    bnz,a,pt    %ncc, 1f    ! Need to call kpreempt?
      or    %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag

    !
    ! Need to cater for the different expectations of kcopy
    ! and bcopy. kcopy will *always* set a t_lofault handler
    ! If it fires, we're expected to just return the error code
    ! and *not* to invoke any existing error handler. As far as
    ! bcopy is concerned, we only set t_lofault if there was an
    ! existing lofault handler. In that case we're expected to
    ! invoke the previously existing handler after restting the
    ! t_lofault value.
    !
1:
    andn    %l6, COPY_FLAGS, %l6    ! remove flags from lofault address
    membar  #Sync           ! sync error barrier
    stn %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault

    ! call kpreempt if necessary
    btst    KPREEMPT_FLAG, %l1
    bz,pt   %icc, 2f
      nop
    call    kpreempt
      rdpr  %pil, %o0   ! pass %pil
2:
    btst    BCOPY_FLAG, %l1
    bnz,pn  %ncc, 3f
      nop
    ret
    restore %g1, 0, %o0

3:
    !
    ! We're here via bcopy. There *must* have been an error handler
    ! in place otheerwise we would have died a nasty death already.
    !
    jmp %l6             ! goto real handler
    restore %g0, 0, %o0         ! dispose of copy window

/*
 * We got here because of a fault in .copyerr.  We can't safely restore fp
 * state, so we panic.
 */
fp_panic_msg:
    .asciz  "Unable to restore fp state after copy operation"

    .align  4
.copyerr2:
    set fp_panic_msg, %o0
    call    panic
      nop
    SET_SIZE(kcopy)
#endif  /* lint */


/*
 * Copy a block of storage - must not overlap (from + len <= to).
 * Registers: l6 - saved t_lofault
 *
 * Copy a page of memory.
 * Assumes double word alignment and a count >= 256.
 */
#if defined(lint)

/* ARGSUSED */
void
bcopy(const void *from, void *to, size_t count)
{}

#else   /* lint */

    ENTRY(bcopy)

    save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
    ldn [THREAD_REG + T_LOFAULT], %l6   ! save t_lofault
    tst %l6
        !
        ! We've already captured whether t_lofault was zero on entry.
        ! We need to mark ourselves as being from bcopy since both
        ! kcopy and bcopy use the same code path. If BCOPY_FLAG is set
        ! and the saved lofault was zero, we won't reset lofault on
        ! returning.
        !
    or  %l6, BCOPY_FLAG, %l6
    bz,pt   %ncc, .do_copy
    sethi   %hi(.copyerr), %o2
    or  %o2, %lo(.copyerr), %o2
    membar  #Sync           ! sync error barrier
    stn %o2, [THREAD_REG + T_LOFAULT]   ! install new vector

.do_copy:
    cmp %i2, 12         ! for small counts
    blu %ncc, .bytecp       ! just copy bytes
      .empty

    cmp %i2, VIS_COPY_THRESHOLD ! for large counts
    blu,pt  %ncc, .bcb_punt
      .empty

    !
    ! Check to see if VIS acceleration is enabled
    !
    sethi   %hi(use_hw_bcopy), %o2
    ld  [%o2 + %lo(use_hw_bcopy)], %o2
    tst %o2
    bz,pn   %icc, .bcb_punt
      nop

    subcc   %i1, %i0, %i3
    bneg,a,pn %ncc, 1f
    neg %i3
1:
    /*
     * Compare against 256 since we should be checking block addresses
     * and (dest & ~63) - (src & ~63) can be 3 blocks even if
     * src = dest + (64 * 3) + 63.
     */
    cmp %i3, 256
    blu,pn  %ncc, .bcb_punt
      nop

    ldn [THREAD_REG + T_LWP], %o3
    tst %o3
    bnz,pt  %ncc, 1f
      nop

    ! kpreempt_disable();
    ldsb    [THREAD_REG + T_PREEMPT], %o2
    inc %o2
    stb %o2, [THREAD_REG + T_PREEMPT]

1:
    rd  %fprs, %o2      ! check for unused fp
    st  %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
    btst    FPRS_FEF, %o2
    bz,a    %icc, .do_blockcopy
      wr    %g0, FPRS_FEF, %fprs

.bcb_fpregs_inuse:
    cmp %i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
    bgeu    %ncc, 1f        !  if we have to save the fpregs)
      nop

    tst %o3
    bnz,pt  %ncc, .bcb_punt
      nop

    ldsb    [THREAD_REG + T_PREEMPT], %l0
    deccc   %l0
    bnz,pn  %icc, .bcb_punt
      stb   %l0, [THREAD_REG + T_PREEMPT]

    ! Check for a kernel preemption request
    ldn [THREAD_REG + T_CPU], %l0
    ldub    [%l0 + CPU_KPRUNRUN], %l0
    tst %l0
    bz,pt   %icc, .bcb_punt
      nop

    ! Attempt to preempt
    call    kpreempt
      rdpr    %pil, %o0       ! pass %pil

    ba,pt   %ncc, .bcb_punt
      nop

1:
    wr  %g0, FPRS_FEF, %fprs

    ! save in-use fpregs on stack
    membar  #Sync
    add %fp, STACK_BIAS - 257, %o2
    and %o2, -64, %o2
    stda    %d0, [%o2]ASI_BLK_P
    add %o2, 64, %o2
    stda    %d16, [%o2]ASI_BLK_P
    add %o2, 64, %o2
    stda    %d32, [%o2]ASI_BLK_P
    add %o2, 64, %o2
    stda    %d48, [%o2]ASI_BLK_P
    membar  #Sync

.do_blockcopy:
    membar  #StoreStore|#StoreLoad|#LoadStore

    rd  %gsr, %o2
    st  %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]  ! save gsr

    ! Set the lower bit in the saved t_lofault to indicate
    ! that we need to clear the %fprs register on the way
    ! out
    or  %l6, FPUSED_FLAG, %l6

    ! Swap src/dst since the code below is memcpy code
    ! and memcpy/bcopy have different calling sequences
    mov %i1, %i5
    mov %i0, %i1
    mov %i5, %i0

!!! This code is nearly identical to the version in the sun4u
!!! libc_psr.  Most bugfixes made to that file should be
!!! merged into this routine.

    andcc   %i0, 7, %o3
    bz,pt   %ncc, blkcpy
    sub %o3, 8, %o3
    neg %o3
    sub %i2, %o3, %i2

    ! Align Destination on double-word boundary

2:  ldub    [%i1], %o4
    inc %i1
    inc %i0
    deccc   %o3
    bgu %ncc, 2b
    stb %o4, [%i0 - 1]
blkcpy:
    andcc   %i0, 63, %i3
    bz,pn   %ncc, blalign       ! now block aligned
    sub %i3, 64, %i3
    neg %i3         ! bytes till block aligned
    sub %i2, %i3, %i2       ! update %i2 with new count

    ! Copy %i3 bytes till dst is block (64 byte) aligned. use
    ! double word copies.

    alignaddr %i1, %g0, %g1
    ldd [%g1], %d0
    add %g1, 8, %g1
6:
    ldd [%g1], %d2
    add %g1, 8, %g1
    subcc   %i3, 8, %i3
    faligndata %d0, %d2, %d8
    std %d8, [%i0]
    add %i1, 8, %i1
    bz,pn   %ncc, blalign
    add %i0, 8, %i0
    ldd [%g1], %d0
    add %g1, 8, %g1
    subcc   %i3, 8, %i3
    faligndata %d2, %d0, %d8
    std %d8, [%i0]
    add %i1, 8, %i1
    bgu,pn  %ncc, 6b
    add %i0, 8, %i0

blalign:
    membar  #StoreLoad
    ! %i2 = total length
    ! %i3 = blocks  (length - 64) / 64
    ! %i4 = doubles remaining  (length - blocks)
    sub %i2, 64, %i3
    andn    %i3, 63, %i3
    sub %i2, %i3, %i4
    andn    %i4, 7, %i4
    sub %i4, 16, %i4
    sub %i2, %i4, %i2
    sub %i2, %i3, %i2

    andn    %i1, 0x3f, %l7      ! blk aligned address
    alignaddr %i1, %g0, %g0     ! gen %gsr

    srl %i1, 3, %l5     ! bits 3,4,5 are now least sig in  %l5
    andcc   %l5, 7, %i5     ! mask everything except bits 1,2 3
    add %i1, %i4, %i1
    add %i1, %i3, %i1

    ldda    [%l7]ASI_BLK_P, %d0
    add %l7, 64, %l7
    ldda    [%l7]ASI_BLK_P, %d16
    add %l7, 64, %l7
    ldda    [%l7]ASI_BLK_P, %d32
    add %l7, 64, %l7
    sub %i3, 128, %i3

    ! switch statement to get us to the right 8 byte blk within a
    ! 64 byte block
    cmp  %i5, 4
    bgeu,a   hlf
    cmp  %i5, 6
    cmp  %i5, 2
    bgeu,a   sqtr
    nop
    cmp  %i5, 1
    be,a     seg1
    nop
    ba,pt    %ncc, seg0
    nop
sqtr:
    be,a     seg2
    nop
    ba,pt    %ncc, seg3
    nop

hlf:
    bgeu,a   fqtr
    nop
    cmp  %i5, 5
    be,a     seg5
    nop
    ba,pt    %ncc, seg4
    nop
fqtr:
    be,a     seg6
    nop
    ba,pt    %ncc, seg7
    nop


seg0:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D0
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D16
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D32
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, seg0

0:
    FALIGN_D16
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D32
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd0
    add %i0, 64, %i0

1:
    FALIGN_D32
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D0
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd16
    add %i0, 64, %i0

2:
    FALIGN_D0
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D16
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd32
    add %i0, 64, %i0

seg1:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D2
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D18
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D34
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, seg1
0:
    FALIGN_D18
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D34
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd2
    add %i0, 64, %i0

1:
    FALIGN_D34
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D2
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd18
    add %i0, 64, %i0

2:
    FALIGN_D2
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D18
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd34
    add %i0, 64, %i0

seg2:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D4
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D20
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D36
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, seg2

0:
    FALIGN_D20
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D36
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd4
    add %i0, 64, %i0

1:
    FALIGN_D36
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D4
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd20
    add %i0, 64, %i0

2:
    FALIGN_D4
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D20
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd36
    add %i0, 64, %i0

seg3:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D6
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D22
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D38
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, seg3

0:
    FALIGN_D22
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D38
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd6
    add %i0, 64, %i0

1:
    FALIGN_D38
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D6
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd22
    add %i0, 64, %i0

2:
    FALIGN_D6
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D22
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd38
    add %i0, 64, %i0

seg4:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D8
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D24
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D40
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, seg4

0:
    FALIGN_D24
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D40
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd8
    add %i0, 64, %i0

1:
    FALIGN_D40
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D8
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd24
    add %i0, 64, %i0

2:
    FALIGN_D8
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D24
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd40
    add %i0, 64, %i0

seg5:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D10
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D26
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D42
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, seg5

0:
    FALIGN_D26
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D42
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd10
    add %i0, 64, %i0

1:
    FALIGN_D42
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D10
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd26
    add %i0, 64, %i0

2:
    FALIGN_D10
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D26
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd42
    add %i0, 64, %i0

seg6:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D12
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D28
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D44
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, seg6

0:
    FALIGN_D28
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D44
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd12
    add %i0, 64, %i0

1:
    FALIGN_D44
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D12
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd28
    add %i0, 64, %i0

2:
    FALIGN_D12
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D28
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd44
    add %i0, 64, %i0

seg7:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D14
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D30
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D46
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, seg7

0:
    FALIGN_D30
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D46
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd14
    add %i0, 64, %i0

1:
    FALIGN_D46
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D14
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd30
    add %i0, 64, %i0

2:
    FALIGN_D14
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D30
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, blkd46
    add %i0, 64, %i0


    !
    ! dribble out the last partial block
    !
blkd0:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d0, %d2, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd2:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d2, %d4, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd4:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d4, %d6, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd6:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d6, %d8, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd8:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d8, %d10, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd10:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d10, %d12, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd12:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d12, %d14, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd14:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    fsrc1   %d14, %d0
    ba,a,pt %ncc, blkleft

blkd16:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d16, %d18, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd18:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d18, %d20, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd20:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d20, %d22, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd22:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d22, %d24, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd24:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d24, %d26, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd26:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d26, %d28, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd28:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d28, %d30, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd30:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    fsrc1   %d30, %d0
    ba,a,pt %ncc, blkleft
blkd32:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d32, %d34, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd34:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d34, %d36, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd36:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d36, %d38, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd38:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d38, %d40, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd40:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d40, %d42, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd42:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d42, %d44, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd44:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    faligndata %d44, %d46, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
blkd46:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, blkdone
    fsrc1   %d46, %d0

blkleft:
1:
    ldd [%l7], %d2
    add %l7, 8, %l7
    subcc   %i4, 8, %i4
    faligndata %d0, %d2, %d8
    std %d8, [%i0]
    blu,pn  %ncc, blkdone
    add %i0, 8, %i0
    ldd [%l7], %d0
    add %l7, 8, %l7
    subcc   %i4, 8, %i4
    faligndata %d2, %d0, %d8
    std %d8, [%i0]
    bgeu,pt %ncc, 1b
    add %i0, 8, %i0

blkdone:
    tst %i2
    bz,pt   %ncc, .bcb_exit
    and %l3, 0x4, %l3       ! fprs.du = fprs.dl = 0

7:  ldub    [%i1], %i4
    inc %i1
    inc %i0
    deccc   %i2
    bgu,pt  %ncc, 7b
      stb     %i4, [%i0 - 1]

.bcb_exit:
    membar  #StoreLoad|#StoreStore
    btst    FPUSED_FLAG, %l6
    bz  %icc, 1f
      and   %l6, COPY_FLAGS, %l1    ! Store flags in %l1
                    ! We can't clear the flags from %l6 yet.
                    ! If there's an error, .copyerr will
                    ! need them

    ld  [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2  ! restore gsr
    wr  %o2, 0, %gsr

    ld  [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
    btst    FPRS_FEF, %o3
    bz  %icc, 4f
      nop

    ! restore fpregs from stack
    membar  #Sync
    add %fp, STACK_BIAS - 257, %o2
    and %o2, -64, %o2
    ldda    [%o2]ASI_BLK_P, %d0
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d16
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d32
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d48
    membar  #Sync

    ba,pt   %ncc, 2f
      wr    %o3, 0, %fprs       ! restore fprs

4:
    FZERO               ! zero all of the fpregs
    wr  %o3, 0, %fprs       ! restore fprs

2:  ldn [THREAD_REG + T_LWP], %o2
    tst %o2
    bnz,pt  %ncc, 1f
      nop

    ldsb    [THREAD_REG + T_PREEMPT], %l0
    deccc   %l0
    bnz,pn  %ncc, 1f
      stb   %l0, [THREAD_REG + T_PREEMPT]

    ! Check for a kernel preemption request
    ldn [THREAD_REG + T_CPU], %l0
    ldub    [%l0 + CPU_KPRUNRUN], %l0
    tst %l0
    bnz,a,pt    %ncc, 1f    ! Need to call kpreempt?
      or    %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag

1:
    btst    BCOPY_FLAG, %l1
    bz,pn   %icc, 3f
      andncc    %l6, COPY_FLAGS, %l6

    !
    ! Here via bcopy. Check to see if the handler was NULL.
    ! If so, just return quietly. Otherwise, reset the
    ! handler and go home.
    !
    bnz,pn  %ncc, 3f
      nop

    !
    ! Null handler.  Check for kpreempt flag, call if necessary,
    ! then return.
    !
    btst    KPREEMPT_FLAG, %l1
    bz,pt   %icc, 2f
      nop
    call    kpreempt
      rdpr  %pil, %o0   ! pass %pil
2:
    ret
      restore   %g0, 0, %o0

    !
    ! Here via kcopy or bcopy with a handler.Reset the
    ! fault handler.
    !
3:
    membar  #Sync
    stn %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault

    ! call kpreempt if necessary
    btst    KPREEMPT_FLAG, %l1
    bz,pt   %icc, 4f
      nop
    call    kpreempt
      rdpr  %pil, %o0
4:
    ret
      restore   %g0, 0, %o0

.bcb_punt:
    !
    ! use aligned transfers where possible
    !
    xor %i0, %i1, %o4       ! xor from and to address
    btst    7, %o4          ! if lower three bits zero
    bz  %icc, .aldoubcp     ! can align on double boundary
    .empty  ! assembler complaints about label

    xor %i0, %i1, %o4       ! xor from and to address
    btst    3, %o4          ! if lower two bits zero
    bz  %icc, .alwordcp     ! can align on word boundary
    btst    3, %i0          ! delay slot, from address unaligned?
    !
    ! use aligned reads and writes where possible
    ! this differs from wordcp in that it copes
    ! with odd alignment between source and destnation
    ! using word reads and writes with the proper shifts
    ! in between to align transfers to and from memory
    ! i0 - src address, i1 - dest address, i2 - count
    ! i3, i4 - tmps for used generating complete word
    ! i5 (word to write)
    ! l0 size in bits of upper part of source word (US)
    ! l1 size in bits of lower part of source word (LS = 32 - US)
    ! l2 size in bits of upper part of destination word (UD)
    ! l3 size in bits of lower part of destination word (LD = 32 - UD)
    ! l4 number of bytes leftover after aligned transfers complete
    ! l5 the number 32
    !
    mov 32, %l5         ! load an oft-needed constant
    bz  .align_dst_only
    btst    3, %i1          ! is destnation address aligned?
    clr %i4         ! clear registers used in either case
    bz  %icc, .align_src_only
    clr %l0
    !
    ! both source and destination addresses are unaligned
    !
1:                  ! align source
    ldub    [%i0], %i3      ! read a byte from source address
    add %i0, 1, %i0     ! increment source address
    or  %i4, %i3, %i4       ! or in with previous bytes (if any)
    btst    3, %i0          ! is source aligned?
    add %l0, 8, %l0     ! increment size of upper source (US)
    bnz,a   1b
    sll %i4, 8, %i4     ! make room for next byte

    sub %l5, %l0, %l1       ! generate shift left count (LS)
    sll %i4, %l1, %i4       ! prepare to get rest
    ld  [%i0], %i3      ! read a word
    add %i0, 4, %i0     ! increment source address
    srl %i3, %l0, %i5       ! upper src bits into lower dst bits
    or  %i4, %i5, %i5       ! merge
    mov 24, %l3         ! align destination
1:
    srl %i5, %l3, %i4       ! prepare to write a single byte
    stb %i4, [%i1]      ! write a byte
    add %i1, 1, %i1     ! increment destination address
    sub %i2, 1, %i2     ! decrement count
    btst    3, %i1          ! is destination aligned?
    bnz,a   1b
    sub %l3, 8, %l3     ! delay slot, decrement shift count (LD)
    sub %l5, %l3, %l2       ! generate shift left count (UD)
    sll %i5, %l2, %i5       ! move leftover into upper bytes
    cmp %l2, %l0        ! cmp # reqd to fill dst w old src left
    bgu %ncc, .more_needed  ! need more to fill than we have
    nop

    sll %i3, %l1, %i3       ! clear upper used byte(s)
    srl %i3, %l1, %i3
    ! get the odd bytes between alignments
    sub %l0, %l2, %l0       ! regenerate shift count
    sub %l5, %l0, %l1       ! generate new shift left count (LS)
    and %i2, 3, %l4     ! must do remaining bytes if count%4 > 0
    andn    %i2, 3, %i2     ! # of aligned bytes that can be moved
    srl %i3, %l0, %i4
    or  %i5, %i4, %i5
    st  %i5, [%i1]      ! write a word
    subcc   %i2, 4, %i2     ! decrement count
    bz  %ncc, .unalign_out
    add %i1, 4, %i1     ! increment destination address

    b   2f
    sll %i3, %l1, %i5       ! get leftover into upper bits
.more_needed:
    sll %i3, %l0, %i3       ! save remaining byte(s)
    srl %i3, %l0, %i3
    sub %l2, %l0, %l1       ! regenerate shift count
    sub %l5, %l1, %l0       ! generate new shift left count
    sll %i3, %l1, %i4       ! move to fill empty space
    b   3f
    or  %i5, %i4, %i5       ! merge to complete word
    !
    ! the source address is aligned and destination is not
    !
.align_dst_only:
    ld  [%i0], %i4      ! read a word
    add %i0, 4, %i0     ! increment source address
    mov 24, %l0         ! initial shift alignment count
1:
    srl %i4, %l0, %i3       ! prepare to write a single byte
    stb %i3, [%i1]      ! write a byte
    add %i1, 1, %i1     ! increment destination address
    sub %i2, 1, %i2     ! decrement count
    btst    3, %i1          ! is destination aligned?
    bnz,a   1b
    sub %l0, 8, %l0     ! delay slot, decrement shift count
.xfer:
    sub %l5, %l0, %l1       ! generate shift left count
    sll %i4, %l1, %i5       ! get leftover
3:
    and %i2, 3, %l4     ! must do remaining bytes if count%4 > 0
    andn    %i2, 3, %i2     ! # of aligned bytes that can be moved
2:
    ld  [%i0], %i3      ! read a source word
    add %i0, 4, %i0     ! increment source address
    srl %i3, %l0, %i4       ! upper src bits into lower dst bits
    or  %i5, %i4, %i5       ! merge with upper dest bits (leftover)
    st  %i5, [%i1]      ! write a destination word
    subcc   %i2, 4, %i2     ! decrement count
    bz  %ncc, .unalign_out  ! check if done
    add %i1, 4, %i1     ! increment destination address
    b   2b          ! loop
    sll %i3, %l1, %i5       ! get leftover
.unalign_out:
    tst %l4         ! any bytes leftover?
    bz  %ncc, .cpdone
    .empty              ! allow next instruction in delay slot
1:
    sub %l0, 8, %l0     ! decrement shift
    srl %i3, %l0, %i4       ! upper src byte into lower dst byte
    stb %i4, [%i1]      ! write a byte
    subcc   %l4, 1, %l4     ! decrement count
    bz  %ncc, .cpdone       ! done?
    add %i1, 1, %i1     ! increment destination
    tst %l0         ! any more previously read bytes
    bnz %ncc, 1b        ! we have leftover bytes
    mov %l4, %i2        ! delay slot, mv cnt where dbytecp wants
    b   .dbytecp        ! let dbytecp do the rest
    sub %i0, %i1, %i0       ! i0 gets the difference of src and dst
    !
    ! the destination address is aligned and the source is not
    !
.align_src_only:
    ldub    [%i0], %i3      ! read a byte from source address
    add %i0, 1, %i0     ! increment source address
    or  %i4, %i3, %i4       ! or in with previous bytes (if any)
    btst    3, %i0          ! is source aligned?
    add %l0, 8, %l0     ! increment shift count (US)
    bnz,a   .align_src_only
    sll %i4, 8, %i4     ! make room for next byte
    b,a .xfer
    !
    ! if from address unaligned for double-word moves,
    ! move bytes till it is, if count is < 56 it could take
    ! longer to align the thing than to do the transfer
    ! in word size chunks right away
    !
.aldoubcp:
    cmp %i2, 56         ! if count < 56, use wordcp, it takes
    blu,a   %ncc, .alwordcp     ! longer to align doubles than words
    mov 3, %o0          ! mask for word alignment
    call    .alignit        ! copy bytes until aligned
    mov 7, %o0          ! mask for double alignment
    !
    ! source and destination are now double-word aligned
    ! i3 has aligned count returned by alignit
    !
    and %i2, 7, %i2     ! unaligned leftover count
    sub %i0, %i1, %i0       ! i0 gets the difference of src and dst
5:
    ldx [%i0+%i1], %o4      ! read from address
    stx %o4, [%i1]      ! write at destination address
    subcc   %i3, 8, %i3     ! dec count
    bgu %ncc, 5b
    add %i1, 8, %i1     ! delay slot, inc to address
    cmp %i2, 4          ! see if we can copy a word
    blu %ncc, .dbytecp      ! if 3 or less bytes use bytecp
    .empty
    !
    ! for leftover bytes we fall into wordcp, if needed
    !
.wordcp:
    and %i2, 3, %i2     ! unaligned leftover count
5:
    ld  [%i0+%i1], %o4      ! read from address
    st  %o4, [%i1]      ! write at destination address
    subcc   %i3, 4, %i3     ! dec count
    bgu %ncc, 5b
    add %i1, 4, %i1     ! delay slot, inc to address
    b,a .dbytecp

    ! we come here to align copies on word boundaries
.alwordcp:
    call    .alignit        ! go word-align it
    mov 3, %o0          ! bits that must be zero to be aligned
    b   .wordcp
    sub %i0, %i1, %i0       ! i0 gets the difference of src and dst

    !
    ! byte copy, works with any alignment
    !
.bytecp:
    b   .dbytecp
    sub %i0, %i1, %i0       ! i0 gets difference of src and dst

    !
    ! differenced byte copy, works with any alignment
    ! assumes dest in %i1 and (source - dest) in %i0
    !
1:
    stb %o4, [%i1]      ! write to address
    inc %i1         ! inc to address
.dbytecp:
    deccc   %i2         ! dec count
    bgeu,a  %ncc, 1b        ! loop till done
    ldub    [%i0+%i1], %o4      ! read from address
    !
    ! FPUSED_FLAG will not have been set in any path leading to
    ! this point. No need to deal with it.
    !
.cpdone:
    btst    BCOPY_FLAG, %l6
    bz,pn   %icc, 2f
    andncc  %l6, BCOPY_FLAG, %l6
    !
    ! Here via bcopy. Check to see if the handler was NULL.
    ! If so, just return quietly. Otherwise, reset the
    ! handler and go home.
    !
    bnz,pn  %ncc, 2f
    nop
    !
    ! Null handler.
    !
    ret
    restore %g0, 0, %o0
    !
    ! Here via kcopy or bcopy with a handler.Reset the
    ! fault handler.
    !
2:
    membar  #Sync
    stn %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
    ret
    restore %g0, 0, %o0     ! return (0)

/*
 * Common code used to align transfers on word and doubleword
 * boudaries.  Aligns source and destination and returns a count
 * of aligned bytes to transfer in %i3
 */
1:
    inc %i0         ! inc from
    stb %o4, [%i1]      ! write a byte
    inc %i1         ! inc to
    dec %i2         ! dec count
.alignit:
    btst    %o0, %i0        ! %o0 is bit mask to check for alignment
    bnz,a   1b
    ldub    [%i0], %o4      ! read next byte

    retl
    andn    %i2, %o0, %i3       ! return size of aligned bytes
    SET_SIZE(bcopy)

#endif  /* lint */

/*
 * Block copy with possibly overlapped operands.
 */

#if defined(lint)

/*ARGSUSED*/
void
ovbcopy(const void *from, void *to, size_t count)
{}

#else   /* lint */

    ENTRY(ovbcopy)
    tst %o2         ! check count
    bgu,a   %ncc, 1f        ! nothing to do or bad arguments
    subcc   %o0, %o1, %o3       ! difference of from and to address

    retl                ! return
    nop
1:
    bneg,a  %ncc, 2f
    neg %o3         ! if < 0, make it positive
2:  cmp %o2, %o3        ! cmp size and abs(from - to)
    bleu    %ncc, bcopy     ! if size <= abs(diff): use bcopy,
    .empty              !   no overlap
    cmp %o0, %o1        ! compare from and to addresses
    blu %ncc, .ov_bkwd      ! if from < to, copy backwards
    nop
    !
    ! Copy forwards.
    !
.ov_fwd:
    ldub    [%o0], %o3      ! read from address
    inc %o0         ! inc from address
    stb %o3, [%o1]      ! write to address
    deccc   %o2         ! dec count
    bgu %ncc, .ov_fwd       ! loop till done
    inc %o1         ! inc to address

    retl                ! return
    nop
    !
    ! Copy backwards.
    !
.ov_bkwd:
    deccc   %o2         ! dec count
    ldub    [%o0 + %o2], %o3    ! get byte at end of src
    bgu %ncc, .ov_bkwd      ! loop till done
    stb %o3, [%o1 + %o2]    ! delay slot, store at end of dst

    retl                ! return
    nop
    SET_SIZE(ovbcopy)

#endif  /* lint */

/*
 * hwblkpagecopy()
 *
 * Copies exactly one page.  This routine assumes the caller (ppcopy)
 * has already disabled kernel preemption and has checked
 * use_hw_bcopy.
 */
#ifdef lint
/*ARGSUSED*/
void
hwblkpagecopy(const void *src, void *dst)
{ }
#else /* lint */
    ENTRY(hwblkpagecopy)
    ! get another window w/space for three aligned blocks of saved fpregs
    save    %sp, -SA(MINFRAME + 4*64), %sp

    ! %i0 - source address (arg)
    ! %i1 - destination address (arg)
    ! %i2 - length of region (not arg)
    ! %l0 - saved fprs
    ! %l1 - pointer to saved fpregs

    rd  %fprs, %l0      ! check for unused fp
    btst    FPRS_FEF, %l0
    bz  1f
    membar  #Sync

    ! save in-use fpregs on stack
    add %fp, STACK_BIAS - 193, %l1
    and %l1, -64, %l1
    stda    %d0, [%l1]ASI_BLK_P
    add %l1, 64, %l3
    stda    %d16, [%l3]ASI_BLK_P
    add %l3, 64, %l3
    stda    %d32, [%l3]ASI_BLK_P
    membar  #Sync

1:  wr  %g0, FPRS_FEF, %fprs
    ldda    [%i0]ASI_BLK_P, %d0
    add %i0, 64, %i0
    set PAGESIZE - 64, %i2

2:  ldda    [%i0]ASI_BLK_P, %d16
    fsrc1   %d0, %d32
    fsrc1   %d2, %d34
    fsrc1   %d4, %d36
    fsrc1   %d6, %d38
    fsrc1   %d8, %d40
    fsrc1   %d10, %d42
    fsrc1   %d12, %d44
    fsrc1   %d14, %d46
    stda    %d32, [%i1]ASI_BLK_P
    add %i0, 64, %i0
    subcc   %i2, 64, %i2
    bz,pn   %ncc, 3f
    add %i1, 64, %i1
    ldda    [%i0]ASI_BLK_P, %d0
    fsrc1   %d16, %d32
    fsrc1   %d18, %d34
    fsrc1   %d20, %d36
    fsrc1   %d22, %d38
    fsrc1   %d24, %d40
    fsrc1   %d26, %d42
    fsrc1   %d28, %d44
    fsrc1   %d30, %d46
    stda    %d32, [%i1]ASI_BLK_P
    add %i0, 64, %i0
    sub %i2, 64, %i2
    ba,pt   %ncc, 2b
    add %i1, 64, %i1

3:  membar  #Sync
    btst    FPRS_FEF, %l0
    bz  4f
    stda    %d16, [%i1]ASI_BLK_P

    ! restore fpregs from stack
    membar  #Sync
    ldda    [%l1]ASI_BLK_P, %d0
    add %l1, 64, %l3
    ldda    [%l3]ASI_BLK_P, %d16
    add %l3, 64, %l3
    ldda    [%l3]ASI_BLK_P, %d32

4:  wr  %l0, 0, %fprs       ! restore fprs
    membar #Sync
    ret
    restore %g0, 0, %o0
    SET_SIZE(hwblkpagecopy)
#endif  /* lint */


/*
 * Transfer data to and from user space -
 * Note that these routines can cause faults
 * It is assumed that the kernel has nothing at
 * less than KERNELBASE in the virtual address space.
 *
 * Note that copyin(9F) and copyout(9F) are part of the
 * DDI/DKI which specifies that they return '-1' on "errors."
 *
 * Sigh.
 *
 * So there's two extremely similar routines - xcopyin() and xcopyout()
 * which return the errno that we've faithfully computed.  This
 * allows other callers (e.g. uiomove(9F)) to work correctly.
 * Given that these are used pretty heavily, we expand the calling
 * sequences inline for all flavours (rather than making wrappers).
 *
 * There are also stub routines for xcopyout_little and xcopyin_little,
 * which currently are intended to handle requests of <= 16 bytes from
 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
 * is left as an exercise...
 */

/*
 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
 *
 * General theory of operation:
 *
 * The only difference between default_copy{in,out} and
 * default_xcopy{in,out} is in the error handling routine they invoke
 * when a memory access error is seen. default_xcopyOP returns the errno
 * while default_copyOP returns -1 (see above). copy{in,out}_noerr set
 * a special flag (by oring the value 2 into the fault handler address)
 * if they are called with a fault handler already in place. That flag
 * causes the default handlers to trampoline to the previous handler
 * upon an error.
 *
 * None of the copyops routines grab a window until it's decided that
 * we need to do a HW block copy operation. This saves a window
 * spill/fill when we're called during socket ops. The typical IO
 * path won't cause spill/fill traps.
 *
 * This code uses a set of 4 limits for the maximum size that will
 * be copied given a particular input/output address alignment.
 * the default limits are:
 *
 * single byte aligned - 900 (hw_copy_limit_1)
 * two byte aligned - 1800 (hw_copy_limit_2)
 * four byte aligned - 3600 (hw_copy_limit_4)
 * eight byte aligned - 7200 (hw_copy_limit_8)
 *
 * If the value for a particular limit is zero, the copy will be done
 * via the copy loops rather than VIS.
 *
 * Flow:
 *
 * If count == zero return zero.
 *
 * Store the previous lo_fault handler into %g6.
 * Place our secondary lofault handler into %g5.
 * Place the address of our nowindow fault handler into %o3.
 * Place the address of the windowed fault handler into %o4.
 * --> We'll use this handler if we end up grabbing a window
 * --> before we use VIS instructions.
 *
 * If count is less than or equal to SMALL_LIMIT (7) we
 * always do a byte for byte copy.
 *
 * If count is > SMALL_LIMIT, we check the alignment of the input
 * and output pointers. Based on the alignment we check count
 * against a soft limit of VIS_COPY_THRESHOLD (900 on spitfire). If
 * we're larger than VIS_COPY_THRESHOLD, we check against a limit based
 * on detected alignment. If we exceed the alignment value we copy
 * via VIS instructions.
 *
 * If we don't exceed one of the limits, we store -count in %o3,
 * we store the number of chunks (8, 4, 2 or 1 byte) operated
 * on in our basic copy loop in %o2. Following this we branch
 * to the appropriate copy loop and copy that many chunks.
 * Since we've been adding the chunk size to %o3 each time through
 * as well as decrementing %o2, we can tell if any data is
 * is left to be copied by examining %o3. If that is zero, we're
 * done and can go home. If not, we figure out what the largest
 * chunk size left to be copied is and branch to that copy loop
 * unless there's only one byte left. We load that as we're
 * branching to code that stores it just before we return.
 *
 * There is one potential situation in which we start to do a VIS
 * copy but decide to punt and return to the copy loops. There is
 * (in the default configuration) a window of 256 bytes between
 * the single byte aligned copy limit and what VIS treats as its
 * minimum if floating point is in use in the calling app. We need
 * to be prepared to handle this. See the .small_copyOP label for
 * details.
 *
 * Fault handlers are invoked if we reference memory that has no
 * current mapping.  All forms share the same copyio_fault handler.
 * This routine handles fixing up the stack and general housecleaning.
 * Each copy operation has a simple fault handler that is then called
 * to do the work specific to the invidual operation.  The handlers
 * for default_copyOP and copyOP_noerr are found at the end of
 * default_copyout. The handlers for default_xcopyOP are found at the
 * end of xdefault_copyin.
 */

/*
 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
 */

#if defined(lint)

/*ARGSUSED*/
int
copyout(const void *kaddr, void *uaddr, size_t count)
{ return (0); }

#else   /* lint */

/*
 * We save the arguments in the following registers in case of a fault:
 *  kaddr - %g2
 *  uaddr - %g3
 *  count - %g4
 */
#define SAVE_SRC    %g2
#define SAVE_DST    %g3
#define SAVE_COUNT  %g4

#define REAL_LOFAULT        %g5
#define SAVED_LOFAULT       %g6

/*
 * Generic copyio fault handler.  This is the first line of defense when a
 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
 * This allows us to share common code for all the flavors of the copy
 * operations, including the _noerr versions.
 *
 * Note that this function will restore the original input parameters before
 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
 * member of the t_copyop structure, if needed.
 */
    ENTRY(copyio_fault)
    btst    FPUSED_FLAG, SAVED_LOFAULT
    bz  1f
      andn  SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT

    membar  #Sync

    ld  [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
    wr  %o2, 0, %gsr        ! restore gsr

    ld  [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
    btst    FPRS_FEF, %o3
    bz  4f
      nop

    ! restore fpregs from stack
    membar  #Sync
    add %fp, STACK_BIAS - 257, %o2
    and %o2, -64, %o2
    ldda    [%o2]ASI_BLK_P, %d0
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d16
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d32
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d48
    membar  #Sync

    ba,pt   %ncc, 1f
      wr    %o3, 0, %fprs       ! restore fprs

4:
    FZERO               ! zero all of the fpregs
    wr  %o3, 0, %fprs       ! restore fprs

1:

    restore

    mov SAVE_SRC, %o0
    mov SAVE_DST, %o1
    jmp REAL_LOFAULT
      mov   SAVE_COUNT, %o2
    SET_SIZE(copyio_fault)

    ENTRY(copyio_fault_nowindow)
    membar  #Sync
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault

    mov SAVE_SRC, %o0
    mov SAVE_DST, %o1
    jmp REAL_LOFAULT
      mov   SAVE_COUNT, %o2
    SET_SIZE(copyio_fault_nowindow)

    ENTRY(copyout)
    sethi   %hi(.copyout_err), REAL_LOFAULT
    or  REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT

.do_copyout:
    !
    ! Check the length and bail if zero.
    !
    tst %o2
    bnz,pt  %ncc, 1f
      nop
    retl
      clr   %o0
1:
    sethi   %hi(copyio_fault), %o4
    or  %o4, %lo(copyio_fault), %o4
    sethi   %hi(copyio_fault_nowindow), %o3
    ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
    or  %o3, %lo(copyio_fault_nowindow), %o3
    membar  #Sync
    stn %o3, [THREAD_REG + T_LOFAULT]

    mov %o0, SAVE_SRC
    mov %o1, SAVE_DST
    mov %o2, SAVE_COUNT

    !
    ! Check to see if we're more than SMALL_LIMIT (7 bytes).
    ! Run in leaf mode, using the %o regs as our input regs.
    !
    subcc   %o2, SMALL_LIMIT, %o3
    bgu,a,pt %ncc, .dco_ns
    or  %o0, %o1, %o3
    !
    ! What was previously ".small_copyout"
    ! Do full differenced copy.
    !
.dcobcp:
    sub %g0, %o2, %o3       ! negate count
    add %o0, %o2, %o0       ! make %o0 point at the end
    add %o1, %o2, %o1       ! make %o1 point at the end
    ba,pt   %ncc, .dcocl
    ldub    [%o0 + %o3], %o4    ! load first byte
    !
    ! %o0 and %o2 point at the end and remain pointing at the end
    ! of their buffers. We pull things out by adding %o3 (which is
    ! the negation of the length) to the buffer end which gives us
    ! the curent location in the buffers. By incrementing %o3 we walk
    ! through both buffers without having to bump each buffer's
    ! pointer. A very fast 4 instruction loop.
    !
    .align 16
.dcocl:
    stba    %o4, [%o1 + %o3]ASI_USER
    inccc   %o3
    bl,a,pt %ncc, .dcocl
    ldub    [%o0 + %o3], %o4
    !
    ! We're done. Go home.
    !
    membar  #Sync
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
    retl
    clr %o0
    !
    ! Try aligned copies from here.
    !
.dco_ns:
    ! %o0 = kernel addr (to be copied from)
    ! %o1 = user addr (to be copied to)
    ! %o2 = length
    ! %o3 = %o1 | %o2 (used for alignment checking)
    ! %o4 is alternate lo_fault
    ! %o5 is original lo_fault
    !
    ! See if we're single byte aligned. If we are, check the
    ! limit for single byte copies. If we're smaller or equal,
    ! bounce to the byte for byte copy loop. Otherwise do it in
    ! HW (if enabled).
    !
    btst    1, %o3
    bz,pt   %icc, .dcoh8
    btst    7, %o3
    !
    ! Single byte aligned. Do we do it via HW or via
    ! byte for byte? Do a quick no memory reference
    ! check to pick up small copies.
    !
    subcc   %o2, VIS_COPY_THRESHOLD, %o3
    bleu,pt %ncc, .dcobcp
    sethi   %hi(hw_copy_limit_1), %o3
    !
    ! Big enough that we need to check the HW limit for
    ! this size copy.
    !
    ld  [%o3 + %lo(hw_copy_limit_1)], %o3
    !
    ! Is HW copy on? If not, do everything byte for byte.
    !
    tst %o3
    bz,pn   %icc, .dcobcp
    subcc   %o3, %o2, %o3
    !
    ! If we're less than or equal to the single byte copy limit,
    ! bop to the copy loop.
    !
    bge,pt  %ncc, .dcobcp
    nop
    !
    ! We're big enough and copy is on. Do it with HW.
    !
    ba,pt   %ncc, .big_copyout
    nop
.dcoh8:
    !
    ! 8 byte aligned?
    !
    bnz,a   %ncc, .dcoh4
    btst    3, %o3
    !
    ! See if we're in the "small range".
    ! If so, go off and do the copy.
    ! If not, load the hard limit. %o3 is
    ! available for reuse.
    !
    subcc   %o2, VIS_COPY_THRESHOLD, %o3
    bleu,pt %ncc, .dcos8
    sethi   %hi(hw_copy_limit_8), %o3
    ld  [%o3 + %lo(hw_copy_limit_8)], %o3
    !
    ! If it's zero, there's no HW bcopy.
    ! Bop off to the aligned copy.
    !
    tst %o3
    bz,pn   %icc, .dcos8
    subcc   %o3, %o2, %o3
    !
    ! We're negative if our size is larger than hw_copy_limit_8.
    !
    bge,pt  %ncc, .dcos8
    nop
    !
    ! HW assist is on and we're large enough. Do it.
    !
    ba,pt   %ncc, .big_copyout
    nop
.dcos8:
    !
    ! Housekeeping for copy loops. Uses same idea as in the byte for
    ! byte copy loop above.
    !
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    sub %g0, %o2, %o3
    ba,pt   %ncc, .dodebc
    srl %o2, 3, %o2     ! Number of 8 byte chunks to copy
    !
    ! 4 byte aligned?
    !
.dcoh4:
    bnz,pn  %ncc, .dcoh2
    !
    ! See if we're in the "small range".
    ! If so, go off an do the copy.
    ! If not, load the hard limit. %o3 is
    ! available for reuse.
    !
    subcc   %o2, VIS_COPY_THRESHOLD, %o3
    bleu,pt %ncc, .dcos4
    sethi   %hi(hw_copy_limit_4), %o3
    ld  [%o3 + %lo(hw_copy_limit_4)], %o3
    !
    ! If it's zero, there's no HW bcopy.
    ! Bop off to the aligned copy.
    !
    tst %o3
    bz,pn   %icc, .dcos4
    subcc   %o3, %o2, %o3
    !
    ! We're negative if our size is larger than hw_copy_limit_4.
    !
    bge,pt  %ncc, .dcos4
    nop
    !
    ! HW assist is on and we're large enough. Do it.
    !
    ba,pt   %ncc, .big_copyout
    nop
.dcos4:
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    sub %g0, %o2, %o3
    ba,pt   %ncc, .dodfbc
    srl %o2, 2, %o2     ! Number of 4 byte chunks to copy
    !
    ! We must be 2 byte aligned. Off we go.
    ! The check for small copies was done in the
    ! delay at .dcoh4
    !
.dcoh2:
    ble %ncc, .dcos2
    sethi   %hi(hw_copy_limit_2), %o3
    ld  [%o3 + %lo(hw_copy_limit_2)], %o3
    tst %o3
    bz,pn   %icc, .dcos2
    subcc   %o3, %o2, %o3
    bge,pt  %ncc, .dcos2
    nop
    !
    ! HW is on and we're big enough. Do it.
    !
    ba,pt   %ncc, .big_copyout
    nop
.dcos2:
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    sub %g0, %o2, %o3
    ba,pt   %ncc, .dodtbc
    srl %o2, 1, %o2     ! Number of 2 byte chunks to copy
.small_copyout:
    !
    ! Why are we doing this AGAIN? There are certain conditions in
    ! big_copyout that will cause us to forego the HW assisted copies
    ! and bounce back to a non-HW assisted copy. This dispatches those
    ! copies. Note that we branch around this in the main line code.
    !
    ! We make no check for limits or HW enablement here. We've
    ! already been told that we're a poster child so just go off
    ! and do it.
    !
    or  %o0, %o1, %o3
    btst    1, %o3
    bnz %icc, .dcobcp       ! Most likely
    btst    7, %o3
    bz  %icc, .dcos8
    btst    3, %o3
    bz  %icc, .dcos4
    nop
    ba,pt   %ncc, .dcos2
    nop
    .align 32
.dodebc:
    ldx [%o0 + %o3], %o4
    deccc   %o2
    stxa    %o4, [%o1 + %o3]ASI_USER
    bg,pt   %ncc, .dodebc
    addcc   %o3, 8, %o3
    !
    ! End of copy loop. Check to see if we're done. Most
    ! eight byte aligned copies end here.
    !
    bz,pt   %ncc, .dcofh
    nop
    !
    ! Something is left - do it byte for byte.
    !
    ba,pt   %ncc, .dcocl
    ldub    [%o0 + %o3], %o4    ! load next byte
    !
    ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
    !
    .align 32
.dodfbc:
    lduw    [%o0 + %o3], %o4
    deccc   %o2
    sta %o4, [%o1 + %o3]ASI_USER
    bg,pt   %ncc, .dodfbc
    addcc   %o3, 4, %o3
    !
    ! End of copy loop. Check to see if we're done. Most
    ! four byte aligned copies end here.
    !
    bz,pt   %ncc, .dcofh
    nop
    !
    ! Something is left. Do it byte for byte.
    !
    ba,pt   %ncc, .dcocl
    ldub    [%o0 + %o3], %o4    ! load next byte
    !
    ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
    ! copy.
    !
    .align 32
.dodtbc:
    lduh    [%o0 + %o3], %o4
    deccc   %o2
    stha    %o4, [%o1 + %o3]ASI_USER
    bg,pt   %ncc, .dodtbc
    addcc   %o3, 2, %o3
    !
    ! End of copy loop. Anything left?
    !
    bz,pt   %ncc, .dcofh
    nop
    !
    ! Deal with the last byte
    !
    ldub    [%o0 + %o3], %o4
    stba    %o4, [%o1 + %o3]ASI_USER
.dcofh:
    membar  #Sync
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
    retl
    clr %o0

.big_copyout:
    !
    ! Are we using the FP registers?
    !
    rd  %fprs, %o3          ! check for unused fp
    btst    FPRS_FEF, %o3
    bnz %icc, .copyout_fpregs_inuse
    nop
    !
    ! We're going to go off and do a block copy.
    ! Switch fault hendlers and grab a window. We
    ! don't do a membar #Sync since we've done only
    ! kernel data to this point.
    !
    stn %o4, [THREAD_REG + T_LOFAULT]
    save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
    !
    ! %o3 is now %i3. Save original %fprs.
    !
    st  %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
    ba,pt   %ncc, .do_block_copyout     ! Not in use. Go off and do it.
    wr  %g0, FPRS_FEF, %fprs        ! clear %fprs
    !
.copyout_fpregs_inuse:
    !
    ! We're here if the FP regs are in use. Need to see if the request
    ! exceeds our suddenly larger minimum.
    !
    cmp %i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
    bl  %ncc, .small_copyout
      nop
    !
    ! We're going to go off and do a block copy.
    ! Change to the heavy duty fault handler and grab a window first.
    !
    stn %o4, [THREAD_REG + T_LOFAULT]
    save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
    st  %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
    !
    ! save in-use fpregs on stack
    !
    wr  %g0, FPRS_FEF, %fprs
    membar  #Sync
    add %fp, STACK_BIAS - 257, %o2
    and %o2, -64, %o2
    stda    %d0, [%o2]ASI_BLK_P
    add %o2, 64, %o2
    stda    %d16, [%o2]ASI_BLK_P
    add %o2, 64, %o2
    stda    %d32, [%o2]ASI_BLK_P
    add %o2, 64, %o2
    stda    %d48, [%o2]ASI_BLK_P
    membar  #Sync

.do_block_copyout:
    membar  #StoreStore|#StoreLoad|#LoadStore

    rd  %gsr, %o2
    st  %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]  ! save gsr

    ! Set the lower bit in the saved t_lofault to indicate
    ! that we need to clear the %fprs register on the way
    ! out
    or  SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT

    ! Swap src/dst since the code below is memcpy code
    ! and memcpy/bcopy have different calling sequences
    mov %i1, %i5
    mov %i0, %i1
    mov %i5, %i0

!!! This code is nearly identical to the version in the sun4u
!!! libc_psr.  Most bugfixes made to that file should be
!!! merged into this routine.

    andcc   %i0, 7, %o3
    bz  %ncc, copyout_blkcpy
    sub %o3, 8, %o3
    neg %o3
    sub %i2, %o3, %i2

    ! Align Destination on double-word boundary

2:  ldub    [%i1], %o4
    inc %i1
    stba    %o4, [%i0]ASI_USER
    deccc   %o3
    bgu %ncc, 2b
      inc   %i0
copyout_blkcpy:
    andcc   %i0, 63, %i3
    bz,pn   %ncc, copyout_blalign   ! now block aligned
    sub %i3, 64, %i3
    neg %i3         ! bytes till block aligned
    sub %i2, %i3, %i2       ! update %i2 with new count

    ! Copy %i3 bytes till dst is block (64 byte) aligned. use
    ! double word copies.

    alignaddr %i1, %g0, %g1
    ldd [%g1], %d0
    add %g1, 8, %g1
6:
    ldd [%g1], %d2
    add %g1, 8, %g1
    subcc   %i3, 8, %i3
    faligndata %d0, %d2, %d8
    stda     %d8, [%i0]ASI_USER
    add %i1, 8, %i1
    bz,pn   %ncc, copyout_blalign
    add %i0, 8, %i0
    ldd [%g1], %d0
    add %g1, 8, %g1
    subcc   %i3, 8, %i3
    faligndata %d2, %d0, %d8
    stda     %d8, [%i0]ASI_USER
    add %i1, 8, %i1
    bgu,pn  %ncc, 6b
    add %i0, 8, %i0

copyout_blalign:
    membar  #StoreLoad
    ! %i2 = total length
    ! %i3 = blocks  (length - 64) / 64
    ! %i4 = doubles remaining  (length - blocks)
    sub %i2, 64, %i3
    andn    %i3, 63, %i3
    sub %i2, %i3, %i4
    andn    %i4, 7, %i4
    sub %i4, 16, %i4
    sub %i2, %i4, %i2
    sub %i2, %i3, %i2

    andn    %i1, 0x3f, %l7      ! blk aligned address
    alignaddr %i1, %g0, %g0     ! gen %gsr

    srl %i1, 3, %l5     ! bits 3,4,5 are now least sig in  %l5
    andcc   %l5, 7, %i5     ! mask everything except bits 1,2 3
    add %i1, %i4, %i1
    add %i1, %i3, %i1

    ldda    [%l7]ASI_BLK_P, %d0
    add %l7, 64, %l7
    ldda    [%l7]ASI_BLK_P, %d16
    add %l7, 64, %l7
    ldda    [%l7]ASI_BLK_P, %d32
    add %l7, 64, %l7
    sub %i3, 128, %i3

    ! switch statement to get us to the right 8 byte blk within a
    ! 64 byte block

    cmp  %i5, 4
    bgeu,a   copyout_hlf
    cmp  %i5, 6
    cmp  %i5, 2
    bgeu,a   copyout_sqtr
    nop
    cmp  %i5, 1
    be,a     copyout_seg1
    nop
    ba,pt    %ncc, copyout_seg0
    nop
copyout_sqtr:
    be,a     copyout_seg2
    nop
    ba,pt    %ncc, copyout_seg3
    nop

copyout_hlf:
    bgeu,a   copyout_fqtr
    nop
    cmp  %i5, 5
    be,a     copyout_seg5
    nop
    ba,pt    %ncc, copyout_seg4
    nop
copyout_fqtr:
    be,a     copyout_seg6
    nop
    ba,pt    %ncc, copyout_seg7
    nop

copyout_seg0:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D0
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D16
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D32
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyout_seg0

0:
    FALIGN_D16
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D32
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd0
    add %i0, 64, %i0

1:
    FALIGN_D32
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D0
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd16
    add %i0, 64, %i0

2:
    FALIGN_D0
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D16
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd32
    add %i0, 64, %i0

copyout_seg1:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D2
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D18
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D34
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyout_seg1
0:
    FALIGN_D18
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D34
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd2
    add %i0, 64, %i0

1:
    FALIGN_D34
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D2
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd18
    add %i0, 64, %i0

2:
    FALIGN_D2
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D18
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd34
    add %i0, 64, %i0

copyout_seg2:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D4
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D20
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D36
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyout_seg2

0:
    FALIGN_D20
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D36
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd4
    add %i0, 64, %i0

1:
    FALIGN_D36
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D4
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd20
    add %i0, 64, %i0

2:
    FALIGN_D4
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D20
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd36
    add %i0, 64, %i0

copyout_seg3:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D6
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D22
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D38
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyout_seg3

0:
    FALIGN_D22
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D38
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd6
    add %i0, 64, %i0

1:
    FALIGN_D38
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D6
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd22
    add %i0, 64, %i0

2:
    FALIGN_D6
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D22
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd38
    add %i0, 64, %i0

copyout_seg4:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D8
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D24
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D40
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyout_seg4

0:
    FALIGN_D24
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D40
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd8
    add %i0, 64, %i0

1:
    FALIGN_D40
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D8
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd24
    add %i0, 64, %i0

2:
    FALIGN_D8
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D24
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd40
    add %i0, 64, %i0

copyout_seg5:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D10
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D26
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D42
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyout_seg5

0:
    FALIGN_D26
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D42
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd10
    add %i0, 64, %i0

1:
    FALIGN_D42
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D10
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd26
    add %i0, 64, %i0

2:
    FALIGN_D10
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D26
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd42
    add %i0, 64, %i0

copyout_seg6:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D12
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D28
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D44
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyout_seg6

0:
    FALIGN_D28
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D44
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd12
    add %i0, 64, %i0

1:
    FALIGN_D44
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D12
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd28
    add %i0, 64, %i0

2:
    FALIGN_D12
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D28
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd44
    add %i0, 64, %i0

copyout_seg7:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D14
    ldda    [%l7]ASI_BLK_P, %d0
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D30
    ldda    [%l7]ASI_BLK_P, %d16
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D46
    ldda    [%l7]ASI_BLK_P, %d32
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyout_seg7

0:
    FALIGN_D30
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D46
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd14
    add %i0, 64, %i0

1:
    FALIGN_D46
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D14
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd30
    add %i0, 64, %i0

2:
    FALIGN_D14
    stda    %d48, [%i0]ASI_BLK_AIUS
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D30
    stda    %d48, [%i0]ASI_BLK_AIUS
    ba,pt   %ncc, copyout_blkd46
    add %i0, 64, %i0


    !
    ! dribble out the last partial block
    !
copyout_blkd0:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d0, %d2, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd2:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d2, %d4, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd4:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d4, %d6, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd6:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d6, %d8, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd8:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d8, %d10, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd10:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d10, %d12, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd12:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d12, %d14, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd14:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    fsrc1   %d14, %d0
    ba,a,pt %ncc, copyout_blkleft

copyout_blkd16:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d16, %d18, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd18:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d18, %d20, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd20:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d20, %d22, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd22:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d22, %d24, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd24:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d24, %d26, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd26:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d26, %d28, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd28:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d28, %d30, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd30:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    fsrc1   %d30, %d0
    ba,a,pt %ncc, copyout_blkleft
copyout_blkd32:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d32, %d34, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd34:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d34, %d36, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd36:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d36, %d38, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd38:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d38, %d40, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd40:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d40, %d42, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd42:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d42, %d44, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd44:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    faligndata %d44, %d46, %d48
    stda    %d48, [%i0]ASI_USER
    add %i0, 8, %i0
copyout_blkd46:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyout_blkdone
    fsrc1   %d46, %d0

copyout_blkleft:
1:
    ldd [%l7], %d2
    add %l7, 8, %l7
    subcc   %i4, 8, %i4
    faligndata %d0, %d2, %d8
    stda    %d8, [%i0]ASI_USER
    blu,pn  %ncc, copyout_blkdone
    add %i0, 8, %i0
    ldd [%l7], %d0
    add %l7, 8, %l7
    subcc   %i4, 8, %i4
    faligndata %d2, %d0, %d8
    stda    %d8, [%i0]ASI_USER
    bgeu,pt %ncc, 1b
    add %i0, 8, %i0

copyout_blkdone:
    tst %i2
    bz,pt   %ncc, .copyout_exit
    and %l3, 0x4, %l3       ! fprs.du = fprs.dl = 0

7:  ldub    [%i1], %i4
    inc %i1
    stba    %i4, [%i0]ASI_USER
    inc %i0
    deccc   %i2
    bgu %ncc, 7b
      nop

.copyout_exit:
    membar  #StoreLoad|#StoreStore
    btst    FPUSED_FLAG, SAVED_LOFAULT
    bz  1f
      nop

    ld  [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
    wr  %o2, 0, %gsr        ! restore gsr

    ld  [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
    btst    FPRS_FEF, %o3
    bz  4f
      nop

    ! restore fpregs from stack
    membar  #Sync
    add %fp, STACK_BIAS - 257, %o2
    and %o2, -64, %o2
    ldda    [%o2]ASI_BLK_P, %d0
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d16
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d32
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d48
    membar  #Sync

    ba,pt   %ncc, 1f
      wr    %o3, 0, %fprs       ! restore fprs

4:
    FZERO               ! zero all of the fpregs
    wr  %o3, 0, %fprs       ! restore fprs

1:
    andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
    membar  #Sync           ! sync error barrier
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
    ret
    restore %g0, 0, %o0

.copyout_err:
    ldn [THREAD_REG + T_COPYOPS], %o4
    brz %o4, 2f
    nop
    ldn [%o4 + CP_COPYOUT], %g2
    jmp %g2
    nop
2:
    retl
    mov -1, %o0
    SET_SIZE(copyout)

#endif  /* lint */


#ifdef  lint

/*ARGSUSED*/
int
xcopyout(const void *kaddr, void *uaddr, size_t count)
{ return (0); }

#else   /* lint */

    ENTRY(xcopyout)
    sethi   %hi(.xcopyout_err), REAL_LOFAULT
    b   .do_copyout
      or    REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
.xcopyout_err:
    ldn [THREAD_REG + T_COPYOPS], %o4
    brz %o4, 2f
    nop
    ldn [%o4 + CP_XCOPYOUT], %g2
    jmp %g2
    nop
2:
    retl
    mov %g1, %o0
    SET_SIZE(xcopyout)

#endif  /* lint */

#ifdef  lint

/*ARGSUSED*/
int
xcopyout_little(const void *kaddr, void *uaddr, size_t count)
{ return (0); }

#else   /* lint */

    ENTRY(xcopyout_little)
    sethi   %hi(.little_err), %o4
    ldn [THREAD_REG + T_LOFAULT], %o5
    or  %o4, %lo(.little_err), %o4
    membar  #Sync           ! sync error barrier
    stn %o4, [THREAD_REG + T_LOFAULT]

    subcc   %g0, %o2, %o3
    add %o0, %o2, %o0
    bz,pn   %ncc, 2f        ! check for zero bytes
    sub %o2, 1, %o4
    add %o0, %o4, %o0       ! start w/last byte
    add %o1, %o2, %o1
    ldub    [%o0+%o3], %o4

1:  stba    %o4, [%o1+%o3]ASI_AIUSL
    inccc   %o3
    sub %o0, 2, %o0     ! get next byte
    bcc,a,pt %ncc, 1b
      ldub  [%o0+%o3], %o4

2:  membar  #Sync           ! sync error barrier
    stn %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
    retl
    mov %g0, %o0        ! return (0)
    SET_SIZE(xcopyout_little)

#endif  /* lint */

/*
 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
 */

#if defined(lint)

/*ARGSUSED*/
int
copyin(const void *uaddr, void *kaddr, size_t count)
{ return (0); }

#else   /* lint */

    ENTRY(copyin)
    sethi   %hi(.copyin_err), REAL_LOFAULT
    or  REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT

.do_copyin:
    !
    ! Check the length and bail if zero.
    !
    tst %o2
    bnz,pt  %ncc, 1f
      nop
    retl
      clr   %o0
1:
    sethi   %hi(copyio_fault), %o4
    or  %o4, %lo(copyio_fault), %o4
    sethi   %hi(copyio_fault_nowindow), %o3
    ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
    or  %o3, %lo(copyio_fault_nowindow), %o3
    membar  #Sync
    stn %o3, [THREAD_REG + T_LOFAULT]

    mov %o0, SAVE_SRC
    mov %o1, SAVE_DST
    mov %o2, SAVE_COUNT

    !
    ! Check to see if we're more than SMALL_LIMIT.
    !
    subcc   %o2, SMALL_LIMIT, %o3
    bgu,a,pt %ncc, .dci_ns
    or  %o0, %o1, %o3
    !
    ! What was previously ".small_copyin"
    !
.dcibcp:
    sub %g0, %o2, %o3       ! setup for copy loop
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    ba,pt   %ncc, .dcicl
    lduba   [%o0 + %o3]ASI_USER, %o4
    !
    ! %o0 and %o1 point at the end and remain pointing at the end
    ! of their buffers. We pull things out by adding %o3 (which is
    ! the negation of the length) to the buffer end which gives us
    ! the curent location in the buffers. By incrementing %o3 we walk
    ! through both buffers without having to bump each buffer's
    ! pointer. A very fast 4 instruction loop.
    !
    .align 16
.dcicl:
    stb %o4, [%o1 + %o3]
    inccc   %o3
    bl,a,pt %ncc, .dcicl
    lduba   [%o0 + %o3]ASI_USER, %o4
    !
    ! We're done. Go home.
    !
    membar  #Sync
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
    retl
    clr %o0
    !
    ! Try aligned copies from here.
    !
.dci_ns:
    !
    ! See if we're single byte aligned. If we are, check the
    ! limit for single byte copies. If we're smaller, or equal,
    ! bounce to the byte for byte copy loop. Otherwise do it in
    ! HW (if enabled).
    !
    btst    1, %o3
    bz,a,pt %icc, .dcih8
    btst    7, %o3
    !
    ! We're single byte aligned.
    !
    subcc   %o2, VIS_COPY_THRESHOLD, %o3
    bleu,pt %ncc, .dcibcp
    sethi   %hi(hw_copy_limit_1), %o3
    ld  [%o3 + %lo(hw_copy_limit_1)], %o3
    !
    ! Is HW copy on? If not do everything byte for byte.
    !
    tst %o3
    bz,pn   %icc, .dcibcp
    subcc   %o3, %o2, %o3
    !
    ! Are we bigger than the HW limit? If not
    ! go to byte for byte.
    !
    bge,pt  %ncc, .dcibcp
    nop
    !
    ! We're big enough and copy is on. Do it with HW.
    !
    ba,pt   %ncc, .big_copyin
    nop
.dcih8:
    !
    ! 8 byte aligned?
    !
    bnz,a   %ncc, .dcih4
    btst    3, %o3
    !
    ! We're eight byte aligned.
    !
    subcc   %o2, VIS_COPY_THRESHOLD, %o3
    bleu,pt %ncc, .dcis8
    sethi   %hi(hw_copy_limit_8), %o3
    ld  [%o3 + %lo(hw_copy_limit_8)], %o3
    !
    ! Is HW assist on? If not, do it with the aligned copy.
    !
    tst %o3
    bz,pn   %icc, .dcis8
    subcc   %o3, %o2, %o3
    bge %ncc, .dcis8
    nop
    ba,pt   %ncc, .big_copyin
    nop
.dcis8:
    !
    ! Housekeeping for copy loops. Uses same idea as in the byte for
    ! byte copy loop above.
    !
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    sub %g0, %o2, %o3
    ba,pt   %ncc, .didebc
    srl %o2, 3, %o2     ! Number of 8 byte chunks to copy
    !
    ! 4 byte aligned?
    !
.dcih4:
    bnz %ncc, .dcih2
    subcc   %o2, VIS_COPY_THRESHOLD, %o3
    bleu,pt %ncc, .dcis4
    sethi   %hi(hw_copy_limit_4), %o3
    ld  [%o3 + %lo(hw_copy_limit_4)], %o3
    !
    ! Is HW assist on? If not, do it with the aligned copy.
    !
    tst %o3
    bz,pn   %icc, .dcis4
    subcc   %o3, %o2, %o3
    !
    ! We're negative if our size is less than or equal to hw_copy_limit_4.
    !
    bge %ncc, .dcis4
    nop
    ba,pt   %ncc, .big_copyin
    nop
.dcis4:
    !
    ! Housekeeping for copy loops. Uses same idea as in the byte
    ! for byte copy loop above.
    !
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    sub %g0, %o2, %o3
    ba,pt   %ncc, .didfbc
    srl %o2, 2, %o2     ! Number of 4 byte chunks to copy
.dcih2:
    !
    ! We're two byte aligned. Check for "smallness"
    ! done in delay at .dcih4
    !
    bleu,pt %ncc, .dcis2
    sethi   %hi(hw_copy_limit_2), %o3
    ld  [%o3 + %lo(hw_copy_limit_2)], %o3
    !
    ! Is HW assist on? If not, do it with the aligned copy.
    !
    tst %o3
    bz,pn   %icc, .dcis2
    subcc   %o3, %o2, %o3
    !
    ! Are we larger than the HW limit?
    !
    bge %ncc, .dcis2
    nop
    !
    ! HW assist is on and we're large enough to use it.
    !
    ba,pt   %ncc, .big_copyin
    nop
    !
    ! Housekeeping for copy loops. Uses same idea as in the byte
    ! for byte copy loop above.
    !
.dcis2:
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    sub %g0, %o2, %o3
    ba,pt   %ncc, .didtbc
    srl %o2, 1, %o2     ! Number of 2 byte chunks to copy
    !
.small_copyin:
    !
    ! Why are we doing this AGAIN? There are certain conditions in
    ! big copyin that will cause us to forgo the HW assisted copys
    ! and bounce back to a non-hw assisted copy. This dispatches
    ! those copies. Note that we branch around this in the main line
    ! code.
    !
    ! We make no check for limits or HW enablement here. We've
    ! already been told that we're a poster child so just go off
    ! and do it.
    !
    or  %o0, %o1, %o3
    btst    1, %o3
    bnz %icc, .dcibcp       ! Most likely
    btst    7, %o3
    bz  %icc, .dcis8
    btst    3, %o3
    bz  %icc, .dcis4
    nop
    ba,pt   %ncc, .dcis2
    nop
    !
    ! Eight byte aligned copies. A steal from the original .small_copyin
    ! with modifications. %o2 is number of 8 byte chunks to copy. When
    ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
    ! to copy.
    !
    .align 32
.didebc:
    ldxa    [%o0 + %o3]ASI_USER, %o4
    deccc   %o2
    stx %o4, [%o1 + %o3]
    bg,pt   %ncc, .didebc
    addcc   %o3, 8, %o3
    !
    ! End of copy loop. Most 8 byte aligned copies end here.
    !
    bz,pt   %ncc, .dcifh
    nop
    !
    ! Something is left. Do it byte for byte.
    !
    ba,pt   %ncc, .dcicl
    lduba   [%o0 + %o3]ASI_USER, %o4
    !
    ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
    !
    .align 32
.didfbc:
    lduwa   [%o0 + %o3]ASI_USER, %o4
    deccc   %o2
    st  %o4, [%o1 + %o3]
    bg,pt   %ncc, .didfbc
    addcc   %o3, 4, %o3
    !
    ! End of copy loop. Most 4 byte aligned copies end here.
    !
    bz,pt   %ncc, .dcifh
    nop
    !
    ! Something is left. Do it byte for byte.
    !
    ba,pt   %ncc, .dcicl
    lduba   [%o0 + %o3]ASI_USER, %o4
    !
    ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
    ! copy.
    !
    .align 32
.didtbc:
    lduha   [%o0 + %o3]ASI_USER, %o4
    deccc   %o2
    sth %o4, [%o1 + %o3]
    bg,pt   %ncc, .didtbc
    addcc   %o3, 2, %o3
    !
    ! End of copy loop. Most 2 byte aligned copies end here.
    !
    bz,pt   %ncc, .dcifh
    nop
    !
    ! Deal with the last byte
    !
    lduba   [%o0 + %o3]ASI_USER, %o4
    stb %o4, [%o1 + %o3]
.dcifh:
    membar  #Sync
    stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
    retl
    clr %o0

.big_copyin:
    !
    ! Are we using the FP registers?
    !
    rd  %fprs, %o3      ! check for unused fp
    btst    FPRS_FEF, %o3
    bnz %ncc, .copyin_fpregs_inuse
    nop
    !
    ! We're going off to do a block copy.
    ! Switch fault hendlers and grab a window. We
    ! don't do a membar #Sync since we've done only
    ! kernel data to this point.
    !
    stn %o4, [THREAD_REG + T_LOFAULT]
    save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
    !
    ! %o3 is %i3 after the save...
    !
    st  %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
    ba,pt   %ncc, .do_blockcopyin
    wr  %g0, FPRS_FEF, %fprs
.copyin_fpregs_inuse:
    !
    ! We're here if the FP regs are in use. Need to see if the request
    ! exceeds our suddenly larger minimum.
    !
    cmp %i2, VIS_COPY_THRESHOLD+(64*4)
    bl  %ncc, .small_copyin
    nop
    !
    ! We're going off and do a block copy.
    ! Change to the heavy duty fault handler and grab a window first.
    ! New handler is passed in
    !
    stn %o4, [THREAD_REG + T_LOFAULT]
    save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
    !
    ! %o3 is now %i3
    !
    st  %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]

    ! save in-use fpregs on stack
    wr  %g0, FPRS_FEF, %fprs
    membar  #Sync
    add %fp, STACK_BIAS - 257, %o2
    and %o2, -64, %o2
    stda    %d0, [%o2]ASI_BLK_P
    add %o2, 64, %o2
    stda    %d16, [%o2]ASI_BLK_P
    add %o2, 64, %o2
    stda    %d32, [%o2]ASI_BLK_P
    add %o2, 64, %o2
    stda    %d48, [%o2]ASI_BLK_P
    membar  #Sync

.do_blockcopyin:
    membar  #StoreStore|#StoreLoad|#LoadStore

    rd  %gsr, %o2
    st  %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]  ! save gsr

    ! Set the lower bit in the saved t_lofault to indicate
    ! that we need to clear the %fprs register on the way
    ! out
    or  SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT

    ! Swap src/dst since the code below is memcpy code
    ! and memcpy/bcopy have different calling sequences
    mov %i1, %i5
    mov %i0, %i1
    mov %i5, %i0

!!! This code is nearly identical to the version in the sun4u
!!! libc_psr.  Most bugfixes made to that file should be
!!! merged into this routine.

    andcc   %i0, 7, %o3
    bz  copyin_blkcpy
    sub %o3, 8, %o3
    neg %o3
    sub %i2, %o3, %i2

    ! Align Destination on double-word boundary

2:  lduba   [%i1]ASI_USER, %o4
    inc %i1
    inc %i0
    deccc   %o3
    bgu %ncc, 2b
    stb %o4, [%i0-1]
copyin_blkcpy:
    andcc   %i0, 63, %i3
    bz,pn   %ncc, copyin_blalign    ! now block aligned
    sub %i3, 64, %i3
    neg %i3         ! bytes till block aligned
    sub %i2, %i3, %i2       ! update %i2 with new count

    ! Copy %i3 bytes till dst is block (64 byte) aligned. use
    ! double word copies.

    alignaddr %i1, %g0, %g1
    ldda    [%g1]ASI_USER, %d0
    add %g1, 8, %g1
6:
    ldda    [%g1]ASI_USER, %d2
    add %g1, 8, %g1
    subcc   %i3, 8, %i3
    faligndata %d0, %d2, %d8
    std %d8, [%i0]
    add %i1, 8, %i1
    bz,pn   %ncc, copyin_blalign
    add %i0, 8, %i0
    ldda    [%g1]ASI_USER, %d0
    add %g1, 8, %g1
    subcc   %i3, 8, %i3
    faligndata %d2, %d0, %d8
    std %d8, [%i0]
    add %i1, 8, %i1
    bgu,pn  %ncc, 6b
    add %i0, 8, %i0

copyin_blalign:
    membar  #StoreLoad
    ! %i2 = total length
    ! %i3 = blocks  (length - 64) / 64
    ! %i4 = doubles remaining  (length - blocks)
    sub %i2, 64, %i3
    andn    %i3, 63, %i3
    sub %i2, %i3, %i4
    andn    %i4, 7, %i4
    sub %i4, 16, %i4
    sub %i2, %i4, %i2
    sub %i2, %i3, %i2

    andn    %i1, 0x3f, %l7      ! blk aligned address
    alignaddr %i1, %g0, %g0     ! gen %gsr

    srl %i1, 3, %l5     ! bits 3,4,5 are now least sig in  %l5
    andcc   %l5, 7, %i5     ! mask everything except bits 1,2 3
    add %i1, %i4, %i1
    add %i1, %i3, %i1

    ldda    [%l7]ASI_BLK_AIUS, %d0
    add %l7, 64, %l7
    ldda    [%l7]ASI_BLK_AIUS, %d16
    add %l7, 64, %l7
    ldda    [%l7]ASI_BLK_AIUS, %d32
    add %l7, 64, %l7
    sub %i3, 128, %i3

    ! switch statement to get us to the right 8 byte blk within a
    ! 64 byte block

    cmp  %i5, 4
    bgeu,a   copyin_hlf
    cmp  %i5, 6
    cmp  %i5, 2
    bgeu,a   copyin_sqtr
    nop
    cmp  %i5, 1
    be,a     copyin_seg1
    nop
    ba,pt    %ncc, copyin_seg0
    nop
copyin_sqtr:
    be,a     copyin_seg2
    nop
    ba,pt    %ncc, copyin_seg3
    nop

copyin_hlf:
    bgeu,a   copyin_fqtr
    nop
    cmp  %i5, 5
    be,a     copyin_seg5
    nop
    ba,pt    %ncc, copyin_seg4
    nop
copyin_fqtr:
    be,a     copyin_seg6
    nop
    ba,pt    %ncc, copyin_seg7
    nop

copyin_seg0:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D0
    ldda    [%l7]ASI_BLK_AIUS, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D16
    ldda    [%l7]ASI_BLK_AIUS, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D32
    ldda    [%l7]ASI_BLK_AIUS, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyin_seg0

0:
    FALIGN_D16
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D32
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd0
    add %i0, 64, %i0

1:
    FALIGN_D32
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D0
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd16
    add %i0, 64, %i0

2:
    FALIGN_D0
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D16
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd32
    add %i0, 64, %i0

copyin_seg1:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D2
    ldda    [%l7]ASI_BLK_AIUS, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D18
    ldda    [%l7]ASI_BLK_AIUS, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D34
    ldda    [%l7]ASI_BLK_AIUS, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyin_seg1
0:
    FALIGN_D18
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D34
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd2
    add %i0, 64, %i0

1:
    FALIGN_D34
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D2
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd18
    add %i0, 64, %i0

2:
    FALIGN_D2
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D18
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd34
    add %i0, 64, %i0
copyin_seg2:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D4
    ldda    [%l7]ASI_BLK_AIUS, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D20
    ldda    [%l7]ASI_BLK_AIUS, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D36
    ldda    [%l7]ASI_BLK_AIUS, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyin_seg2

0:
    FALIGN_D20
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D36
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd4
    add %i0, 64, %i0

1:
    FALIGN_D36
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D4
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd20
    add %i0, 64, %i0

2:
    FALIGN_D4
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D20
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd36
    add %i0, 64, %i0

copyin_seg3:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D6
    ldda    [%l7]ASI_BLK_AIUS, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D22
    ldda    [%l7]ASI_BLK_AIUS, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D38
    ldda    [%l7]ASI_BLK_AIUS, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyin_seg3

0:
    FALIGN_D22
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D38
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd6
    add %i0, 64, %i0

1:
    FALIGN_D38
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D6
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd22
    add %i0, 64, %i0

2:
    FALIGN_D6
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D22
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd38
    add %i0, 64, %i0

copyin_seg4:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D8
    ldda    [%l7]ASI_BLK_AIUS, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D24
    ldda    [%l7]ASI_BLK_AIUS, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D40
    ldda    [%l7]ASI_BLK_AIUS, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyin_seg4

0:
    FALIGN_D24
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D40
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd8
    add %i0, 64, %i0

1:
    FALIGN_D40
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D8
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd24
    add %i0, 64, %i0

2:
    FALIGN_D8
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D24
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd40
    add %i0, 64, %i0

copyin_seg5:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D10
    ldda    [%l7]ASI_BLK_AIUS, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D26
    ldda    [%l7]ASI_BLK_AIUS, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D42
    ldda    [%l7]ASI_BLK_AIUS, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyin_seg5

0:
    FALIGN_D26
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D42
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd10
    add %i0, 64, %i0

1:
    FALIGN_D42
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D10
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd26
    add %i0, 64, %i0

2:
    FALIGN_D10
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D26
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd42
    add %i0, 64, %i0

copyin_seg6:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D12
    ldda    [%l7]ASI_BLK_AIUS, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D28
    ldda    [%l7]ASI_BLK_AIUS, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D44
    ldda    [%l7]ASI_BLK_AIUS, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyin_seg6

0:
    FALIGN_D28
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D44
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd12
    add %i0, 64, %i0

1:
    FALIGN_D44
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D12
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd28
    add %i0, 64, %i0

2:
    FALIGN_D12
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D28
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd44
    add %i0, 64, %i0

copyin_seg7:
    ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
    FALIGN_D14
    ldda    [%l7]ASI_BLK_AIUS, %d0
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 0f
    add %i0, 64, %i0
    ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
    FALIGN_D30
    ldda    [%l7]ASI_BLK_AIUS, %d16
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 1f
    add %i0, 64, %i0
    ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
    FALIGN_D46
    ldda    [%l7]ASI_BLK_AIUS, %d32
    stda    %d48, [%i0]ASI_BLK_P
    add %l7, 64, %l7
    subcc   %i3, 64, %i3
    bz,pn   %ncc, 2f
    add %i0, 64, %i0
    ba,a,pt %ncc, copyin_seg7

0:
    FALIGN_D30
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D46
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd14
    add %i0, 64, %i0

1:
    FALIGN_D46
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D14
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd30
    add %i0, 64, %i0

2:
    FALIGN_D14
    stda    %d48, [%i0]ASI_BLK_P
    add %i0, 64, %i0
    membar  #Sync
    FALIGN_D30
    stda    %d48, [%i0]ASI_BLK_P
    ba,pt   %ncc, copyin_blkd46
    add %i0, 64, %i0


    !
    ! dribble out the last partial block
    !
copyin_blkd0:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d0, %d2, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd2:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d2, %d4, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd4:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d4, %d6, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd6:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d6, %d8, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd8:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d8, %d10, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd10:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d10, %d12, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd12:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d12, %d14, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd14:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    fsrc1   %d14, %d0
    ba,a,pt %ncc, copyin_blkleft

copyin_blkd16:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d16, %d18, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd18:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d18, %d20, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd20:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d20, %d22, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd22:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d22, %d24, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd24:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d24, %d26, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd26:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d26, %d28, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd28:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d28, %d30, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd30:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    fsrc1   %d30, %d0
    ba,a,pt %ncc, copyin_blkleft
copyin_blkd32:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d32, %d34, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd34:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d34, %d36, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd36:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d36, %d38, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd38:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d38, %d40, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd40:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d40, %d42, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd42:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d42, %d44, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd44:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    faligndata %d44, %d46, %d48
    std %d48, [%i0]
    add %i0, 8, %i0
copyin_blkd46:
    subcc   %i4, 8, %i4
    blu,pn  %ncc, copyin_blkdone
    fsrc1   %d46, %d0

copyin_blkleft:
1:
    ldda    [%l7]ASI_USER, %d2
    add %l7, 8, %l7
    subcc   %i4, 8, %i4
    faligndata %d0, %d2, %d8
    std %d8, [%i0]
    blu,pn  %ncc, copyin_blkdone
    add %i0, 8, %i0
    ldda    [%l7]ASI_USER, %d0
    add %l7, 8, %l7
    subcc   %i4, 8, %i4
    faligndata %d2, %d0, %d8
    std %d8, [%i0]
    bgeu,pt %ncc, 1b
    add %i0, 8, %i0

copyin_blkdone:
    tst %i2
    bz,pt   %ncc, .copyin_exit
    and %l3, 0x4, %l3       ! fprs.du = fprs.dl = 0

7:  lduba   [%i1]ASI_USER, %i4
    inc %i1
    inc %i0
    deccc   %i2
    bgu %ncc, 7b
      stb     %i4, [%i0 - 1]

.copyin_exit:
    membar  #StoreLoad|#StoreStore
    btst    FPUSED_FLAG, SAVED_LOFAULT
    bz  %icc, 1f
      nop

    ld  [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2  ! restore gsr
    wr  %o2, 0, %gsr

    ld  [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
    btst    FPRS_FEF, %o3
    bz  %icc, 4f
      nop

    ! restore fpregs from stack
    membar  #Sync
    add %fp, STACK_BIAS - 257, %o2
    and %o2, -64, %o2
    ldda    [%o2]ASI_BLK_P, %d0
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d16
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d32
    add %o2, 64, %o2
    ldda    [%o2]ASI_BLK_P, %d48
    membar  #Sync

    ba,pt   %ncc, 1f
      wr    %o3, 0, %fprs       ! restore fprs

4:
    FZERO               ! zero all of the fpregs
    wr  %o3, 0, %fprs       ! restore fprs

1:
    andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
    membar  #Sync               ! sync error barrier
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
    ret
    restore %g0, 0, %o0
.copyin_err:
    ldn [THREAD_REG + T_COPYOPS], %o4
    brz %o4, 2f
    nop
    ldn [%o4 + CP_COPYIN], %g2
    jmp %g2
    nop
2:
    retl
    mov -1, %o0
    SET_SIZE(copyin)

#endif  /* lint */

#ifdef  lint

/*ARGSUSED*/
int
xcopyin(const void *uaddr, void *kaddr, size_t count)
{ return (0); }

#else   /* lint */

    ENTRY(xcopyin)
    sethi   %hi(.xcopyin_err), REAL_LOFAULT
    b   .do_copyin
      or    REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
.xcopyin_err:
    ldn [THREAD_REG + T_COPYOPS], %o4
    brz %o4, 2f
    nop
    ldn [%o4 + CP_XCOPYIN], %g2
    jmp %g2
    nop
2:
    retl
    mov %g1, %o0
    SET_SIZE(xcopyin)

#endif  /* lint */

#ifdef  lint

/*ARGSUSED*/
int
xcopyin_little(const void *uaddr, void *kaddr, size_t count)
{ return (0); }

#else   /* lint */

    ENTRY(xcopyin_little)
    sethi   %hi(.little_err), %o4
    ldn [THREAD_REG + T_LOFAULT], %o5
    or  %o4, %lo(.little_err), %o4
    membar  #Sync               ! sync error barrier
    stn %o4, [THREAD_REG + T_LOFAULT]

    subcc   %g0, %o2, %o3
    add %o0, %o2, %o0
    bz,pn   %ncc, 2f        ! check for zero bytes
    sub %o2, 1, %o4
    add %o0, %o4, %o0       ! start w/last byte
    add %o1, %o2, %o1
    lduba   [%o0+%o3]ASI_AIUSL, %o4

1:  stb %o4, [%o1+%o3]
    inccc   %o3
    sub %o0, 2, %o0     ! get next byte
    bcc,a,pt %ncc, 1b
      lduba [%o0+%o3]ASI_AIUSL, %o4

2:  membar  #Sync               ! sync error barrier
    stn %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
    retl
    mov %g0, %o0        ! return (0)

.little_err:
    membar  #Sync               ! sync error barrier
    stn %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
    retl
    mov %g1, %o0
    SET_SIZE(xcopyin_little)

#endif  /* lint */


/*
 * Copy a block of storage - must not overlap (from + len <= to).
 * No fault handler installed (to be called under on_fault())
 */
#if defined(lint)

/* ARGSUSED */
void
copyin_noerr(const void *ufrom, void *kto, size_t count)
{}

#else   /* lint */

    ENTRY(copyin_noerr)
    sethi   %hi(.copyio_noerr), REAL_LOFAULT
    b   .do_copyin
      or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
.copyio_noerr:
    jmp SAVED_LOFAULT
      nop
    SET_SIZE(copyin_noerr)

#endif /* lint */

/*
 * Copy a block of storage - must not overlap (from + len <= to).
 * No fault handler installed (to be called under on_fault())
 */

#if defined(lint)

/* ARGSUSED */
void
copyout_noerr(const void *kfrom, void *uto, size_t count)
{}

#else   /* lint */

    ENTRY(copyout_noerr)
    sethi   %hi(.copyio_noerr), REAL_LOFAULT
    b   .do_copyout
      or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
    SET_SIZE(copyout_noerr)

#endif /* lint */

#if defined(lint)

int use_hw_bcopy = 1;
int use_hw_copyio = 1;
int use_hw_bzero = 1;
uint_t hw_copy_limit_1 = 0;
uint_t hw_copy_limit_2 = 0;
uint_t hw_copy_limit_4 = 0;
uint_t hw_copy_limit_8 = 0;

#else /* !lint */

    .align  4
    DGDEF(use_hw_bcopy)
    .word   1
    DGDEF(use_hw_copyio)
    .word   1
    DGDEF(use_hw_bzero)
    .word   1
    DGDEF(hw_copy_limit_1)
    .word   0
    DGDEF(hw_copy_limit_2)
    .word   0
    DGDEF(hw_copy_limit_4)
    .word   0
    DGDEF(hw_copy_limit_8)
    .word   0

    .align  64
    .section ".text"
#endif /* !lint */


/*
 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
 * longer than 256 bytes in length using spitfire's block stores.  If
 * the criteria for using this routine are not met then it calls bzero
 * and returns 1.  Otherwise 0 is returned indicating success.
 * Caller is responsible for ensuring use_hw_bzero is true and that
 * kpreempt_disable() has been called.
 */
#ifdef lint
/*ARGSUSED*/
int
hwblkclr(void *addr, size_t len)
{
    return(0);
}
#else /* lint */
    ! %i0 - start address
    ! %i1 - length of region (multiple of 64)
    ! %l0 - saved fprs
    ! %l1 - pointer to saved %d0 block
    ! %l2 - saved curthread->t_lwp

    ENTRY(hwblkclr)
    ! get another window w/space for one aligned block of saved fpregs
    save    %sp, -SA(MINFRAME + 2*64), %sp

    ! Must be block-aligned
    andcc   %i0, (64-1), %g0
    bnz,pn  %ncc, 1f
      nop

    ! ... and must be 256 bytes or more
    cmp %i1, 256
    blu,pn  %ncc, 1f
      nop

    ! ... and length must be a multiple of 64
    andcc   %i1, (64-1), %g0
    bz,pn   %ncc, 2f
      nop

1:  ! punt, call bzero but notify the caller that bzero was used
    mov %i0, %o0
    call    bzero
      mov   %i1, %o1
    ret
    restore %g0, 1, %o0 ! return (1) - did not use block operations

2:  rd  %fprs, %l0      ! check for unused fp
    btst    FPRS_FEF, %l0
    bz  1f
      nop

    ! save in-use fpregs on stack
    membar  #Sync
    add %fp, STACK_BIAS - 65, %l1
    and %l1, -64, %l1
    stda    %d0, [%l1]ASI_BLK_P

1:  membar  #StoreStore|#StoreLoad|#LoadStore
    wr  %g0, FPRS_FEF, %fprs
    wr  %g0, ASI_BLK_P, %asi

    ! Clear block
    fzero   %d0
    fzero   %d2
    fzero   %d4
    fzero   %d6
    fzero   %d8
    fzero   %d10
    fzero   %d12
    fzero   %d14

    mov 256, %i3
    ba  .pz_doblock
      nop

.pz_blkstart:
      ! stda    %d0, [%i0+192]%asi  ! in dly slot of branch that got us here
    stda    %d0, [%i0+128]%asi
    stda    %d0, [%i0+64]%asi
    stda    %d0, [%i0]%asi
.pz_zinst:
    add %i0, %i3, %i0
    sub %i1, %i3, %i1
.pz_doblock:
    cmp %i1, 256
    bgeu,a  %ncc, .pz_blkstart
      stda  %d0, [%i0+192]%asi

    cmp %i1, 64
    blu %ncc, .pz_finish

    andn    %i1, (64-1), %i3
    srl %i3, 4, %i2     ! using blocks, 1 instr / 16 words
    set .pz_zinst, %i4
    sub %i4, %i2, %i4
    jmp %i4
      nop

.pz_finish:
    membar  #Sync
    btst    FPRS_FEF, %l0
    bz,a    .pz_finished
      wr    %l0, 0, %fprs       ! restore fprs

    ! restore fpregs from stack
    ldda    [%l1]ASI_BLK_P, %d0
    membar  #Sync
    wr  %l0, 0, %fprs       ! restore fprs

.pz_finished:
    ret
    restore %g0, 0, %o0     ! return (bzero or not)
    SET_SIZE(hwblkclr)
#endif  /* lint */

#ifdef  lint
/* Copy 32 bytes of data from src to dst using physical addresses */
/*ARGSUSED*/
void
hw_pa_bcopy32(uint64_t src, uint64_t dst)
{}
#else   /*!lint */

    /*
     * Copy 32 bytes of data from src (%o0) to dst (%o1)
     * using physical addresses.
     */
    ENTRY_NP(hw_pa_bcopy32)
    rdpr    %pstate, %g1
    andn    %g1, PSTATE_IE, %g2
    wrpr    %g0, %g2, %pstate

    ldxa    [%o0]ASI_MEM, %o2
    add     %o0, 8, %o0
    ldxa    [%o0]ASI_MEM, %o3
    add     %o0, 8, %o0
    ldxa    [%o0]ASI_MEM, %o4
    add     %o0, 8, %o0
    ldxa    [%o0]ASI_MEM, %o5
    stxa    %o2, [%o1]ASI_MEM
    add     %o1, 8, %o1
    stxa    %o3, [%o1]ASI_MEM
    add     %o1, 8, %o1
    stxa    %o4, [%o1]ASI_MEM
    add     %o1, 8, %o1
    stxa    %o5, [%o1]ASI_MEM

    membar  #Sync
    retl
      wrpr    %g0, %g1, %pstate
    SET_SIZE(hw_pa_bcopy32)
#endif /* lint */