sun4v/cpu/niagara_copy.s

	niagara_copy.s revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

#include <sys/param.h>
#include <sys/errno.h>
#include <sys/asm_linkage.h>
#include <sys/vtrace.h>
#include <sys/machthread.h>
#include <sys/clock.h>
#include <sys/asi.h>
#include <sys/fsr.h>
#include <sys/privregs.h>
#include <sys/machasi.h>
#include <sys/niagaraasi.h>

#if !defined(lint)
#include "assym.h"
#endif  /* lint */


/*
 * Pseudo-code to aid in understanding the control flow of the
 * bcopy/kcopy routine.
 *
 * On entry to kcopy:
 *  %l7 = curthread->t_lofault;
 *  curthread->t_lofault = .copyerr;
 *  %o5 = %l7;          ! save existing handler in %o5
 *  Call bcopy();
 *
 * On entry to bcopy:
 *
 *  if (length < 128)
 *      goto_regular_copy;
 *
 *  if (!use_vis)
 *      goto_regular_copy;
 *
 *  do_blockcopy_here;
 *
 * In lofault handler:
 *  curthread->t_lofault = %o5; ! restore old t_lofault
 *  return (errno)
 *
 */

/*
 * Less then or equal this number of bytes we will always copy byte-for-byte
 */
#define SMALL_LIMIT 7

/*
 * Size of stack frame in order to accomodate a 64-byte aligned
 * floating-point register save area and 2 32-bit temp locations.
 */
#define HWCOPYFRAMESIZE ((64 * 5) + (2 * 4))

/*
 * LOFAULT_SET : Flag set by kzero to indicate that lo_fault handler was set
 */
#define LOFAULT_SET 2

/*
 * This define is to align data for the unaligned source cases.
 * The data1, data2 and data3 is merged into data1 and data2.
 * The data3 is preserved for next merge.
 */
#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)    \
    sllx    data1, lshift, data1                ;\
    srlx    data2, rshift, tmp              ;\
    or  data1, tmp, data1               ;\
    sllx    data2, lshift, data2                ;\
    srlx    data3, rshift, tmp              ;\
    or  data2, tmp, data2
/*
 * This macro is to align the data. Basically it merges
 * data1 and data2 to form double word.
 */
#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)    \
    sllx    data1, lshift, data1                ;\
    srlx    data2, rshift, tmp              ;\
    or  data1, tmp, data1

/*
 * Copy a block of storage, returning an error code if `from' or
 * `to' takes a kernel pagefault which cannot be resolved.
 * Returns errno value on pagefault error, 0 if all ok
 */


#if defined(lint)

/* ARGSUSED */
int
kcopy(const void *from, void *to, size_t count)
{ return(0); }

#else   /* lint */

    .seg    ".text"
    .align  4

    ENTRY(kcopy)

    save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
    set .copyerr, %o5       ! copyerr is lofault value
    ldn [THREAD_REG + T_LOFAULT], %l7   ! save existing handler
    membar  #Sync           ! sync error barrier (see copy.s)
    stn %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
    b   .do_copy        ! common code
      mov   %l7, %o5

/*
 * We got here because of a fault during kcopy.
 * Errno value is in %g1.
 */
.copyerr:
    membar  #Sync           ! sync error barrier
    stn %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
    ret
    restore %g1, 0, %o0

    SET_SIZE(kcopy)
#endif  /* lint */


/*
 * Copy a block of storage - must not overlap (from + len <= to).
 *
 * Copy a page of memory.
 * Assumes double word alignment and a count >= 256.
 */
#if defined(lint)

/* ARGSUSED */
void
bcopy(const void *from, void *to, size_t count)
{}

#else   /* lint */

    ENTRY(bcopy)

    save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp

.do_copy:
    cmp %i2, 12         ! for small counts
    blu %ncc, .bytecp       ! just copy bytes
      .empty

    cmp %i2, 128        ! for less than 128 bytes
    blu,pn  %ncc, .bcb_punt     ! no block st/quad ld
      nop

    set use_hw_bcopy, %o2
    ld  [%o2], %o2
    tst %o2
    bz  .bcb_punt
      nop

    subcc   %i1, %i0, %i3
    bneg,a,pn %ncc, 1f
    neg %i3
1:
    /*
     * Compare against 256 since we should be checking block addresses
     * and (dest & ~63) - (src & ~63) can be 3 blocks even if
     * src = dest + (64 * 3) + 63.
     */
    cmp %i3, 256
    blu,pn  %ncc, .bcb_punt
      nop

    /*
     * Copy that reach here have at least 2 blocks of data to copy.
     */
.do_blockcopy:
    ! Swap src/dst since the code below is memcpy code
    ! and memcpy/bcopy have different calling sequences
    mov %i1, %i5
    mov %i0, %i1
    mov %i5, %i0

    andcc   %i0, 0x3f, %i3      ! is dst aligned on a 64 bytes
    bz  %xcc, .chksrc       ! dst is already double aligned
    sub %i3, 0x40, %i3
    neg %i3         ! bytes till dst 64 bytes aligned
    sub %i2, %i3, %i2       ! update i2 with new count

1:  ldub    [%i1], %i4
    stb %i4, [%i0]
    inc %i1
    deccc   %i3
    bgu %xcc, 1b
    inc %i0

    ! Now Destination is block (64 bytes) aligned
.chksrc:
    andn    %i2, 0x3f, %i3      ! %i3 count is multiple of block size
    sub %i2, %i3, %i2       ! Residue bytes in %i2

    mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi

    andcc   %i1, 0xf, %o2       ! is src quadword aligned
    bz,pn   %xcc, .blkcpy       ! src offset in %o2
    nop
    cmp %o2, 0x8
    bg  .cpy_upper_double
    nop
    bl  .cpy_lower_double
    nop

    ! Falls through when source offset is equal to 8 i.e.
    ! source is double word aligned.
    ! In this case no shift/merge of data is required
    sub %i1, %o2, %i1       ! align the src at 16 bytes.
    andn    %i1, 0x3f, %l0      ! %l0 has block aligned source
    prefetch [%l0+0x0], #one_read
    ldda    [%i1+0x0]%asi, %l2
loop0:
    ldda    [%i1+0x10]%asi, %l4
    prefetch [%l0+0x40], #one_read

    stxa    %l3, [%i0+0x0]%asi
    stxa    %l4, [%i0+0x8]%asi

    ldda    [%i1+0x20]%asi, %l2
    stxa    %l5, [%i0+0x10]%asi
    stxa    %l2, [%i0+0x18]%asi

    ldda    [%i1+0x30]%asi, %l4
    stxa    %l3, [%i0+0x20]%asi
    stxa    %l4, [%i0+0x28]%asi

    ldda    [%i1+0x40]%asi, %l2
    stxa    %l5, [%i0+0x30]%asi
    stxa    %l2, [%i0+0x38]%asi

    add %l0, 0x40, %l0
    add %i1, 0x40, %i1
    subcc   %i3, 0x40, %i3
    bgu,pt  %xcc, loop0
    add %i0, 0x40, %i0
    ba  .blkdone
    add %i1, %o2, %i1       ! increment the source by src offset
                    ! the src offset was stored in %o2

.cpy_lower_double:
    sub %i1, %o2, %i1       ! align the src at 16 bytes.
    sll %o2, 3, %o0     ! %o0 left shift
    mov 0x40, %o1
    sub %o1, %o0, %o1       ! %o1 right shift = (64 - left shift)
    andn    %i1, 0x3f, %l0      ! %l0 has block aligned source
    prefetch [%l0+0x0], #one_read
    ldda    [%i1+0x0]%asi, %l2  ! partial data in %l2 and %l3 has
                    ! complete data
loop1:
    ldda    [%i1+0x10]%asi, %l4 ! %l4 has partial data for this read.
    ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)    ! merge %l2, %l3 and %l4
                            ! into %l2 and %l3
    prefetch [%l0+0x40], #one_read
    stxa    %l2, [%i0+0x0]%asi
    stxa    %l3, [%i0+0x8]%asi

    ldda    [%i1+0x20]%asi, %l2
    ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)    ! merge %l2 with %l5 and
    stxa    %l4, [%i0+0x10]%asi         ! %l4 from previous read
    stxa    %l5, [%i0+0x18]%asi         ! into %l4 and %l5

    ! Repeat the same for next 32 bytes.

    ldda    [%i1+0x30]%asi, %l4
    ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
    stxa    %l2, [%i0+0x20]%asi
    stxa    %l3, [%i0+0x28]%asi

    ldda    [%i1+0x40]%asi, %l2
    ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
    stxa    %l4, [%i0+0x30]%asi
    stxa    %l5, [%i0+0x38]%asi

    add %l0, 0x40, %l0
    add %i1, 0x40, %i1
    subcc   %i3, 0x40, %i3
    bgu,pt  %xcc, loop1
    add %i0, 0x40, %i0
    ba  .blkdone
    add %i1, %o2, %i1       ! increment the source by src offset
                    ! the src offset was stored in %o2

.cpy_upper_double:
    sub %i1, %o2, %i1       ! align the src at 16 bytes.
    mov 0x8, %o0
    sub %o2, %o0, %o0
    sll %o0, 3, %o0     ! %o0 left shift
    mov 0x40, %o1
    sub %o1, %o0, %o1       ! %o1 right shift = (64 - left shift)
    andn    %i1, 0x3f, %l0      ! %l0 has block aligned source
    prefetch [%l0+0x0], #one_read
    ldda    [%i1+0x0]%asi, %l2  ! partial data in %l3 for this read and
                    ! no data in %l2
loop2:
    ldda    [%i1+0x10]%asi, %l4 ! %l4 has complete data and %l5 has
                    ! partial
    ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)    ! merge %l3, %l4 and %l5
                            ! into %l3 and %l4
    prefetch [%l0+0x40], #one_read
    stxa    %l3, [%i0+0x0]%asi
    stxa    %l4, [%i0+0x8]%asi

    ldda    [%i1+0x20]%asi, %l2
    ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)    ! merge %l2 and %l3 with
    stxa    %l5, [%i0+0x10]%asi         ! %l5 from previous read
    stxa    %l2, [%i0+0x18]%asi         ! into %l5 and %l2

    ! Repeat the same for next 32 bytes.

    ldda    [%i1+0x30]%asi, %l4
    ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
    stxa    %l3, [%i0+0x20]%asi
    stxa    %l4, [%i0+0x28]%asi

    ldda    [%i1+0x40]%asi, %l2
    ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
    stxa    %l5, [%i0+0x30]%asi
    stxa    %l2, [%i0+0x38]%asi

    add %l0, 0x40, %l0
    add %i1, 0x40, %i1
    subcc   %i3, 0x40, %i3
    bgu,pt  %xcc, loop2
    add %i0, 0x40, %i0
    ba  .blkdone
    add %i1, %o2, %i1       ! increment the source by src offset
                    ! the src offset was stored in %o2


    ! Both Source and Destination are block aligned.
    ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.blkcpy:
    prefetch [%i1+0x0], #one_read
1:
    ldda    [%i1+0x0]%asi, %l0
    ldda    [%i1+0x10]%asi, %l2
    prefetch [%i1+0x40], #one_read

    stxa    %l0, [%i0+0x0]%asi
    ldda    [%i1+0x20]%asi, %l4
    ldda    [%i1+0x30]%asi, %l6

    stxa    %l1, [%i0+0x8]%asi
    stxa    %l2, [%i0+0x10]%asi
    stxa    %l3, [%i0+0x18]%asi
    stxa    %l4, [%i0+0x20]%asi
    stxa    %l5, [%i0+0x28]%asi
    stxa    %l6, [%i0+0x30]%asi
    stxa    %l7, [%i0+0x38]%asi

    add %i1, 0x40, %i1
    subcc   %i3, 0x40, %i3
    bgu,pt  %xcc, 1b
    add %i0, 0x40, %i0

.blkdone:
    membar  #Sync
    tst %i2
    bz,pt   %xcc, .blkexit
    nop

.residue:
    ldub    [%i1], %i4
    stb %i4, [%i0]
    inc %i1
    deccc   %i2
    bgu %xcc, .residue
    inc %i0

.blkexit:
    ret
    restore %g0, 0, %o0

.bcb_punt:
    !
    ! use aligned transfers where possible
    !
    xor %i0, %i1, %o4       ! xor from and to address
    btst    7, %o4          ! if lower three bits zero
    bz  .aldoubcp       ! can align on double boundary
    .empty  ! assembler complaints about label

    xor %i0, %i1, %o4       ! xor from and to address
    btst    3, %o4          ! if lower two bits zero
    bz  .alwordcp       ! can align on word boundary
    btst    3, %i0          ! delay slot, from address unaligned?
    !
    ! use aligned reads and writes where possible
    ! this differs from wordcp in that it copes
    ! with odd alignment between source and destnation
    ! using word reads and writes with the proper shifts
    ! in between to align transfers to and from memory
    ! i0 - src address, i1 - dest address, i2 - count
    ! i3, i4 - tmps for used generating complete word
    ! i5 (word to write)
    ! l0 size in bits of upper part of source word (US)
    ! l1 size in bits of lower part of source word (LS = 32 - US)
    ! l2 size in bits of upper part of destination word (UD)
    ! l3 size in bits of lower part of destination word (LD = 32 - UD)
    ! l4 number of bytes leftover after aligned transfers complete
    ! l5 the number 32
    !
    mov 32, %l5         ! load an oft-needed constant
    bz  .align_dst_only
    btst    3, %i1          ! is destnation address aligned?
    clr %i4         ! clear registers used in either case
    bz  .align_src_only
    clr %l0
    !
    ! both source and destination addresses are unaligned
    !
1:                  ! align source
    ldub    [%i0], %i3      ! read a byte from source address
    add %i0, 1, %i0     ! increment source address
    or  %i4, %i3, %i4       ! or in with previous bytes (if any)
    btst    3, %i0          ! is source aligned?
    add %l0, 8, %l0     ! increment size of upper source (US)
    bnz,a   1b
    sll %i4, 8, %i4     ! make room for next byte

    sub %l5, %l0, %l1       ! generate shift left count (LS)
    sll %i4, %l1, %i4       ! prepare to get rest
    ld  [%i0], %i3      ! read a word
    add %i0, 4, %i0     ! increment source address
    srl %i3, %l0, %i5       ! upper src bits into lower dst bits
    or  %i4, %i5, %i5       ! merge
    mov 24, %l3         ! align destination
1:
    srl %i5, %l3, %i4       ! prepare to write a single byte
    stb %i4, [%i1]      ! write a byte
    add %i1, 1, %i1     ! increment destination address
    sub %i2, 1, %i2     ! decrement count
    btst    3, %i1          ! is destination aligned?
    bnz,a   1b
    sub %l3, 8, %l3     ! delay slot, decrement shift count (LD)
    sub %l5, %l3, %l2       ! generate shift left count (UD)
    sll %i5, %l2, %i5       ! move leftover into upper bytes
    cmp %l2, %l0        ! cmp # reqd to fill dst w old src left
    bgu %ncc, .more_needed  ! need more to fill than we have
    nop

    sll %i3, %l1, %i3       ! clear upper used byte(s)
    srl %i3, %l1, %i3
    ! get the odd bytes between alignments
    sub %l0, %l2, %l0       ! regenerate shift count
    sub %l5, %l0, %l1       ! generate new shift left count (LS)
    and %i2, 3, %l4     ! must do remaining bytes if count%4 > 0
    andn    %i2, 3, %i2     ! # of aligned bytes that can be moved
    srl %i3, %l0, %i4
    or  %i5, %i4, %i5
    st  %i5, [%i1]      ! write a word
    subcc   %i2, 4, %i2     ! decrement count
    bz  %ncc, .unalign_out
    add %i1, 4, %i1     ! increment destination address

    b   2f
    sll %i3, %l1, %i5       ! get leftover into upper bits
.more_needed:
    sll %i3, %l0, %i3       ! save remaining byte(s)
    srl %i3, %l0, %i3
    sub %l2, %l0, %l1       ! regenerate shift count
    sub %l5, %l1, %l0       ! generate new shift left count
    sll %i3, %l1, %i4       ! move to fill empty space
    b   3f
    or  %i5, %i4, %i5       ! merge to complete word
    !
    ! the source address is aligned and destination is not
    !
.align_dst_only:
    ld  [%i0], %i4      ! read a word
    add %i0, 4, %i0     ! increment source address
    mov 24, %l0         ! initial shift alignment count
1:
    srl %i4, %l0, %i3       ! prepare to write a single byte
    stb %i3, [%i1]      ! write a byte
    add %i1, 1, %i1     ! increment destination address
    sub %i2, 1, %i2     ! decrement count
    btst    3, %i1          ! is destination aligned?
    bnz,a   1b
    sub %l0, 8, %l0     ! delay slot, decrement shift count
.xfer:
    sub %l5, %l0, %l1       ! generate shift left count
    sll %i4, %l1, %i5       ! get leftover
3:
    and %i2, 3, %l4     ! must do remaining bytes if count%4 > 0
    andn    %i2, 3, %i2     ! # of aligned bytes that can be moved
2:
    ld  [%i0], %i3      ! read a source word
    add %i0, 4, %i0     ! increment source address
    srl %i3, %l0, %i4       ! upper src bits into lower dst bits
    or  %i5, %i4, %i5       ! merge with upper dest bits (leftover)
    st  %i5, [%i1]      ! write a destination word
    subcc   %i2, 4, %i2     ! decrement count
    bz  %ncc, .unalign_out  ! check if done
    add %i1, 4, %i1     ! increment destination address
    b   2b          ! loop
    sll %i3, %l1, %i5       ! get leftover
.unalign_out:
    tst %l4         ! any bytes leftover?
    bz  %ncc, .cpdone
    .empty              ! allow next instruction in delay slot
1:
    sub %l0, 8, %l0     ! decrement shift
    srl %i3, %l0, %i4       ! upper src byte into lower dst byte
    stb %i4, [%i1]      ! write a byte
    subcc   %l4, 1, %l4     ! decrement count
    bz  %ncc, .cpdone       ! done?
    add %i1, 1, %i1     ! increment destination
    tst %l0         ! any more previously read bytes
    bnz %ncc, 1b        ! we have leftover bytes
    mov %l4, %i2        ! delay slot, mv cnt where dbytecp wants
    b   .dbytecp        ! let dbytecp do the rest
    sub %i0, %i1, %i0       ! i0 gets the difference of src and dst
    !
    ! the destination address is aligned and the source is not
    !
.align_src_only:
    ldub    [%i0], %i3      ! read a byte from source address
    add %i0, 1, %i0     ! increment source address
    or  %i4, %i3, %i4       ! or in with previous bytes (if any)
    btst    3, %i0          ! is source aligned?
    add %l0, 8, %l0     ! increment shift count (US)
    bnz,a   .align_src_only
    sll %i4, 8, %i4     ! make room for next byte
    b,a .xfer
    !
    ! if from address unaligned for double-word moves,
    ! move bytes till it is, if count is < 56 it could take
    ! longer to align the thing than to do the transfer
    ! in word size chunks right away
    !
.aldoubcp:
    cmp %i2, 56         ! if count < 56, use wordcp, it takes
    blu,a   %ncc, .alwordcp     ! longer to align doubles than words
    mov 3, %o0          ! mask for word alignment
    call    .alignit        ! copy bytes until aligned
    mov 7, %o0          ! mask for double alignment
    !
    ! source and destination are now double-word aligned
    ! i3 has aligned count returned by alignit
    !
    and %i2, 7, %i2     ! unaligned leftover count
    sub %i0, %i1, %i0       ! i0 gets the difference of src and dst
5:
    ldx [%i0+%i1], %o4      ! read from address
    stx %o4, [%i1]      ! write at destination address
    subcc   %i3, 8, %i3     ! dec count
    bgu %ncc, 5b
    add %i1, 8, %i1     ! delay slot, inc to address
    cmp %i2, 4          ! see if we can copy a word
    blu %ncc, .dbytecp      ! if 3 or less bytes use bytecp
    .empty
    !
    ! for leftover bytes we fall into wordcp, if needed
    !
.wordcp:
    and %i2, 3, %i2     ! unaligned leftover count
5:
    ld  [%i0+%i1], %o4      ! read from address
    st  %o4, [%i1]      ! write at destination address
    subcc   %i3, 4, %i3     ! dec count
    bgu %ncc, 5b
    add %i1, 4, %i1     ! delay slot, inc to address
    b,a .dbytecp

    ! we come here to align copies on word boundaries
.alwordcp:
    call    .alignit        ! go word-align it
    mov 3, %o0          ! bits that must be zero to be aligned
    b   .wordcp
    sub %i0, %i1, %i0       ! i0 gets the difference of src and dst

    !
    ! byte copy, works with any alignment
    !
.bytecp:
    b   .dbytecp
    sub %i0, %i1, %i0       ! i0 gets difference of src and dst

    !
    ! differenced byte copy, works with any alignment
    ! assumes dest in %i1 and (source - dest) in %i0
    !
1:
    stb %o4, [%i1]      ! write to address
    inc %i1         ! inc to address
.dbytecp:
    deccc   %i2         ! dec count
    bgeu,a  %ncc, 1b        ! loop till done
    ldub    [%i0+%i1], %o4      ! read from address
.cpdone:
    membar  #Sync           ! sync error barrier
    ret
    restore %g0, 0, %o0     ! return (0)

/*
 * Common code used to align transfers on word and doubleword
 * boudaries.  Aligns source and destination and returns a count
 * of aligned bytes to transfer in %i3
 */
1:
    inc %i0         ! inc from
    stb %o4, [%i1]      ! write a byte
    inc %i1         ! inc to
    dec %i2         ! dec count
.alignit:
    btst    %o0, %i0        ! %o0 is bit mask to check for alignment
    bnz,a   1b
    ldub    [%i0], %o4      ! read next byte

    retl
    andn    %i2, %o0, %i3       ! return size of aligned bytes
    SET_SIZE(bcopy)

#endif  /* lint */

/*
 * Block copy with possibly overlapped operands.
 */

#if defined(lint)

/*ARGSUSED*/
void
ovbcopy(const void *from, void *to, size_t count)
{}

#else   /* lint */

    ENTRY(ovbcopy)
    tst %o2         ! check count
    bgu,a   %ncc, 1f        ! nothing to do or bad arguments
    subcc   %o0, %o1, %o3       ! difference of from and to address

    retl                ! return
    nop
1:
    bneg,a  %ncc, 2f
    neg %o3         ! if < 0, make it positive
2:  cmp %o2, %o3        ! cmp size and abs(from - to)
    bleu    %ncc, bcopy     ! if size <= abs(diff): use bcopy,
    .empty              !   no overlap
    cmp %o0, %o1        ! compare from and to addresses
    blu %ncc, .ov_bkwd      ! if from < to, copy backwards
    nop
    !
    ! Copy forwards.
    !
.ov_fwd:
    ldub    [%o0], %o3      ! read from address
    inc %o0         ! inc from address
    stb %o3, [%o1]      ! write to address
    deccc   %o2         ! dec count
    bgu %ncc, .ov_fwd       ! loop till done
    inc %o1         ! inc to address

    retl                ! return
    nop
    !
    ! Copy backwards.
    !
.ov_bkwd:
    deccc   %o2         ! dec count
    ldub    [%o0 + %o2], %o3    ! get byte at end of src
    bgu %ncc, .ov_bkwd      ! loop till done
    stb %o3, [%o1 + %o2]    ! delay slot, store at end of dst

    retl                ! return
    nop
    SET_SIZE(ovbcopy)

#endif  /* lint */

/*
 * hwblkpagecopy()
 *
 * Copies exactly one page.  This routine assumes the caller (ppcopy)
 * has already disabled kernel preemption and has checked
 * use_hw_bcopy.
 */
#ifdef lint
/*ARGSUSED*/
void
hwblkpagecopy(const void *src, void *dst)
{ }
#else /* lint */
    ENTRY(hwblkpagecopy)
    save    %sp, -SA(MINFRAME + 4*64), %sp

    ! %i0 - source address (arg)
    ! %i1 - destination address (arg)
    ! %i2 - length of region (not arg)

    set PAGESIZE, %i2

    /*
     * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
     */
    mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
    prefetch [%i0+0x0], #one_read
    prefetch [%i0+0x40], #one_read
1:
    prefetch [%i0+0x80], #one_read
    prefetch [%i0+0xc0], #one_read
    ldda    [%i0+0x0]%asi, %l0
    ldda    [%i0+0x10]%asi, %l2
    ldda    [%i0+0x20]%asi, %l4
    ldda    [%i0+0x30]%asi, %l6
    stxa    %l0, [%i1+0x0]%asi
    stxa    %l1, [%i1+0x8]%asi
    stxa    %l2, [%i1+0x10]%asi
    stxa    %l3, [%i1+0x18]%asi
    stxa    %l4, [%i1+0x20]%asi
    stxa    %l5, [%i1+0x28]%asi
    stxa    %l6, [%i1+0x30]%asi
    stxa    %l7, [%i1+0x38]%asi
    ldda    [%i0+0x40]%asi, %l0
    ldda    [%i0+0x50]%asi, %l2
    ldda    [%i0+0x60]%asi, %l4
    ldda    [%i0+0x70]%asi, %l6
    stxa    %l0, [%i1+0x40]%asi
    stxa    %l1, [%i1+0x48]%asi
    stxa    %l2, [%i1+0x50]%asi
    stxa    %l3, [%i1+0x58]%asi
    stxa    %l4, [%i1+0x60]%asi
    stxa    %l5, [%i1+0x68]%asi
    stxa    %l6, [%i1+0x70]%asi
    stxa    %l7, [%i1+0x78]%asi

    add %i0, 0x80, %i0
    subcc   %i2, 0x80, %i2
    bgu,pt  %xcc, 1b
    add %i1, 0x80, %i1

    membar #Sync
    ret
    restore %g0, 0, %o0
    SET_SIZE(hwblkpagecopy)
#endif  /* lint */


/*
 * Transfer data to and from user space -
 * Note that these routines can cause faults
 * It is assumed that the kernel has nothing at
 * less than KERNELBASE in the virtual address space.
 *
 * Note that copyin(9F) and copyout(9F) are part of the
 * DDI/DKI which specifies that they return '-1' on "errors."
 *
 * Sigh.
 *
 * So there's two extremely similar routines - xcopyin() and xcopyout()
 * which return the errno that we've faithfully computed.  This
 * allows other callers (e.g. uiomove(9F)) to work correctly.
 * Given that these are used pretty heavily, we expand the calling
 * sequences inline for all flavours (rather than making wrappers).
 *
 * There are also stub routines for xcopyout_little and xcopyin_little,
 * which currently are intended to handle requests of <= 16 bytes from
 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
 * is left as an exercise...
 */

/*
 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
 *
 * General theory of operation:
 *
 * None of the copyops routines grab a window until it's decided that
 * we need to do a HW block copy operation. This saves a window
 * spill/fill when we're called during socket ops. The typical IO
 * path won't cause spill/fill traps.
 *
 * This code uses a set of 4 limits for the maximum size that will
 * be copied given a particular input/output address alignment.
 * the default limits are:
 *
 * single byte aligned - 256 (hw_copy_limit_1)
 * two byte aligned - 512 (hw_copy_limit_2)
 * four byte aligned - 1024 (hw_copy_limit_4)
 * eight byte aligned - 1024 (hw_copy_limit_8)
 *
 * If the value for a particular limit is zero, the copy will be done
 * via the copy loops rather than block store/quad load instructions.
 *
 * Flow:
 *
 * If count == zero return zero.
 *
 * Store the previous lo_fault handler into %g6.
 * Place our secondary lofault handler into %g5.
 * Place the address of our nowindow fault handler into %o3.
 * Place the address of the windowed fault handler into %o4.
 * --> We'll use this handler if we end up grabbing a window
 * --> before we use block initializing store and quad load ASIs
 *
 * If count is less than or equal to SMALL_LIMIT (7) we
 * always do a byte for byte copy.
 *
 * If count is > SMALL_LIMIT, we check the alignment of the input
 * and output pointers. Based on the alignment we check count
 * against a limit based on detected alignment.  If we exceed the
 * alignment value we copy via block initializing store and quad
 * load instructions.
 *
 * If we don't exceed one of the limits, we store -count in %o3,
 * we store the number of chunks (8, 4, 2 or 1 byte) operated
 * on in our basic copy loop in %o2. Following this we branch
 * to the appropriate copy loop and copy that many chunks.
 * Since we've been adding the chunk size to %o3 each time through
 * as well as decrementing %o2, we can tell if any data is
 * is left to be copied by examining %o3. If that is zero, we're
 * done and can go home. If not, we figure out what the largest
 * chunk size left to be copied is and branch to that copy loop
 * unless there's only one byte left. We load that as we're
 * branching to code that stores it just before we return.
 *
 * Fault handlers are invoked if we reference memory that has no
 * current mapping.  All forms share the same copyio_fault handler.
 * This routine handles fixing up the stack and general housecleaning.
 * Each copy operation has a simple fault handler that is then called
 * to do the work specific to the invidual operation.  The handler
 * for copyOP and xcopyOP are found at the end of individual function.
 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
 */

/*
 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
 */

#if defined(lint)

/*ARGSUSED*/
int
copyout(const void *kaddr, void *uaddr, size_t count)
{ return (0); }

#else   /* lint */

/*
 * We save the arguments in the following registers in case of a fault:
 *  kaddr - %g2
 *  uaddr - %g3
 *  count - %g4
 */
#define SAVE_SRC    %g2
#define SAVE_DST    %g3
#define SAVE_COUNT  %g4

#define REAL_LOFAULT        %g5
#define SAVED_LOFAULT       %g6

/*
 * Generic copyio fault handler.  This is the first line of defense when a
 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
 * This allows us to share common code for all the flavors of the copy
 * operations, including the _noerr versions.
 *
 * Note that this function will restore the original input parameters before
 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
 * member of the t_copyop structure, if needed.
 */
    ENTRY(copyio_fault)
    membar  #Sync
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault

    restore

    mov SAVE_SRC, %o0
    mov SAVE_DST, %o1
    jmp REAL_LOFAULT
      mov   SAVE_COUNT, %o2
    SET_SIZE(copyio_fault)

    ENTRY(copyio_fault_nowindow)
    membar  #Sync
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault

    mov SAVE_SRC, %o0
    mov SAVE_DST, %o1
    jmp REAL_LOFAULT
      mov   SAVE_COUNT, %o2
    SET_SIZE(copyio_fault_nowindow)

    ENTRY(copyout)
    sethi   %hi(.copyout_err), REAL_LOFAULT
    or  REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT

.do_copyout:
    !
    ! Check the length and bail if zero.
    !
    tst %o2
    bnz,pt  %ncc, 1f
      nop
    retl
      clr   %o0
1:
    sethi   %hi(copyio_fault), %o4
    or  %o4, %lo(copyio_fault), %o4
    sethi   %hi(copyio_fault_nowindow), %o3
    ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
    or  %o3, %lo(copyio_fault_nowindow), %o3
    membar  #Sync
    stn %o3, [THREAD_REG + T_LOFAULT]

    mov %o0, SAVE_SRC
    mov %o1, SAVE_DST
    mov %o2, SAVE_COUNT

    !
    ! Check to see if we're more than SMALL_LIMIT (7 bytes).
    ! Run in leaf mode, using the %o regs as our input regs.
    !
    subcc   %o2, SMALL_LIMIT, %o3
    bgu,a,pt %ncc, .dco_ns
    or  %o0, %o1, %o3
    !
    ! What was previously ".small_copyout"
    ! Do full differenced copy.
    !
.dcobcp:
    sub %g0, %o2, %o3       ! negate count
    add %o0, %o2, %o0       ! make %o0 point at the end
    add %o1, %o2, %o1       ! make %o1 point at the end
    ba,pt   %ncc, .dcocl
    ldub    [%o0 + %o3], %o4    ! load first byte
    !
    ! %o0 and %o2 point at the end and remain pointing at the end
    ! of their buffers. We pull things out by adding %o3 (which is
    ! the negation of the length) to the buffer end which gives us
    ! the curent location in the buffers. By incrementing %o3 we walk
    ! through both buffers without having to bump each buffer's
    ! pointer. A very fast 4 instruction loop.
    !
    .align 16
.dcocl:
    stba    %o4, [%o1 + %o3]ASI_USER
    inccc   %o3
    bl,a,pt %ncc, .dcocl
    ldub    [%o0 + %o3], %o4
    !
    ! We're done. Go home.
    !
    membar  #Sync
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
    retl
    clr %o0
    !
    ! Try aligned copies from here.
    !
.dco_ns:
    ! %o0 = kernel addr (to be copied from)
    ! %o1 = user addr (to be copied to)
    ! %o2 = length
    ! %o3 = %o1 | %o2 (used for alignment checking)
    ! %o4 is alternate lo_fault
    ! %o5 is original lo_fault
    !
    ! See if we're single byte aligned. If we are, check the
    ! limit for single byte copies. If we're smaller or equal,
    ! bounce to the byte for byte copy loop. Otherwise do it in
    ! HW (if enabled).
    !
    btst    1, %o3
    bz,pt   %icc, .dcoh8
    btst    7, %o3
    !
    ! Single byte aligned. Do we do it via HW or via
    ! byte for byte? Do a quick no memory reference
    ! check to pick up small copies.
    !
    sethi   %hi(hw_copy_limit_1), %o3
    !
    ! Big enough that we need to check the HW limit for
    ! this size copy.
    !
    ld  [%o3 + %lo(hw_copy_limit_1)], %o3
    !
    ! Is HW copy on? If not, do everything byte for byte.
    !
    tst %o3
    bz,pn   %icc, .dcobcp
    subcc   %o3, %o2, %o3
    !
    ! If we're less than or equal to the single byte copy limit,
    ! bop to the copy loop.
    !
    bge,pt  %ncc, .dcobcp
    nop
    !
    ! We're big enough and copy is on. Do it with HW.
    !
    ba,pt   %ncc, .big_copyout
    nop
.dcoh8:
    !
    ! 8 byte aligned?
    !
    bnz,a   %ncc, .dcoh4
    btst    3, %o3
    !
    ! See if we're in the "small range".
    ! If so, go off and do the copy.
    ! If not, load the hard limit. %o3 is
    ! available for reuse.
    !
    sethi   %hi(hw_copy_limit_8), %o3
    ld  [%o3 + %lo(hw_copy_limit_8)], %o3
    !
    ! If it's zero, there's no HW bcopy.
    ! Bop off to the aligned copy.
    !
    tst %o3
    bz,pn   %icc, .dcos8
    subcc   %o3, %o2, %o3
    !
    ! We're negative if our size is larger than hw_copy_limit_8.
    !
    bge,pt  %ncc, .dcos8
    nop
    !
    ! HW assist is on and we're large enough. Do it.
    !
    ba,pt   %ncc, .big_copyout
    nop
.dcos8:
    !
    ! Housekeeping for copy loops. Uses same idea as in the byte for
    ! byte copy loop above.
    !
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    sub %g0, %o2, %o3
    ba,pt   %ncc, .dodebc
    srl %o2, 3, %o2     ! Number of 8 byte chunks to copy
    !
    ! 4 byte aligned?
    !
.dcoh4:
    bnz,pn  %ncc, .dcoh2
    !
    ! See if we're in the "small range".
    ! If so, go off an do the copy.
    ! If not, load the hard limit. %o3 is
    ! available for reuse.
    !
    sethi   %hi(hw_copy_limit_4), %o3
    ld  [%o3 + %lo(hw_copy_limit_4)], %o3
    !
    ! If it's zero, there's no HW bcopy.
    ! Bop off to the aligned copy.
    !
    tst %o3
    bz,pn   %icc, .dcos4
    subcc   %o3, %o2, %o3
    !
    ! We're negative if our size is larger than hw_copy_limit_4.
    !
    bge,pt  %ncc, .dcos4
    nop
    !
    ! HW assist is on and we're large enough. Do it.
    !
    ba,pt   %ncc, .big_copyout
    nop
.dcos4:
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    sub %g0, %o2, %o3
    ba,pt   %ncc, .dodfbc
    srl %o2, 2, %o2     ! Number of 4 byte chunks to copy
    !
    ! We must be 2 byte aligned. Off we go.
    ! The check for small copies was done in the
    ! delay at .dcoh4
    !
.dcoh2:
    ble %ncc, .dcos2
    sethi   %hi(hw_copy_limit_2), %o3
    ld  [%o3 + %lo(hw_copy_limit_2)], %o3
    tst %o3
    bz,pn   %icc, .dcos2
    subcc   %o3, %o2, %o3
    bge,pt  %ncc, .dcos2
    nop
    !
    ! HW is on and we're big enough. Do it.
    !
    ba,pt   %ncc, .big_copyout
    nop
.dcos2:
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    sub %g0, %o2, %o3
    ba,pt   %ncc, .dodtbc
    srl %o2, 1, %o2     ! Number of 2 byte chunks to copy
.small_copyout:
    !
    ! Why are we doing this AGAIN? There are certain conditions in
    ! big_copyout that will cause us to forego the HW assisted copies
    ! and bounce back to a non-HW assisted copy. This dispatches those
    ! copies. Note that we branch around this in the main line code.
    !
    ! We make no check for limits or HW enablement here. We've
    ! already been told that we're a poster child so just go off
    ! and do it.
    !
    or  %o0, %o1, %o3
    btst    1, %o3
    bnz %icc, .dcobcp       ! Most likely
    btst    7, %o3
    bz  %icc, .dcos8
    btst    3, %o3
    bz  %icc, .dcos4
    nop
    ba,pt   %ncc, .dcos2
    nop
    .align 32
.dodebc:
    ldx [%o0 + %o3], %o4
    deccc   %o2
    stxa    %o4, [%o1 + %o3]ASI_USER
    bg,pt   %ncc, .dodebc
    addcc   %o3, 8, %o3
    !
    ! End of copy loop. Check to see if we're done. Most
    ! eight byte aligned copies end here.
    !
    bz,pt   %ncc, .dcofh
    nop
    !
    ! Something is left - do it byte for byte.
    !
    ba,pt   %ncc, .dcocl
    ldub    [%o0 + %o3], %o4    ! load next byte
    !
    ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
    !
    .align 32
.dodfbc:
    lduw    [%o0 + %o3], %o4
    deccc   %o2
    sta %o4, [%o1 + %o3]ASI_USER
    bg,pt   %ncc, .dodfbc
    addcc   %o3, 4, %o3
    !
    ! End of copy loop. Check to see if we're done. Most
    ! four byte aligned copies end here.
    !
    bz,pt   %ncc, .dcofh
    nop
    !
    ! Something is left. Do it byte for byte.
    !
    ba,pt   %ncc, .dcocl
    ldub    [%o0 + %o3], %o4    ! load next byte
    !
    ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
    ! copy.
    !
    .align 32
.dodtbc:
    lduh    [%o0 + %o3], %o4
    deccc   %o2
    stha    %o4, [%o1 + %o3]ASI_USER
    bg,pt   %ncc, .dodtbc
    addcc   %o3, 2, %o3
    !
    ! End of copy loop. Anything left?
    !
    bz,pt   %ncc, .dcofh
    nop
    !
    ! Deal with the last byte
    !
    ldub    [%o0 + %o3], %o4
    stba    %o4, [%o1 + %o3]ASI_USER
.dcofh:
    membar  #Sync
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
    retl
    clr %o0

.big_copyout:
    !
    ! We're going to go off and do a block copy.
    ! Switch fault handlers and grab a window. We
    ! don't do a membar #Sync since we've done only
    ! kernel data to this point.
    !
    stn %o4, [THREAD_REG + T_LOFAULT]
    save    %sp, -SA(MINFRAME), %sp

    ! Copy out that reach here are larger than 256 bytes. The
    ! hw_copy_limit_1 is set to 256. Never set this limit less
    ! 128 bytes.
.do_block_copyout:

    ! Swap src/dst since the code below is memcpy code
    ! and memcpy/bcopy have different calling sequences
    mov %i1, %i5
    mov %i0, %i1
    mov %i5, %i0

    andcc   %i0, 7, %i3     ! is dst double aligned
    bz  %ncc, copyout_blkcpy
    sub %i3, 8, %i3
    neg %i3         ! bytes till double aligned
    sub %i2, %i3, %i2       ! update %i2 with new count

    ! Align Destination on double-word boundary

1:  ldub    [%i1], %i4
    inc %i1
    stba    %i4, [%i0]ASI_USER
    deccc   %i3
    bgu %ncc, 1b
      inc   %i0

copyout_blkcpy:
    andcc   %i0, 63, %i3
    bz,pn   %ncc, copyout_blalign   ! now block aligned
    sub %i3, 64, %i3
    neg %i3         ! bytes till block aligned
    sub %i2, %i3, %i2       ! update %i2 with new count

    ! Copy %i3 bytes till dst is block (64 byte) aligned. use
    ! double word copies.

    andcc   %i1, 7, %g1     ! is src aligned on a 8 bytes
    bz  %ncc, .co_dbcopy    ! %g1 has source offset (last 3-bits)
    sll %g1, 3, %l1     ! left shift
    mov 0x40, %l2
    sub %l2, %l1, %l2       ! right shift = (64 - left shift)

    ! Now use double word copies to align destination.
.co_double:
    sub %i1, %g1, %i1       ! align the src at 8 bytes.
    ldx [%i1], %o2
2:
    ldx [%i1+8], %o4
    ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3)
    stxa    %o2, [%i0]ASI_USER
    mov %o4, %o2
    add %i1, 0x8, %i1
    subcc   %i3, 0x8, %i3
    bgu,pt  %ncc, 2b
    add %i0, 0x8, %i0
    ba  copyout_blalign
    add %i1, %g1, %i1

    ! Both source and destination are double aligned.
    ! No shift and merge of data required in this case.
.co_dbcopy:
    ldx [%i1], %o2
    stxa    %o2, [%i0]ASI_USER
    add %i1, 0x8, %i1
    subcc   %i3, 0x8, %i3
    bgu,pt  %ncc, .co_dbcopy
    add %i0, 0x8, %i0

copyout_blalign:
    andn    %i2, 0x3f, %i3      ! %i3 count is multiple of block size
    sub %i2, %i3, %i2       ! Residue bytes in %i2

    mov ASI_BLK_INIT_QUAD_LDD_AIUS, %asi

    andcc   %i1, 0xf, %o2       ! is src quadword aligned
    bz,pn   %xcc, .co_blkcpy    ! src offset in %o2 (last 4-bits)
    nop
    cmp %o2, 0x8
    bg  .co_upper_double
    nop
    bl  .co_lower_double
    nop

    ! Falls through when source offset is equal to 8 i.e.
    ! source is double word aligned.
    ! In this case no shift/merge of data is required

    sub %i1, %o2, %i1       ! align the src at 16 bytes.
    andn    %i1, 0x3f, %l0      ! %l0 has block aligned source
    prefetch [%l0+0x0], #one_read
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
.co_loop0:
    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
    prefetch [%l0+0x40], #one_read

    stxa    %l3, [%i0+0x0]%asi
    stxa    %l4, [%i0+0x8]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2

    stxa    %l5, [%i0+0x10]%asi
    stxa    %l2, [%i0+0x18]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4

    stxa    %l3, [%i0+0x20]%asi
    stxa    %l4, [%i0+0x28]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2

    stxa    %l5, [%i0+0x30]%asi
    stxa    %l2, [%i0+0x38]%asi

    add %l0, 0x40, %l0
    subcc   %i3, 0x40, %i3
    bgu,pt  %xcc, .co_loop0
    add %i0, 0x40, %i0
    ba  .co_blkdone
    add %i1, %o2, %i1       ! increment the source by src offset
                    ! the src offset was stored in %o2

.co_lower_double:

    sub %i1, %o2, %i1       ! align the src at 16 bytes.
    sll %o2, 3, %o0     ! %o0 left shift
    mov 0x40, %o1
    sub %o1, %o0, %o1       ! %o1 right shift = (64 - left shift)
    andn    %i1, 0x3f, %l0      ! %l0 has block aligned source
    prefetch [%l0+0x0], #one_read
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2    ! partial data in %l2 and %l3 has
                    ! complete data
.co_loop1:
    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4    ! %l4 has partial data
                            ! for this read.
    ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)    ! merge %l2, %l3 and %l4
                            ! into %l2 and %l3
    prefetch [%l0+0x40], #one_read

    stxa    %l2, [%i0+0x0]%asi
    stxa    %l3, [%i0+0x8]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
    ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)    ! merge %l2 with %l5 and
                            ! %l4 from previous read
                            ! into %l4 and %l5
    stxa    %l4, [%i0+0x10]%asi
    stxa    %l5, [%i0+0x18]%asi

    ! Repeat the same for next 32 bytes.

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
    ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)

    stxa    %l2, [%i0+0x20]%asi
    stxa    %l3, [%i0+0x28]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
    ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)

    stxa    %l4, [%i0+0x30]%asi
    stxa    %l5, [%i0+0x38]%asi

    add %l0, 0x40, %l0
    subcc   %i3, 0x40, %i3
    bgu,pt  %xcc, .co_loop1
    add %i0, 0x40, %i0
    ba  .co_blkdone
    add %i1, %o2, %i1       ! increment the source by src offset
                    ! the src offset was stored in %o2

.co_upper_double:

    sub %i1, %o2, %i1       ! align the src at 16 bytes.
    sub %o2, 0x8, %o0
    sll %o0, 3, %o0     ! %o0 left shift
    mov 0x40, %o1
    sub %o1, %o0, %o1       ! %o1 right shift = (64 - left shift)
    andn    %i1, 0x3f, %l0      ! %l0 has block aligned source
    prefetch [%l0+0x0], #one_read
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2    ! partial data in %l3
                            ! for this read and
                            ! no data in %l2
.co_loop2:
    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4    ! %l4 has complete data
                            ! and %l5 has partial
    ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)    ! merge %l3, %l4 and %l5
                            ! into %l3 and %l4
    prefetch [%l0+0x40], #one_read

    stxa    %l3, [%i0+0x0]%asi
    stxa    %l4, [%i0+0x8]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
    ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)    ! merge %l2 and %l3 with
                            ! %l5 from previous read
                            ! into %l5 and %l2

    stxa    %l5, [%i0+0x10]%asi
    stxa    %l2, [%i0+0x18]%asi

    ! Repeat the same for next 32 bytes.

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
    ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)

    stxa    %l3, [%i0+0x20]%asi
    stxa    %l4, [%i0+0x28]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
    ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)

    stxa    %l5, [%i0+0x30]%asi
    stxa    %l2, [%i0+0x38]%asi

    add %l0, 0x40, %l0
    subcc   %i3, 0x40, %i3
    bgu,pt  %xcc, .co_loop2
    add %i0, 0x40, %i0
    ba  .co_blkdone
    add %i1, %o2, %i1       ! increment the source by src offset
                    ! the src offset was stored in %o2


    ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.co_blkcpy:

    andn    %i1, 0x3f, %o0      ! %o0 has block aligned source
    prefetch [%o0+0x0], #one_read
1:
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
    add %i1, 0x10, %i1

    prefetch [%o0+0x40], #one_read

    stxa    %l0, [%i0+0x0]%asi

    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
    add %i1, 0x10, %i1

    stxa    %l1, [%i0+0x8]%asi
    stxa    %l2, [%i0+0x10]%asi
    stxa    %l3, [%i0+0x18]%asi
    stxa    %l4, [%i0+0x20]%asi
    stxa    %l5, [%i0+0x28]%asi
    stxa    %l6, [%i0+0x30]%asi
    stxa    %l7, [%i0+0x38]%asi

    add %o0, 0x40, %o0
    subcc   %i3, 0x40, %i3
    bgu,pt  %xcc, 1b
    add %i0, 0x40, %i0

.co_blkdone:
    membar  #Sync

    ! Copy as much rest of the data as double word copy.
.co_dwcp:
    cmp %i2, 0x8        ! Not enough bytes to copy as double
    blu %ncc, .co_dbdone
    nop

    andn    %i2, 0x7, %i3       ! %i3 count is multiple of 8 bytes size
    sub %i2, %i3, %i2       ! Residue bytes in %i2

    andcc   %i1, 7, %g1     ! is src aligned on a 8 bytes
    bz  %ncc, .co_cpy_db
    nop

    sll %g1, 3, %l0     ! left shift
    mov 0x40, %l1
    sub %l1, %l0, %l1       ! right shift = (64 - left shift)

.co_cpy_wd:
    sub %i1, %g1, %i1       ! align the src at 8 bytes.
    ldx [%i1], %o2
3:
    ldx [%i1+8], %o4
    ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3)
    stxa    %o2, [%i0]ASI_USER
    mov %o4, %o2
    add %i1, 0x8, %i1
    subcc   %i3, 0x8, %i3
    bgu,pt  %ncc, 3b
    add %i0, 0x8, %i0
    ba  .co_dbdone
    add %i1, %g1, %i1

.co_cpy_db:
    ldx [%i1], %o2
    stxa    %o2, [%i0]ASI_USER
    add %i1, 0x8, %i1
    subcc   %i3, 0x8, %i3
    bgu,pt  %ncc, .co_cpy_db
    add %i0, 0x8, %i0

.co_dbdone:
    tst %i2
    bz,pt   %xcc, .copyout_exit
    nop

    ! Copy the residue as byte copy
.co_residue:
    ldub    [%i1], %i4
    stba    %i4, [%i0]ASI_USER
    inc %i1
    deccc   %i2
    bgu %xcc, .co_residue
    inc %i0

.copyout_exit:
    membar  #Sync
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
    ret
    restore %g0, 0, %o0

.copyout_err:
    ldn [THREAD_REG + T_COPYOPS], %o4
    brz %o4, 2f
    nop
    ldn [%o4 + CP_COPYOUT], %g2
    jmp %g2
    nop
2:
    retl
    mov -1, %o0
    SET_SIZE(copyout)

#endif  /* lint */


#ifdef  lint

/*ARGSUSED*/
int
xcopyout(const void *kaddr, void *uaddr, size_t count)
{ return (0); }

#else   /* lint */

    ENTRY(xcopyout)
    sethi   %hi(.xcopyout_err), REAL_LOFAULT
    b   .do_copyout
      or    REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
.xcopyout_err:
    ldn [THREAD_REG + T_COPYOPS], %o4
    brz %o4, 2f
    nop
    ldn [%o4 + CP_XCOPYOUT], %g2
    jmp %g2
    nop
2:
    retl
    mov %g1, %o0
    SET_SIZE(xcopyout)

#endif  /* lint */

#ifdef  lint

/*ARGSUSED*/
int
xcopyout_little(const void *kaddr, void *uaddr, size_t count)
{ return (0); }

#else   /* lint */

    ENTRY(xcopyout_little)
    sethi   %hi(.little_err), %o4
    ldn [THREAD_REG + T_LOFAULT], %o5
    or  %o4, %lo(.little_err), %o4
    membar  #Sync           ! sync error barrier
    stn %o4, [THREAD_REG + T_LOFAULT]

    subcc   %g0, %o2, %o3
    add %o0, %o2, %o0
    bz,pn   %ncc, 2f        ! check for zero bytes
    sub %o2, 1, %o4
    add %o0, %o4, %o0       ! start w/last byte
    add %o1, %o2, %o1
    ldub    [%o0+%o3], %o4

1:  stba    %o4, [%o1+%o3]ASI_AIUSL
    inccc   %o3
    sub %o0, 2, %o0     ! get next byte
    bcc,a,pt %ncc, 1b
      ldub  [%o0+%o3], %o4

2:  membar  #Sync           ! sync error barrier
    stn %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
    retl
    mov %g0, %o0        ! return (0)
    SET_SIZE(xcopyout_little)

#endif  /* lint */

/*
 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
 */

#if defined(lint)

/*ARGSUSED*/
int
copyin(const void *uaddr, void *kaddr, size_t count)
{ return (0); }

#else   /* lint */

    ENTRY(copyin)
    sethi   %hi(.copyin_err), REAL_LOFAULT
    or  REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT

.do_copyin:
    !
    ! Check the length and bail if zero.
    !
    tst %o2
    bnz,pt  %ncc, 1f
      nop
    retl
      clr   %o0
1:
    sethi   %hi(copyio_fault), %o4
    or  %o4, %lo(copyio_fault), %o4
    sethi   %hi(copyio_fault_nowindow), %o3
    ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
    or  %o3, %lo(copyio_fault_nowindow), %o3
    membar  #Sync
    stn %o3, [THREAD_REG + T_LOFAULT]

    mov %o0, SAVE_SRC
    mov %o1, SAVE_DST
    mov %o2, SAVE_COUNT

    !
    ! Check to see if we're more than SMALL_LIMIT.
    !
    subcc   %o2, SMALL_LIMIT, %o3
    bgu,a,pt %ncc, .dci_ns
    or  %o0, %o1, %o3
    !
    ! What was previously ".small_copyin"
    !
.dcibcp:
    sub %g0, %o2, %o3       ! setup for copy loop
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    ba,pt   %ncc, .dcicl
    lduba   [%o0 + %o3]ASI_USER, %o4
    !
    ! %o0 and %o1 point at the end and remain pointing at the end
    ! of their buffers. We pull things out by adding %o3 (which is
    ! the negation of the length) to the buffer end which gives us
    ! the curent location in the buffers. By incrementing %o3 we walk
    ! through both buffers without having to bump each buffer's
    ! pointer. A very fast 4 instruction loop.
    !
    .align 16
.dcicl:
    stb %o4, [%o1 + %o3]
    inccc   %o3
    bl,a,pt %ncc, .dcicl
    lduba   [%o0 + %o3]ASI_USER, %o4
    !
    ! We're done. Go home.
    !
    membar  #Sync
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
    retl
    clr %o0
    !
    ! Try aligned copies from here.
    !
.dci_ns:
    !
    ! See if we're single byte aligned. If we are, check the
    ! limit for single byte copies. If we're smaller, or equal,
    ! bounce to the byte for byte copy loop. Otherwise do it in
    ! HW (if enabled).
    !
    btst    1, %o3
    bz,a,pt %icc, .dcih8
    btst    7, %o3
    !
    ! We're single byte aligned.
    !
    sethi   %hi(hw_copy_limit_1), %o3
    ld  [%o3 + %lo(hw_copy_limit_1)], %o3
    !
    ! Is HW copy on? If not do everything byte for byte.
    !
    tst %o3
    bz,pn   %icc, .dcibcp
    subcc   %o3, %o2, %o3
    !
    ! Are we bigger than the HW limit? If not
    ! go to byte for byte.
    !
    bge,pt  %ncc, .dcibcp
    nop
    !
    ! We're big enough and copy is on. Do it with HW.
    !
    ba,pt   %ncc, .big_copyin
    nop
.dcih8:
    !
    ! 8 byte aligned?
    !
    bnz,a   %ncc, .dcih4
    btst    3, %o3
    !
    ! We're eight byte aligned.
    !
    sethi   %hi(hw_copy_limit_8), %o3
    ld  [%o3 + %lo(hw_copy_limit_8)], %o3
    !
    ! Is HW assist on? If not, do it with the aligned copy.
    !
    tst %o3
    bz,pn   %icc, .dcis8
    subcc   %o3, %o2, %o3
    bge %ncc, .dcis8
    nop
    ba,pt   %ncc, .big_copyin
    nop
.dcis8:
    !
    ! Housekeeping for copy loops. Uses same idea as in the byte for
    ! byte copy loop above.
    !
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    sub %g0, %o2, %o3
    ba,pt   %ncc, .didebc
    srl %o2, 3, %o2     ! Number of 8 byte chunks to copy
    !
    ! 4 byte aligned?
    !
.dcih4:
    bnz %ncc, .dcih2
    sethi   %hi(hw_copy_limit_4), %o3
    ld  [%o3 + %lo(hw_copy_limit_4)], %o3
    !
    ! Is HW assist on? If not, do it with the aligned copy.
    !
    tst %o3
    bz,pn   %icc, .dcis4
    subcc   %o3, %o2, %o3
    !
    ! We're negative if our size is less than or equal to hw_copy_limit_4.
    !
    bge %ncc, .dcis4
    nop
    ba,pt   %ncc, .big_copyin
    nop
.dcis4:
    !
    ! Housekeeping for copy loops. Uses same idea as in the byte
    ! for byte copy loop above.
    !
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    sub %g0, %o2, %o3
    ba,pt   %ncc, .didfbc
    srl %o2, 2, %o2     ! Number of 4 byte chunks to copy
.dcih2:
    !
    ! We're two byte aligned. Check for "smallness"
    ! done in delay at .dcih4
    !
    bleu,pt %ncc, .dcis2
    sethi   %hi(hw_copy_limit_2), %o3
    ld  [%o3 + %lo(hw_copy_limit_2)], %o3
    !
    ! Is HW assist on? If not, do it with the aligned copy.
    !
    tst %o3
    bz,pn   %icc, .dcis2
    subcc   %o3, %o2, %o3
    !
    ! Are we larger than the HW limit?
    !
    bge %ncc, .dcis2
    nop
    !
    ! HW assist is on and we're large enough to use it.
    !
    ba,pt   %ncc, .big_copyin
    nop
    !
    ! Housekeeping for copy loops. Uses same idea as in the byte
    ! for byte copy loop above.
    !
.dcis2:
    add %o0, %o2, %o0
    add %o1, %o2, %o1
    sub %g0, %o2, %o3
    ba,pt   %ncc, .didtbc
    srl %o2, 1, %o2     ! Number of 2 byte chunks to copy
    !
.small_copyin:
    !
    ! Why are we doing this AGAIN? There are certain conditions in
    ! big copyin that will cause us to forgo the HW assisted copys
    ! and bounce back to a non-hw assisted copy. This dispatches
    ! those copies. Note that we branch around this in the main line
    ! code.
    !
    ! We make no check for limits or HW enablement here. We've
    ! already been told that we're a poster child so just go off
    ! and do it.
    !
    or  %o0, %o1, %o3
    btst    1, %o3
    bnz %icc, .dcibcp       ! Most likely
    btst    7, %o3
    bz  %icc, .dcis8
    btst    3, %o3
    bz  %icc, .dcis4
    nop
    ba,pt   %ncc, .dcis2
    nop
    !
    ! Eight byte aligned copies. A steal from the original .small_copyin
    ! with modifications. %o2 is number of 8 byte chunks to copy. When
    ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
    ! to copy.
    !
    .align 32
.didebc:
    ldxa    [%o0 + %o3]ASI_USER, %o4
    deccc   %o2
    stx %o4, [%o1 + %o3]
    bg,pt   %ncc, .didebc
    addcc   %o3, 8, %o3
    !
    ! End of copy loop. Most 8 byte aligned copies end here.
    !
    bz,pt   %ncc, .dcifh
    nop
    !
    ! Something is left. Do it byte for byte.
    !
    ba,pt   %ncc, .dcicl
    lduba   [%o0 + %o3]ASI_USER, %o4
    !
    ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
    !
    .align 32
.didfbc:
    lduwa   [%o0 + %o3]ASI_USER, %o4
    deccc   %o2
    st  %o4, [%o1 + %o3]
    bg,pt   %ncc, .didfbc
    addcc   %o3, 4, %o3
    !
    ! End of copy loop. Most 4 byte aligned copies end here.
    !
    bz,pt   %ncc, .dcifh
    nop
    !
    ! Something is left. Do it byte for byte.
    !
    ba,pt   %ncc, .dcicl
    lduba   [%o0 + %o3]ASI_USER, %o4
    !
    ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
    ! copy.
    !
    .align 32
.didtbc:
    lduha   [%o0 + %o3]ASI_USER, %o4
    deccc   %o2
    sth %o4, [%o1 + %o3]
    bg,pt   %ncc, .didtbc
    addcc   %o3, 2, %o3
    !
    ! End of copy loop. Most 2 byte aligned copies end here.
    !
    bz,pt   %ncc, .dcifh
    nop
    !
    ! Deal with the last byte
    !
    lduba   [%o0 + %o3]ASI_USER, %o4
    stb %o4, [%o1 + %o3]
.dcifh:
    membar  #Sync
    stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
    retl
    clr %o0

.big_copyin:
    !
    ! We're going off to do a block copy.
    ! Switch fault hendlers and grab a window. We
    ! don't do a membar #Sync since we've done only
    ! kernel data to this point.
    !
    stn %o4, [THREAD_REG + T_LOFAULT]
    save    %sp, -SA(MINFRAME), %sp

    ! Copy in that reach here are larger than 256 bytes. The
    ! hw_copy_limit_1 is set to 256. Never set this limit less
    ! 128 bytes.
.do_blockcopyin:

    ! Swap src/dst since the code below is memcpy code
    ! and memcpy/bcopy have different calling sequences
    mov %i1, %i5
    mov %i0, %i1
    mov %i5, %i0

    andcc   %i0, 7, %i3     ! is dst double aligned
    bz  %ncc, copyin_blkcpy
    sub %i3, 8, %i3
    neg %i3         ! bytes till double aligned
    sub %i2, %i3, %i2       ! update %i2 with new count

    ! Align Destination on double-word boundary

1:  lduba   [%i1]ASI_USER, %i4
    inc %i1
    stb %i4, [%i0]
    deccc   %i3
    bgu %ncc, 1b
      inc   %i0

copyin_blkcpy:
    andcc   %i0, 63, %i3
    bz,pn   %ncc, copyin_blalign    ! now block aligned
    sub %i3, 64, %i3
    neg %i3         ! bytes till block aligned
    sub %i2, %i3, %i2       ! update %i2 with new count

    ! Copy %i3 bytes till dst is block (64 byte) aligned. use
    ! double word copies.

    andcc   %i1, 7, %g1     ! is src aligned on a 8 bytes
    bz  %ncc, .ci_dbcopy    ! %g1 has source offset (last 3-bits)
    sll %g1, 3, %l1     ! left shift
    mov 0x40, %l2
    sub %l2, %l1, %l2       ! right shift = (64 - left shift)

    ! Now use double word copies to align destination.
.ci_double:
    sub %i1, %g1, %i1       ! align the src at 8 bytes.
    ldxa    [%i1]ASI_USER, %o2
2:
    add %i1, 0x8, %i1
    ldxa    [%i1]ASI_USER, %o4
    ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3)
    stx %o2, [%i0]
    mov %o4, %o2
    subcc   %i3, 0x8, %i3
    bgu,pt  %ncc, 2b
    add %i0, 0x8, %i0
    ba  copyin_blalign
    add %i1, %g1, %i1

    ! Both source and destination are double aligned.
    ! No shift and merge of data required in this case.
.ci_dbcopy:
    ldxa    [%i1]ASI_USER, %o2
    stx %o2, [%i0]
    add %i1, 0x8, %i1
    subcc   %i3, 0x8, %i3
    bgu,pt  %ncc, .ci_dbcopy
    add %i0, 0x8, %i0

copyin_blalign:
    andn    %i2, 0x3f, %i3      ! %i3 count is multiple of block size
    sub %i2, %i3, %i2       ! Residue bytes in %i2

    mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi

    andcc   %i1, 0xf, %o2       ! is src quadword aligned
    bz,pn   %xcc, .ci_blkcpy    ! src offset in %o2 (last 4-bits)
    nop
    cmp %o2, 0x8
    bg  .ci_upper_double
    nop
    bl  .ci_lower_double
    nop

    ! Falls through when source offset is equal to 8 i.e.
    ! source is double word aligned.
    ! In this case no shift/merge of data is required

    sub %i1, %o2, %i1       ! align the src at 16 bytes.
    andn    %i1, 0x3f, %l0      ! %l0 has block aligned source
    prefetch [%l0+0x0], #one_read
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
.ci_loop0:
    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4

    prefetch [%l0+0x40], #one_read

    stxa    %l3, [%i0+0x0]%asi
    stxa    %l4, [%i0+0x8]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2

    stxa    %l5, [%i0+0x10]%asi
    stxa    %l2, [%i0+0x18]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4

    stxa    %l3, [%i0+0x20]%asi
    stxa    %l4, [%i0+0x28]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2

    stxa    %l5, [%i0+0x30]%asi
    stxa    %l2, [%i0+0x38]%asi

    add %l0, 0x40, %l0
    subcc   %i3, 0x40, %i3
    bgu,pt  %xcc, .ci_loop0
    add %i0, 0x40, %i0
    ba  .ci_blkdone
    add %i1, %o2, %i1       ! increment the source by src offset
                    ! the src offset was stored in %o2

.ci_lower_double:

    sub %i1, %o2, %i1       ! align the src at 16 bytes.
    sll %o2, 3, %o0     ! %o0 left shift
    mov 0x40, %o1
    sub %o1, %o0, %o1       ! %o1 right shift = (64 - left shift)
    andn    %i1, 0x3f, %l0      ! %l0 has block aligned source
    prefetch [%l0+0x0], #one_read
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2    ! partial data in %l2
                            ! and %l3 has complete
                            ! data
.ci_loop1:
    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4    ! %l4 has partial data
                            ! for this read.
    ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)    ! merge %l2, %l3 and %l4
                            ! into %l2 and %l3

    prefetch [%l0+0x40], #one_read

    stxa    %l2, [%i0+0x0]%asi
    stxa    %l3, [%i0+0x8]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
    ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)    ! merge %l2 with %l5 and
                            ! %l4 from previous read
                            ! into %l4 and %l5
    stxa    %l4, [%i0+0x10]%asi
    stxa    %l5, [%i0+0x18]%asi

    ! Repeat the same for next 32 bytes.

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
    ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)

    stxa    %l2, [%i0+0x20]%asi
    stxa    %l3, [%i0+0x28]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
    ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)

    stxa    %l4, [%i0+0x30]%asi
    stxa    %l5, [%i0+0x38]%asi

    add %l0, 0x40, %l0
    subcc   %i3, 0x40, %i3
    bgu,pt  %xcc, .ci_loop1
    add %i0, 0x40, %i0
    ba  .ci_blkdone
    add %i1, %o2, %i1       ! increment the source by src offset
                    ! the src offset was stored in %o2

.ci_upper_double:

    sub %i1, %o2, %i1       ! align the src at 16 bytes.
    sub %o2, 0x8, %o0
    sll %o0, 3, %o0     ! %o0 left shift
    mov 0x40, %o1
    sub %o1, %o0, %o1       ! %o1 right shift = (64 - left shift)
    andn    %i1, 0x3f, %l0      ! %l0 has block aligned source
    prefetch [%l0+0x0], #one_read
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2    ! partial data in %l3
                            ! for this read and
                            ! no data in %l2
.ci_loop2:
    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4    ! %l4 has complete data
                            ! and %l5 has partial
    ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)    ! merge %l3, %l4 and %l5
                            ! into %l3 and %l4
    prefetch [%l0+0x40], #one_read

    stxa    %l3, [%i0+0x0]%asi
    stxa    %l4, [%i0+0x8]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
    ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)    ! merge %l2 and %l3 with
                            ! %l5 from previous read
                            ! into %l5 and %l2

    stxa    %l5, [%i0+0x10]%asi
    stxa    %l2, [%i0+0x18]%asi

    ! Repeat the same for next 32 bytes.

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
    ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)

    stxa    %l3, [%i0+0x20]%asi
    stxa    %l4, [%i0+0x28]%asi

    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
    ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)

    stxa    %l5, [%i0+0x30]%asi
    stxa    %l2, [%i0+0x38]%asi

    add %l0, 0x40, %l0
    subcc   %i3, 0x40, %i3
    bgu,pt  %xcc, .ci_loop2
    add %i0, 0x40, %i0
    ba  .ci_blkdone
    add %i1, %o2, %i1       ! increment the source by src offset
                    ! the src offset was stored in %o2


    ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.ci_blkcpy:

    andn    %i1, 0x3f, %o0      ! %o0 has block aligned source
    prefetch [%o0+0x0], #one_read
1:
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
    add %i1, 0x10, %i1

    prefetch [%o0+0x40], #one_read

    stxa    %l0, [%i0+0x0]%asi

    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
    add %i1, 0x10, %i1
    ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
    add %i1, 0x10, %i1

    stxa    %l1, [%i0+0x8]%asi
    stxa    %l2, [%i0+0x10]%asi
    stxa    %l3, [%i0+0x18]%asi
    stxa    %l4, [%i0+0x20]%asi
    stxa    %l5, [%i0+0x28]%asi
    stxa    %l6, [%i0+0x30]%asi
    stxa    %l7, [%i0+0x38]%asi

    add %o0, 0x40, %o0
    subcc   %i3, 0x40, %i3
    bgu,pt  %xcc, 1b
    add %i0, 0x40, %i0

.ci_blkdone:
    membar  #Sync

    ! Copy as much rest of the data as double word copy.
.ci_dwcp:
    cmp %i2, 0x8        ! Not enough bytes to copy as double
    blu %ncc, .ci_dbdone
    nop

    andn    %i2, 0x7, %i3       ! %i3 count is multiple of 8 bytes size
    sub %i2, %i3, %i2       ! Residue bytes in %i2

    andcc   %i1, 7, %g1     ! is src aligned on a 8 bytes
    bz  %ncc, .ci_cpy_db
    nop

    sll %g1, 3, %l0     ! left shift
    mov 0x40, %l1
    sub %l1, %l0, %l1       ! right shift = (64 - left shift)

.ci_cpy_dbwd:
    sub %i1, %g1, %i1       ! align the src at 8 bytes.
    ldxa    [%i1]ASI_USER, %o2
3:
    add %i1, 0x8, %i1
    ldxa    [%i1]ASI_USER, %o4
    ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3)
    stx %o2, [%i0]
    mov %o4, %o2
    subcc   %i3, 0x8, %i3
    bgu,pt  %ncc, 3b
    add %i0, 0x8, %i0
    ba  .ci_dbdone
    add %i1, %g1, %i1

.ci_cpy_db:
    ldxa    [%i1]ASI_USER, %o2
    stx %o2, [%i0]
    add %i1, 0x8, %i1
    subcc   %i3, 0x8, %i3
    bgu,pt  %ncc, .ci_cpy_db
    add %i0, 0x8, %i0

.ci_dbdone:
    tst %i2
    bz,pt   %xcc, .copyin_exit
    nop

    ! Copy the residue as byte copy
.ci_residue:
    lduba   [%i1]ASI_USER, %i4
    stb %i4, [%i0]
    inc %i1
    deccc   %i2
    bgu %xcc, .ci_residue
    inc %i0

.copyin_exit:
    membar  #Sync
    stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
    ret
    restore %g0, 0, %o0
.copyin_err:
    ldn [THREAD_REG + T_COPYOPS], %o4
    brz %o4, 2f
    nop
    ldn [%o4 + CP_COPYIN], %g2
    jmp %g2
    nop
2:
    retl
    mov -1, %o0
    SET_SIZE(copyin)

#endif  /* lint */

#ifdef  lint

/*ARGSUSED*/
int
xcopyin(const void *uaddr, void *kaddr, size_t count)
{ return (0); }

#else   /* lint */

    ENTRY(xcopyin)
    sethi   %hi(.xcopyin_err), REAL_LOFAULT
    b   .do_copyin
      or    REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
.xcopyin_err:
    ldn [THREAD_REG + T_COPYOPS], %o4
    brz %o4, 2f
    nop
    ldn [%o4 + CP_XCOPYIN], %g2
    jmp %g2
    nop
2:
    retl
    mov %g1, %o0
    SET_SIZE(xcopyin)

#endif  /* lint */

#ifdef  lint

/*ARGSUSED*/
int
xcopyin_little(const void *uaddr, void *kaddr, size_t count)
{ return (0); }

#else   /* lint */

    ENTRY(xcopyin_little)
    sethi   %hi(.little_err), %o4
    ldn [THREAD_REG + T_LOFAULT], %o5
    or  %o4, %lo(.little_err), %o4
    membar  #Sync               ! sync error barrier
    stn %o4, [THREAD_REG + T_LOFAULT]

    subcc   %g0, %o2, %o3
    add %o0, %o2, %o0
    bz,pn   %ncc, 2f        ! check for zero bytes
    sub %o2, 1, %o4
    add %o0, %o4, %o0       ! start w/last byte
    add %o1, %o2, %o1
    lduba   [%o0+%o3]ASI_AIUSL, %o4

1:  stb %o4, [%o1+%o3]
    inccc   %o3
    sub %o0, 2, %o0     ! get next byte
    bcc,a,pt %ncc, 1b
      lduba [%o0+%o3]ASI_AIUSL, %o4

2:  membar  #Sync               ! sync error barrier
    stn %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
    retl
    mov %g0, %o0        ! return (0)

.little_err:
    membar  #Sync               ! sync error barrier
    stn %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
    retl
    mov %g1, %o0
    SET_SIZE(xcopyin_little)

#endif  /* lint */


/*
 * Copy a block of storage - must not overlap (from + len <= to).
 * No fault handler installed (to be called under on_fault())
 */
#if defined(lint)

/* ARGSUSED */
void
copyin_noerr(const void *ufrom, void *kto, size_t count)
{}

#else   /* lint */

    ENTRY(copyin_noerr)
    sethi   %hi(.copyio_noerr), REAL_LOFAULT
    b   .do_copyin
      or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
.copyio_noerr:
    jmp SAVED_LOFAULT
      nop
    SET_SIZE(copyin_noerr)

#endif /* lint */

/*
 * Copy a block of storage - must not overlap (from + len <= to).
 * No fault handler installed (to be called under on_fault())
 */

#if defined(lint)

/* ARGSUSED */
void
copyout_noerr(const void *kfrom, void *uto, size_t count)
{}

#else   /* lint */

    ENTRY(copyout_noerr)
    sethi   %hi(.copyio_noerr), REAL_LOFAULT
    b   .do_copyout
      or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
    SET_SIZE(copyout_noerr)

#endif /* lint */

#if defined(lint)

int use_hw_bcopy = 1;
int use_hw_bzero = 1;
uint_t hw_copy_limit_1 = 0x100;
uint_t hw_copy_limit_2 = 0x200;
uint_t hw_copy_limit_4 = 0x400;
uint_t hw_copy_limit_8 = 0x400;

#else /* !lint */

    .align  4
    DGDEF(use_hw_bcopy)
    .word   1
    DGDEF(use_hw_bzero)
    .word   1
    DGDEF(hw_copy_limit_1)
    .word   0x100
    DGDEF(hw_copy_limit_2)
    .word   0x200
    DGDEF(hw_copy_limit_4)
    .word   0x400
    DGDEF(hw_copy_limit_8)
    .word   0x400

    .align  64
    .section ".text"
#endif /* !lint */

/*
 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
 * longer than 256 bytes in length using Niagara's block stores/quad store.
 * If the criteria for using this routine are not met then it calls bzero
 * and returns 1.  Otherwise 0 is returned indicating success.
 * Caller is responsible for ensuring use_hw_bzero is true and that
 * kpreempt_disable() has been called.
 */
#ifdef lint
/*ARGSUSED*/
int
hwblkclr(void *addr, size_t len)
{
    return(0);
}
#else /* lint */
    ! %i0 - start address
    ! %i1 - length of region (multiple of 64)

    ENTRY(hwblkclr)
    save    %sp, -SA(MINFRAME), %sp

    ! Must be block-aligned
    andcc   %i0, 0x3f, %g0
    bnz,pn  %ncc, 1f
      nop

    ! ... and must be 256 bytes or more
    cmp %i1, 0x100
    blu,pn  %ncc, 1f
      nop

    ! ... and length must be a multiple of 64
    andcc   %i1, 0x3f, %g0
    bz,pn   %ncc, .pz_doblock
    mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi

1:  ! punt, call bzero but notify the caller that bzero was used
    mov %i0, %o0
    call    bzero
      mov   %i1, %o1
    ret
    restore %g0, 1, %o0 ! return (1) - did not use block operations

    ! Already verified that there are at least 256 bytes to set
.pz_doblock:
    stxa    %g0, [%i0+0x0]%asi
    stxa    %g0, [%i0+0x40]%asi
    stxa    %g0, [%i0+0x80]%asi
    stxa    %g0, [%i0+0xc0]%asi

    stxa    %g0, [%i0+0x8]%asi
    stxa    %g0, [%i0+0x10]%asi
    stxa    %g0, [%i0+0x18]%asi
    stxa    %g0, [%i0+0x20]%asi
    stxa    %g0, [%i0+0x28]%asi
    stxa    %g0, [%i0+0x30]%asi
    stxa    %g0, [%i0+0x38]%asi

    stxa    %g0, [%i0+0x48]%asi
    stxa    %g0, [%i0+0x50]%asi
    stxa    %g0, [%i0+0x58]%asi
    stxa    %g0, [%i0+0x60]%asi
    stxa    %g0, [%i0+0x68]%asi
    stxa    %g0, [%i0+0x70]%asi
    stxa    %g0, [%i0+0x78]%asi

    stxa    %g0, [%i0+0x88]%asi
    stxa    %g0, [%i0+0x90]%asi
    stxa    %g0, [%i0+0x98]%asi
    stxa    %g0, [%i0+0xa0]%asi
    stxa    %g0, [%i0+0xa8]%asi
    stxa    %g0, [%i0+0xb0]%asi
    stxa    %g0, [%i0+0xb8]%asi

    stxa    %g0, [%i0+0xc8]%asi
    stxa    %g0, [%i0+0xd0]%asi
    stxa    %g0, [%i0+0xd8]%asi
    stxa    %g0, [%i0+0xe0]%asi
    stxa    %g0, [%i0+0xe8]%asi
    stxa    %g0, [%i0+0xf0]%asi
    stxa    %g0, [%i0+0xf8]%asi

    sub %i1, 0x100, %i1
    cmp %i1, 0x100
    bgu,pt  %ncc, .pz_doblock
    add %i0, 0x100, %i0

2:
    ! Check if more than 64 bytes to set
    cmp %i1,0x40
    blu %ncc, .pz_finish
    nop

3:
    stxa    %g0, [%i0+0x0]%asi
    stxa    %g0, [%i0+0x8]%asi
    stxa    %g0, [%i0+0x10]%asi
    stxa    %g0, [%i0+0x18]%asi
    stxa    %g0, [%i0+0x20]%asi
    stxa    %g0, [%i0+0x28]%asi
    stxa    %g0, [%i0+0x30]%asi
    stxa    %g0, [%i0+0x38]%asi

    subcc   %i1, 0x40, %i1
    bgu,pt  %ncc, 3b
    add %i0, 0x40, %i0

.pz_finish:
    membar  #Sync
    ret
    restore %g0, 0, %o0     ! return (bzero or not)
    SET_SIZE(hwblkclr)
#endif  /* lint */

#ifdef  lint
/* Copy 32 bytes of data from src to dst using physical addresses */
/*ARGSUSED*/
void
hw_pa_bcopy32(uint64_t src, uint64_t dst)
{}
#else   /*!lint */

    /*
     * Copy 32 bytes of data from src (%o0) to dst (%o1)
     * using physical addresses.
     */
    ENTRY_NP(hw_pa_bcopy32)
    rdpr    %pstate, %g1
    andn    %g1, PSTATE_IE, %g2
    wrpr    %g0, %g2, %pstate

    ldxa    [%o0]ASI_MEM, %o2
    add     %o0, 8, %o0
    ldxa    [%o0]ASI_MEM, %o3
    add     %o0, 8, %o0
    ldxa    [%o0]ASI_MEM, %o4
    add     %o0, 8, %o0
    ldxa    [%o0]ASI_MEM, %o5
    stxa    %o2, [%o1]ASI_MEM
    add     %o1, 8, %o1
    stxa    %o3, [%o1]ASI_MEM
    add     %o1, 8, %o1
    stxa    %o4, [%o1]ASI_MEM
    add     %o1, 8, %o1
    stxa    %o5, [%o1]ASI_MEM

    membar  #Sync
    retl
      wrpr    %g0, %g1, %pstate
    SET_SIZE(hw_pa_bcopy32)
#endif /* lint */

/*
 * Zero a block of storage.
 *
 * uzero is used by the kernel to zero a block in user address space.
 */

/*
 * Control flow of the bzero/kzero/uzero routine.
 *
 *  For fewer than 7 bytes stores, bytes will be zeroed.
 *
 *  For less than 15 bytes stores, align the address on 4 byte boundary.
 *  Then store as many 4-byte chunks, followed by trailing bytes.
 *
 *  For sizes greater than 15 bytes, align the address on 8 byte boundary.
 *  if (count > 128) {
 *      store as many 8-bytes chunks to block align the address
 *      store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
 *      store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
 *  }
 *  Store as many 8-byte chunks, followed by trailing bytes.
 */

#if defined(lint)

/* ARGSUSED */
int
kzero(void *addr, size_t count)
{ return(0); }

/* ARGSUSED */
void
uzero(void *addr, size_t count)
{}

#else   /* lint */

    ENTRY(uzero)
    !
    ! Set a new lo_fault handler only if we came in with one
    ! already specified.
    !
    wr  %g0, ASI_USER, %asi
    ldn [THREAD_REG + T_LOFAULT], %o5
    tst %o5
    bz,pt   %ncc, .do_zero
    sethi   %hi(.zeroerr), %o2
    or  %o2, %lo(.zeroerr), %o2
    membar  #Sync
    ba,pt   %ncc, .do_zero
    stn %o2, [THREAD_REG + T_LOFAULT]

    ENTRY(kzero)
    !
    ! Always set a lo_fault handler
    !
    wr  %g0, ASI_P, %asi
    ldn [THREAD_REG + T_LOFAULT], %o5
    sethi   %hi(.zeroerr), %o2
    or  %o5, LOFAULT_SET, %o5
    or  %o2, %lo(.zeroerr), %o2
    membar  #Sync
    ba,pt   %ncc, .do_zero
    stn %o2, [THREAD_REG + T_LOFAULT]

/*
 * We got here because of a fault during kzero or if
 * uzero or bzero was called with t_lofault non-zero.
 * Otherwise we've already run screaming from the room.
 * Errno value is in %g1. Note that we're here iff
 * we did set t_lofault.
 */
.zeroerr:
    !
    ! Undo asi register setting. Just set it to be the
        ! kernel default without checking.
    !
    wr  %g0, ASI_P, %asi

    !
    ! We did set t_lofault. It may well have been zero coming in.
    !
1:
    tst %o5
    membar #Sync
    bne,pn  %ncc, 3f
    andncc  %o5, LOFAULT_SET, %o5
2:
    !
    ! Old handler was zero. Just return the error.
    !
    retl                ! return
    mov %g1, %o0        ! error code from %g1
3:
    !
    ! We're here because %o5 was non-zero. It was non-zero
    ! because either LOFAULT_SET was present, a previous fault
    ! handler was present or both. In all cases we need to reset
    ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
    ! before we either simply return the error or we invoke the
    ! previously specified handler.
    !
    be  %ncc, 2b
    stn %o5, [THREAD_REG + T_LOFAULT]
    jmp %o5         ! goto real handler
      nop
    SET_SIZE(kzero)
    SET_SIZE(uzero)

#endif  /* lint */

/*
 * Zero a block of storage.
 */

#if defined(lint)

/* ARGSUSED */
void
bzero(void *addr, size_t count)
{}

#else   /* lint */

    ENTRY(bzero)
    wr  %g0, ASI_P, %asi

    ldn [THREAD_REG + T_LOFAULT], %o5   ! save old vector
    tst %o5
    bz,pt   %ncc, .do_zero
    sethi   %hi(.zeroerr), %o2
    or  %o2, %lo(.zeroerr), %o2
    membar  #Sync               ! sync error barrier
    stn %o2, [THREAD_REG + T_LOFAULT]   ! install new vector

.do_zero:
    cmp %o1, 7
    blu,pn  %ncc, .byteclr
    nop

    cmp %o1, 15
    blu,pn  %ncc, .wdalign
    nop

    andcc   %o0, 7, %o3     ! is add aligned on a 8 byte bound
    bz,pt   %ncc, .blkalign     ! already double aligned
    sub %o3, 8, %o3     ! -(bytes till double aligned)
    add %o1, %o3, %o1       ! update o1 with new count

1:
    stba    %g0, [%o0]%asi
    inccc   %o3
    bl,pt   %ncc, 1b
    inc %o0

    ! Now address is double aligned
.blkalign:
    cmp %o1, 0x80       ! check if there are 128 bytes to set
    blu,pn  %ncc, .bzero_small
    mov %o1, %o3

    sethi   %hi(use_hw_bzero), %o2
    ld  [%o2 + %lo(use_hw_bzero)], %o2
    tst %o2
    bz  %ncc, .bzero_small
    mov %o1, %o3

    rd  %asi, %o3
    wr  %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
    cmp %o3, ASI_P
    bne,a   %ncc, .algnblk
    wr  %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi

.algnblk:
    andcc   %o0, 0x3f, %o3      ! is block aligned?
    bz,pt   %ncc, .bzero_blk
    sub %o3, 0x40, %o3      ! -(bytes till block aligned)
    add %o1, %o3, %o1       ! o1 is the remainder

    ! Clear -(%o3) bytes till block aligned
1:
    stxa    %g0, [%o0]%asi
    addcc   %o3, 8, %o3
    bl,pt   %ncc, 1b
    add %o0, 8, %o0

.bzero_blk:
    and %o1, 0x3f, %o3      ! calc bytes left after blk clear
    andn    %o1, 0x3f, %o4      ! calc size of blocks in bytes

    cmp %o4, 0x100      ! 256 bytes or more
    blu,pn  %ncc, 3f
    nop

2:
    stxa    %g0, [%o0+0x0]%asi
    stxa    %g0, [%o0+0x40]%asi
    stxa    %g0, [%o0+0x80]%asi
    stxa    %g0, [%o0+0xc0]%asi

    stxa    %g0, [%o0+0x8]%asi
    stxa    %g0, [%o0+0x10]%asi
    stxa    %g0, [%o0+0x18]%asi
    stxa    %g0, [%o0+0x20]%asi
    stxa    %g0, [%o0+0x28]%asi
    stxa    %g0, [%o0+0x30]%asi
    stxa    %g0, [%o0+0x38]%asi

    stxa    %g0, [%o0+0x48]%asi
    stxa    %g0, [%o0+0x50]%asi
    stxa    %g0, [%o0+0x58]%asi
    stxa    %g0, [%o0+0x60]%asi
    stxa    %g0, [%o0+0x68]%asi
    stxa    %g0, [%o0+0x70]%asi
    stxa    %g0, [%o0+0x78]%asi

    stxa    %g0, [%o0+0x88]%asi
    stxa    %g0, [%o0+0x90]%asi
    stxa    %g0, [%o0+0x98]%asi
    stxa    %g0, [%o0+0xa0]%asi
    stxa    %g0, [%o0+0xa8]%asi
    stxa    %g0, [%o0+0xb0]%asi
    stxa    %g0, [%o0+0xb8]%asi

    stxa    %g0, [%o0+0xc8]%asi
    stxa    %g0, [%o0+0xd0]%asi
    stxa    %g0, [%o0+0xd8]%asi
    stxa    %g0, [%o0+0xe0]%asi
    stxa    %g0, [%o0+0xe8]%asi
    stxa    %g0, [%o0+0xf0]%asi
    stxa    %g0, [%o0+0xf8]%asi

    sub %o4, 0x100, %o4
    cmp %o4, 0x100
    bgu,pt  %ncc, 2b
    add %o0, 0x100, %o0

3:
    ! ... check if 64 bytes to set
    cmp %o4, 0x40
    blu %ncc, .bzero_blk_done
    nop

4:
    stxa    %g0, [%o0+0x0]%asi
    stxa    %g0, [%o0+0x8]%asi
    stxa    %g0, [%o0+0x10]%asi
    stxa    %g0, [%o0+0x18]%asi
    stxa    %g0, [%o0+0x20]%asi
    stxa    %g0, [%o0+0x28]%asi
    stxa    %g0, [%o0+0x30]%asi
    stxa    %g0, [%o0+0x38]%asi

    subcc   %o4, 0x40, %o4
    bgu,pt  %ncc, 3b
    add %o0, 0x40, %o0

.bzero_blk_done:
    membar  #Sync
    !
    ! Undo asi register setting.
    !
    rd  %asi, %o4
    wr  %g0, ASI_P, %asi
    cmp %o4, ASI_BLK_INIT_ST_QUAD_LDD_P
    bne,a   %ncc, .bzero_small
    wr  %g0, ASI_USER, %asi

.bzero_small:
    ! Set the remaining doubles
    subcc   %o3, 8, %o3     ! Can we store any doubles?
    blu,pn  %ncc, .byteclr
    and %o1, 7, %o1     ! calc bytes left after doubles

.dbclr:
    stxa    %g0, [%o0]%asi      ! Clear the doubles
    subcc   %o3, 8, %o3
    bgeu,pt %ncc, .dbclr
    add %o0, 8, %o0

    ba  .byteclr
    nop

.wdalign:
    andcc   %o0, 3, %o3     ! is add aligned on a word boundary
    bz,pn   %ncc, .wdclr
    andn    %o1, 3, %o3     ! create word sized count in %o3

    dec %o1         ! decrement count
    stba    %g0, [%o0]%asi      ! clear a byte
    ba  .wdalign
    inc %o0         ! next byte

.wdclr:
    sta %g0, [%o0]%asi      ! 4-byte clearing loop
    subcc   %o3, 4, %o3
    bnz,pt  %ncc, .wdclr
    inc 4, %o0

    and %o1, 3, %o1     ! leftover count, if any

.byteclr:
    ! Set the leftover bytes
    brz %o1, .bzero_exit
    nop

7:
    deccc   %o1         ! byte clearing loop
    stba    %g0, [%o0]%asi
    bgu,pt  %ncc, 7b
    inc %o0

.bzero_exit:
    !
    ! We're just concerned with whether t_lofault was set
    ! when we came in. We end up here from either kzero()
    ! or bzero(). kzero() *always* sets a lofault handler.
    ! It ors LOFAULT_SET into %o5 to indicate it has done
    ! this even if the value of %o5 is otherwise zero.
    ! bzero() sets a lofault handler *only* if one was
    ! previously set. Accordingly we need to examine
    ! %o5 and if it is non-zero be sure to clear LOFAULT_SET
    ! before resetting the error handler.
    !
    tst %o5
    bz  %ncc, 1f
    andn    %o5, LOFAULT_SET, %o5
    membar  #Sync               ! sync error barrier
    stn %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1:
    retl
    clr %o0         ! return (0)

    SET_SIZE(bzero)
#endif  /* lint */