/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
.file "memcpy.s"
/*
* memcpy(s1, s2, len)
*
* Copy s2 to s1, always copy n bytes.
* Note: this C code does not work for overlapped copies.
* Memmove() and bcopy() do.
*
* Added entry __align_cpy_1 is generally for use of the compilers.
*
* Fast assembler language version of the following C-program for memcpy
* which represents the `standard' for the C-library.
*
* void *
* memcpy(void *s, const void *s0, size_t n)
* {
* if (n != 0) {
* char *s1 = s;
* const char *s2 = s0;
* do {
* *s1++ = *s2++;
* } while (--n != 0);
* }
* return (s);
* }
*
*
* N1 Flow :
*
* if (count < 17) {
* Do the byte copy
* Return destination address
* }
* if (count < 128) {
* Is source aligned on word boundary
* If no then align source on word boundary then goto .ald
* If yes goto .ald
* .ald:
* Is destination aligned on word boundary
* Depending on destination offset (last 2 bits of destination)
* copy data by shifting and merging.
* Copy residue bytes as byte copy
* Return destination address
* } else {
* Align destination on block boundary
* Depending on the source offset (last 4 bits of source address) align
* the data and store to destination. Both the load and store are done
* using ASI_BLK_INIT_ST_QUAD_LDD_P.
* For remaining count copy as much data in 8-byte chunk from source to
* destination.
* Followed by trailing copy using byte copy.
* Return saved destination address
* }
*
*
* N2 Flow :
* Flow :
*
* if (count < 128) {
* if count < 3
* copy bytes; exit with dst addr
* if src & dst aligned on word boundary but not long word boundary,
* copy with ldw/stw; branch to finish_up
* if src & dst aligned on long word boundary
* copy with ldx/stx; branch to finish_up
* if src & dst not aligned and length <= 14
* copy bytes; exit with dst addr
* move enough bytes to get src to word boundary
* if dst now on word boundary
* move_words:
* copy words; branch to finish_up
* if dst now on half word boundary
* load words, shift half words, store words; branch to finish_up
* if dst on byte 1
* load words, shift 3 bytes, store words; branch to finish_up
* if dst on byte 3
* load words, shift 1 byte, store words; branch to finish_up
* finish_up:
* copy bytes; exit with dst addr
* } else { More than 128 bytes
* move bytes until dst is on long word boundary
* if( src is on long word boundary ) {
* if (count < 512) {
* finish_long: src/dst aligned on 8 bytes
* copy with ldx/stx in 8-way unrolled loop;
* copy final 0-63 bytes; exit with dst addr
* } else { src/dst aligned; count > 512
* align dst on 64 byte boundary; use 8-way test for each of 8 possible
* src alignments relative to a 64 byte boundary to select the
* 16-way unrolled loop to use for
* block load, fmovd, block-init-store, block-store, fmovd operations
* then go to finish_long.
* }
* } else { src/dst not aligned on 8 bytes
* if src is word aligned and count < 512
* move words in 8-way unrolled loop
* move final 0-31 bytes; exit with dst addr
* if count < 512
* use alignaddr/faligndata combined with ldd/std in 8-way
* unrolled loop to move data.
* go to unalign_done
* else
* setup alignaddr for faligndata instructions
* align dst on 64 byte boundary; use 8-way test for each of 8 possible
* src alignments to nearest long word relative to 64 byte boundary to
* select the 8-way unrolled loop to use for
* block load, falign, fmovd, block-init-store, block-store loop
* (only use block-init-store when src/dst on 8 byte boundaries.)
* unalign_done:
* move remaining bytes for unaligned cases. exit with dst addr.
* }
*
* Comment on N2 memmove and memcpy common code and block-store-init:
* In the man page for memmove, it specifies that copying will take place
* correctly between objects that overlap. For memcpy, behavior is
* undefined for objects that overlap.
*
* In rare cases, some multi-threaded applications may attempt to examine
* the copy destination buffer during the copy. Using the block-store-init
* instruction allows those applications to observe zeros in some
* cache lines of the destination buffer for narrow windows. But the
* the block-store-init provides memory throughput advantages for many
* common applications. To meet both needs, those applications which need
* the destination buffer to retain meaning during the copy should use
* memmove instead of memcpy. The memmove version duplicates the memcpy
* algorithms except the memmove version does not use block-store-init
* in those cases where memcpy does use block-store-init. Otherwise, when
* memmove can determine the source and destination do not overlap,
* memmove shares the memcpy code.
*/
#include <sys/asm_linkage.h>
#include <sys/niagaraasi.h>
#include <sys/asi.h>
#include <sys/trap.h>
/* documented name for primary block initializing store */
#define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P
#define BLOCK_SIZE 64
#define FPRS_FEF 0x4
#define SHORTCOPY 3
#define SHORTCHECK 14
#define SHORT_LONG 64 /* max copy for short longword-aligned case */
/* must be at least 32 */
#define SMALL_MAX 128
#define MED_UMAX 512 /* max copy for medium un-aligned case */
#define MED_WMAX 512 /* max copy for medium word-aligned case */
#define MED_MAX 512 /* max copy for medium longword-aligned case */
#ifdef NIAGARA2_IMPL
#include <sys/sun4asi.h>
#else /* NIAGARA2_IMPL */
/*
* This define is to align data for the unaligned source cases.
* The data1, data2 and data3 is merged into data1 and data2.
* The data3 is preserved for next merge.
*/
#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \
sllx data1, lshift, data1 ;\
srlx data2, rshift, tmp ;\
or data1, tmp, data1 ;\
sllx data2, lshift, data2 ;\
srlx data3, rshift, tmp ;\
or data2, tmp, data2
/*
* Align the data. Merge the data1 and data2 into data1.
*/
#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \
sllx data1, lshift, data1 ;\
srlx data2, rshift, tmp ;\
or data1, tmp, data1
#endif /* NIAGARA2_IMPL */
ANSI_PRAGMA_WEAK(memmove,function)
ANSI_PRAGMA_WEAK(memcpy,function)
ENTRY(memmove)
cmp %o1, %o0 ! if from address is >= to use forward copy
bgeu,pn %ncc, .forcpy ! else use backward if ...
sub %o0, %o1, %o4 ! get difference of two addresses
cmp %o2, %o4 ! compare size and difference of addresses
bleu,pn %ncc, .forcpy ! if size is bigger, do overlapped copy
add %o1, %o2, %o5 ! get to end of source space
!
! an overlapped copy that must be done "backwards"
!
.chksize:
cmp %o2, 8 ! less than 8 byte do byte copy
blu,pt %ncc, 2f ! else continue
! Now size is bigger than 8
.dbalign:
add %o0, %o2, %g1 ! get to end of dest space
andcc %g1, 7, %o3 ! %o3 has bytes till dst 8 bytes aligned
bz,a,pn %ncc, .dbbck ! if dst is not 8 byte aligned: align it
andn %o2, 7, %o3 ! %o3 count is multiple of 8 bytes size
sub %o2, %o3, %o2 ! update o2 with new count
1: dec %o5 ! decrement source
ldub [%o5], %g1 ! load one byte
deccc %o3 ! decrement count
bgu,pt %ncc, 1b ! if not done keep copying
stb %g1, [%o5+%o4] ! store one byte into dest
andncc %o2, 7, %o3 ! %o3 count is multiple of 8 bytes size
bz,pn %ncc, 2f ! if size < 8, move to byte copy
! Now Destination is 8 byte aligned
.dbbck:
andcc %o5, 7, %o0 ! %o0 has src offset
bz,a,pn %ncc, .dbcopybc ! if src is aligned to fast mem move
sub %o2, %o3, %o2 ! Residue bytes in %o2
.cpy_dbwdbc: ! alignment of src is needed
sub %o2, 8, %o2 ! set size one loop ahead
sll %o0, 3, %g1 ! %g1 is left shift
mov 64, %g5 ! init %g5 to be 64
sub %g5, %g1, %g5 ! %g5 right shift = (64 - left shift)
sub %o5, %o0, %o5 ! align the src at 8 bytes.
add %o4, %o0, %o4 ! increase difference between src & dst
ldx [%o5], %o1 ! load first 8 bytes
srlx %o1, %g5, %o1
1: sub %o5, 8, %o5 ! subtract 8 from src
ldx [%o5], %o0 ! load 8 byte
sllx %o0, %g1, %o3 ! shift loaded 8 bytes left into tmp reg
or %o1, %o3, %o3 ! align data
stx %o3, [%o5+%o4] ! store 8 byte
subcc %o2, 8, %o2 ! subtract 8 byte from size
bg,pt %ncc, 1b ! if size > 0 continue
srlx %o0, %g5, %o1 ! move extra byte for the next use
srl %g1, 3, %o0 ! retsote %o0 value for alignment
add %o5, %o0, %o5 ! restore src alignment
sub %o4, %o0, %o4 ! restore difference between src & dest
ba 2f ! branch to the trailing byte copy
add %o2, 8, %o2 ! restore size value
.dbcopybc: ! alignment of src is not needed
1: sub %o5, 8, %o5 ! subtract from src
ldx [%o5], %g1 ! load 8 bytes
subcc %o3, 8, %o3 ! subtract from size
bgu,pt %ncc, 1b ! if size is bigger 0 continue
stx %g1, [%o5+%o4] ! store 8 bytes to destination
ba 2f
nop
.bcbyte:
1: ldub [%o5], %g1 ! load one byte
stb %g1, [%o5+%o4] ! store one byte
2: deccc %o2 ! decrement size
bgeu,a,pt %ncc, 1b ! if size is >= 0 continue
dec %o5 ! decrement from address
.exitbc: ! exit from backward copy
retl
add %o5, %o4, %o0 ! restore dest addr
#ifdef NIAGARA2_IMPL
!
! Check to see if memmove is large aligned copy
! If so, use special version of copy that avoids
! use of block store init
!
.forcpy:
cmp %o2, SMALL_MAX ! check for not small case
blt,pn %ncc, .mv_short ! merge with memcpy
mov %o0, %g1 ! save %o0
neg %o0, %o5
andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
brz,pt %o5, .mv_dst_aligned_on_8
! %o5 has the bytes to be written in partial store.
sub %o2, %o5, %o2
sub %o1, %o0, %o1 ! %o1 gets the difference
7: ! dst aligning loop
ldub [%o1+%o0], %o4 ! load one byte
subcc %o5, 1, %o5
stb %o4, [%o0]
bgu,pt %ncc, 7b
add %o0, 1, %o0 ! advance dst
add %o1, %o0, %o1 ! restore %o1
.mv_dst_aligned_on_8:
andcc %o1, 7, %o5
brnz,pt %o5, .src_dst_unaligned_on_8
prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
.mv_src_dst_aligned_on_8:
! check if we are copying MED_MAX or more bytes
cmp %o2, MED_MAX ! limit to store buffer size
bleu,pt %ncc, .medlong
prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
/*
* The following memmove code mimics the memcpy code for large aligned copies,
* but does not use the ASI_STBI_P (block initializing store) performance
* optimization. See memmove rationale section in documentation
*/
.mv_large_align8_copy: ! Src and dst share 8 byte alignment
rd %fprs, %g5 ! check for unused fp
! if fprs.fef == 0, set it.
! Setting it when already set costs more than checking
andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
bz,a %ncc, 1f
wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
1:
! align dst to 64 byte boundary
andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
brz,pn %o3, .mv_aligned_on_64
sub %o3, 64, %o3 ! %o3 has negative bytes to move
add %o2, %o3, %o2 ! adjust remaining count
.mv_align_to_64:
ldx [%o1], %o4
add %o1, 8, %o1 ! increment src ptr
addcc %o3, 8, %o3
stx %o4, [%o0]
brnz,pt %o3, .mv_align_to_64
add %o0, 8, %o0 ! increment dst ptr
.mv_aligned_on_64:
prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
mov %asi,%o4 ! save %asi
! Determine source alignment
! to correct 8 byte offset
andcc %o1, 0x20, %o3
brnz,pn %o3, .mv_align_1
mov ASI_BLK_P, %asi ! setup %asi for block load/store
andcc %o1, 0x10, %o3
brnz,pn %o3, .mv_align_01
nop
andcc %o1, 0x08, %o3
brz,pn %o3, .mv_align_000
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
ba .mv_align_001
nop
.mv_align_01:
andcc %o1, 0x08, %o3
brnz,pn %o3, .mv_align_011
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
ba .mv_align_010
nop
.mv_align_1:
andcc %o1, 0x10, %o3
brnz,pn %o3, .mv_align_11
nop
andcc %o1, 0x08, %o3
brnz,pn %o3, .mv_align_101
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
ba .mv_align_100
nop
.mv_align_11:
andcc %o1, 0x08, %o3
brz,pn %o3, .mv_align_110
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
.mv_align_111:
! Alignment off by 8 bytes
ldd [%o1], %d0
add %o1, 8, %o1
sub %o2, 8, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.mv_align_111_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d2
fmovd %d18, %d4
fmovd %d20, %d6
fmovd %d22, %d8
fmovd %d24, %d10
fmovd %d26, %d12
fmovd %d28, %d14
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d0
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d2
fmovd %d18, %d4
fmovd %d20, %d6
fmovd %d22, %d8
fmovd %d24, %d10
fmovd %d26, %d12
fmovd %d28, %d14
add %o1, 128, %o1 ! increment src
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d30, %d0
bgt,pt %ncc, .mv_align_111_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
ba .remain_stuff
add %o0, 8, %o0
! END OF mv_align_111
.mv_align_110:
! Alignment off by 16 bytes
ldd [%o1], %d0
ldd [%o1+8], %d2
add %o1, 16, %o1
sub %o2, 16, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.mv_align_110_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d4
fmovd %d18, %d6
fmovd %d20, %d8
fmovd %d22, %d10
fmovd %d24, %d12
fmovd %d26, %d14
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d28, %d0
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d2
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d4
fmovd %d18, %d6
fmovd %d20, %d8
fmovd %d22, %d10
fmovd %d24, %d12
fmovd %d26, %d14
add %o1, 128, %o1 ! increment src
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d28, %d0
fmovd %d30, %d2
bgt,pt %ncc, .mv_align_110_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
std %d2, [%o0+8]
ba .remain_stuff
add %o0, 16, %o0
! END OF mv_align_110
.mv_align_101:
! Alignment off by 24 bytes
ldd [%o1], %d0
ldd [%o1+8], %d2
ldd [%o1+16], %d4
add %o1, 24, %o1
sub %o2, 24, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.mv_align_101_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d6
fmovd %d18, %d8
fmovd %d20, %d10
fmovd %d22, %d12
fmovd %d24, %d14
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d26, %d0
fmovd %d28, %d2
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d4
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d6
fmovd %d18, %d8
fmovd %d20, %d10
fmovd %d22, %d12
fmovd %d24, %d14
add %o1, 128, %o1 ! increment src
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d26, %d0
fmovd %d28, %d2
fmovd %d30, %d4
bgt,pt %ncc, .mv_align_101_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
std %d2, [%o0+8]
std %d4, [%o0+16]
ba .remain_stuff
add %o0, 24, %o0
! END OF mv_align_101
.mv_align_100:
! Alignment off by 32 bytes
ldd [%o1], %d0
ldd [%o1+8], %d2
ldd [%o1+16],%d4
ldd [%o1+24],%d6
add %o1, 32, %o1
sub %o2, 32, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.mv_align_100_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d8
fmovd %d18, %d10
fmovd %d20, %d12
fmovd %d22, %d14
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d24, %d0
fmovd %d26, %d2
fmovd %d28, %d4
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d6
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d8
fmovd %d18, %d10
fmovd %d20, %d12
fmovd %d22, %d14
add %o1, 128, %o1 ! increment src
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d24, %d0
fmovd %d26, %d2
fmovd %d28, %d4
fmovd %d30, %d6
bgt,pt %ncc, .mv_align_100_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
std %d2, [%o0+8]
std %d4, [%o0+16]
std %d6, [%o0+24]
ba .remain_stuff
add %o0, 32, %o0
! END OF mv_align_100
.mv_align_011:
! Alignment off by 40 bytes
ldd [%o1], %d0
ldd [%o1+8], %d2
ldd [%o1+16], %d4
ldd [%o1+24], %d6
ldd [%o1+32], %d8
add %o1, 40, %o1
sub %o2, 40, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.mv_align_011_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d10
fmovd %d18, %d12
fmovd %d20, %d14
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d22, %d0
fmovd %d24, %d2
fmovd %d26, %d4
fmovd %d28, %d6
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d8
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d10
fmovd %d18, %d12
fmovd %d20, %d14
add %o1, 128, %o1 ! increment src
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d22, %d0
fmovd %d24, %d2
fmovd %d26, %d4
fmovd %d28, %d6
fmovd %d30, %d8
bgt,pt %ncc, .mv_align_011_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
std %d2, [%o0+8]
std %d4, [%o0+16]
std %d6, [%o0+24]
std %d8, [%o0+32]
ba .remain_stuff
add %o0, 40, %o0
! END OF mv_align_011
.mv_align_010:
! Alignment off by 48 bytes
ldd [%o1], %d0
ldd [%o1+8], %d2
ldd [%o1+16], %d4
ldd [%o1+24], %d6
ldd [%o1+32], %d8
ldd [%o1+40], %d10
add %o1, 48, %o1
sub %o2, 48, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.mv_align_010_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d12
fmovd %d18, %d14
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d20, %d0
fmovd %d22, %d2
fmovd %d24, %d4
fmovd %d26, %d6
fmovd %d28, %d8
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d10
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d12
fmovd %d18, %d14
add %o1, 128, %o1 ! increment src
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d20, %d0
fmovd %d22, %d2
fmovd %d24, %d4
fmovd %d26, %d6
fmovd %d28, %d8
fmovd %d30, %d10
bgt,pt %ncc, .mv_align_010_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
std %d2, [%o0+8]
std %d4, [%o0+16]
std %d6, [%o0+24]
std %d8, [%o0+32]
std %d10, [%o0+40]
ba .remain_stuff
add %o0, 48, %o0
! END OF mv_align_010
.mv_align_001:
! Alignment off by 56 bytes
ldd [%o1], %d0
ldd [%o1+8], %d2
ldd [%o1+16], %d4
ldd [%o1+24], %d6
ldd [%o1+32], %d8
ldd [%o1+40], %d10
ldd [%o1+48], %d12
add %o1, 56, %o1
sub %o2, 56, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.mv_align_001_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d14
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d18, %d0
fmovd %d20, %d2
fmovd %d22, %d4
fmovd %d24, %d6
fmovd %d26, %d8
fmovd %d28, %d10
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d12
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d14
add %o1, 128, %o1 ! increment src
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d18, %d0
fmovd %d20, %d2
fmovd %d22, %d4
fmovd %d24, %d6
fmovd %d26, %d8
fmovd %d28, %d10
fmovd %d30, %d12
bgt,pt %ncc, .mv_align_001_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
std %d2, [%o0+8]
std %d4, [%o0+16]
std %d6, [%o0+24]
std %d8, [%o0+32]
std %d10, [%o0+40]
std %d12, [%o0+48]
ba .remain_stuff
add %o0, 56, %o0
! END OF mv_align_001
.mv_align_000:
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.mv_align_000_loop:
/* ---- copy line 1 of 2. ---- */
subcc %o5, 128, %o5
ldda [%o1]%asi,%d0
stda %d0,[%o0]%asi
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
/* ---- copy line 2 of 2. ---- */
add %o0, 64, %o0
ldda [%o1+64]%asi,%d0
add %o1, 128, %o1 ! increment src
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! increment dst
bgt,pt %ncc, .mv_align_000_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
ba .remain_stuff
nop
! END OF mv_align_000
#else /* NIAGARA2_IMPL */
#endif /* NIAGARA2_IMPL */
SET_SIZE(memmove)
ENTRY(memcpy)
ENTRY(__align_cpy_1)
#ifdef NIAGARA2_IMPL
cmp %o2, SMALL_MAX ! check for not small case
bgeu,pn %ncc, .medium ! go to larger cases
mov %o0, %g1 ! save %o0
.mv_short:
cmp %o2, SHORTCOPY ! check for really short case
ble,pt %ncc, .smallfin
or %o0, %o1, %o4 ! prepare alignment check
andcc %o4, 0x3, %o5 ! test for alignment
bz,pt %ncc, .smallword ! branch to word aligned case
cmp %o2, SHORTCHECK
ble,pt %ncc, .smallrest
andcc %o1, 0x3, %o5 ! is src word aligned
bz,pn %ncc, .aldst
cmp %o5, 2 ! is src half-word aligned
be,pt %ncc, .s2algn
cmp %o5, 3 ! src is byte aligned
.s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it
inc 1, %o1
stb %o3, [%o0] ! move a byte to align src
inc 1, %o0
bne,pt %ncc, .s2algn
dec %o2
b .ald ! now go align dest
andcc %o0, 0x3, %o5
.s2algn:lduh [%o1], %o3 ! know src is 2 byte aligned
inc 2, %o1
srl %o3, 8, %o4
stb %o4, [%o0] ! have to do bytes,
stb %o3, [%o0 + 1] ! don't know dst alignment
inc 2, %o0
dec 2, %o2
.aldst: andcc %o0, 0x3, %o5 ! align the destination address
.ald: bz,pn %ncc, .w4cp
cmp %o5, 2
be,pn %ncc, .w2cp
cmp %o5, 3
.w3cp: lduw [%o1], %o4
inc 4, %o1
srl %o4, 24, %o5
stb %o5, [%o0]
bne,pt %ncc, .w1cp
inc %o0
dec 1, %o2
andn %o2, 3, %o3 ! %o3 is aligned word count
dec 4, %o3 ! avoid reading beyond tail of src
sub %o1, %o0, %o1 ! %o1 gets the difference
1: sll %o4, 8, %g5 ! save residual bytes
lduw [%o1+%o0], %o4
deccc 4, %o3
srl %o4, 24, %o5 ! merge with residual
or %o5, %g5, %g5
st %g5, [%o0]
bnz,pt %ncc, 1b
inc 4, %o0
sub %o1, 3, %o1 ! used one byte of last word read
and %o2, 3, %o2
b 7f
inc 4, %o2
.w1cp: srl %o4, 8, %o5
sth %o5, [%o0]
inc 2, %o0
dec 3, %o2
andn %o2, 3, %o3 ! %o3 is aligned word count
dec 4, %o3 ! avoid reading beyond tail of src
sub %o1, %o0, %o1 ! %o1 gets the difference
2: sll %o4, 24, %g5 ! save residual bytes
lduw [%o1+%o0], %o4
deccc 4, %o3
srl %o4, 8, %o5 ! merge with residual
or %o5, %g5, %g5
st %g5, [%o0]
bnz,pt %ncc, 2b
inc 4, %o0
sub %o1, 1, %o1 ! used three bytes of last word read
and %o2, 3, %o2
b 7f
inc 4, %o2
.w2cp: lduw [%o1], %o4
inc 4, %o1
srl %o4, 16, %o5
sth %o5, [%o0]
inc 2, %o0
dec 2, %o2
andn %o2, 3, %o3 ! %o3 is aligned word count
dec 4, %o3 ! avoid reading beyond tail of src
sub %o1, %o0, %o1 ! %o1 gets the difference
3: sll %o4, 16, %g5 ! save residual bytes
lduw [%o1+%o0], %o4
deccc 4, %o3
srl %o4, 16, %o5 ! merge with residual
or %o5, %g5, %g5
st %g5, [%o0]
bnz,pt %ncc, 3b
inc 4, %o0
sub %o1, 2, %o1 ! used two bytes of last word read
and %o2, 3, %o2
b 7f
inc 4, %o2
.w4cp: andn %o2, 3, %o3 ! %o3 is aligned word count
sub %o1, %o0, %o1 ! %o1 gets the difference
1: lduw [%o1+%o0], %o4 ! read from address
deccc 4, %o3 ! decrement count
st %o4, [%o0] ! write at destination address
bgu,pt %ncc, 1b
inc 4, %o0 ! increment to address
and %o2, 3, %o2 ! number of leftover bytes, if any
! simple finish up byte copy, works with any alignment
7:
add %o1, %o0, %o1 ! restore %o1
.smallrest:
tst %o2
bz,pt %ncc, .smallx
cmp %o2, 4
blt,pt %ncc, .smallleft3
nop
sub %o2, 3, %o2
.smallnotalign4:
ldub [%o1], %o3 ! read byte
subcc %o2, 4, %o2 ! reduce count by 4
stb %o3, [%o0] ! write byte
ldub [%o1+1], %o3 ! repeat for total of 4 bytes
add %o1, 4, %o1 ! advance SRC by 4
stb %o3, [%o0+1]
ldub [%o1-2], %o3
add %o0, 4, %o0 ! advance DST by 4
stb %o3, [%o0-2]
ldub [%o1-1], %o3
bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain
stb %o3, [%o0-1]
addcc %o2, 3, %o2 ! restore count
bz,pt %ncc, .smallx
.smallleft3: ! 1, 2, or 3 bytes remain
subcc %o2, 1, %o2
ldub [%o1], %o3 ! load one byte
bz,pt %ncc, .smallx
stb %o3, [%o0] ! store one byte
ldub [%o1+1], %o3 ! load second byte
subcc %o2, 1, %o2
bz,pt %ncc, .smallx
stb %o3, [%o0+1] ! store second byte
ldub [%o1+2], %o3 ! load third byte
stb %o3, [%o0+2] ! store third byte
.smallx:
retl
mov %g1, %o0 ! restore %o0
.smallfin:
tst %o2
bnz,pt %ncc, .smallleft3
nop
retl
mov %g1, %o0 ! restore %o0
.align 16
.smallwords:
lduw [%o1], %o3 ! read word
.smallwordx:
subcc %o2, 8, %o2 ! update count
stw %o3, [%o0] ! write word
add %o1, 8, %o1 ! update SRC
lduw [%o1-4], %o3 ! read word
add %o0, 8, %o0 ! update DST
bgu,pt %ncc, .smallwords ! loop until done
stw %o3, [%o0-4] ! write word
addcc %o2, 7, %o2 ! restore count
bz,pt %ncc, .smallexit ! check for completion
cmp %o2, 4 ! check for 4 or more bytes left
blt %ncc, .smallleft3 ! if not, go to finish up
nop
lduw [%o1], %o3
add %o1, 4, %o1
subcc %o2, 4, %o2
add %o0, 4, %o0
bnz,pt %ncc, .smallleft3
stw %o3, [%o0-4]
retl
mov %g1, %o0 ! restore %o0
! 8 or more bytes, src and dest start on word boundary
! %o4 contains or %o0, %o1; %o3 contains first four bytes of src
.smalllong:
andcc %o4, 0x7, %o5 ! test for long alignment
bnz,pt %ncc, .smallwordx ! branch to word aligned case
cmp %o2, SHORT_LONG-7
bge,a %ncc, .medl64 ! if we branch
sub %o2,56,%o2 ! adjust %o2 to -31 off count
sub %o1, %o0, %o1 ! %o1 gets the difference
.small_long_l:
ldx [%o1+%o0], %o3
subcc %o2, 8, %o2
add %o0, 8, %o0
bgu,pt %ncc, .small_long_l ! loop until done
stx %o3, [%o0-8] ! write word
add %o1, %o0, %o1 ! restore %o1
addcc %o2, 7, %o2 ! restore %o2 to correct count
bz,pt %ncc, .smallexit ! check for completion
cmp %o2, 4 ! check for 4 or more bytes left
blt,pt %ncc, .smallleft3 ! if not, go to finish up
nop
lduw [%o1], %o3
add %o1, 4, %o1
subcc %o2, 4, %o2
stw %o3, [%o0]
add %o0, 4, %o0
bnz,pt %ncc, .smallleft3
nop
retl
mov %g1, %o0 ! restore %o0
.align 16
! src and dest start on word boundary
.smallword:
subcc %o2, 7, %o2 ! adjust count
bgu,pt %ncc, .smalllong
lduw [%o1], %o3 ! read word
addcc %o2, 3, %o2 ! restore count
bz,pt %ncc, .smallexit
stw %o3, [%o0] ! write word
deccc %o2 ! reduce count for cc test
ldub [%o1+4], %o3 ! load one byte
bz,pt %ncc, .smallexit
stb %o3, [%o0+4] ! store one byte
ldub [%o1+5], %o3 ! load second byte
deccc %o2
bz,pt %ncc, .smallexit
stb %o3, [%o0+5] ! store second byte
ldub [%o1+6], %o3 ! load third byte
stb %o3, [%o0+6] ! store third byte
.smallexit:
retl
mov %g1, %o0 ! restore %o0
.align 16
.medium:
neg %o0, %o5
andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
brz,pt %o5, .dst_aligned_on_8
! %o5 has the bytes to be written in partial store.
sub %o2, %o5, %o2
sub %o1, %o0, %o1 ! %o1 gets the difference
7: ! dst aligning loop
ldub [%o1+%o0], %o4 ! load one byte
subcc %o5, 1, %o5
stb %o4, [%o0]
bgu,pt %ncc, 7b
add %o0, 1, %o0 ! advance dst
add %o1, %o0, %o1 ! restore %o1
.dst_aligned_on_8:
andcc %o1, 7, %o5
brnz,pt %o5, .src_dst_unaligned_on_8
prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
.src_dst_aligned_on_8:
! check if we are copying MED_MAX or more bytes
cmp %o2, MED_MAX ! limit to store buffer size
bgu,pt %ncc, .large_align8_copy
prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
/*
* Special case for handling when src and dest are both long word aligned
* and total data to move is less than MED_MAX bytes
*/
.medlong:
subcc %o2, 63, %o2 ! adjust length to allow cc test
ble,pt %ncc, .medl63 ! skip big loop if less than 64 bytes
.medl64:
prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read ! into the l2 cache
ldx [%o1], %o4 ! load
subcc %o2, 64, %o2 ! decrement length count
stx %o4, [%o0] ! and store
ldx [%o1+8], %o3 ! a block of 64 bytes
stx %o3, [%o0+8]
ldx [%o1+16], %o4
stx %o4, [%o0+16]
ldx [%o1+24], %o3
stx %o3, [%o0+24]
ldx [%o1+32], %o4 ! load
stx %o4, [%o0+32] ! and store
ldx [%o1+40], %o3 ! a block of 64 bytes
add %o1, 64, %o1 ! increase src ptr by 64
stx %o3, [%o0+40]
ldx [%o1-16], %o4
add %o0, 64, %o0 ! increase dst ptr by 64
stx %o4, [%o0-16]
ldx [%o1-8], %o3
bgu,pt %ncc, .medl64 ! repeat if at least 64 bytes left
stx %o3, [%o0-8]
.medl63:
addcc %o2, 32, %o2 ! adjust remaining count
ble,pt %ncc, .medl31 ! to skip if 31 or fewer bytes left
nop
ldx [%o1], %o4 ! load
sub %o2, 32, %o2 ! decrement length count
stx %o4, [%o0] ! and store
ldx [%o1+8], %o3 ! a block of 32 bytes
add %o1, 32, %o1 ! increase src ptr by 32
stx %o3, [%o0+8]
ldx [%o1-16], %o4
add %o0, 32, %o0 ! increase dst ptr by 32
stx %o4, [%o0-16]
ldx [%o1-8], %o3
stx %o3, [%o0-8]
.medl31:
addcc %o2, 16, %o2 ! adjust remaining count
ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left
nop !
ldx [%o1], %o4 ! load and store 16 bytes
add %o1, 16, %o1 ! increase src ptr by 16
stx %o4, [%o0] !
sub %o2, 16, %o2 ! decrease count by 16
ldx [%o1-8], %o3 !
add %o0, 16, %o0 ! increase dst ptr by 16
stx %o3, [%o0-8]
.medl15:
addcc %o2, 15, %o2 ! restore count
bz,pt %ncc, .smallexit ! exit if finished
cmp %o2, 8
blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
tst %o2
ldx [%o1], %o4 ! load 8 bytes
add %o1, 8, %o1 ! increase src ptr by 8
add %o0, 8, %o0 ! increase dst ptr by 8
subcc %o2, 8, %o2 ! decrease count by 8
bnz,pt %ncc, .medw7
stx %o4, [%o0-8] ! and store 8 bytes
retl
mov %g1, %o0 ! restore %o0
.align 16
.src_dst_unaligned_on_8:
! DST is 8-byte aligned, src is not
2:
andcc %o1, 0x3, %o5 ! test word alignment
bnz,pt %ncc, .unalignsetup ! branch to skip if not word aligned
prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
/*
* Handle all cases where src and dest are aligned on word
* boundaries. Use unrolled loops for better performance.
* This option wins over standard large data move when
* source and destination is in cache for medium
* to short data moves.
*/
cmp %o2, MED_WMAX ! limit to store buffer size
bge,pt %ncc, .unalignrejoin ! otherwise rejoin main loop
prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
subcc %o2, 31, %o2 ! adjust length to allow cc test
! for end of loop
ble,pt %ncc, .medw31 ! skip big loop if less than 16
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
.medw32:
ld [%o1], %o4 ! move a block of 32 bytes
stw %o4, [%o0]
ld [%o1+4], %o3
stw %o3, [%o0+4]
ld [%o1+8], %o4
stw %o4, [%o0+8]
ld [%o1+12], %o3
stw %o3, [%o0+12]
ld [%o1+16], %o4
subcc %o2, 32, %o2 ! decrement length count
stw %o4, [%o0+16]
ld [%o1+20], %o3
add %o1, 32, %o1 ! increase src ptr by 32
stw %o3, [%o0+20]
ld [%o1-8], %o4
add %o0, 32, %o0 ! increase dst ptr by 32
stw %o4, [%o0-8]
ld [%o1-4], %o3
bgu,pt %ncc, .medw32 ! repeat if at least 32 bytes left
stw %o3, [%o0-4]
.medw31:
addcc %o2, 31, %o2 ! restore count
bz,pt %ncc, .smallexit ! exit if finished
nop
cmp %o2, 16
blt,pt %ncc, .medw15
nop
ld [%o1], %o4 ! move a block of 16 bytes
subcc %o2, 16, %o2 ! decrement length count
stw %o4, [%o0]
ld [%o1+4], %o3
add %o1, 16, %o1 ! increase src ptr by 16
stw %o3, [%o0+4]
ld [%o1-8], %o4
add %o0, 16, %o0 ! increase dst ptr by 16
stw %o4, [%o0-8]
ld [%o1-4], %o3
stw %o3, [%o0-4]
.medw15:
bz,pt %ncc, .smallexit ! exit if finished
cmp %o2, 8
blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
tst %o2
ld [%o1], %o4 ! load 4 bytes
subcc %o2, 8, %o2 ! decrease count by 8
stw %o4, [%o0] ! and store 4 bytes
add %o1, 8, %o1 ! increase src ptr by 8
ld [%o1-4], %o3 ! load 4 bytes
add %o0, 8, %o0 ! increase dst ptr by 8
stw %o3, [%o0-4] ! and store 4 bytes
bz,pt %ncc, .smallexit ! exit if finished
.medw7: ! count is ge 1, less than 8
cmp %o2, 4 ! check for 4 bytes left
blt,pt %ncc, .smallleft3 ! skip if 3 or fewer bytes left
nop !
ld [%o1], %o4 ! load 4 bytes
add %o1, 4, %o1 ! increase src ptr by 4
add %o0, 4, %o0 ! increase dst ptr by 4
subcc %o2, 4, %o2 ! decrease count by 4
bnz .smallleft3
stw %o4, [%o0-4] ! and store 4 bytes
retl
mov %g1, %o0 ! restore %o0
.align 16
.large_align8_copy: ! Src and dst share 8 byte alignment
rd %fprs, %g5 ! check for unused fp
! if fprs.fef == 0, set it.
! Setting it when already set costs more than checking
andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
bz,a %ncc, 1f
wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
1:
! align dst to 64 byte boundary
andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
brz,pn %o3, .aligned_to_64
andcc %o0, 8, %o3 ! odd long words to move?
brz,pt %o3, .aligned_to_16
nop
ldx [%o1], %o4
sub %o2, 8, %o2
add %o1, 8, %o1 ! increment src ptr
add %o0, 8, %o0 ! increment dst ptr
stx %o4, [%o0-8]
.aligned_to_16:
andcc %o0, 16, %o3 ! pair of long words to move?
brz,pt %o3, .aligned_to_32
nop
ldx [%o1], %o4
sub %o2, 16, %o2
stx %o4, [%o0]
add %o1, 16, %o1 ! increment src ptr
ldx [%o1-8], %o4
add %o0, 16, %o0 ! increment dst ptr
stx %o4, [%o0-8]
.aligned_to_32:
andcc %o0, 32, %o3 ! four long words to move?
brz,pt %o3, .aligned_to_64
nop
ldx [%o1], %o4
sub %o2, 32, %o2
stx %o4, [%o0]
ldx [%o1+8], %o4
stx %o4, [%o0+8]
ldx [%o1+16], %o4
stx %o4, [%o0+16]
add %o1, 32, %o1 ! increment src ptr
ldx [%o1-8], %o4
add %o0, 32, %o0 ! increment dst ptr
stx %o4, [%o0-8]
.aligned_to_64:
prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
mov %asi,%o4 ! save %asi
! Determine source alignment
! to correct 8 byte offset
andcc %o1, 0x20, %o3
brnz,pn %o3, .align_1
mov ASI_BLK_P, %asi ! setup %asi for block load/store
andcc %o1, 0x10, %o3
brnz,pn %o3, .align_01
nop
andcc %o1, 0x08, %o3
brz,pn %o3, .align_000
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
ba .align_001
nop
.align_01:
andcc %o1, 0x08, %o3
brnz,pn %o3, .align_011
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
ba .align_010
nop
.align_1:
andcc %o1, 0x10, %o3
brnz,pn %o3, .align_11
nop
andcc %o1, 0x08, %o3
brnz,pn %o3, .align_101
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
ba .align_100
nop
.align_11:
andcc %o1, 0x08, %o3
brz,pn %o3, .align_110
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
.align_111:
! Alignment off by 8 bytes
ldd [%o1], %d0
add %o1, 8, %o1
sub %o2, 8, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.align_111_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d2
fmovd %d18, %d4
fmovd %d20, %d6
fmovd %d22, %d8
fmovd %d24, %d10
fmovd %d26, %d12
fmovd %d28, %d14
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d0
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d2
fmovd %d18, %d4
fmovd %d20, %d6
fmovd %d22, %d8
fmovd %d24, %d10
fmovd %d26, %d12
fmovd %d28, %d14
add %o1, 128, %o1 ! increment src
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d30, %d0
bgt,pt %ncc, .align_111_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
ba .remain_stuff
add %o0, 8, %o0
! END OF align_111
.align_110:
! Alignment off by 16 bytes
ldd [%o1], %d0
ldd [%o1+8], %d2
add %o1, 16, %o1
sub %o2, 16, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.align_110_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d4
fmovd %d18, %d6
fmovd %d20, %d8
fmovd %d22, %d10
fmovd %d24, %d12
fmovd %d26, %d14
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d28, %d0
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d2
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d4
fmovd %d18, %d6
fmovd %d20, %d8
fmovd %d22, %d10
fmovd %d24, %d12
fmovd %d26, %d14
add %o1, 128, %o1 ! increment src
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d28, %d0
fmovd %d30, %d2
bgt,pt %ncc, .align_110_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
std %d2, [%o0+8]
ba .remain_stuff
add %o0, 16, %o0
! END OF align_110
.align_101:
! Alignment off by 24 bytes
ldd [%o1], %d0
ldd [%o1+8], %d2
ldd [%o1+16], %d4
add %o1, 24, %o1
sub %o2, 24, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.align_101_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d6
fmovd %d18, %d8
fmovd %d20, %d10
fmovd %d22, %d12
fmovd %d24, %d14
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d26, %d0
fmovd %d28, %d2
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d4
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d6
fmovd %d18, %d8
fmovd %d20, %d10
fmovd %d22, %d12
fmovd %d24, %d14
add %o1, 128, %o1 ! increment src
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d26, %d0
fmovd %d28, %d2
fmovd %d30, %d4
bgt,pt %ncc, .align_101_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
std %d2, [%o0+8]
std %d4, [%o0+16]
ba .remain_stuff
add %o0, 24, %o0
! END OF align_101
.align_100:
! Alignment off by 32 bytes
ldd [%o1], %d0
ldd [%o1+8], %d2
ldd [%o1+16],%d4
ldd [%o1+24],%d6
add %o1, 32, %o1
sub %o2, 32, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.align_100_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d8
fmovd %d18, %d10
fmovd %d20, %d12
fmovd %d22, %d14
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d24, %d0
fmovd %d26, %d2
fmovd %d28, %d4
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d6
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d8
fmovd %d18, %d10
fmovd %d20, %d12
fmovd %d22, %d14
add %o1, 128, %o1 ! increment src
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d24, %d0
fmovd %d26, %d2
fmovd %d28, %d4
fmovd %d30, %d6
bgt,pt %ncc, .align_100_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
std %d2, [%o0+8]
std %d4, [%o0+16]
std %d6, [%o0+24]
ba .remain_stuff
add %o0, 32, %o0
! END OF align_100
.align_011:
! Alignment off by 40 bytes
ldd [%o1], %d0
ldd [%o1+8], %d2
ldd [%o1+16], %d4
ldd [%o1+24], %d6
ldd [%o1+32], %d8
add %o1, 40, %o1
sub %o2, 40, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.align_011_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d10
fmovd %d18, %d12
fmovd %d20, %d14
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d22, %d0
fmovd %d24, %d2
fmovd %d26, %d4
fmovd %d28, %d6
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d8
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d10
fmovd %d18, %d12
fmovd %d20, %d14
add %o1, 128, %o1 ! increment src
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d22, %d0
fmovd %d24, %d2
fmovd %d26, %d4
fmovd %d28, %d6
fmovd %d30, %d8
bgt,pt %ncc, .align_011_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
std %d2, [%o0+8]
std %d4, [%o0+16]
std %d6, [%o0+24]
std %d8, [%o0+32]
ba .remain_stuff
add %o0, 40, %o0
! END OF align_011
.align_010:
! Alignment off by 48 bytes
ldd [%o1], %d0
ldd [%o1+8], %d2
ldd [%o1+16], %d4
ldd [%o1+24], %d6
ldd [%o1+32], %d8
ldd [%o1+40], %d10
add %o1, 48, %o1
sub %o2, 48, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.align_010_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d12
fmovd %d18, %d14
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d20, %d0
fmovd %d22, %d2
fmovd %d24, %d4
fmovd %d26, %d6
fmovd %d28, %d8
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d10
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d12
fmovd %d18, %d14
add %o1, 128, %o1 ! increment src
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d20, %d0
fmovd %d22, %d2
fmovd %d24, %d4
fmovd %d26, %d6
fmovd %d28, %d8
fmovd %d30, %d10
bgt,pt %ncc, .align_010_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
std %d2, [%o0+8]
std %d4, [%o0+16]
std %d6, [%o0+24]
std %d8, [%o0+32]
std %d10, [%o0+40]
ba .remain_stuff
add %o0, 48, %o0
! END OF align_010
.align_001:
! Alignment off by 56 bytes
ldd [%o1], %d0
ldd [%o1+8], %d2
ldd [%o1+16], %d4
ldd [%o1+24], %d6
ldd [%o1+32], %d8
ldd [%o1+40], %d10
ldd [%o1+48], %d12
add %o1, 56, %o1
sub %o2, 56, %o2
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.align_001_loop:
subcc %o5, 128, %o5
/* ---- copy line 1 of 2. ---- */
ldda [%o1]%asi,%d16 ! block load
fmovd %d16, %d14
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d18, %d0
fmovd %d20, %d2
fmovd %d22, %d4
fmovd %d24, %d6
fmovd %d26, %d8
fmovd %d28, %d10
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
fmovd %d30, %d12
/* ---- copy line 2 of 2. ---- */
ldda [%o1+64]%asi,%d16
fmovd %d16, %d14
add %o1, 128, %o1 ! increment src
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! advance dst
fmovd %d18, %d0
fmovd %d20, %d2
fmovd %d22, %d4
fmovd %d24, %d6
fmovd %d26, %d8
fmovd %d28, %d10
fmovd %d30, %d12
bgt,pt %ncc, .align_001_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
std %d0, [%o0]
std %d2, [%o0+8]
std %d4, [%o0+16]
std %d6, [%o0+24]
std %d8, [%o0+32]
std %d10, [%o0+40]
std %d12, [%o0+48]
ba .remain_stuff
add %o0, 56, %o0
! END OF align_001
.align_000:
andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size
and %o2, 0x7f, %o2 ! residue bytes in %o2
.align_000_loop:
/* ---- copy line 1 of 2. ---- */
subcc %o5, 128, %o5
ldda [%o1]%asi,%d0
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
/* ---- copy line 2 of 2. ---- */
add %o0, 64, %o0
ldda [%o1+64]%asi,%d0
add %o1, 128, %o1 ! increment src
stxa %g0,[%o0]ASI_STBI_P ! block initializing store
stda %d0,[%o0]%asi
add %o0, 64, %o0 ! increment dst
bgt,pt %ncc, .align_000_loop
prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
! END OF align_000
.remain_stuff:
mov %o4, %asi ! restore %asi
brnz %g5, .medlong
membar #Sync
ba .medlong
wr %g5, %g0, %fprs
.align 16
! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
.unalignsetup:
prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
.unalignrejoin:
rd %fprs, %g5 ! check for unused fp
! if fprs.fef == 0, set it.
! Setting it when already set costs more than checking
andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
bz,a %ncc, 1f
wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
1:
cmp %o2, MED_UMAX ! check for medium unaligned limit
bge,pt %ncc,.unalign_large
nop
andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
and %o2, 0x3f, %o2 ! residue bytes in %o2
cmp %o2, 8 ! Insure we don't load beyond
bgt .unalign_adjust ! end of source buffer
andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
add %o2, 64, %o2 ! adjust to leave loop
sub %o5, 64, %o5 ! early if necessary
.unalign_adjust:
alignaddr %o1, %g0, %g0 ! generate %gsr
add %o1, %o5, %o1 ! advance %o1 to after blocks
ldd [%o4], %d0
.unalign_loop:
ldd [%o4+8], %d2
faligndata %d0, %d2, %d16
ldd [%o4+16], %d4
std %d16, [%o0]
faligndata %d2, %d4, %d18
ldd [%o4+24], %d6
std %d18, [%o0+8]
faligndata %d4, %d6, %d20
ldd [%o4+32], %d8
std %d20, [%o0+16]
faligndata %d6, %d8, %d22
ldd [%o4+40], %d10
std %d22, [%o0+24]
faligndata %d8, %d10, %d24
ldd [%o4+48], %d12
std %d24, [%o0+32]
faligndata %d10, %d12, %d26
ldd [%o4+56], %d14
std %d26, [%o0+40]
faligndata %d12, %d14, %d28
ldd [%o4+64], %d0
std %d28, [%o0+48]
faligndata %d14, %d0, %d30
add %o4, BLOCK_SIZE, %o4
std %d30, [%o0+56]
add %o0, BLOCK_SIZE, %o0
subcc %o5, BLOCK_SIZE, %o5
bgu,pt %ncc, .unalign_loop
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
ba .unalign_done
nop
.unalign_large:
andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned?
bz %ncc, .unalignsrc
sub %o3, 64, %o3 ! %o3 will be multiple of 8
neg %o3 ! bytes until dest is 64 byte aligned
sub %o2, %o3, %o2 ! update cnt with bytes to be moved
! Move bytes according to source alignment
andcc %o1, 0x1, %o5
bnz %ncc, .unalignbyte ! check for byte alignment
nop
andcc %o1, 2, %o5 ! check for half word alignment
bnz %ncc, .unalignhalf
nop
! Src is word aligned
.unalignword:
ld [%o1], %o4 ! load 4 bytes
stw %o4, [%o0] ! and store 4 bytes
ld [%o1+4], %o4 ! load 4 bytes
add %o1, 8, %o1 ! increase src ptr by 8
stw %o4, [%o0+4] ! and store 4 bytes
subcc %o3, 8, %o3 ! decrease count by 8
bnz %ncc, .unalignword
add %o0, 8, %o0 ! increase dst ptr by 8
ba .unalignsrc
nop
! Src is half-word aligned
.unalignhalf:
lduh [%o1], %o4 ! load 2 bytes
sllx %o4, 32, %o5 ! shift left
lduw [%o1+2], %o4
or %o4, %o5, %o5
sllx %o5, 16, %o5
lduh [%o1+6], %o4
or %o4, %o5, %o5
stx %o5, [%o0]
add %o1, 8, %o1
subcc %o3, 8, %o3
bnz %ncc, .unalignhalf
add %o0, 8, %o0
ba .unalignsrc
nop
! Src is Byte aligned
.unalignbyte:
sub %o0, %o1, %o0 ! share pointer advance
.unalignbyte_loop:
ldub [%o1], %o4
sllx %o4, 56, %o5
lduh [%o1+1], %o4
sllx %o4, 40, %o4
or %o4, %o5, %o5
lduh [%o1+3], %o4
sllx %o4, 24, %o4
or %o4, %o5, %o5
lduh [%o1+5], %o4
sllx %o4, 8, %o4
or %o4, %o5, %o5
ldub [%o1+7], %o4
or %o4, %o5, %o5
stx %o5, [%o0+%o1]
subcc %o3, 8, %o3
bnz %ncc, .unalignbyte_loop
add %o1, 8, %o1
add %o0,%o1, %o0 ! restore pointer
! Destination is now block (64 byte aligned)
.unalignsrc:
andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
and %o2, 0x3f, %o2 ! residue bytes in %o2
add %o2, 64, %o2 ! Insure we don't load beyond
sub %o5, 64, %o5 ! end of source buffer
andn %o1, 0x3f, %o4 ! %o4 has block aligned src address
prefetch [%o4 + (3 * BLOCK_SIZE)], #one_read
alignaddr %o1, %g0, %g0 ! generate %gsr
add %o1, %o5, %o1 ! advance %o1 to after blocks
!
! Determine source alignment to correct 8 byte offset
andcc %o1, 0x20, %o3
brnz,pn %o3, .unalign_1
nop
andcc %o1, 0x10, %o3
brnz,pn %o3, .unalign_01
nop
andcc %o1, 0x08, %o3
brz,a %o3, .unalign_000
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
ba .unalign_001
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
.unalign_01:
andcc %o1, 0x08, %o3
brnz,a %o3, .unalign_011
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
ba .unalign_010
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
.unalign_1:
andcc %o1, 0x10, %o3
brnz,pn %o3, .unalign_11
nop
andcc %o1, 0x08, %o3
brnz,a %o3, .unalign_101
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
ba .unalign_100
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
.unalign_11:
andcc %o1, 0x08, %o3
brz,pn %o3, .unalign_110
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
.unalign_111:
ldd [%o4+56], %d14
.unalign_111_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d14, %d16, %d48
faligndata %d16, %d18, %d50
faligndata %d18, %d20, %d52
faligndata %d20, %d22, %d54
faligndata %d22, %d24, %d56
faligndata %d24, %d26, %d58
faligndata %d26, %d28, %d60
faligndata %d28, %d30, %d62
fmovd %d30, %d14
stda %d48, [%o0]ASI_BLK_P
subcc %o5, 64, %o5
add %o0, 64, %o0
bgu,pt %ncc, .unalign_111_loop
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
ba .unalign_done
membar #Sync
.unalign_110:
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.unalign_110_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d12, %d14, %d48
faligndata %d14, %d16, %d50
faligndata %d16, %d18, %d52
faligndata %d18, %d20, %d54
faligndata %d20, %d22, %d56
faligndata %d22, %d24, %d58
faligndata %d24, %d26, %d60
faligndata %d26, %d28, %d62
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%o0]ASI_BLK_P
subcc %o5, 64, %o5
add %o0, 64, %o0
bgu,pt %ncc, .unalign_110_loop
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
ba .unalign_done
membar #Sync
.unalign_101:
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.unalign_101_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d10, %d12, %d48
faligndata %d12, %d14, %d50
faligndata %d14, %d16, %d52
faligndata %d16, %d18, %d54
faligndata %d18, %d20, %d56
faligndata %d20, %d22, %d58
faligndata %d22, %d24, %d60
faligndata %d24, %d26, %d62
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%o0]ASI_BLK_P
subcc %o5, 64, %o5
add %o0, 64, %o0
bgu,pt %ncc, .unalign_101_loop
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
ba .unalign_done
membar #Sync
.unalign_100:
ldd [%o4+32], %d8
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.unalign_100_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d8, %d10, %d48
faligndata %d10, %d12, %d50
faligndata %d12, %d14, %d52
faligndata %d14, %d16, %d54
faligndata %d16, %d18, %d56
faligndata %d18, %d20, %d58
faligndata %d20, %d22, %d60
faligndata %d22, %d24, %d62
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%o0]ASI_BLK_P
subcc %o5, 64, %o5
add %o0, 64, %o0
bgu,pt %ncc, .unalign_100_loop
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
ba .unalign_done
membar #Sync
.unalign_011:
ldd [%o4+24], %d6
ldd [%o4+32], %d8
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.unalign_011_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d6, %d8, %d48
faligndata %d8, %d10, %d50
faligndata %d10, %d12, %d52
faligndata %d12, %d14, %d54
faligndata %d14, %d16, %d56
faligndata %d16, %d18, %d58
faligndata %d18, %d20, %d60
faligndata %d20, %d22, %d62
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%o0]ASI_BLK_P
subcc %o5, 64, %o5
add %o0, 64, %o0
bgu,pt %ncc, .unalign_011_loop
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
ba .unalign_done
membar #Sync
.unalign_010:
ldd [%o4+16], %d4
ldd [%o4+24], %d6
ldd [%o4+32], %d8
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.unalign_010_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d4, %d6, %d48
faligndata %d6, %d8, %d50
faligndata %d8, %d10, %d52
faligndata %d10, %d12, %d54
faligndata %d12, %d14, %d56
faligndata %d14, %d16, %d58
faligndata %d16, %d18, %d60
faligndata %d18, %d20, %d62
fmovd %d20, %d4
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%o0]ASI_BLK_P
subcc %o5, 64, %o5
add %o0, 64, %o0
bgu,pt %ncc, .unalign_010_loop
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
ba .unalign_done
membar #Sync
.unalign_001:
ldd [%o4+8], %d2
ldd [%o4+16], %d4
ldd [%o4+24], %d6
ldd [%o4+32], %d8
ldd [%o4+40], %d10
ldd [%o4+48], %d12
ldd [%o4+56], %d14
.unalign_001_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d2, %d4, %d48
faligndata %d4, %d6, %d50
faligndata %d6, %d8, %d52
faligndata %d8, %d10, %d54
faligndata %d10, %d12, %d56
faligndata %d12, %d14, %d58
faligndata %d14, %d16, %d60
faligndata %d16, %d18, %d62
fmovd %d18, %d2
fmovd %d20, %d4
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%o0]ASI_BLK_P
subcc %o5, 64, %o5
add %o0, 64, %o0
bgu,pt %ncc, .unalign_001_loop
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
ba .unalign_done
membar #Sync
.unalign_000:
ldda [%o4]ASI_BLK_P, %d0
.unalign_000_loop:
add %o4, 64, %o4
ldda [%o4]ASI_BLK_P, %d16
faligndata %d0, %d2, %d48
faligndata %d2, %d4, %d50
faligndata %d4, %d6, %d52
faligndata %d6, %d8, %d54
faligndata %d8, %d10, %d56
faligndata %d10, %d12, %d58
faligndata %d12, %d14, %d60
faligndata %d14, %d16, %d62
fmovd %d16, %d0
fmovd %d18, %d2
fmovd %d20, %d4
fmovd %d22, %d6
fmovd %d24, %d8
fmovd %d26, %d10
fmovd %d28, %d12
fmovd %d30, %d14
stda %d48, [%o0]ASI_BLK_P
subcc %o5, 64, %o5
add %o0, 64, %o0
bgu,pt %ncc, .unalign_000_loop
prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
membar #Sync
.unalign_done:
! Handle trailing bytes, 64 to 127
! Dest long word aligned, Src not long word aligned
cmp %o2, 15
bleu %ncc, .unalign_short
andn %o2, 0x7, %o5 ! %o5 is multiple of 8
and %o2, 0x7, %o2 ! residue bytes in %o2
add %o2, 8, %o2
sub %o5, 8, %o5 ! insure we don't load past end of src
andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
add %o1, %o5, %o1 ! advance %o1 to after multiple of 8
ldd [%o4], %d0 ! fetch partial word
.unalign_by8:
ldd [%o4+8], %d2
add %o4, 8, %o4
faligndata %d0, %d2, %d16
subcc %o5, 8, %o5
std %d16, [%o0]
fmovd %d2, %d0
bgu,pt %ncc, .unalign_by8
add %o0, 8, %o0
.unalign_short:
brnz %g5, .smallrest
nop
ba .smallrest
wr %g5, %g0, %fprs
#else /* NIAGARA2_IMPL */
.forcpy:
mov %o0, %g5 ! save des address for return val
cmp %o2, 17 ! for small counts copy bytes
bleu,pt %ncc, .dbytecp
nop
cmp %o2, 0x80 ! For lengths less than 128 bytes no
bleu,pn %ncc, .no_blkcpy ! copy using ASI_BLK_INIT_ST_QUAD_LDD_P
/*
* Make sure that source and destination buffers are 64 bytes apart.
* If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy
* the data.
*/
subcc %o1, %o0, %o3
blu %ncc, .blkalgndst
cmp %o3, 0x40 ! if src - dst >= 0x40
bgeu,pt %ncc, .blkalgndst ! then use ASI_BLK_INIT_ST_QUAD_LDD_P
.no_blkcpy:
andcc %o1, 3, %o5 ! is src word aligned
bz,pn %ncc, .aldst
cmp %o5, 2 ! is src half-word aligned
be,pt %ncc, .s2algn
cmp %o5, 3 ! src is byte aligned
.s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it
inc 1, %o1
stb %o3, [%g5] ! move a byte to align src
inc 1, %g5
bne,pt %ncc, .s2algn
dec %o2
b .ald ! now go align dest
andcc %g5, 3, %o5
.s2algn:lduh [%o1], %o3 ! know src is 2 byte alinged
inc 2, %o1
srl %o3, 8, %o4
stb %o4, [%g5] ! have to do bytes,
stb %o3, [%g5 + 1] ! don't know dst alingment
inc 2, %g5
dec 2, %o2
.aldst: andcc %g5, 3, %o5 ! align the destination address
.ald: bz,pn %ncc, .w4cp
cmp %o5, 2
bz,pn %ncc, .w2cp
cmp %o5, 3
.w3cp: lduw [%o1], %o4
inc 4, %o1
srl %o4, 24, %o5
stb %o5, [%g5]
bne,pt %ncc, .w1cp
inc %g5
dec 1, %o2
andn %o2, 3, %o3 ! o3 is aligned word count
dec 4, %o3 ! avoid reading beyond tail of src
sub %o1, %g5, %o1 ! o1 gets the difference
1: sll %o4, 8, %g1 ! save residual bytes
lduw [%o1+%g5], %o4
deccc 4, %o3
srl %o4, 24, %o5 ! merge with residual
or %o5, %g1, %g1
st %g1, [%g5]
bnz,pt %ncc, 1b
inc 4, %g5
sub %o1, 3, %o1 ! used one byte of last word read
and %o2, 3, %o2
b 7f
inc 4, %o2
.w1cp: srl %o4, 8, %o5
sth %o5, [%g5]
inc 2, %g5
dec 3, %o2
andn %o2, 3, %o3 ! o3 is aligned word count
dec 4, %o3 ! avoid reading beyond tail of src
sub %o1, %g5, %o1 ! o1 gets the difference
2: sll %o4, 24, %g1 ! save residual bytes
lduw [%o1+%g5], %o4
deccc 4, %o3
srl %o4, 8, %o5 ! merge with residual
or %o5, %g1, %g1
st %g1, [%g5]
bnz,pt %ncc, 2b
inc 4, %g5
sub %o1, 1, %o1 ! used three bytes of last word read
and %o2, 3, %o2
b 7f
inc 4, %o2
.w2cp: lduw [%o1], %o4
inc 4, %o1
srl %o4, 16, %o5
sth %o5, [%g5]
inc 2, %g5
dec 2, %o2
andn %o2, 3, %o3 ! o3 is aligned word count
dec 4, %o3 ! avoid reading beyond tail of src
sub %o1, %g5, %o1 ! o1 gets the difference
3: sll %o4, 16, %g1 ! save residual bytes
lduw [%o1+%g5], %o4
deccc 4, %o3
srl %o4, 16, %o5 ! merge with residual
or %o5, %g1, %g1
st %g1, [%g5]
bnz,pt %ncc, 3b
inc 4, %g5
sub %o1, 2, %o1 ! used two bytes of last word read
and %o2, 3, %o2
b 7f
inc 4, %o2
.w4cp: andn %o2, 3, %o3 ! o3 is aligned word count
sub %o1, %g5, %o1 ! o1 gets the difference
1: lduw [%o1+%g5], %o4 ! read from address
deccc 4, %o3 ! decrement count
st %o4, [%g5] ! write at destination address
bgu,pt %ncc, 1b
inc 4, %g5 ! increment to address
b 7f
and %o2, 3, %o2 ! number of leftover bytes, if any
!
! differenced byte copy, works with any alignment
!
.dbytecp:
b 7f
sub %o1, %g5, %o1 ! o1 gets the difference
4: stb %o4, [%g5] ! write to address
inc %g5 ! inc to address
7: deccc %o2 ! decrement count
bgeu,a,pt %ncc,4b ! loop till done
ldub [%o1+%g5], %o4 ! read from address
retl ! %o0 was preserved
nop
.blkalgndst:
save %sp, -SA(MINFRAME), %sp
! Block (64 bytes) align the destination.
andcc %i0, 0x3f, %i3 ! is dst block aligned
bz %ncc, .chksrc ! dst already block aligned
sub %i3, 0x40, %i3
neg %i3 ! bytes till dst 64 bytes aligned
sub %i2, %i3, %i2 ! update i2 with new count
! Based on source and destination alignment do
! either 8 bytes, 4 bytes, 2 bytes or byte copy.
! Is dst & src 8B aligned
or %i0, %i1, %o2
andcc %o2, 0x7, %g0
bz %ncc, .alewdcp
nop
! Is dst & src 4B aligned
andcc %o2, 0x3, %g0
bz %ncc, .alwdcp
nop
! Is dst & src 2B aligned
andcc %o2, 0x1, %g0
bz %ncc, .alhlfwdcp
nop
! 1B aligned
1: ldub [%i1], %o2
stb %o2, [%i0]
inc %i1
deccc %i3
bgu,pt %ncc, 1b
inc %i0
ba .chksrc
nop
! dst & src 4B aligned
.alwdcp:
ld [%i1], %o2
st %o2, [%i0]
add %i1, 0x4, %i1
subcc %i3, 0x4, %i3
bgu,pt %ncc, .alwdcp
add %i0, 0x4, %i0
ba .chksrc
nop
! dst & src 2B aligned
.alhlfwdcp:
lduh [%i1], %o2
stuh %o2, [%i0]
add %i1, 0x2, %i1
subcc %i3, 0x2, %i3
bgu,pt %ncc, .alhlfwdcp
add %i0, 0x2, %i0
ba .chksrc
nop
! dst & src 8B aligned
.alewdcp:
ldx [%i1], %o2
stx %o2, [%i0]
add %i1, 0x8, %i1
subcc %i3, 0x8, %i3
bgu,pt %ncc, .alewdcp
add %i0, 0x8, %i0
! Now Destination is block (64 bytes) aligned
.chksrc:
andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
sub %i2, %i3, %i2 ! Residue bytes in %i2
mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
andcc %i1, 0xf, %l1 ! is src quadword aligned
bz,pn %ncc, .blkcpy ! src offset in %l1
nop
cmp %l1, 0x8
bgu %ncc, .cpy_upper_double
nop
blu %ncc, .cpy_lower_double
nop
! Falls through when source offset is equal to 8 i.e.
! source is double word aligned.
! In this case no shift/merge of data is required
sub %i1, %l1, %i1 ! align the src at 16 bytes.
andn %i1, 0x3f, %o0 ! %o0 has block aligned source
prefetch [%o0+0x0], #one_read
ldda [%i1+0x0]%asi, %o2
loop0:
ldda [%i1+0x10]%asi, %o4
prefetch [%o0+0x40], #one_read
stxa %o3, [%i0+0x0]%asi
stxa %o4, [%i0+0x8]%asi
ldda [%i1+0x20]%asi, %o2
stxa %o5, [%i0+0x10]%asi
stxa %o2, [%i0+0x18]%asi
ldda [%i1+0x30]%asi, %o4
stxa %o3, [%i0+0x20]%asi
stxa %o4, [%i0+0x28]%asi
ldda [%i1+0x40]%asi, %o2
stxa %o5, [%i0+0x30]%asi
stxa %o2, [%i0+0x38]%asi
add %o0, 0x40, %o0
add %i1, 0x40, %i1
subcc %i3, 0x40, %i3
bgu,pt %ncc, loop0
add %i0, 0x40, %i0
ba .blkdone
add %i1, %l1, %i1 ! increment the source by src offset
.cpy_lower_double:
sub %i1, %l1, %i1 ! align the src at 16 bytes.
sll %l1, 3, %l2 ! %l2 left shift
mov 0x40, %l3
sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift)
andn %i1, 0x3f, %o0 ! %o0 has block aligned source
prefetch [%o0+0x0], #one_read
ldda [%i1+0x0]%asi, %o2 ! partial data in %o2 and %o3 has
! complete data
loop1:
ldda [%i1+0x10]%asi, %o4 ! %o4 has partial data for this read.
ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) ! merge %o2, %o3 and %o4
! into %o2 and %o3
prefetch [%o0+0x40], #one_read
stxa %o2, [%i0+0x0]%asi
stxa %o3, [%i0+0x8]%asi
ldda [%i1+0x20]%asi, %o2
ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) ! merge %o2 with %o5 and
stxa %o4, [%i0+0x10]%asi ! %o4 from previous read
stxa %o5, [%i0+0x18]%asi ! into %o4 and %o5
! Repeat the same for next 32 bytes.
ldda [%i1+0x30]%asi, %o4
ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1)
stxa %o2, [%i0+0x20]%asi
stxa %o3, [%i0+0x28]%asi
ldda [%i1+0x40]%asi, %o2
ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1)
stxa %o4, [%i0+0x30]%asi
stxa %o5, [%i0+0x38]%asi
add %o0, 0x40, %o0
add %i1, 0x40, %i1
subcc %i3, 0x40, %i3
bgu,pt %ncc, loop1
add %i0, 0x40, %i0
ba .blkdone
add %i1, %l1, %i1 ! increment the source by src offset
.cpy_upper_double:
sub %i1, %l1, %i1 ! align the src at 16 bytes.
mov 0x8, %l2
sub %l1, %l2, %l2
sll %l2, 3, %l2 ! %l2 left shift
mov 0x40, %l3
sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift)
andn %i1, 0x3f, %o0 ! %o0 has block aligned source
prefetch [%o0+0x0], #one_read
ldda [%i1+0x0]%asi, %o2 ! partial data in %o3 for this read and
! no data in %o2
loop2:
ldda [%i1+0x10]%asi, %o4 ! %o4 has complete data and %o5 has
! partial
ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) ! merge %o3, %o4 and %o5
! into %o3 and %o4
prefetch [%o0+0x40], #one_read
stxa %o3, [%i0+0x0]%asi
stxa %o4, [%i0+0x8]%asi
ldda [%i1+0x20]%asi, %o2
ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) ! merge %o2 and %o3 with
stxa %o5, [%i0+0x10]%asi ! %o5 from previous read
stxa %o2, [%i0+0x18]%asi ! into %o5 and %o2
! Repeat the same for next 32 bytes.
ldda [%i1+0x30]%asi, %o4
ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1)
stxa %o3, [%i0+0x20]%asi
stxa %o4, [%i0+0x28]%asi
ldda [%i1+0x40]%asi, %o2
ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1)
stxa %o5, [%i0+0x30]%asi
stxa %o2, [%i0+0x38]%asi
add %o0, 0x40, %o0
add %i1, 0x40, %i1
subcc %i3, 0x40, %i3
bgu,pt %ncc, loop2
add %i0, 0x40, %i0
ba .blkdone
add %i1, %l1, %i1 ! increment the source by src offset
! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.blkcpy:
andn %i1, 0x3f, %o0 ! %o0 has block aligned source
prefetch [%o0+0x0], #one_read
1:
prefetch [%o0+0x40], #one_read
ldda [%i1+0x0]%asi, %o2
ldda [%i1+0x10]%asi, %o4
stxa %o2, [%i0+0x0]%asi
stxa %o3, [%i0+0x8]%asi
stxa %o4, [%i0+0x10]%asi
stxa %o5, [%i0+0x18]%asi
ldda [%i1+0x20]%asi, %o2
ldda [%i1+0x30]%asi, %o4
stxa %o2, [%i0+0x20]%asi
stxa %o3, [%i0+0x28]%asi
stxa %o4, [%i0+0x30]%asi
stxa %o5, [%i0+0x38]%asi
add %o0, 0x40, %o0
add %i1, 0x40, %i1
subcc %i3, 0x40, %i3
bgu,pt %ncc, 1b
add %i0, 0x40, %i0
.blkdone:
membar #Sync
mov ASI_PNF, %asi ! restore %asi to default
! ASI_PRIMARY_NOFAULT value
tst %i2
bz,pt %ncc, .blkexit
nop
! Handle trailing bytes
cmp %i2, 0x8
blu,pt %ncc, .residue
nop
! Can we do some 8B ops
or %i1, %i0, %o2
andcc %o2, 0x7, %g0
bnz %ncc, .last4
nop
! Do 8byte ops as long as possible
.last8:
ldx [%i1], %o2
stx %o2, [%i0]
add %i1, 0x8, %i1
sub %i2, 0x8, %i2
cmp %i2, 0x8
bgu,pt %ncc, .last8
add %i0, 0x8, %i0
tst %i2
bz,pt %ncc, .blkexit
nop
ba .residue
nop
.last4:
! Can we do 4B ops
andcc %o2, 0x3, %g0
bnz %ncc, .last2
nop
1:
ld [%i1], %o2
st %o2, [%i0]
add %i1, 0x4, %i1
sub %i2, 0x4, %i2
cmp %i2, 0x4
bgu,pt %ncc, 1b
add %i0, 0x4, %i0
cmp %i2, 0
bz,pt %ncc, .blkexit
nop
ba .residue
nop
.last2:
! Can we do 2B ops
andcc %o2, 0x1, %g0
bnz %ncc, .residue
nop
1:
lduh [%i1], %o2
stuh %o2, [%i0]
add %i1, 0x2, %i1
sub %i2, 0x2, %i2
cmp %i2, 0x2
bgu,pt %ncc, 1b
add %i0, 0x2, %i0
cmp %i2, 0
bz,pt %ncc, .blkexit
nop
.residue:
ldub [%i1], %o2
stb %o2, [%i0]
inc %i1
deccc %i2
bgu,pt %ncc, .residue
inc %i0
.blkexit:
ret
restore %g5, %g0, %o0
#endif /* NIAGARA2_IMPL */
SET_SIZE(memcpy)
SET_SIZE(__align_cpy_1)