/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/*
* memcpy(s1, s2, len)
*
* Copy s2 to s1, always copy n bytes.
* Note: this C code does not work for overlapped copies.
* Memmove() and bcopy() do.
*
* Fast assembler language version of the following C-program for memcpy
* which represents the `standard' for the C-library.
*
* void *
* memcpy(void *s, const void *s0, size_t n)
* {
* if (n != 0) {
* char *s1 = s;
* const char *s2 = s0;
* do {
* *s1++ = *s2++;
* } while (--n != 0);
* }
* return (s);
* }
*/
#include <sys/asm_linkage.h>
!
!
.ovbc:
.byte:
.byteloop:
.exit:
.align 16
.dbalign:
.dbalign1:
.dbmed:
!
!
!
.dbmedl32:
.dbmedl31:
nop !
.dbmedl15:
!
!
.align 16
.dbbck:
.dbmv64:
.dbmv32:
.dbmvx:
.dbmv8:
.dbremain:
.dbbyte:
.dbexit:
.forcpy:
.small_4:
.align 16
.smallfin:
.align 16
.smleft7:
.align 16
.align 16
.smby16:
.smlongx: !
.align 16
.medium:
.med_half:
.med_word:
/*
* Handle all cases where src and dest are aligned on word
* or long word boundaries. Use unrolled loops for better
* performance. This option wins over standard large data
* move when source and destination is in cache for medium
* to short data moves.
*/
.medw16:
.medw15:
nop !
nop !
.medwexit:
/*
* Special case for handling when src and dest are both long word aligned
* and total data to move is between SMALL_MAX and ALIGNED8_FPCOPY_THRESHOLD
* bytes.
*/
.align 16
.medl32:
.medl31:
nop !
.medl15:
.align 16
3:
! in SRC compared to in DST
!
! Examples: Let # denote bytes that should not be accessed
! Let x denote a byte already copied to align DST
! Let . and - denote bytes not yet copied
! Let | denote double alignment boundaries
!
! DST: ######xx|........|--------|..###### o2 = 18
! o0
!
! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8
! o1
!
! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8
! o1
!
! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8
! o1
or %g0, -8, %o5
alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1
movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0
add %o5, %o2, %o5
add %o5, %o3, %o5
bleu,pt %ncc, 4f
andn %o5, 7, %o5 ! 8 byte aligned count
and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned
4:
brgez,a %o3, .beginmedloop
ldd [%o1-8], %d0
add %o1, %o3, %o1 ! back up o1
5:
ldda [%o1]ASI_FL8_P, %d2
inc %o1
andcc %o1, 7, %g0
bnz,pt %ncc, 5b
.medloop:
1:
2:
1:
2:
subcc %o2, 8, %o2
std %d0, [%o0]
bz,pt %ncc, .mediumexit
add %o0, 8, %o0
tst %o2
bz,pt %ncc, .mediumexit
nop
3:
ldub [%o1], %o3
deccc %o2
inc %o1
stb %o3, [%o0]
bgu,pt %ncc, 3b
inc %o0
.mediumexit:
wr %o4, %g0, %fprs ! fprs = o4 restore fprs
retl
mov %g1, %o0
.align ICACHE_LINE_SIZE
.large_long:
! %o0 DST, 8 byte aligned
! %o1 SRC, 8 byte aligned
! %o2 count (number of bytes to be moved)
! %o3, %o4, %o5 available as temps
set BST_THRESHOLD, %o5
cmp %o2, %o5
bgu,pn %ncc, .xlarge_long
prefetch [%o1 + (16 * BLOCK_SIZE)], #n_reads
subcc %o2, (16 * BLOCK_SIZE) + 63, %o2 ! adjust length to allow
! cc test for end of loop
ble,pn %ncc, .largel_no ! skip big loop if no more prefetches
prefetch [%o0 + (16 * BLOCK_SIZE)], #n_writes
.largel64p:
prefetch [%o1 + (20 * BLOCK_SIZE)], #n_reads
prefetch [%o0 + (20 * BLOCK_SIZE)], #n_writes
ldx [%o1], %o4 ! load
subcc %o2, 64, %o2 ! decrement length count
stx %o4, [%o0] ! and store
ldx [%o1+8], %o3 ! a block of 64 bytes
stx %o3, [%o0+8]
ldx [%o1+16], %o4 ! a block of 64 bytes
stx %o4, [%o0+16]
ldx [%o1+24], %o3 ! a block of 64 bytes
stx %o3, [%o0+24]
ldx [%o1+32], %o4 ! a block of 64 bytes
stx %o4, [%o0+32]
ldx [%o1+40], %o3 ! a block of 64 bytes
add %o1, 64, %o1 ! increase src ptr by 64
stx %o3, [%o0+40]
ldx [%o1-16], %o4
add %o0, 64, %o0 ! increase dst ptr by 64
stx %o4, [%o0-16]
ldx [%o1-8], %o3
bgu,pt %ncc, .largel64p ! repeat if at least 64 bytes left
stx %o3, [%o0-8]
.largel_no:
add %o2, (16 * BLOCK_SIZE), %o2
.largel64: ! finish with no more prefetches
ldx [%o1], %o4 ! load
subcc %o2, 64, %o2 ! decrement length count
stx %o4, [%o0] ! and store
ldx [%o1+8], %o3 ! a block of 64 bytes
stx %o3, [%o0+8]
ldx [%o1+16], %o4 ! a block of 64 bytes
stx %o4, [%o0+16]
ldx [%o1+24], %o3 ! a block of 64 bytes
stx %o3, [%o0+24]
ldx [%o1+32], %o4 ! a block of 64 bytes
stx %o4, [%o0+32]
ldx [%o1+40], %o3 ! a block of 64 bytes
add %o1, 64, %o1 ! increase src ptr by 64
stx %o3, [%o0+40]
ldx [%o1-16], %o4
add %o0, 64, %o0 ! increase dst ptr by 64
stx %o4, [%o0-16]
ldx [%o1-8], %o3
bgu,pt %ncc, .largel64 ! repeat if at least 64 bytes left
stx %o3, [%o0-8]
.largel32:
addcc %o2, 32, %o2 ! adjust finish count
ble,pt %ncc, .largel31
nop
ldx [%o1], %o4 ! load
sub %o2, 32, %o2 ! decrement length count
stx %o4, [%o0] ! and store
ldx [%o1+8], %o3 ! a block of 32 bytes
add %o1, 32, %o1 ! increase src ptr by 32
stx %o3, [%o0+8]
ldx [%o1-16], %o4
add %o0, 32, %o0 ! increase dst ptr by 32
stx %o4, [%o0-16]
ldx [%o1-8], %o3
stx %o3, [%o0-8]
.largel31:
addcc %o2, 16, %o2 ! adjust remaining count
ble,pt %ncc, .largel15 ! skip if 15 or fewer bytes left
nop !
ldx [%o1], %o4 ! load and store 16 bytes
add %o1, 16, %o1 ! increase src ptr by 16
stx %o4, [%o0] !
sub %o2, 16, %o2 ! decrease count by 16
ldx [%o1-8], %o3 !
add %o0, 16, %o0 ! increase dst ptr by 16
stx %o3, [%o0-8]
.largel15:
addcc %o2, 15, %o2 ! restore count
bz,pt %ncc, .medwexit ! exit if finished
nop
cmp %o2, 8
blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
nop
ldx [%o1], %o4 ! load 8 bytes
add %o1, 8, %o1 ! increase src ptr by 8
subcc %o2, 8, %o2 ! decrease count by 8
bz,pt %ncc, .medwexit ! exit if finished
stx %o4, [%o0] ! and store 8 bytes
ba .medw7
add %o0, 8, %o0 ! increase dst ptr by 8
.align ICACHE_LINE_SIZE
.large:
! %o0 I/O DST is 64-byte aligned
1:
prefetch [%o1 + (20 * BLOCK_SIZE)], #n_reads
prefetch [%o0 + (20 * BLOCK_SIZE)], #n_writes
faligndata %f12, %f14, %f44
ldd [%o1], %f2
faligndata %f14, %f0, %f46
std %f32, [%o0]
ldd [%o1 + 8], %f4
faligndata %f0, %f2, %f32
std %f34, [%o0 + 8]
ldd [%o1 + 16], %f6
faligndata %f2, %f4, %f34
std %f36, [%o0 + 16]
ldd [%o1 + 24], %f8
faligndata %f4, %f6, %f36
std %f38, [%o0 + 24]
ldd [%o1 + 32], %f10
faligndata %f6, %f8, %f38
std %f40, [%o0 + 32]
ldd [%o1 + 40], %f12
faligndata %f8, %f10, %f40
std %f42, [%o0 + 40]
ldd [%o1 + 48], %f14
sub %o2, BLOCK_SIZE, %o2 ! update count
faligndata %f10, %f12, %f42
std %f44, [%o0 + 48]
add %o0, BLOCK_SIZE, %o0 ! update DST
cmp %o2, BLOCK_SIZE + 8
ldd [%o1 + 56], %f0
add %o1, BLOCK_SIZE, %o1 ! update SRC
bgu,pt %ncc, 1b
std %f46, [%o0 - 8]
faligndata %f12, %f14, %f44
faligndata %f14, %f0, %f46
std %f32, [%o0]
std %f34, [%o0 + 8]
std %f36, [%o0 + 16]
std %f38, [%o0 + 24]
std %f40, [%o0 + 32]
std %f42, [%o0 + 40]
std %f44, [%o0 + 48]
std %f46, [%o0 + 56] ! store 64 bytes
cmp %o2, BLOCK_SIZE
bne %ncc, 2f ! exactly 1 block remaining?
add %o0, BLOCK_SIZE, %o0 ! update DST
brz,a %o3, 3f ! is SRC double aligned?
ldd [%o1], %f2
2:
add %o5, %o2, %o5 ! %o5 was already set to 0 or -8
add %o5, %o3, %o5
ba .beginmedloop
andn %o5, 7, %o5 ! 8 byte aligned count
! This is when there is exactly 1 block remaining and SRC is aligned
3:
ldd [%o1 + 0x8], %f4
ldd [%o1 + 0x10], %f6
fsrc1 %f0, %f32
ldd [%o1 + 0x18], %f8
fsrc1 %f2, %f34
ldd [%o1 + 0x20], %f10
fsrc1 %f4, %f36
ldd [%o1 + 0x28], %f12
fsrc1 %f6, %f38
ldd [%o1 + 0x30], %f14
fsrc1 %f8, %f40
fsrc1 %f10, %f42
fsrc1 %f12, %f44
fsrc1 %f14, %f46
std %f32, [%o0]
std %f34, [%o0 + 8]
std %f36, [%o0 + 16]
std %f38, [%o0 + 24]
std %f40, [%o0 + 32]
std %f42, [%o0 + 40]
std %f44, [%o0 + 48]
std %f46, [%o0 + 56] ! store 64 bytes
wr %o4, 0, %fprs
retl
mov %g1, %o0
.align 16
.xlarge_long:
! long word aligned, larger than Block store threshold
! %o0 DST, 8 byte aligned
! %o1 SRC, 8 byte aligned
! %o2 count (number of bytes to be moved)
! %o3, %o4, %o5 available as temps
! prefetch through %o1 + (12* BLOCK_SIZE) has been done
! Need to align DST to 64 byte boundary for block stores
andcc %o0, 63, %o5
bz,pt %ncc, .xlarge_aligned
sub %o5, 64, %o5
add %o2,%o5, %o2
.xlarge_a:
addcc %o5, 8, %o5
ldx [%o1], %o4
add %o1, 8, %o1
add %o0, 8, %o0
bnz,pt %ncc, .xlarge_a
stx %o4, [%o0-8]
! DST is now on 64 byte boundary
.xlarge_aligned:
prefetch [%o1 + (20 * BLOCK_SIZE)], #one_read
rd %fprs, %o4 ! check for unused FPU
andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0
bz,a %ncc, .xlarge_loop
wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
.xlarge_loop:
prefetch [%o1 + (4 * BLOCK_SIZE)], #n_reads
ldd [%o1], %d0
sub %o2, 64, %o2
ldd [%o1 + 8], %d2
ldd [%o1 + 16], %d4
ldd [%o1 + 24], %d6
ldd [%o1 + 32], %d8
ldd [%o1 + 40], %d10
ldd [%o1 + 48], %d12
add %o1, 64, %o1
ldd [%o1 - 8], %d14
stda %d0, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache
add %o0, 64, %o0
cmp %o2,64
bgt,pt %ncc, .xlarge_loop
prefetch [%o1 + (20 * BLOCK_SIZE)], #one_read
membar #StoreLoad|#StoreStore ! needed after final blk store
wr %o4, 0, %fprs
subcc %o2,63,%o2
bgt,pt %ncc, .largel64
nop
ba .largel32
nop
.align 16
.xlarge:
! %o0 I/O DST is 64-byte aligned
1:
ldd [%o1], %f2
faligndata %f12, %f14, %f44
ldd [%o1 + 0x8], %f4
faligndata %f14, %f0, %f46
stda %f32, [%o0]ASI_BLK_P
sub %o2, BLOCK_SIZE, %o2 ! update count
ldd [%o1 + 0x10], %f6
faligndata %f0, %f2, %f32
ldd [%o1 + 0x18], %f8
faligndata %f2, %f4, %f34
ldd [%o1 + 0x20], %f10
faligndata %f4, %f6, %f36
ldd [%o1 + 0x28], %f12
faligndata %f6, %f8, %f38
ldd [%o1 + 0x30], %f14
prefetch [%o1 + (4 * BLOCK_SIZE)], #n_reads
faligndata %f8, %f10, %f40
ldd [%o1 + 0x38], %f0
faligndata %f10, %f12, %f42
prefetch [%o1 + (20 * BLOCK_SIZE)], #one_read
add %o0, BLOCK_SIZE, %o0 ! update DST
cmp %o2, BLOCK_SIZE + 8
bgu,pt %ncc, 1b
add %o1, BLOCK_SIZE, %o1 ! update SRC
faligndata %f12, %f14, %f44
faligndata %f14, %f0, %f46
stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache
cmp %o2, BLOCK_SIZE
bne %ncc, 2f ! exactly 1 block remaining?
add %o0, BLOCK_SIZE, %o0 ! update DST
brz,a %o3, 3f ! is SRC double aligned?
ldd [%o1], %f2
2:
add %o5, %o2, %o5 ! %o5 was already set to 0 or -8
add %o5, %o3, %o5
membar #StoreLoad|#StoreStore ! needed after final blk store
ba .beginmedloop
andn %o5, 7, %o5 ! 8 byte aligned count
! This is when there is exactly 1 block remaining and SRC is aligned
3:
ldd [%o1 + 0x8], %f4
ldd [%o1 + 0x10], %f6
fsrc1 %f0, %f32
ldd [%o1 + 0x18], %f8
fsrc1 %f2, %f34
ldd [%o1 + 0x20], %f10
fsrc1 %f4, %f36
ldd [%o1 + 0x28], %f12
fsrc1 %f6, %f38
ldd [%o1 + 0x30], %f14
fsrc1 %f8, %f40
fsrc1 %f10, %f42
fsrc1 %f12, %f44
fsrc1 %f14, %f46
stda %f32, [%o0]ASI_BLK_P
membar #StoreLoad|#StoreStore ! needed after final blk store
wr %o4, 0, %fprs
retl
mov %g1, %o0
SET_SIZE(memcpy)