/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/*
* memcpy(s1, s2, len)
*
* Copy s2 to s1, always copy n bytes.
* Note: this C code does not work for overlapped copies.
* Memmove() and bcopy() do.
*
* Added entry __align_cpy_1 is generally for use of the compilers.
*
* Fast assembler language version of the following C-program for memcpy
* which represents the `standard' for the C-library.
*
* void *
* memcpy(void *s, const void *s0, size_t n)
* {
* if (n != 0) {
* char *s1 = s;
* const char *s2 = s0;
* do {
* *s1++ = *s2++;
* } while (--n != 0);
* }
* return (s);
* }
*
*
*/
#include <sys/asm_linkage.h>
#include <sys/athena_extreg.h>
/*
* for modified instructions. An SXAR2_* macro modifies the two instructions
* that directly follow the macro.
*/
!
!
.chksize:
.dbalign:
.dbbck:
ba 2f
.bcbyte:
.align 64
.forcpy:
!! size > 64
.medium:
1:
2:
!!
!!
#if ALIGNED4_FPCOPY_THRESHOLD < 4096
#else /* ALIGNED4_FPCOPY_THRESHOLD < 4096 */
#endif /* ALIGNED4_FPCOPY_THRESHOLD < 4096 */
.exit:
.med_aln8:
!!
!!
#if ALIGNED8_FPCOPY_THRESHOLD < 4096
#else /* ALIGNED8_FPCOPY_THRESHOLD < 4096 */
#endif /* ALIGNED8_FPCOPY_THRESHOLD < 4096 */
.big_aln8:
1:
2:
.med_unal:
!!
!!
.fp_ready:
!
!
!
!
! in SRC compared to in DST
!
! Examples: Let # denote bytes that should not be accessed
! Let x denote a byte already copied to align DST
! Let . and - denote bytes not yet copied
! Let | denote double alignment boundaries
!
! DST: ######xx|........|--------|..###### %o2 = 18
! %o0
!
! %o3 = -3: SRC: ###xx...|.....---|-----..#|######## %o5 = 8
! %o1
!
! %o3 = 0: SRC: ######xx|........|--------|..###### %o5 = 16-8 = 8
! %o1
! we handle in .medium)
!
! %o3 = +1: SRC: #######x|x.......|.-------|-..##### %o5 = 16-8 = 8
! %o1
mov -0x8, %o5
movrlz %o3, %g0, %o5
add %o5, %o2, %o5
add %o5, %o3, %o5
bleu,pt %icc, 1f ! jmp if size < 256
andn %o5, 0x7, %o5 ! 8-byte aligned count
neg %o0, %o5
and %o5, 0x3f, %o5 ! (-dst&63)
1:
brgez,a,pt %o3, .not8_medium ! jmp if - (-src&7) >= 0
ldd [%o1 - 0x8], %f0 ! prep %f0
!! src underaligned, so prep %f0 some more
add %o1, %o3, %o1 ! back up src
2:
ldda [%o1] 0xd0, %f2 ! 1byte load
add %o1, 0x1, %o1
btst 0x7, %o1
bne,pt %icc, 2b
bshuffle %f0, %f2, %f0 ! shift %f0 left 1byte, merge with %f2
!! The .not8_med* code below is also used by the epilogue routine.
!! Do not put prefetch instruction.
.not8_medium:
tst %o5
bz,pt %icc, .not8_medx ! nothing to copy here
sub %o2, %o5, %o2 ! update size for later
ldd [%o1], %f2
subcc %o5, 0x8, %o5 ! update local count
be,pn %icc, 1f ! align and store last 8 bytes in %o5?
add %o1, 0x8, %o1 ! update src
.not8_med_16: ! copy 16 bytes
faligndata %f0, %f2, %f4
ldd [%o1], %f0
subcc %o5, 0x8, %o5 ! update local count
add %o1, 0x10, %o1 ! update src
std %f4, [%o0]
be,pn %icc, 2f ! store last 8 bytes in %o5?
faligndata %f2, %f0, %f6
ldd [%o1 - 0x8], %f2
subcc %o5, 0x8, %o5 ! update local count
std %f6, [%o0 + 0x8]
bne,pt %icc, .not8_med_16 ! at least 16 bytes?
add %o0, 0x10, %o0 ! update dst
1:
faligndata %f0, %f2, %f4
fmovd %f2, %f0
std %f4, [%o0]
ba .not8_medx
add %o0, 0x8, %o0
2:
std %f6, [%o0 + 0x8] ! faligndata in branch delay
sub %o1, 0x8, %o1
add %o0, 0x10, %o0
.not8_medx:
! Currently, %o1 points to the next double-aligned byte in src.
! 8 bytes starting at [%o1-8] are available in %f0
! At least one byte, possibly all, need to be written.
cmp %o2, 0x40
bgu,pt %icc, .not8_large ! size > 64 after done with %o5?
! otherwise 1-15 bytes left
andcc %o3, 0x7, %o5
be,pt %icc, 2f ! 8 bytes left in %f0?
sub %o5, 0x8, %o3
cmp %o2, 0x8
bl,a,pt %icc, 3f ! store 1-7 bytes?
add %o1, %o3, %o1 ! back up %o1
! at least 8 bytes but need to prep %f0
1:
subcc %o5, 0x1, %o5
ldda [%o1] 0xd0, %f2
add %o1, 0x1, %o1
bgu,pt %icc, 1b
bshuffle %f0, %f2, %f0 ! shift %f0 left 1byte, merge with %f2
2: ! store 8 bytes
subcc %o2, 0x8, %o2
std %f0, [%o0]
be,pt %icc, .not8_exit ! 0 bytes?
add %o0, 0x8, %o0
3: ! 1-7 bytes
ldub [%o1], %o3
subcc %o2, 0x1, %o2
add %o1, 0x1, %o1
stb %o3, [%o0]
bgu,pt %icc, 3b ! at least 1 byte?
add %o0, 0x1, %o0
.not8_exit: ! done
wr %o4, %g0, %fprs ! restore FPU setting
retl
mov %g1, %o0
!! size > 64
!! Since only limited prefetches can be inserted in preamble,
!! we put it for every 256byte. It is sufficient if data comes
!! from memory. We should allow performance degrade on copyback,
!! in which 64byte move is used.
!! Prefetch for dst in preamble is also neglected. Expecting
!! hardware feature (prefetch, writebuffer, etc.) to compensate.
!!
!! We do not actually use any block store instructions.
!! We just copy 64-byte aligned blocks of 64 bytes.
!!
.not8_large:
prefetch [%o1 + 0x100], PF_FCN_LD_L2_256
prefetch [%o1 + 0x200], PF_FCN_LD_L2_256
prefetch [%o1 + 0x300], PF_FCN_LD_L2_256
! Load the rest of the current block
! %o1 is further into src than %o0 is into dst
ldd [%o1], %f2
ldd [%o1 + 0x8], %f4
faligndata %f0, %f2, %f32
ldd [%o1 + 0x10], %f6
faligndata %f2, %f4, %f34
ldd [%o1 + 0x18], %f8
faligndata %f4, %f6, %f36
ldd [%o1 + 0x20], %f10
mov -0x8, %o5 ! setup for .not8_med_entry
faligndata %f6, %f8, %f38
prefetch [%o1 + 0x400], PF_FCN_LD_L2_256
ldd [%o1 + 0x28], %f12
movrlz %o3, %g0, %o5 ! setup for .not8_med_entry
faligndata %f8, %f10, %f40
ldd [%o1 + 0x30], %f14
faligndata %f10, %f12, %f42
ldd [%o1 + 0x38], %f0
sub %o2, 0x40, %o2 ! update size
add %o1, 0x40, %o1 ! update src
! Write previous block and load rest of current block.