/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
*/
.file "memcpy.s"
/*
* memcpy(s1, s2, len)
*
* Copy s2 to s1, always copy n bytes.
* Note: this C code does not work for overlapped copies.
* Memmove() and bcopy() do.
*
* Added entry __align_cpy_1 is generally for use of the compilers.
*
* Fast assembler language version of the following C-program for memcpy
* which represents the `standard' for the C-library.
*
* void *
* memcpy(void *s, const void *s0, size_t n)
* {
* if (n != 0) {
* char *s1 = s;
* const char *s2 = s0;
* do {
* *s1++ = *s2++;
* } while (--n != 0);
* }
* return (s);
* }
*
*
*/
#include <sys/asm_linkage.h>
#include <sys/asi.h>
#include <sys/trap.h>
#include <sys/athena_extreg.h>
#define ALIGNED8_FPCOPY_THRESHOLD 1024
#define ALIGNED4_FPCOPY_THRESHOLD 1024
#define PF_FCN_LD 20
#define PF_FCN_ST 22
#define PF_FCN_LD_L2_256 29
#define PF_FCN_ST_L2_256 31
#define PF_DIST_LD (2048)
#define PF_DIST_DC 0xa90
#define PF_DIST_L1 0x090
#define FPRS_FEF 0x4
/*
* Macros to specify Athena SIMD instructions and/or disable HW prefetch
* for modified instructions. An SXAR2_* macro modifies the two instructions
* that directly follow the macro.
*/
#define SXAR2_DISABLE_PF SXAR2_RAW(0, 0, 0, 2, 0, 0, 0, 0, 2, 0)
#define SXAR2_SIMD SXAR2_RAW(1, 0, 0, 0, 0, 1, 0, 0, 0, 0)
#define SXAR2_SIMD_DIS_PF SXAR2_RAW(1, 0, 0, 2, 0, 1, 0, 0, 2, 0)
ANSI_PRAGMA_WEAK(memmove,function)
ANSI_PRAGMA_WEAK(memcpy,function)
ENTRY(memmove)
cmp %o1, %o0 ! if from address is >= to use forward copy
bgeu,pn %ncc, .forcpy ! else use backward if ...
sub %o0, %o1, %o4 ! get difference of two addresses
cmp %o2, %o4 ! compare size and difference of addresses
bleu,pn %ncc, .forcpy ! if size is bigger, do overlapped copy
add %o1, %o2, %o5 ! get to end of source space
!
! an overlapped copy that must be done "backwards"
!
.chksize:
cmp %o2, 8 ! less than 8 byte do byte copy
blu,pt %ncc, 2f ! else continue
! Now size is bigger than 8
.dbalign:
add %o0, %o2, %g1 ! get to end of dest space
andcc %g1, 7, %o3 ! %o3 has bytes till dst 8 bytes aligned
bz,a,pn %ncc, .dbbck ! if dst is not 8 byte aligned: align it
andn %o2, 7, %o3 ! %o3 count is multiple of 8 bytes size
sub %o2, %o3, %o2 ! update o2 with new count
1: dec %o5 ! decrement source
ldub [%o5], %g1 ! load one byte
deccc %o3 ! decrement count
bgu,pt %ncc, 1b ! if not done keep copying
stb %g1, [%o5+%o4] ! store one byte into dest
andncc %o2, 7, %o3 ! %o3 count is multiple of 8 bytes size
bz,pn %ncc, 2f ! if size < 8, move to byte copy
! Now Destination is 8 byte aligned
.dbbck:
andcc %o5, 7, %o0 ! %o0 has src offset
bz,a,pn %ncc, .dbcopybc ! if src is aligned to fast mem move
sub %o2, %o3, %o2 ! Residue bytes in %o2
.cpy_dbwdbc: ! alignment of src is needed
sub %o2, 8, %o2 ! set size one loop ahead
sll %o0, 3, %g1 ! %g1 is left shift
mov 64, %g5 ! init %g5 to be 64
sub %g5, %g1, %g5 ! %g5 right shift = (64 - left shift)
sub %o5, %o0, %o5 ! align the src at 8 bytes.
add %o4, %o0, %o4 ! increase difference between src & dst
ldx [%o5], %o1 ! load first 8 bytes
srlx %o1, %g5, %o1
1: sub %o5, 8, %o5 ! subtract 8 from src
ldx [%o5], %o0 ! load 8 byte
sllx %o0, %g1, %o3 ! shift loaded 8 bytes left into tmp reg
or %o1, %o3, %o3 ! align data
stx %o3, [%o5+%o4] ! store 8 byte
subcc %o2, 8, %o2 ! subtract 8 byte from size
bg,pt %ncc, 1b ! if size > 0 continue
srlx %o0, %g5, %o1 ! move extra byte for the next use
srl %g1, 3, %o0 ! retsote %o0 value for alignment
add %o5, %o0, %o5 ! restore src alignment
sub %o4, %o0, %o4 ! restore difference between src & dest
ba 2f ! branch to the trailing byte copy
add %o2, 8, %o2 ! restore size value
.dbcopybc: ! alignment of src is not needed
1: sub %o5, 8, %o5 ! subtract from src
ldx [%o5], %g1 ! load 8 bytes
subcc %o3, 8, %o3 ! subtract from size
bgu,pt %ncc, 1b ! if size is bigger 0 continue
stx %g1, [%o5+%o4] ! store 8 bytes to destination
ba 2f
nop
.bcbyte:
1: ldub [%o5], %g1 ! load one byte
stb %g1, [%o5+%o4] ! store one byte
2: deccc %o2 ! decrement size
bgeu,a,pt %ncc, 1b ! if size is >= 0 continue
dec %o5 ! decrement from address
.exitbc: ! exit from backward copy
retl
add %o5, %o4, %o0 ! restore dest addr
SET_SIZE(memmove)
.align 64
ENTRY(memcpy)
ENTRY(__align_cpy_1)
! Note that sun4v variants actually do something different for
! memmove, but Athena will use all the same code as memcpy.
.forcpy:
prefetch [%o1], PF_FCN_LD
prefetch [%o0], PF_FCN_ST
cmp %o2, 64 !!! subline size
bgu,pn %icc, .medium ! jmp if size > 64
mov %o0, %g1
cmp %o2, 0x3
ble,pt %icc, .smallest ! jmp if size < 4
or %o0, %o1, %o3
btst 0x3, %o3
be,pt %icc, .small_aln4 ! jmp if src and dst are 4b aligned
sub %o2, 0x3, %o2 ! TRICKY: .small_unal, .small_aln4
!! 4-64 bytes, alignment unknown
.small_unal: ! copy 4 bytes
ldub [%o1], %o3
subcc %o2, 0x4, %o2
stb %o3, [%o0]
ldub [%o1 + 0x1], %o3
add %o1, 0x4, %o1
stb %o3, [%o0 + 0x1]
ldub [%o1 - 0x2], %o3
add %o0, 0x4, %o0
stb %o3, [%o0 - 0x2]
ldub [%o1 - 0x1], %o3
bgu,pt %icc, .small_unal ! at least 4 bytes?
stb %o3, [%o0 - 0x1]
add %o2, 0x3, %o2 ! TRICKY: see above comment
.smallest: ! 0-3 bytes
tst %o2
be,pt %icc, .sm_exit ! 0 bytes?
nop
.small_3x: ! 1-3 bytes
ldub [%o1], %o3
subcc %o2, 0x1, %o2
be,pt %icc, .sm_exit ! 0 bytes?
stb %o3, [%o0]
ldub [%o1 + 0x1], %o3
subcc %o2, 0x1, %o2
be,pt %icc, .sm_exit ! 0 bytes?
stb %o3, [%o0 + 0x1]
ldub [%o1 + 0x2], %o3
stb %o3, [%o0 + 0x2]
retl ! done
mov %g1, %o0
.sm_aln4_8: ! copy 2 words (8 bytes)
ld [%o1], %o3
.sm_aln4_8_entry: ! 8-64 bytes, copy 8 bytes
subcc %o2, 0x8, %o2
st %o3, [%o0]
add %o1, 0x8, %o1
ld [%o1 - 0x4], %o3
add %o0, 0x8, %o0
bgu,pt %icc, .sm_aln4_8 ! at least 8 bytes?
st %o3, [%o0 - 0x4]
addcc %o2, 0x7, %o2
be,pt %icc, .sm_exit ! 0 bytes?
nop
cmp %o2, 0x4
bl .small_3x ! 1-3 bytes?
nop
ld [%o1], %o3
add %o1, 0x4, %o1
subcc %o2, 0x4, %o2
st %o3, [%o0]
add %o0, 0x4, %o0
bne,pt %icc, .small_3x ! 1-3 bytes?
nop
retl ! done
mov %g1, %o0
!! src, dst is 4b aligned
.small_aln4: ! 4-64 bytes
subcc %o2, 0x4, %o2
bgu,pt %icc, .sm_aln4_8_entry ! TRICKY: jump if size > 7 bytes
ld [%o1], %o3
addcc %o2, 0x3, %o2
be,pt %icc, .sm_exit ! 0 bytes?
st %o3, [%o0]
subcc %o2, 0x1, %o2
ldub [%o1 + 0x4], %o3
be,pt %icc, .sm_exit ! 0 bytes?
stb %o3, [%o0 + 0x4]
ldub [%o1 + 0x5], %o3
subcc %o2, 0x1, %o2
be,pt %icc, .sm_exit ! 0 bytes?
stb %o3, [%o0 + 0x5]
ldub [%o1 + 0x6], %o3
stb %o3, [%o0 + 0x6]
.sm_exit: ! done
retl
mov %g1, %o0
!! size > 64
!! prefetch next line (256byte) here, but not next subline (64byte),
!! because whole 256byte data may be prefetched on the very
!! first prefetch, when data comes from memory.
.medium:
prefetch [%o1 + 0x100], PF_FCN_LD_L2_256
prefetch [%o0 + 0x100], PF_FCN_ST_L2_256
neg %o0, %o5
neg %o1, %o3
andcc %o5, 0x7, %o5 ! (-dst&7)
and %o3, 0x7, %o3
be,pt %icc, 2f ! jmp if dst is 8b aligned
sub %o5, %o3, %o3 ! %o3 = (-dst&7) - (-src&7)
sub %o2, %o5, %o2 ! adjust count to align dst
!! align dst
1:
ldub [%o1], %o4
subcc %o5, 0x1, %o5
add %o1, 0x1, %o1
stb %o4, [%o0]
bgu,pt %icc, 1b ! dst is 8b aligned?
add %o0, 0x1, %o0
!! Now dst is 8b aligned
2:
btst 0x3, %o1
prefetch [%o1 + 0x40], PF_FCN_LD
bne,pt %icc, .med_unal ! jmp if src is not 4b aligned
prefetch [%o0 + 0x40], PF_FCN_ST
btst 0x7, %o1
prefetch [%o1 + 0x80], PF_FCN_LD_L2_256
be,pt %icc, .med_aln8 ! jmp if src is 8b aligned
prefetch [%o0 + 0x80], PF_FCN_ST_L2_256
!!
!! dst is 8b aligned, src is 4b aligned
!!
#if ALIGNED4_FPCOPY_THRESHOLD < 4096
cmp %o2, ALIGNED4_FPCOPY_THRESHOLD
#else /* ALIGNED4_FPCOPY_THRESHOLD < 4096 */
set ALIGNED4_FPCOPY_THRESHOLD, %o4
cmp %o2, %o4
#endif /* ALIGNED4_FPCOPY_THRESHOLD < 4096 */
bgu,pt %icc, .big_aln4 ! jmp if size > threshold
prefetch [%o1 + 0xc0], PF_FCN_LD_L2_256
subcc %o2, 0xf, %o2
prefetch [%o0 + 0xc0], PF_FCN_ST_L2_256
!! never jump since size > 57 after dst aligned
ble,pn %icc, .med_aln4_15 ! jmp if size <= 15
.empty ! prefetch at .med_aln4_16 in delay slot
!! dst is 8b aligned, src is 4b aligned,
!! 15 < size <= threshold
.med_aln4_16: ! copy 16 bytes
prefetch [%o1 + 0x100], PF_FCN_LD_L2_256
ld [%o1], %o4
subcc %o2, 0x10, %o2
prefetch [%o0 + 0x100], PF_FCN_ST_L2_256
st %o4, [%o0]
ld [%o1 + 0x4], %o3
add %o1, 0x10, %o1
st %o3, [%o0 + 0x4]
ld [%o1 - 0x8], %o4
add %o0, 0x10, %o0
st %o4, [%o0 - 0x8]
ld [%o1 - 0x4], %o3
bgu,pt %icc, .med_aln4_16 ! at least 16 bytes?
st %o3, [%o0 - 0x4]
.med_aln4_15: ! 0-15 bytes
addcc %o2, 0xf, %o2
be,pt %icc, .exit ! 0 bytes?
nop
cmp %o2, 0x8
bl,pt %icc, .med_7 ! 1-7 bytes?
nop
ld [%o1], %o4
subcc %o2, 0x8, %o2
st %o4, [%o0]
add %o1, 0x8, %o1
ld [%o1 - 0x4], %o3
add %o0, 0x8, %o0
st %o3, [%o0 - 0x4]
be,pt %icc, .exit ! 0 bytes?
nop
.med_7: ! 1-7 bytes
cmp %o2, 0x3
ble,pt %icc, .med_3 ! 1-3 bytes?
nop
ld [%o1], %o4
sub %o2, 0x4, %o2
add %o1, 0x4, %o1
st %o4, [%o0]
add %o0, 0x4, %o0
tst %o2
be,pt %icc, .exit ! 0 bytes?
nop
.med_3: ! 1-3 bytes
subcc %o2, 0x1, %o2
ldub [%o1], %o3
be,pt %icc, .exit ! 0 bytes?
stb %o3, [%o0]
ldub [%o1 + 0x1], %o3
subcc %o2, 0x1, %o2
be,pt %icc, .exit ! 0 bytes?
stb %o3, [%o0 + 0x1]
ldub [%o1 + 0x2], %o3
stb %o3, [%o0 + 0x2]
.exit:
retl
mov %g1, %o0
.med_aln8:
!!
!! dst is 8b aligned, src is 8b aligned
!!
#if ALIGNED8_FPCOPY_THRESHOLD < 4096
cmp %o2, ALIGNED8_FPCOPY_THRESHOLD
#else /* ALIGNED8_FPCOPY_THRESHOLD < 4096 */
set ALIGNED8_FPCOPY_THRESHOLD, %o4
cmp %o2, %o4
#endif /* ALIGNED8_FPCOPY_THRESHOLD < 4096 */
bgu,pt %icc, .big_aln8 ! jmp if size > threshold
prefetch [%o1 + 0xc0], PF_FCN_LD_L2_256
prefetch [%o0 + 0xc0], PF_FCN_ST_L2_256
L_INTCOPY_SMALL: ! (re-entry for .big_aln8)
subcc %o2, 0x1f, %o2
ble,pn %icc, .med_aln8_31 ! jmp if size <= 31
.empty ! prefetch at .med_aln8_32 in delay slot
.med_aln8_32: ! copy 32 bytes
prefetch [%o1 + 0x100], PF_FCN_LD_L2_256
ldx [%o1], %o4
subcc %o2, 0x20, %o2
prefetch [%o0 + 0x100], PF_FCN_ST_L2_256
stx %o4, [%o0]
ldx [%o1 + 0x8], %o3
add %o1, 0x20, %o1
stx %o3, [%o0 + 0x8]
ldx [%o1 - 0x10], %o4
add %o0, 0x20, %o0
stx %o4, [%o0 - 0x10]
ldx [%o1 - 0x8], %o3
bgu,pt %icc, .med_aln8_32 ! at least 32 bytes?
stx %o3, [%o0 - 0x8]
.med_aln8_31: ! 0-31 bytes
addcc %o2, 0x10, %o2
ble,pt %icc, .med_aln8_15 ! 0-15 bytes?
nop
ldx [%o1], %o4
add %o1, 0x10, %o1
stx %o4, [%o0]
sub %o2, 0x10, %o2
ldx [%o1 - 0x8], %o3
add %o0, 0x10, %o0
stx %o3, [%o0 - 0x8]
.med_aln8_15: ! 0-15 bytes
addcc %o2, 0xf, %o2
be,pt %icc, .exit ! 0 bytes?
nop
cmp %o2, 0x8
bl,pt %icc, .med_7 ! 1-7 bytes?
nop
ldx [%o1], %o4
add %o1, 0x8, %o1
stx %o4, [%o0]
subcc %o2, 0x8, %o2
be,pt %icc, .exit ! 0 bytes?
add %o0, 0x8, %o0
ba .med_7 ! 1-7 bytes
nop
!! dst is 8b aligned, src is 8b aligned
!! size > ALIGNED8_FPCOPY_THRESHOLD
.big_aln8:
rd %fprs, %o4
andcc %o4, FPRS_FEF, %o4 ! preserve %o4
be,a,pt %icc, 1f ! is FPU enabled?
wr %g0, FPRS_FEF, %fprs ! enable FPU
1:
btst 0xf, %o0
be,pt %icc, 2f ! dst 16-byte aligned?
nop
ldd [%o1], %f0
std %f0, [%o0]
add %o0, 8, %o0
add %o1, 8, %o1
sub %o2, 8, %o2
2:
sub %o2, 128*5-1, %o2 ! adjust size for 512-byte epilogue
!! dst is 16b aligned, src is 8b aligned
!! size > ALIGNED8_FPCOPY_THRESHOLD
prologue:
prefetch [%o1+PF_DIST_LD], PF_FCN_LD_L2_256
prefetch [%o0+PF_DIST_LD], PF_FCN_ST_L2_256
add %o1, 128, %o1
add %o0, 128, %o0
prefetch [%o1+PF_DIST_LD], PF_FCN_LD_L2_256
prefetch [%o0+PF_DIST_LD], PF_FCN_ST_L2_256
add %o1, 128, %o1
add %o0, 128, %o0
prefetch [%o1-128*2], PF_FCN_LD
prefetch [%o0-128*2], PF_FCN_ST
prefetch [%o1+PF_DIST_LD], PF_FCN_LD_L2_256
prefetch [%o0+PF_DIST_LD], PF_FCN_ST_L2_256
add %o1, 128, %o1
add %o0, 128, %o0
prefetch [%o1-128*2], PF_FCN_LD
prefetch [%o0-128*2], PF_FCN_ST
prefetch [%o1+PF_DIST_LD], PF_FCN_LD_L2_256
prefetch [%o0+PF_DIST_LD], PF_FCN_ST_L2_256
add %o1, 128, %o1
add %o0, 128, %o0
kernel_loop: ! copy 128 bytes
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*0], %f0
ldd [%o1-128*4+16*1], %f2
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*2], %f4
ldd [%o1-128*4+16*3], %f6
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*4], %f8
ldd [%o1-128*4+16*5], %f10
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*6], %f12
ldd [%o1-128*4+16*7], %f14
SXAR2_SIMD_DIS_PF
std %f0, [%o0-128*4+16*0]
std %f2, [%o0-128*4+16*1]
SXAR2_SIMD_DIS_PF
std %f4, [%o0-128*4+16*2]
std %f6, [%o0-128*4+16*3]
SXAR2_SIMD_DIS_PF
std %f8, [%o0-128*4+16*4]
std %f10, [%o0-128*4+16*5]
SXAR2_SIMD_DIS_PF
std %f12, [%o0-128*4+16*6]
std %f14, [%o0-128*4+16*7]
subcc %o2, 128, %o2
prefetch [%o1-128*2], PF_FCN_LD
prefetch [%o0-128*2], PF_FCN_ST
prefetch [%o1+PF_DIST_LD], PF_FCN_LD_L2_256
prefetch [%o0+PF_DIST_LD], PF_FCN_ST_L2_256
add %o1, 128, %o1
add %o0, 128, %o0
bgu,pt %icc, kernel_loop ! more than 128 bytes?
nop
epilogue: ! copy 512 bytes
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*0], %f0
ldd [%o1-128*4+16*1], %f2
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*2], %f4
ldd [%o1-128*4+16*3], %f6
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*4], %f8
ldd [%o1-128*4+16*5], %f10
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*6], %f12
ldd [%o1-128*4+16*7], %f14
SXAR2_SIMD_DIS_PF
std %f0, [%o0-128*4+16*0]
std %f2, [%o0-128*4+16*1]
SXAR2_SIMD_DIS_PF
std %f4, [%o0-128*4+16*2]
std %f6, [%o0-128*4+16*3]
SXAR2_SIMD_DIS_PF
std %f8, [%o0-128*4+16*4]
std %f10, [%o0-128*4+16*5]
SXAR2_SIMD_DIS_PF
std %f12, [%o0-128*4+16*6]
std %f14, [%o0-128*4+16*7]
prefetch [%o1-128*2], PF_FCN_LD
prefetch [%o0-128*2], PF_FCN_ST
add %o1, 128, %o1
add %o0, 128, %o0
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*0], %f0
ldd [%o1-128*4+16*1], %f2
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*2], %f4
ldd [%o1-128*4+16*3], %f6
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*4], %f8
ldd [%o1-128*4+16*5], %f10
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*6], %f12
ldd [%o1-128*4+16*7], %f14
SXAR2_SIMD_DIS_PF
std %f0, [%o0-128*4+16*0]
std %f2, [%o0-128*4+16*1]
SXAR2_SIMD_DIS_PF
std %f4, [%o0-128*4+16*2]
std %f6, [%o0-128*4+16*3]
SXAR2_SIMD_DIS_PF
std %f8, [%o0-128*4+16*4]
std %f10, [%o0-128*4+16*5]
SXAR2_SIMD_DIS_PF
std %f12, [%o0-128*4+16*6]
std %f14, [%o0-128*4+16*7]
prefetch [%o1-128*2], PF_FCN_LD
prefetch [%o0-128*2], PF_FCN_ST
add %o1, 128, %o1
add %o0, 128, %o0
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*0], %f0
ldd [%o1-128*4+16*1], %f2
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*2], %f4
ldd [%o1-128*4+16*3], %f6
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*4], %f8
ldd [%o1-128*4+16*5], %f10
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*6], %f12
ldd [%o1-128*4+16*7], %f14
SXAR2_SIMD_DIS_PF
std %f0, [%o0-128*4+16*0]
std %f2, [%o0-128*4+16*1]
SXAR2_SIMD_DIS_PF
std %f4, [%o0-128*4+16*2]
std %f6, [%o0-128*4+16*3]
SXAR2_SIMD_DIS_PF
std %f8, [%o0-128*4+16*4]
std %f10, [%o0-128*4+16*5]
SXAR2_SIMD_DIS_PF
std %f12, [%o0-128*4+16*6]
std %f14, [%o0-128*4+16*7]
add %o1, 128, %o1
add %o0, 128, %o0
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*0], %f0
ldd [%o1-128*4+16*1], %f2
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*2], %f4
ldd [%o1-128*4+16*3], %f6
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*4], %f8
ldd [%o1-128*4+16*5], %f10
SXAR2_SIMD_DIS_PF
ldd [%o1-128*4+16*6], %f12
ldd [%o1-128*4+16*7], %f14
SXAR2_SIMD_DIS_PF
std %f0, [%o0-128*4+16*0]
std %f2, [%o0-128*4+16*1]
SXAR2_SIMD_DIS_PF
std %f4, [%o0-128*4+16*2]
std %f6, [%o0-128*4+16*3]
SXAR2_SIMD_DIS_PF
std %f8, [%o0-128*4+16*4]
std %f10, [%o0-128*4+16*5]
SXAR2_SIMD_DIS_PF
std %f12, [%o0-128*4+16*6]
std %f14, [%o0-128*4+16*7]
add %o1, 128, %o1
add %o0, 128, %o0
sub %o1, 128*4, %o1
sub %o0, 128*4, %o0
add %o2, 128*5-1-128*4, %o2 ! restore size
wr %o4, %g0, %fprs ! restore FPU setting from %o4
ba L_INTCOPY_SMALL
nop
.med_unal:
prefetch [%o1 + 0x80], PF_FCN_LD_L2_256
prefetch [%o1 + 0xc0], PF_FCN_LD_L2_256
!!
!! 4b-aligned/unaligned copy with %fp regs
!! dst is already 8-byte aligned
!!
.big_aln4: ! code shared with .med_unal
rd %fprs, %o4
andcc %o4, FPRS_FEF, %o4 ! preserve %o4
be,a,pt %icc, .fp_ready ! is FPU enabled?
wr %g0, FPRS_FEF, %fprs ! enable FPU
.fp_ready:
cmp %o2, 0xff
sethi %hi(0x1234567f), %o5
or %o5, 0x67f, %o5
bmask %o5, %g0, %g0 ! set gsr mask field
add %o1, 0x8, %o1 ! prepare to round src upward
alignaddr %o1, %g0, %o1 ! align src and set gsr align field
!
! %o5 = number of bytes to copy in .not8_medium
!
! size < 256: 8-byte aligned count
! size >= 256: bytes to block align
!
! %o2 = actual count remaining
!
! %o3 = (-dst&7) - (-src&7) set in .medium
! = - (-src&7) current
! = how much sooner we'll cross the alignment boundary
! in SRC compared to in DST
!
! Examples: Let # denote bytes that should not be accessed
! Let x denote a byte already copied to align DST
! Let . and - denote bytes not yet copied
! Let | denote double alignment boundaries
!
! DST: ######xx|........|--------|..###### %o2 = 18
! %o0
!
! %o3 = -3: SRC: ###xx...|.....---|-----..#|######## %o5 = 8
! %o1
!
! %o3 = 0: SRC: ######xx|........|--------|..###### %o5 = 16-8 = 8
! %o1
! (never happens, since src/dst 8-byte aligned is a special case
! we handle in .medium)
!
! %o3 = +1: SRC: #######x|x.......|.-------|-..##### %o5 = 16-8 = 8
! %o1
mov -0x8, %o5
movrlz %o3, %g0, %o5
add %o5, %o2, %o5
add %o5, %o3, %o5
bleu,pt %icc, 1f ! jmp if size < 256
andn %o5, 0x7, %o5 ! 8-byte aligned count
neg %o0, %o5
and %o5, 0x3f, %o5 ! (-dst&63)
1:
brgez,a,pt %o3, .not8_medium ! jmp if - (-src&7) >= 0
ldd [%o1 - 0x8], %f0 ! prep %f0
!! src underaligned, so prep %f0 some more
add %o1, %o3, %o1 ! back up src
2:
ldda [%o1] 0xd0, %f2 ! 1byte load
add %o1, 0x1, %o1
btst 0x7, %o1
bne,pt %icc, 2b
bshuffle %f0, %f2, %f0 ! shift %f0 left 1byte, merge with %f2
!! The .not8_med* code below is also used by the epilogue routine.
!! Do not put prefetch instruction.
.not8_medium:
tst %o5
bz,pt %icc, .not8_medx ! nothing to copy here
sub %o2, %o5, %o2 ! update size for later
ldd [%o1], %f2
subcc %o5, 0x8, %o5 ! update local count
be,pn %icc, 1f ! align and store last 8 bytes in %o5?
add %o1, 0x8, %o1 ! update src
.not8_med_16: ! copy 16 bytes
faligndata %f0, %f2, %f4
ldd [%o1], %f0
subcc %o5, 0x8, %o5 ! update local count
add %o1, 0x10, %o1 ! update src
std %f4, [%o0]
be,pn %icc, 2f ! store last 8 bytes in %o5?
faligndata %f2, %f0, %f6
ldd [%o1 - 0x8], %f2
subcc %o5, 0x8, %o5 ! update local count
std %f6, [%o0 + 0x8]
bne,pt %icc, .not8_med_16 ! at least 16 bytes?
add %o0, 0x10, %o0 ! update dst
1:
faligndata %f0, %f2, %f4
fmovd %f2, %f0
std %f4, [%o0]
ba .not8_medx
add %o0, 0x8, %o0
2:
std %f6, [%o0 + 0x8] ! faligndata in branch delay
sub %o1, 0x8, %o1
add %o0, 0x10, %o0
.not8_medx:
! Currently, %o1 points to the next double-aligned byte in src.
! 8 bytes starting at [%o1-8] are available in %f0
! At least one byte, possibly all, need to be written.
cmp %o2, 0x40
bgu,pt %icc, .not8_large ! size > 64 after done with %o5?
! otherwise 1-15 bytes left
andcc %o3, 0x7, %o5
be,pt %icc, 2f ! 8 bytes left in %f0?
sub %o5, 0x8, %o3
cmp %o2, 0x8
bl,a,pt %icc, 3f ! store 1-7 bytes?
add %o1, %o3, %o1 ! back up %o1
! at least 8 bytes but need to prep %f0
1:
subcc %o5, 0x1, %o5
ldda [%o1] 0xd0, %f2
add %o1, 0x1, %o1
bgu,pt %icc, 1b
bshuffle %f0, %f2, %f0 ! shift %f0 left 1byte, merge with %f2
2: ! store 8 bytes
subcc %o2, 0x8, %o2
std %f0, [%o0]
be,pt %icc, .not8_exit ! 0 bytes?
add %o0, 0x8, %o0
3: ! 1-7 bytes
ldub [%o1], %o3
subcc %o2, 0x1, %o2
add %o1, 0x1, %o1
stb %o3, [%o0]
bgu,pt %icc, 3b ! at least 1 byte?
add %o0, 0x1, %o0
.not8_exit: ! done
wr %o4, %g0, %fprs ! restore FPU setting
retl
mov %g1, %o0
!! size > 64
!! Since only limited prefetches can be inserted in preamble,
!! we put it for every 256byte. It is sufficient if data comes
!! from memory. We should allow performance degrade on copyback,
!! in which 64byte move is used.
!! Prefetch for dst in preamble is also neglected. Expecting
!! hardware feature (prefetch, writebuffer, etc.) to compensate.
!!
!! We do not actually use any block store instructions.
!! We just copy 64-byte aligned blocks of 64 bytes.
!!
.not8_large:
prefetch [%o1 + 0x100], PF_FCN_LD_L2_256
prefetch [%o1 + 0x200], PF_FCN_LD_L2_256
prefetch [%o1 + 0x300], PF_FCN_LD_L2_256
! Load the rest of the current block
! %o1 is further into src than %o0 is into dst
ldd [%o1], %f2
ldd [%o1 + 0x8], %f4
faligndata %f0, %f2, %f32
ldd [%o1 + 0x10], %f6
faligndata %f2, %f4, %f34
ldd [%o1 + 0x18], %f8
faligndata %f4, %f6, %f36
ldd [%o1 + 0x20], %f10
mov -0x8, %o5 ! setup for .not8_med_entry
faligndata %f6, %f8, %f38
prefetch [%o1 + 0x400], PF_FCN_LD_L2_256
ldd [%o1 + 0x28], %f12
movrlz %o3, %g0, %o5 ! setup for .not8_med_entry
faligndata %f8, %f10, %f40
ldd [%o1 + 0x30], %f14
faligndata %f10, %f12, %f42
ldd [%o1 + 0x38], %f0
sub %o2, 0x40, %o2 ! update size
add %o1, 0x40, %o1 ! update src
! Write previous block and load rest of current block.
! Some bytes will be loaded that won't yet be written.
.not8_large_blk:
prefetch [%o1 + PF_DIST_DC], PF_FCN_LD_L2_256
prefetch [%o0 + PF_DIST_DC], PF_FCN_ST_L2_256
ldd [%o1], %f2
faligndata %f12, %f14, %f44
ldd [%o1 + 0x8], %f4
faligndata %f14, %f0, %f46
std %f32, [%o0]
std %f34, [%o0 + 0x8]
std %f36, [%o0 + 0x10]
std %f38, [%o0 + 0x18]
std %f40, [%o0 + 0x20]
std %f42, [%o0 + 0x28]
std %f44, [%o0 + 0x30]
std %f46, [%o0 + 0x38]
sub %o2, 0x40, %o2 ! update size
add %o0, 0x40, %o0 ! update dst
ldd [%o1 + 0x10], %f6
faligndata %f0, %f2, %f32
ldd [%o1 + 0x18], %f8
faligndata %f2, %f4, %f34
ldd [%o1 + 0x20], %f10
faligndata %f4, %f6, %f36
ldd [%o1 + 0x28], %f12
faligndata %f6, %f8, %f38
ldd [%o1 + 0x30], %f14
faligndata %f8, %f10, %f40
ldd [%o1 + 0x38], %f0
faligndata %f10, %f12, %f42
prefetch [%o1 + PF_DIST_L1], PF_FCN_LD
prefetch [%o0 + PF_DIST_L1], PF_FCN_ST
cmp %o2, 0x48
bgu,pt %icc, .not8_large_blk ! store curr and load next?
add %o1, 0x40, %o1 ! update src
.not8_large_exit: ! finish curr block
faligndata %f12, %f14, %f44
faligndata %f14, %f0, %f46
std %f32, [%o0]
std %f34, [%o0 + 0x8]
std %f36, [%o0 + 0x10]
std %f38, [%o0 + 0x18]
std %f40, [%o0 + 0x20]
std %f42, [%o0 + 0x28]
std %f44, [%o0 + 0x30]
std %f46, [%o0 + 0x38]
cmp %o2, 0x40
bne,pt %icc, .not8_med_entry ! not exactly one block left?
add %o0, 0x40, %o0
brz,a,pt %o3, .not8_large_done ! block aligned?
ldd [%o1], %f2
.not8_med_entry: ! finish up in .not8_medium
add %o5, %o2, %o5
add %o5, %o3, %o5
ba .not8_medium
andn %o5, 0x7, %o5
.not8_large_done: ! copy last block
ldd [%o1 + 0x8], %f4
ldd [%o1 + 0x10], %f6
fsrc1 %f0, %f32
ldd [%o1 + 0x18], %f8
fsrc1 %f2, %f34
ldd [%o1 + 0x20], %f10
fsrc1 %f4, %f36
ldd [%o1 + 0x28], %f12
fsrc1 %f6, %f38
ldd [%o1 + 0x30], %f14
fsrc1 %f8, %f40
fsrc1 %f10, %f42
fsrc1 %f12, %f44
fsrc1 %f14, %f46
std %f32, [%o0]
std %f34, [%o0 + 0x8]
std %f36, [%o0 + 0x10]
std %f38, [%o0 + 0x18]
std %f40, [%o0 + 0x20]
std %f42, [%o0 + 0x28]
std %f44, [%o0 + 0x30]
std %f46, [%o0 + 0x38]
wr %o4, %g0, %fprs ! restore FPU setting
retl
mov %g1, %o0
SET_SIZE(memcpy)
SET_SIZE(__align_cpy_1)