/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/*
* memcpy(s1, s2, len)
*
* Copy s2 to s1, always copy n bytes.
* Note: this C code does not work for overlapped copies.
* Memmove() and bcopy() do.
*
* Fast assembler language version of the following C-program for memcpy
* which represents the `standard' for the C-library.
*
* void *
* memcpy(void *s, const void *s0, size_t n)
* {
* if (n != 0) {
* char *s1 = s;
* const char *s2 = s0;
* do {
* *s1++ = *s2++;
* } while (--n != 0);
* }
* return (s);
* }
*/
#include <sys/asm_linkage.h>
#ifndef BSTORE_SIZE
#endif
!
!
.ovbc:
.byte:
.byteloop:
.exit:
.align 16
.dbalign:
.dbalign1:
.dbmed:
!
!
.dbmedl64:
.dbmedl32:
.dbmedl31:
nop !
.dbmedl15:
!
!
.align 16
.dbbck:
.dbmv64:
.dbmv32:
.dbmvx:
.dbmv8:
.dbremain:
.dbbyte:
.dbexit:
.forcpy:
.align 16
.align 16
.medium:
1:
2:
/*
* Handle all cases where src and dest are aligned on word
* or long word boundaries. Use unrolled loops for better
* performance. This option wins over standard large data
* move when source and destination is in cache for medium
* to short data moves.
*/
/*
* no need to put prefetch in loop as prefetches have
* already been issued for maximum loop size
*/
.medw16:
.medw15:
nop !
nop !
.medwexit:
/*
* Special case for handling when src and dest are both long word aligned
* and total data to move is between SMALL_MAX and MED_MAX bytes
*/
.align 16
/*
* no need to put prefetch in loop as prefetches have
* already been issued for maximum loop size
*/
.medl32:
.medl31:
nop !
.medl15:
.align 16
3:
! in SRC compared to in DST
!
! Examples: Let # denote bytes that should not be accessed
! Let x denote a byte already copied to align DST
! Let . and - denote bytes not yet copied
! Let | denote double alignment boundaries
!
! DST: ######xx|........|--------|..###### o2 = 18
! o0
!
! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8
! o1
!
! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8
! o1
!
! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8
! o1
or %g0, -8, %o5
alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1
movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0
add %o5, %o2, %o5
add %o5, %o3, %o5
bleu %ncc, 4f
andn %o5, 7, %o5 ! 8 byte aligned count
and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned
4:
brgez,a %o3, .beginmedloop
ldd [%o1-8], %d0
add %o1, %o3, %o1 ! back up o1
5:
ldda [%o1]ASI_FL8_P, %d2
inc %o1
andcc %o1, 7, %g0
bnz %ncc, 5b
.medloop:
1:
2:
#if 0
/* This code will use partial stores. */
1:
2:
not %o3
faligndata %d0, %d0, %d0 ! shift bytes to the left
and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3]
edge8n %g0, %o3, %o5
stda %d0, [%o0]%o5, ASI_PST8_P
brlez %o2, .mediumexit
add %o0, %o3, %o0 ! update DST to last stored byte
3:
inc %o0
deccc %o2
ldub [%o1], %o3
stb %o3, [%o0]
bgu %ncc, 3b
inc %o1
#else
andcc %o3, 7, %o5 ! Number of bytes needed to completely
! fill %d0 with good (unwritten) data.
bz %ncc, 2f
sub %o5, 8, %o3 ! -(number of good bytes in %d0)
cmp %o2, 8
bl,a %ncc, 3f ! Not enough bytes to fill %d0
add %o1, %o3, %o1 ! Back up %o1
1:
deccc %o5
ldda [%o1]ASI_FL8_P, %d2
inc %o1
bgu %ncc, 1b
2:
3:
#endif
.large:
!
! %d0 I/O already loaded with SRC data from [%o1-8]
! %o2 I/O count (number of bytes that need to be written)
! %o3 I Not written. If zero, then SRC is double aligned.
! %o4 I Not written. Holds fprs.
! %o5 O The number of doubles that remain to be written.
! Load the rest of the current block
! Recall that %o1 is further into SRC than %o0 is into DST
prefetch [%o0 + (0 * BLOCK_SIZE)], 22
prefetch [%o0 + (1 * BLOCK_SIZE)], 22
prefetch [%o0 + (2 * BLOCK_SIZE)], 22
ldd [%o1], %f2
prefetch [%o1 + (3 * BLOCK_SIZE)], 21
ldd [%o1 + 0x8], %f4
faligndata %f0, %f2, %f32
ldd [%o1 + 0x10], %f6
faligndata %f2, %f4, %f34
ldd [%o1 + 0x18], %f8
faligndata %f4, %f6, %f36
ldd [%o1 + 0x20], %f10
or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
prefetch [%o1 + (4 * BLOCK_SIZE)], 21
faligndata %f6, %f8, %f38
ldd [%o1 + 0x28], %f12
movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter)
faligndata %f8, %f10, %f40
ldd [%o1 + 0x30], %f14
faligndata %f10, %f12, %f42
ldd [%o1 + 0x38], %f0
sub %o2, BLOCK_SIZE, %o2 ! update count
prefetch [%o1 + (5 * BLOCK_SIZE)], 21
add %o1, BLOCK_SIZE, %o1 ! update SRC
! Main loop. Write previous block. Load rest of current block.
1:
2:
3:
.align 16
.xlarge:
! %d0 I/O already loaded with SRC data from [%o1-8]
! %o2 I/O count (number of bytes that need to be written)
! %o3 I Not written. If zero, then SRC is double aligned.
! %o4 I Not written. Holds fprs.
! %o5 O The number of doubles that remain to be written.
! Load the rest of the current block
! Recall that %o1 is further into SRC than %o0 is into DST
! prefetch [%o1 + (3 * BLOCK_SIZE)], 21
! executed in delay slot for branch to .xlarge
prefetch [%o1 + (4 * BLOCK_SIZE)], 21
prefetch [%o1 + (5 * BLOCK_SIZE)], 21
ldd [%o1], %f2
prefetch [%o1 + (6 * BLOCK_SIZE)], 21
ldd [%o1 + 0x8], %f4
faligndata %f0, %f2, %f32
ldd [%o1 + 0x10], %f6
faligndata %f2, %f4, %f34
ldd [%o1 + 0x18], %f8
faligndata %f4, %f6, %f36
ldd [%o1 + 0x20], %f10
or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
faligndata %f6, %f8, %f38
ldd [%o1 + 0x28], %f12
movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later)
faligndata %f8, %f10, %f40
ldd [%o1 + 0x30], %f14
faligndata %f10, %f12, %f42
ldd [%o1 + 0x38], %f0
sub %o2, BLOCK_SIZE, %o2 ! update count
prefetch [%o1 + (7 * BLOCK_SIZE)], 21
add %o1, BLOCK_SIZE, %o1 ! update SRC
! This point is 32-byte aligned since 24 instructions appear since
! the previous alignment directive.
! Main loop. Write previous block. Load rest of current block.
1:
2:
3: