memcpy.s revision 8cd45542f2a452ca0dab13d8b2d5cfa876ccbebc
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
.ident "%Z%%M% %I% %E% SMI"
.file "%M%"
#include <sys/asm_linkage.h>
ANSI_PRAGMA_WEAK(memmove,function)
ANSI_PRAGMA_WEAK(memcpy,function)
#include "SYS.h"
ENTRY(memmove)
movl 0+12(%esp),%ecx / get number of bytes to move
pushl %esi / save off %edi, %esi and move destination
pushl %edi
movl 8+ 4(%esp),%edi / destination buffer address
movl 8+ 8(%esp),%esi / source buffer address
movl %edi, %eax
testl %ecx,%ecx
jz .Return
cmpl %esi,%edi / if (source addr > dest addr)
leal -1(%esi,%ecx),%edx / %edx = src + size - 1
jle .memcpy_post / jump if dst < src
cmpl %edx,%edi
jle .CopyLeft / jump if dst <= src + size - 1
jmp .memcpy_post
ENTRY(memcpy)
pushl %esi
pushl %edi
movl 8+4(%esp),%edi / %edi = dest address
movl %edi, %eax / save this
movl 8+8(%esp),%esi / %esi = source address
movl 8+12(%esp),%ecx/ %ecx = length of string
/ %edx scratch register
/ %eax scratch register
.memcpy_post:
nop / this really helps, don't know why
/ note: cld is perf death on P4
cmpl $63,%ecx
ja .move_sse / not worth doing sse for less
.movew:
movl %ecx,%edx / save byte cnt
shrl $2,%ecx / %ecx = number of words to move
rep ; smovl / move the words
andl $0x3,%edx / %edx = number of bytes left to move
jz .Return / %edx <= 3, so just unroll the loop
movb (%esi), %cl
movb %cl, (%edi)
decl %edx
jz .Return
movb 1(%esi), %cl
movb %cl, 1(%edi)
decl %edx
jz .Return
movb 2(%esi), %cl
movb %cl, 2(%edi)
.Return:
popl %edi / restore register variables
popl %esi
ret
.move_sse:
/
/ time to 16 byte align destination
/
andl $15, %eax
jnz .sse_unaligned / jmp if dest is unaligned
.sse: / dest is aligned, check source
movl %ecx, %edx / get byte count
shrl $6, %edx / number of 64 byte blocks to move
testl $15, %esi
jnz .sse_da / go to slow loop if source is unaligned
cmpl $65535, %ecx
ja .sse_sa_nt_loop
/
/ use aligned load since we're lucky
/
.sse_sa_loop:
prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time
prefetcht0 568(%edi) / prefetch source & copy 64 byte at a time
movaps 0(%esi), %xmm0
movaps %xmm0, 0(%edi)
movaps 16(%esi), %xmm1
movaps %xmm1, 16(%edi)
movaps 32(%esi), %xmm2
movaps %xmm2, 32(%edi)
movaps 48(%esi), %xmm3
movaps %xmm3, 48(%edi)
addl $64, %esi
addl $64, %edi
decl %edx
jnz .sse_sa_loop
.sse_cleanup:
andl $63, %ecx / compute remaining bytes
movl 8+4(%esp), %eax / setup return value
jz .Return
jmp .movew
/
/ use aligned load since we're lucky
/
.align 16
.sse_sa_nt_loop:
prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
movaps (%esi), %xmm0
movntps %xmm0, 0(%edi)
movaps 16(%esi), %xmm1
movntps %xmm1, 16(%edi)
movaps 32(%esi), %xmm2
movntps %xmm2, 32(%edi)
movaps 48(%esi), %xmm3
movntps %xmm3, 48(%edi)
addl $64, %esi
addl $64, %edi
decl %edx
jnz .sse_sa_nt_loop
#if defined(_SSE2_INSN)
mfence
#elif defined(_SSE_INSN)
sfence
#else
#error "Must have either SSE or SSE2"
#endif
jmp .sse_cleanup
/
/ Make certain that destination buffer becomes aligned
/
.sse_unaligned:
neg %eax / subtract from 16 and get destination
andl $15, %eax / aligned on a 16 byte boundary
movl %ecx, %edx / saved count
subl %eax, %ecx / subtract from byte count
cmpl $64, %ecx / after aligning, will we still have 64 bytes?
cmovb %edx, %ecx / if not, restore original byte count,
cmovb 8+4(%esp), %eax / and restore return value,
jb .movew / and do a non-SSE move.
xchg %ecx, %eax / flip for copy
rep ; smovb / move the bytes
xchg %ecx, %eax / flip back
jmp .sse
.align 16
.sse_da:
cmpl $65535, %ecx
jbe .sse_da_loop
/
/ use unaligned load since source doesn't line up
/
.sse_da_nt_loop:
prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
movups 0(%esi), %xmm0
movntps %xmm0, 0(%edi)
movups 16(%esi), %xmm1
movntps %xmm1, 16(%edi)
movups 32(%esi), %xmm2
movntps %xmm2, 32(%edi)
movups 48(%esi), %xmm3
movntps %xmm3, 48(%edi)
addl $64, %esi
addl $64, %edi
decl %edx
jnz .sse_da_nt_loop
#if defined(_SSE2_INSN)
mfence
#elif defined(_SSE_INSN)
sfence
#else
#error "Must have either SSE or SSE2"
#endif
jmp .sse_cleanup
/
/ use unaligned load since source doesn't line up
/
.align 16
.sse_da_loop:
prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time
prefetcht0 568(%edi)
movups 0(%esi), %xmm0
movaps %xmm0, 0(%edi)
movups 16(%esi), %xmm1
movaps %xmm1, 16(%edi)
movups 32(%esi), %xmm2
movaps %xmm2, 32(%edi)
movups 48(%esi), %xmm3
movaps %xmm3, 48(%edi)
addl $64, %esi
addl $64, %edi
decl %edx
jnz .sse_da_loop
jmp .sse_cleanup
SET_SIZE(memcpy)
/ .CopyLeft handles the memmove case where we must perform the copy backwards,
/ because of overlap between src and dst. This is not particularly optimized.
.CopyLeft:
movl $3,%eax / heavily used constant
std / reverse direction bit (RtoL)
cmpl $12,%ecx / if (size < 12)
ja .BigCopyLeft / {
movl %edx,%esi / src = src + size - 1
leal -1(%ecx,%edi),%edi / dst = dst + size - 1
rep; smovb / do the byte copy
cld / reset direction flag to LtoR
popl %edi / }
popl %esi / restore registers
movl 4(%esp),%eax / set up return value
ret / return(dba);
.BigCopyLeft: / } else {
xchgl %edx,%ecx
movl %ecx,%esi / align source w/byte copy
leal -1(%edx,%edi),%edi
andl %eax,%ecx
jz .SkipAlignLeft
addl $1, %ecx / we need to insure that future
subl %ecx,%edx / copy is done on aligned boundary
rep; smovb
.SkipAlignLeft:
movl %edx,%ecx
subl %eax,%esi
shrl $2,%ecx / do 4 byte copy RtoL
subl %eax,%edi
rep; smovl
andl %eax,%edx / do 1 byte copy whats left
jz .CleanupReturnLeft
movl %edx,%ecx
addl %eax,%esi / rep; smovl instruction will decrement
addl %eax,%edi / %edi, %esi by four after each copy
/ adding 3 will restore pointers to byte
/ before last double word copied
/ which is where they are expected to
/ be for the single byte copy code
rep; smovb
.CleanupReturnLeft:
cld / reset direction flag to LtoR
popl %edi
popl %esi / restore registers
movl 4(%esp),%eax / set up return value
ret / return(dba);
SET_SIZE(memmove)