amd64/gen/memcpy.s

	memcpy.s revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Copyright (c) 2002 Advanced Micro Devices, Inc.
 *
 * All rights reserved.
 *
 * Redistribution and  use in source and binary  forms, with or
 * without  modification,  are   permitted  provided  that  the
 * following conditions are met:
 *
 * + Redistributions  of source  code  must  retain  the  above
 *   copyright  notice,   this  list  of   conditions  and  the
 *   following disclaimer.
 *
 * + Redistributions  in binary  form must reproduce  the above
 *   copyright  notice,   this  list  of   conditions  and  the
 *   following  disclaimer in  the  documentation and/or  other
 *   materials provided with the distribution.
 *
 * + Neither the  name of Advanced Micro Devices,  Inc. nor the
 *   names  of  its contributors  may  be  used  to endorse  or
 *   promote  products  derived   from  this  software  without
 *   specific prior written permission.
 *
 * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
 * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
 * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
 * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
 * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
 * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
 * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
 * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
 * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
 * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
 * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
 * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * It is  licensee's responsibility  to comply with  any export
 * regulations applicable in licensee's jurisdiction.
 */

    .ident  "%Z%%M% %I% %E% SMI"

    .file   "%M%"

#include <sys/asm_linkage.h>

    ANSI_PRAGMA_WEAK(memmove,function)
    ANSI_PRAGMA_WEAK(memcpy,function)

#include "SYS.h"
#include "cache.h"

    ANSI_PRAGMA_WEAK2(_private_memcpy,memcpy,function)

#define LABEL(s) .memcpy/**/s

    ENTRY(memmove)      /* (void *s1, void *s2, size_t n) */
    cmpq    %rsi,%rdi   / if (source addr > dest addr)
    leaq    -1(%rsi,%rdx),%r9
    jle .CopyRight  /
    cmpq    %r9,%rdi
    jle .CopyLeft
    jmp .CopyRight

    ENTRY(memcpy)                        /* (void *, const void*, size_t) */

.CopyRight:
LABEL(1try):
        cmp     $16, %rdx
        mov     %rdi, %rax
        jae     LABEL(1after)

        .p2align 4

LABEL(1):               /* 1-byte */
        test    $1, %dl
        jz      LABEL(1a)

        mov     (%rsi), %cl
        mov     %cl, (%rdi)

    dec %dl
    lea 1 (%rsi), %rsi
    lea 1 (%rdi), %rdi
    jz  LABEL(exit)

        .p2align 4,, 4

LABEL(1a):
        test    $2, %dl
        jz      LABEL(1b)

        mov     (%rsi), %cx
        mov     %cx, (%rdi)

    sub $2, %dl
    lea 2 (%rsi), %rsi
    lea 2 (%rdi), %rdi
    jz  LABEL(exit)

        .p2align 4,, 4

LABEL(1b):
        test    $4, %dl
        jz      LABEL(1c)

        mov     (%rsi), %ecx
        mov     %ecx, (%rdi)

/*  sub $4, %dl */
    lea 4 (%rsi), %rsi
    lea 4 (%rdi), %rdi
/*  jz  LABEL(exit) */

        .p2align 4,, 4

LABEL(1c):
        test    $8, %dl
        jz      LABEL(1d)

        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)

/*  sub $8, %dl */
/*  lea 8 (%rsi), %rsi */
/*  lea 8 (%rdi), %rdi */
/*  jz  LABEL(exit) */

        .p2align 4

LABEL(1d):

LABEL(exit):
        rep
        ret

        .p2align 4

LABEL(1after):
        push    %rax

LABEL(8try):
        cmp     $32, %rdx
        jae     LABEL(8after)

LABEL(8):                        /* 8-byte */
        mov     %edx, %ecx
        shr     $3, %ecx
        jz      LABEL(8skip)

        .p2align 4

LABEL(8loop):
        dec     %ecx

        mov     (%rsi), %rax
        mov     %rax, (%rdi)

        lea     8 (%rsi), %rsi
        lea     8 (%rdi), %rdi

        jnz     LABEL(8loop)

LABEL(8skip):
        and     $7, %edx
        pop     %rax
        jnz     LABEL(1)

        rep
        ret

        .p2align 4

LABEL(8after):

LABEL(32try):
    mov $512, %r8d      /* size for unaligned data */
    mov $4096, %r9d     /* size for aligned data */
    test    $7, %esi        /* check if either source.. */
    cmovz   %r9, %r8
    test    $7, %edi        /* .. or destination is aligned */
    cmovz   %r9, %r8

        cmp     %r8, %rdx
        ja  LABEL(32after)

LABEL(32):              /* 32-byte */
        mov     %edx, %ecx
        shr     $5, %ecx
        jz      LABEL(32skip)

        .p2align 4

LABEL(32loop):
        dec     %ecx

        mov        (%rsi), %rax
        mov      8 (%rsi), %r8
        mov     16 (%rsi), %r9
        mov     24 (%rsi), %r10

        mov     %rax,    (%rdi)
        mov      %r8,  8 (%rdi)
        mov      %r9, 16 (%rdi)
        mov     %r10, 24 (%rdi)

        lea     32 (%rsi), %rsi
        lea     32 (%rdi), %rdi

        jz      LABEL(32skip)

        dec     %ecx

        mov        (%rsi), %rax
        mov      8 (%rsi), %r8
        mov     16 (%rsi), %r9
        mov     24 (%rsi), %r10

        mov     %rax,    (%rdi)
        mov      %r8,  8 (%rdi)
        mov      %r9, 16 (%rdi)
        mov     %r10, 24 (%rdi)

        lea     32 (%rsi), %rsi
        lea     32 (%rdi), %rdi

        jnz     LABEL(32loop)

        .p2align 4

LABEL(32skip):
        and     $31, %edx
        jnz     LABEL(8)

        pop     %rax
        ret

        .p2align 4

LABEL(32after):

    /* 3DNow: use prefetch */
    prefetchnta _sref_(.amd64cache1) /* improves test further ahead on B0 */

LABEL(aligntry):
        mov     %edi, %r8d          /* align by destination */

        and $7, %r8d
        jz      LABEL(alignafter)   /* not unaligned */

LABEL(align):                       /* align */
        lea     -8 (%r8, %rdx), %rdx
        sub     $8, %r8d

        .p2align 4

LABEL(alignloop):
        inc     %r8d

        mov     (%rsi), %al
        mov     %al, (%rdi)

        lea     1 (%rsi), %rsi
        lea     1 (%rdi), %rdi

        jnz     LABEL(alignloop)

        .p2align 4

LABEL(alignafter):
        mov     _sref_(.amd64cache1half), %r11
        cmp     %rdx, %r11
        cmova   %rdx, %r11

LABEL(fast):
    mov %r11, %rcx
    and $-8, %r11
    shr $3, %rcx
/*  jz  LABEL(fastskip) */

    rep             /* good ol' MOVS */
    movsq

LABEL(fastskip):
    sub %r11, %rdx
    test    $-8, %rdx
    jnz LABEL(fastafterlater)

    and $7, %edx
    pop %rax
    jnz LABEL(1)

    rep
    ret

        .p2align 4

LABEL(64try):
        mov     _sref_(.amd64cache1half), %r11
        cmp     %rdx, %r11
        cmova   %rdx, %r11

LABEL(64):                               /* 64-byte */
        mov     %r11, %rcx
        and     $-64, %r11
        shr     $6, %rcx
        jz      LABEL(64skip)

        .p2align 4

LABEL(64loop):
        dec     %ecx

        mov        (%rsi), %rax
        mov      8 (%rsi), %r8
        mov     16 (%rsi), %r9
        mov     24 (%rsi), %r10

        mov     %rax,    (%rdi)
        mov      %r8,  8 (%rdi)
        mov      %r9, 16 (%rdi)
        mov     %r10, 24 (%rdi)

        mov     32 (%rsi), %rax
        mov     40 (%rsi), %r8
        mov     48 (%rsi), %r9
        mov     56 (%rsi), %r10

        mov     %rax, 32 (%rdi)
        mov      %r8, 40 (%rdi)
        mov      %r9, 48 (%rdi)
        mov     %r10, 56 (%rdi)

        lea     64 (%rsi), %rsi
        lea     64 (%rdi), %rdi

        jz      LABEL(64skip)

        dec     %ecx

        mov        (%rsi), %rax
        mov      8 (%rsi), %r8
        mov     16 (%rsi), %r9
        mov     24 (%rsi), %r10

        mov     %rax,    (%rdi)
        mov      %r8,  8 (%rdi)
        mov      %r9, 16 (%rdi)
        mov     %r10, 24 (%rdi)

        mov     32 (%rsi), %rax
        mov     40 (%rsi), %r8
        mov     48 (%rsi), %r9
        mov     56 (%rsi), %r10

        mov     %rax, 32 (%rdi)
        mov      %r8, 40 (%rdi)
        mov      %r9, 48 (%rdi)
        mov     %r10, 56 (%rdi)

        lea     64 (%rsi), %rsi
        lea     64 (%rdi), %rdi

        jnz     LABEL(64loop)

        .p2align 4

LABEL(64skip):
        sub     %r11, %rdx
        test    $-64, %rdx
        jnz     LABEL(64after)

        and     $63, %edx
        jnz     LABEL(32)

        pop     %rax
        ret

        .p2align 4

LABEL(64after):

LABEL(fastafterlater):

LABEL(pretry):
        mov     _sref_(.amd64cache2half), %r8
        cmp     %rdx, %r8
        cmova   %rdx, %r8

LABEL(pre):                              /* 64-byte prefetching */
        mov     %r8, %rcx
        and     $-64, %r8
        shr     $6, %rcx
        jz      LABEL(preskip)

        push    %r14
        push    %r13
        push    %r12
        push    %rbx

        .p2align 4

LABEL(preloop):
        dec     %rcx

        mov        (%rsi), %rax
        mov      8 (%rsi), %rbx
        mov     16 (%rsi), %r9
        mov     24 (%rsi), %r10
        mov     32 (%rsi), %r11
        mov     40 (%rsi), %r12
        mov     48 (%rsi), %r13
        mov     56 (%rsi), %r14

        prefetchnta  0 + 896 (%rsi) /* 3DNow: use prefetch */
        prefetchnta 64 + 896 (%rsi) /* 3DNow: use prefetch */

        mov     %rax,    (%rdi)
        mov     %rbx,  8 (%rdi)
        mov      %r9, 16 (%rdi)
        mov     %r10, 24 (%rdi)
        mov     %r11, 32 (%rdi)
        mov     %r12, 40 (%rdi)
        mov     %r13, 48 (%rdi)
        mov     %r14, 56 (%rdi)

        lea     64 (%rsi), %rsi
        lea     64 (%rdi), %rdi

        jz      LABEL(preskipa)

        dec     %rcx

        mov        (%rsi), %rax
        mov      8 (%rsi), %rbx
        mov     16 (%rsi), %r9
        mov     24 (%rsi), %r10
        mov     32 (%rsi), %r11
        mov     40 (%rsi), %r12
        mov     48 (%rsi), %r13
        mov     56 (%rsi), %r14

        mov     %rax,    (%rdi)
        mov     %rbx,  8 (%rdi)
        mov      %r9, 16 (%rdi)
        mov     %r10, 24 (%rdi)
        mov     %r11, 32 (%rdi)
        mov     %r12, 40 (%rdi)
        mov     %r13, 48 (%rdi)
        mov     %r14, 56 (%rdi)

        prefetchnta -64 + 896 (%rdi)    /* 3DNow: use prefetchw */
        prefetchnta   0 + 896 (%rdi)    /* 3DNow: use prefetchw */

        lea     64 (%rsi), %rsi
        lea     64 (%rdi), %rdi

        jnz     LABEL(preloop)

LABEL(preskipa):
        pop     %rbx
        pop     %r12
        pop     %r13
        pop     %r14


LABEL(preskip):
        sub     %r8, %rdx
        test    $-64, %rdx
        jnz     LABEL(preafter)

        and     $63, %edx
        jnz     LABEL(32)

        pop     %rax
        ret

        .p2align 4

LABEL(preafter):

LABEL(NTtry):

LABEL(NT):                               /* NT 64-byte */
        mov     %rdx, %rcx
        shr     $7, %rcx
        jz      LABEL(NTskip)

        push    %r14
        push    %r13
        push    %r12

       .p2align 4

LABEL(NTloop):
        prefetchnta 768 (%rsi)      /* prefetching NT here is not so good on B0 and C0 MP systems */
        prefetchnta 832 (%rsi)

        dec     %rcx

        mov        (%rsi), %rax
        mov      8 (%rsi), %r8
        mov     16 (%rsi), %r9
        mov     24 (%rsi), %r10
        mov     32 (%rsi), %r11
        mov     40 (%rsi), %r12
        mov     48 (%rsi), %r13
        mov     56 (%rsi), %r14

        movnti  %rax,    (%rdi)
        movnti   %r8,  8 (%rdi)
        movnti   %r9, 16 (%rdi)
        movnti  %r10, 24 (%rdi)
        movnti  %r11, 32 (%rdi)
        movnti  %r12, 40 (%rdi)
        movnti  %r13, 48 (%rdi)
        movnti  %r14, 56 (%rdi)

        mov      64 (%rsi), %rax
        mov      72 (%rsi), %r8
        mov      80 (%rsi), %r9
        mov      88 (%rsi), %r10
        mov      96 (%rsi), %r11
        mov     104 (%rsi), %r12
        mov     112 (%rsi), %r13
        mov     120 (%rsi), %r14

        movnti  %rax,  64 (%rdi)
        movnti   %r8,  72 (%rdi)
        movnti   %r9,  80 (%rdi)
        movnti  %r10,  88 (%rdi)
        movnti  %r11,  96 (%rdi)
        movnti  %r12, 104 (%rdi)
        movnti  %r13, 112 (%rdi)
        movnti  %r14, 120 (%rdi)

        lea     128 (%rsi), %rsi
        lea     128 (%rdi), %rdi

        jnz     LABEL(NTloop)

        mfence

        pop     %r12
        pop     %r13
        pop     %r14

LABEL(NTskip):
        and     $127, %edx
        jnz     LABEL(32)

        pop     %rax
        ret

    SET_SIZE(memcpy)                   /* (void *, const void*, size_t) */

.CopyLeft:
    movq    %rdi,%rax       / set up return value
    movq    $7,%r8          / heavily used constant
    movq    %rdx,%rcx       / put len into %rcx for rep
    std             / reverse direction bit (RtoL)
    cmpq    $24,%rcx        / if (size < 24)
    ja  .BigCopyLeft        / {
    movq    %r9,%rsi        /     src = src + size - 1
    leaq    -1(%rcx,%rdi),%rdi  /     dst = dst + size - 1
    rep;    smovb           /    do the byte copy
    cld             /    reset direction flag to LtoR
    ret             /  return(dba);
.BigCopyLeft:               / } else {
    xchgq   %r9,%rcx
    movq    %rcx,%rsi       / align source w/byte copy
    leaq    -1(%r9,%rdi),%rdi
    andq    %r8,%rcx
    jz  .SkipAlignLeft
    addq    $1, %rcx        / we need to insure that future
    subq    %rcx,%r9        / copy is done on aligned boundary
    rep;    smovb
.SkipAlignLeft:
    movq    %r9,%rcx
    subq    %r8,%rsi
    shrq    $3,%rcx         / do 8 byte copy RtoL
    subq    %r8,%rdi
    rep;    smovq
    andq    %r8,%r9     / do 1 byte copy whats left
    jz  .CleanupReturnLeft
    movq    %r9,%rcx
    addq    %r8,%rsi        / rep; smovl instruction will decrement
    addq    %r8,%rdi        / %rdi, %rsi by four after each copy
                    / adding 3 will restore pointers to byte
                    / before last double word copied
                    / which is where they are expected to
                    / be for the single byte copy code
    rep;    smovb
.CleanupReturnLeft:
    cld             / reset direction flag to LtoR
    ret             / return(dba);
    SET_SIZE(memmove)