2N/A/*
2N/A * CDDL HEADER START
2N/A *
2N/A * The contents of this file are subject to the terms of the
2N/A * Common Development and Distribution License (the "License").
2N/A * You may not use this file except in compliance with the License.
2N/A *
2N/A * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
2N/A * or http://www.opensolaris.org/os/licensing.
2N/A * See the License for the specific language governing permissions
2N/A * and limitations under the License.
2N/A *
2N/A * When distributing Covered Code, include this CDDL HEADER in each
2N/A * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
2N/A * If applicable, add the following below this CDDL HEADER, with the
2N/A * fields enclosed by brackets "[]" replaced with your own identifying
2N/A * information: Portions Copyright [yyyy] [name of copyright owner]
2N/A *
2N/A * CDDL HEADER END
2N/A */
2N/A
2N/A/*
2N/A * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
2N/A * Use is subject to license terms.
2N/A */
2N/A
2N/A .file "__align_cpy_8.s"
2N/A
2N/A/* __align_cpy_8(s1, s2, n)
2N/A *
2N/A * Copy 8-byte aligned source to 8-byte aligned target in multiples of 8 bytes.
2N/A *
2N/A * Input:
2N/A * o0 address of target
2N/A * o1 address of source
2N/A * o2 number of bytes to copy (must be a multiple of 8)
2N/A * Output:
2N/A * o0 address of target
2N/A * Caller's registers that have been changed by this function:
2N/A * o1-o5
2N/A *
2N/A * Note:
2N/A * This helper routine will not be used by any 32-bit compilations. To do
2N/A * so would break binary compatibility with previous versions of Solaris.
2N/A *
2N/A * Assumptions:
2N/A * Source and target addresses are 8-byte aligned.
2N/A * Bytes to be copied are non-overlapping or _exactly_ overlapping.
2N/A * The number of bytes to be copied is a multiple of 8.
2N/A * Call will _usually_ be made with a byte count of more than 4*8 and
2N/A * less than a few hundred bytes. Legal values are 0 to MAX_SIZE_T.
2N/A *
2N/A * Optimization attempt:
2N/A * Reasonable speed for a generic v9. Going for 32 bytes at a time
2N/A * rather than 16 bytes at a time did not result in a time saving for
2N/A * the number of bytes expected to be copied. No timing runs using other
2N/A * levels of optimization have been tried yet.
2N/A *
2N/A * Even when multiples of 16 bytes were used, the savings by going for 32 bytes
2N/A * at a time were about 2%. Thus, __align_cpy_16 is a second entry point to
2N/A * the same code as __align_cpy_8.
2N/A *
2N/A * Register usage:
2N/A * o1 source address (updated for each read)
2N/A * o2 byte count remaining
2N/A * o3 contents being copied
2N/A * o4 more contents being copied
2N/A * o5 target address
2N/A */
2N/A
2N/A#include <sys/asm_linkage.h>
2N/A
2N/A ENTRY(__align_cpy_8)
2N/A ENTRY(__align_cpy_16)
2N/A cmp %o0, %o1 ! Identical--do nothing.
2N/A be,pn %xcc, .done
2N/A subcc %o2, 8, %o2
2N/A bz,pn %xcc, .wrdbl2 ! Only 8 bytes need to be copied.
2N/A mov %o0, %o5 ! Original target address is returned.
2N/A bpos,a,pt %xcc, .wrdbl1 ! Have at least 16 bytes to copy.
2N/A ldx [%o1], %o3
2N/A.done:
2N/A retl ! No bytes to copy.
2N/A nop
2N/A
2N/A .align 32
2N/A.wrdbl1: ! Copy 16 bytes at a time.
2N/A subcc %o2, 16, %o2
2N/A ldx [%o1+8], %o4
2N/A add %o1, 16, %o1
2N/A stx %o3, [%o5]
2N/A stx %o4, [%o5+8]
2N/A add %o5, 16, %o5
2N/A bg,a,pt %xcc, .wrdbl1 ! Have at least 16 more bytes.
2N/A ldx [%o1], %o3
2N/A
2N/A bz,a,pt %xcc, .wrdbl3 ! Have 8 bytes remaining to copy.
2N/A ldx [%o1], %o3
2N/A
2N/A retl
2N/A nop
2N/A
2N/A.wrdbl2:
2N/A ldx [%o1], %o3 ! Copy last 8 bytes.
2N/A.wrdbl3:
2N/A stx %o3, [%o5]
2N/A retl
2N/A nop
2N/A
2N/A SET_SIZE(__align_cpy_8)
2N/A SET_SIZE(__align_cpy_16)