amd64/src/__vsqrtf.S

	__vsqrtf.S revision 25c28e83beb90e7c80452a7c818c5e6f73a07dc8
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

    .file   "__vsqrtf.S"

#include "libm.h"

    ENTRY(__vsqrtf)
    push    %rbp
    movq    %rsp,%rbp

/ on entry:
/   %edi = n
/   %rsi = x
/   %edx = stridex
/   %rcx = y
/   %r8d = stridey

    movslq  %edx,%rdx       / sign extend and scale strides
    shlq    $2,%rdx
    movslq  %r8d,%r8
    shlq    $2,%r8

    cmpl    $4,%edi
    jl  .finish

    cmpq    $4,%rdx
    jne .nonunit
    cmpq    $4,%r8
    jne .nonunit

/ unit-stride case
    movq    %rdx,%r9
    shlq    $2,%r9
    movq    %r8,%r10
    shlq    $2,%r10

    .align  16
.loop:
    movups  (%rsi),%xmm0
    addq    %r9,%rsi
    sqrtps  %xmm0,%xmm0
    movups  %xmm0,(%rcx)
    addq    %r10,%rcx
    subl    $4,%edi
    cmpl    $4,%edi
    jge .loop

.finish:
    testl   %edi,%edi
    jle .done

.finish_loop:
    movss   (%rsi),%xmm0
    addq    %rdx,%rsi
    sqrtss  %xmm0,%xmm0
    movss   %xmm0,(%rcx)
    addq    %r8,%rcx
    decl    %edi
    jg  .finish_loop

.done:
    leave
    ret

    .align  16
.nonunit:
    movss   (%rsi),%xmm0
    addq    %rdx,%rsi
    movss   (%rsi),%xmm1
    addq    %rdx,%rsi
    movss   (%rsi),%xmm2
    addq    %rdx,%rsi
    movss   (%rsi),%xmm3
    addq    %rdx,%rsi

    movlhps %xmm1,%xmm0     / xmm0:   0  x1   0  x0
    movlhps %xmm3,%xmm2     / xmm2:   0  x3   0  x2
    shufps  $0x88,%xmm2,%xmm0   / xmm0:  x3  x2  x1  x0

    sqrtps  %xmm0,%xmm0     / xmm0:  y3  y2  y1  y0

    movaps  %xmm0,%xmm1     / xmm1:  y3  y2  y1  y0
    shufps  $0xf5,%xmm0,%xmm1   / xmm1:  y3  y3  y1  y1
    movhlps %xmm0,%xmm2     / xmm2:   0  x3  y3  y2
    movhlps %xmm1,%xmm3     / xmm3:   0   0  y3  y3

    movss   %xmm0,(%rcx)
    addq    %r8,%rcx
    movss   %xmm1,(%rcx)
    addq    %r8,%rcx
    movss   %xmm2,(%rcx)
    addq    %r8,%rcx
    movss   %xmm3,(%rcx)
    addq    %r8,%rcx

    subl    $4,%edi
    cmpl    $4,%edi
    jge .nonunit

    jmp .finish

    SET_SIZE(__vsqrtf)