__vsqrtf.S revision 25c28e83beb90e7c80452a7c818c5e6f73a07dc8
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
.file "__vsqrtf.S"
#include "libm.h"
ENTRY(__vsqrtf)
push %rbp
movq %rsp,%rbp
/ on entry:
/ %edi = n
/ %rsi = x
/ %edx = stridex
/ %rcx = y
/ %r8d = stridey
movslq %edx,%rdx / sign extend and scale strides
shlq $2,%rdx
movslq %r8d,%r8
shlq $2,%r8
cmpl $4,%edi
jl .finish
cmpq $4,%rdx
jne .nonunit
cmpq $4,%r8
jne .nonunit
/ unit-stride case
movq %rdx,%r9
shlq $2,%r9
movq %r8,%r10
shlq $2,%r10
.align 16
.loop:
movups (%rsi),%xmm0
addq %r9,%rsi
sqrtps %xmm0,%xmm0
movups %xmm0,(%rcx)
addq %r10,%rcx
subl $4,%edi
cmpl $4,%edi
jge .loop
.finish:
testl %edi,%edi
jle .done
.finish_loop:
movss (%rsi),%xmm0
addq %rdx,%rsi
sqrtss %xmm0,%xmm0
movss %xmm0,(%rcx)
addq %r8,%rcx
decl %edi
jg .finish_loop
.done:
leave
ret
.align 16
.nonunit:
movss (%rsi),%xmm0
addq %rdx,%rsi
movss (%rsi),%xmm1
addq %rdx,%rsi
movss (%rsi),%xmm2
addq %rdx,%rsi
movss (%rsi),%xmm3
addq %rdx,%rsi
movlhps %xmm1,%xmm0 / xmm0: 0 x1 0 x0
movlhps %xmm3,%xmm2 / xmm2: 0 x3 0 x2
shufps $0x88,%xmm2,%xmm0 / xmm0: x3 x2 x1 x0
sqrtps %xmm0,%xmm0 / xmm0: y3 y2 y1 y0
movaps %xmm0,%xmm1 / xmm1: y3 y2 y1 y0
shufps $0xf5,%xmm0,%xmm1 / xmm1: y3 y3 y1 y1
movhlps %xmm0,%xmm2 / xmm2: 0 x3 y3 y2
movhlps %xmm1,%xmm3 / xmm3: 0 0 y3 y3
movss %xmm0,(%rcx)
addq %r8,%rcx
movss %xmm1,(%rcx)
addq %r8,%rcx
movss %xmm2,(%rcx)
addq %r8,%rcx
movss %xmm3,(%rcx)
addq %r8,%rcx
subl $4,%edi
cmpl $4,%edi
jge .nonunit
jmp .finish
SET_SIZE(__vsqrtf)