__vcos.S revision 25c28e83beb90e7c80452a7c818c5e6f73a07dc8
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
.file "__vcos.S"
#include "libm.h"
RO_DATA
.align 64
constants:
.word 0x3ec718e3,0xa6972785
.word 0x3ef9fd39,0x94293940
.word 0xbf2a019f,0x75ee4be1
.word 0xbf56c16b,0xba552569
.word 0x3f811111,0x1108c703
.word 0x3fa55555,0x554f5b35
.word 0xbfc55555,0x555554d0
.word 0xbfdfffff,0xffffff85
.word 0x3ff00000,0x00000000
.word 0xbfc55555,0x5551fc28
.word 0x3f811107,0x62eacc9d
.word 0xbfdfffff,0xffff6328
.word 0x3fa55551,0x5f7acf0c
.word 0x3fe45f30,0x6dc9c883
.word 0x43380000,0x00000000
.word 0x3ff921fb,0x54400000
.word 0x3dd0b461,0x1a600000
.word 0x3ba3198a,0x2e000000
.word 0x397b839a,0x252049c1
.word 0x80000000,0x00004000
.word 0xffff8000,0x00000000 ! N.B.: low-order words used
.word 0x3fc90000,0x80000000 ! for sign bit hacking; see
.word 0x3fc40000,0x00000000 ! references to "thresh" below
#define p4 0x0
#define q4 0x08
#define p3 0x10
#define q3 0x18
#define p2 0x20
#define q2 0x28
#define p1 0x30
#define q1 0x38
#define one 0x40
#define pp1 0x48
#define pp2 0x50
#define qq1 0x58
#define qq2 0x60
#define invpio2 0x68
#define round 0x70
#define pio2_1 0x78
#define pio2_2 0x80
#define pio2_3 0x88
#define pio2_3t 0x90
#define f30val 0x98
#define mask 0xa0
#define thresh 0xa8
! local storage indices
#define xsave STACK_BIAS-0x8
#define ysave STACK_BIAS-0x10
#define nsave STACK_BIAS-0x14
#define sxsave STACK_BIAS-0x18
#define sysave STACK_BIAS-0x1c
#define biguns STACK_BIAS-0x20
#define n2 STACK_BIAS-0x24
#define n1 STACK_BIAS-0x28
#define n0 STACK_BIAS-0x2c
#define x2_1 STACK_BIAS-0x40
#define x1_1 STACK_BIAS-0x50
#define x0_1 STACK_BIAS-0x60
#define y2_0 STACK_BIAS-0x70
#define y1_0 STACK_BIAS-0x80
#define y0_0 STACK_BIAS-0x90
! sizeof temp storage - must be a multiple of 16 for V9
#define tmps 0x90
!--------------------------------------------------------------------
! define pipes for easier reading
#define P0_f0 %f0
#define P0_f1 %f1
#define P0_f2 %f2
#define P0_f3 %f3
#define P0_f4 %f4
#define P0_f5 %f5
#define P0_f6 %f6
#define P0_f7 %f7
#define P0_f8 %f8
#define P0_f9 %f9
#define P1_f10 %f10
#define P1_f11 %f11
#define P1_f12 %f12
#define P1_f13 %f13
#define P1_f14 %f14
#define P1_f15 %f15
#define P1_f16 %f16
#define P1_f17 %f17
#define P1_f18 %f18
#define P1_f19 %f19
#define P2_f20 %f20
#define P2_f21 %f21
#define P2_f22 %f22
#define P2_f23 %f23
#define P2_f24 %f24
#define P2_f25 %f25
#define P2_f26 %f26
#define P2_f27 %f27
#define P2_f28 %f28
#define P2_f29 %f29
! define __vlibm_TBL_sincos_hi & lo for easy reading
#define SC_HI %l3
#define SC_LO %l4
! define constants for easy reading
#define C_q1 %f46
#define C_q2 %f48
#define C_q3 %f50
#define C_q4 %f52
! one ( 1 ) uno eins echi un
#define C_ONE %f54
#define C_ONE_LO %f55
! masks
#define MSK_SIGN %i5
#define MSK_BIT31 %f30
#define MSK_BIT13 %f31
#define MSK_BITSHI17 %f44
! constants for pp and qq
#define C_pp1 %f56
#define C_pp2 %f58
#define C_qq1 %f60
#define C_qq2 %f62
! sign mask
#define C_signM %i5
#define LIM_l5 %l5
#define LIM_l6 %l6
! when in pri range, using value as transition from poly to table.
! for Medium range,change use of %l6 and use to keep track of biguns.
#define LIM_l7 %l7
!--------------------------------------------------------------------
ENTRY(__vcos)
save %sp,-SA(MINFRAME)-tmps,%sp
PIC_SETUP(g5)
PIC_SET(g5,__vlibm_TBL_sincos_hi,l3)
PIC_SET(g5,__vlibm_TBL_sincos_lo,l4)
PIC_SET(g5,constants,o0)
mov %o0,%g1
wr %g0,0x82,%asi ! set %asi for non-faulting loads
! ========== primary range ==========
! register use
! i0 n
! i1 x
! i2 stridex
! i3 y
! i4 stridey
! i5 0x80000000
! l0 hx0
! l1 hx1
! l2 hx2
! l3 __vlibm_TBL_sincos_hi
! l4 __vlibm_TBL_sincos_lo
! l5 0x3fc40000
! l6 0x3e400000
! l7 0x3fe921fb
! the following are 64-bit registers in both V8+ and V9
! g1 scratch
! g5
! o0 py0
! o1 py1
! o2 py2
! o3 oy0
! o4 oy1
! o5 oy2
! o7 scratch
! f0 x0
! f2
! f4
! f6
! f8 scratch for table base
! f9 signbit0
! f10 x1
! f12
! f14
! f16
! f18 scratch for table base
! f19 signbit1
! f20 x2
! f22
! f24
! f26
! f28 scratch for table base
! f29 signbit2
! f30 0x80000000
! f31 0x4000
! f32
! f34
! f36
! f38
! f40
! f42
! f44 0xffff800000000000
! f46 p1
! f48 p2
! f50 p3
! f52 p4
! f54 one
! f56 pp1
! f58 pp2
! f60 qq1
! f62 qq2
#ifdef __sparcv9
stx %i1,[%fp+xsave] ! save arguments
stx %i3,[%fp+ysave]
#else
st %i1,[%fp+xsave] ! save arguments
st %i3,[%fp+ysave]
#endif
st %i0,[%fp+nsave]
st %i2,[%fp+sxsave]
st %i4,[%fp+sysave]
sethi %hi(0x80000000),MSK_SIGN ! load/set up constants
sethi %hi(0x3fc40000),LIM_l5
sethi %hi(0x3e400000),LIM_l6
sethi %hi(0x3fe921fb),LIM_l7
or LIM_l7,%lo(0x3fe921fb),LIM_l7
ldd [%g1+f30val],MSK_BIT31
ldd [%g1+mask],MSK_BITSHI17
ldd [%g1+q1],C_q1
ldd [%g1+q2],C_q2
ldd [%g1+q3],C_q3
ldd [%g1+q4],C_q4
ldd [%g1+one],C_ONE
ldd [%g1+pp1],C_pp1
ldd [%g1+pp2],C_pp2
ldd [%g1+qq1],C_qq1
ldd [%g1+qq2],C_qq2
sll %i2,3,%i2 ! scale strides
sll %i4,3,%i4
add %fp,x0_1,%o3 ! precondition loop
add %fp,x0_1,%o4
add %fp,x0_1,%o5
ld [%i1],%l0 ! hx = *x
ld [%i1],P0_f0
ld [%i1+4],P0_f1
andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
add %i1,%i2,%i1 ! x += stridex
ba,pt %icc,.loop0
!delay slot
nop
.align 32
.loop0:
lda [%i1]%asi,%l1 ! preload next argument
sub %l0,LIM_l6,%g1
sub LIM_l7,%l0,%o7
fands P0_f0,MSK_BIT31,P0_f9 ! save signbit
lda [%i1]%asi,P1_f10
orcc %o7,%g1,%g0
mov %i3,%o0 ! py0 = y
bl,pn %icc,.range0 ! if hx < 0x3e400000 or > 0x3fe921fb
! delay slot
lda [%i1+4]%asi,P1_f11
addcc %i0,-1,%i0
add %i3,%i4,%i3 ! y += stridey
ble,pn %icc,.endloop1
! delay slot
andn %l1,MSK_SIGN,%l1
add %i1,%i2,%i1 ! x += stridex
fabsd P0_f0,P0_f0
fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only
.loop1:
lda [%i1]%asi,%l2 ! preload next argument
sub %l1,LIM_l6,%g1
sub LIM_l7,%l1,%o7
fands P1_f10,MSK_BIT31,P1_f19 ! save signbit
lda [%i1]%asi,P2_f20
orcc %o7,%g1,%g0
mov %i3,%o1 ! py1 = y
bl,pn %icc,.range1 ! if hx < 0x3e400000 or > 0x3fe921fb
! delay slot
lda [%i1+4]%asi,P2_f21
addcc %i0,-1,%i0
add %i3,%i4,%i3 ! y += stridey
ble,pn %icc,.endloop2
! delay slot
andn %l2,MSK_SIGN,%l2
add %i1,%i2,%i1 ! x += stridex
fabsd P1_f10,P1_f10
fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only
.loop2:
st P0_f6,[%o3]
sub %l2,LIM_l6,%g1
sub LIM_l7,%l2,%o7
fands P2_f20,MSK_BIT31,P2_f29 ! save signbit
st P0_f7,[%o3+4]
orcc %g1,%o7,%g0
mov %i3,%o2 ! py2 = y
bl,pn %icc,.range2 ! if hx < 0x3e400000 or > 0x3fe921fb
! delay slot
add %i3,%i4,%i3 ! y += stridey
cmp %l0,LIM_l5
fabsd P2_f20,P2_f20
bl,pn %icc,.case4
! delay slot
st P1_f16,[%o4]
cmp %l1,LIM_l5
fpadd32s P0_f0,MSK_BIT13,P0_f8
bl,pn %icc,.case2
! delay slot
st P1_f17,[%o4+4]
cmp %l2,LIM_l5
fpadd32s P1_f10,MSK_BIT13,P1_f18
bl,pn %icc,.case1
! delay slot
st P2_f26,[%o5]
mov %o0,%o3
sethi %hi(0x3fc3c000),%o7
fpadd32s P2_f20,MSK_BIT13,P2_f28
st P2_f27,[%o5+4]
fand P0_f8,MSK_BITSHI17,P0_f2
mov %o1,%o4
fand P1_f18,MSK_BITSHI17,P1_f12
mov %o2,%o5
sub %l0,%o7,%l0
fand P2_f28,MSK_BITSHI17,P2_f22
sub %l1,%o7,%l1
sub %l2,%o7,%l2
fsubd P0_f0,P0_f2,P0_f0
srl %l0,10,%l0
add SC_HI,8,%g1;add SC_LO,8,%o7
fsubd P1_f10,P1_f12,P1_f10
srl %l1,10,%l1
fsubd P2_f20,P2_f22,P2_f20
srl %l2,10,%l2
fmuld P0_f0,P0_f0,P0_f2
andn %l0,0x1f,%l0
fmuld P1_f10,P1_f10,P1_f12
andn %l1,0x1f,%l1
fmuld P2_f20,P2_f20,P2_f22
andn %l2,0x1f,%l2
fmuld P0_f2,C_pp2,P0_f6
ldd [%g1+%l0],%f32
fmuld P1_f12,C_pp2,P1_f16
ldd [%g1+%l1],%f36
fmuld P2_f22,C_pp2,P2_f26
ldd [%g1+%l2],%f40
faddd P0_f6,C_pp1,P0_f6
fmuld P0_f2,C_qq2,P0_f4
ldd [SC_HI+%l0],%f34
faddd P1_f16,C_pp1,P1_f16
fmuld P1_f12,C_qq2,P1_f14
ldd [SC_HI+%l1],%f38
faddd P2_f26,C_pp1,P2_f26
fmuld P2_f22,C_qq2,P2_f24
ldd [SC_HI+%l2],%f42
fmuld P0_f2,P0_f6,P0_f6
faddd P0_f4,C_qq1,P0_f4
fmuld P1_f12,P1_f16,P1_f16
faddd P1_f14,C_qq1,P1_f14
fmuld P2_f22,P2_f26,P2_f26
faddd P2_f24,C_qq1,P2_f24
faddd P0_f6,C_ONE,P0_f6
fmuld P0_f2,P0_f4,P0_f4
faddd P1_f16,C_ONE,P1_f16
fmuld P1_f12,P1_f14,P1_f14
faddd P2_f26,C_ONE,P2_f26
fmuld P2_f22,P2_f24,P2_f24
fmuld P0_f0,P0_f6,P0_f6
ldd [%o7+%l0],P0_f2
fmuld P1_f10,P1_f16,P1_f16
ldd [%o7+%l1],P1_f12
fmuld P2_f20,P2_f26,P2_f26
ldd [%o7+%l2],P2_f22
fmuld P0_f4,%f32,P0_f4
lda [%i1]%asi,%l0 ! preload next argument
fmuld P1_f14,%f36,P1_f14
lda [%i1]%asi,P0_f0
fmuld P2_f24,%f40,P2_f24
lda [%i1+4]%asi,P0_f1
fmuld P0_f6,%f34,P0_f6
add %i1,%i2,%i1 ! x += stridex
fmuld P1_f16,%f38,P1_f16
fmuld P2_f26,%f42,P2_f26
fsubd P0_f6,P0_f4,P0_f6
fsubd P1_f16,P1_f14,P1_f16
fsubd P2_f26,P2_f24,P2_f26
fsubd P0_f2,P0_f6,P0_f6
fsubd P1_f12,P1_f16,P1_f16
fsubd P2_f22,P2_f26,P2_f26
faddd P0_f6,%f32,P0_f6
faddd P1_f16,%f36,P1_f16
faddd P2_f26,%f40,P2_f26
andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
nop !!(vsin) fors P0_f6,P0_f9,P0_f6
addcc %i0,-1,%i0
nop !!(vsin) fors P1_f16,P1_f19,P1_f16
bg,pt %icc,.loop0
! delay slot
nop !!(vsin) fors P2_f26,P2_f29,P2_f26
ba,pt %icc,.endloop0
! delay slot
nop
.align 32
.case1:
st P2_f27,[%o5+4]
sethi %hi(0x3fc3c000),%o7
fand P0_f8,MSK_BITSHI17,P0_f2
sub %l0,%o7,%l0
sub %l1,%o7,%l1
add SC_HI,8,%g1;add SC_LO,8,%o7
fand P1_f18,MSK_BITSHI17,P1_f12
fmuld P2_f20,P2_f20,P2_f22
fsubd P0_f0,P0_f2,P0_f0
srl %l0,10,%l0
mov %o0,%o3
fsubd P1_f10,P1_f12,P1_f10
srl %l1,10,%l1
mov %o1,%o4
fmuld P2_f22,C_q4,P2_f24
mov %o2,%o5
fmuld P0_f0,P0_f0,P0_f2
andn %l0,0x1f,%l0
fmuld P1_f10,P1_f10,P1_f12
andn %l1,0x1f,%l1
faddd P2_f24,C_q3,P2_f24
fmuld P0_f2,C_pp2,P0_f6
ldd [%g1+%l0],%f32
fmuld P1_f12,C_pp2,P1_f16
ldd [%g1+%l1],%f36
fmuld P2_f22,P2_f24,P2_f24
faddd P0_f6,C_pp1,P0_f6
fmuld P0_f2,C_qq2,P0_f4
ldd [SC_HI+%l0],%f34
faddd P1_f16,C_pp1,P1_f16
fmuld P1_f12,C_qq2,P1_f14
ldd [SC_HI+%l1],%f38
faddd P2_f24,C_q2,P2_f24
fmuld P0_f2,P0_f6,P0_f6
faddd P0_f4,C_qq1,P0_f4
fmuld P1_f12,P1_f16,P1_f16
faddd P1_f14,C_qq1,P1_f14
fmuld P2_f22,P2_f24,P2_f24
faddd P0_f6,C_ONE,P0_f6
fmuld P0_f2,P0_f4,P0_f4
faddd P1_f16,C_ONE,P1_f16
fmuld P1_f12,P1_f14,P1_f14
faddd P2_f24,C_q1,P2_f24
fmuld P0_f0,P0_f6,P0_f6
ldd [%o7+%l0],P0_f2
fmuld P1_f10,P1_f16,P1_f16
ldd [%o7+%l1],P1_f12
fmuld P0_f4,%f32,P0_f4
lda [%i1]%asi,%l0 ! preload next argument
fmuld P1_f14,%f36,P1_f14
lda [%i1]%asi,P0_f0
fmuld P0_f6,%f34,P0_f6
lda [%i1+4]%asi,P0_f1
fmuld P1_f16,%f38,P1_f16
add %i1,%i2,%i1 ! x += stridex
fmuld P2_f22,P2_f24,P2_f24
fsubd P0_f6,P0_f4,P0_f6
fsubd P1_f16,P1_f14,P1_f16
!!(vsin)fmuld P2_f20,P2_f24,P2_f24
fsubd P0_f2,P0_f6,P0_f6
fsubd P1_f12,P1_f16,P1_f16
faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26
faddd P0_f6,%f32,P0_f6
faddd P1_f16,%f36,P1_f16
andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
nop !!(vsin) fors P2_f26,P2_f29,P2_f26
addcc %i0,-1,%i0
nop !!(vsin) fors P0_f6,P0_f9,P0_f6
bg,pt %icc,.loop0
! delay slot
nop !!(vsin) fors P1_f16,P1_f19,P1_f16
ba,pt %icc,.endloop0
! delay slot
nop
.align 32
.case2:
st P2_f26,[%o5]
cmp %l2,LIM_l5
fpadd32s P2_f20,MSK_BIT13,P2_f28
bl,pn %icc,.case3
! delay slot
st P2_f27,[%o5+4]
sethi %hi(0x3fc3c000),%o7
fand P0_f8,MSK_BITSHI17,P0_f2
sub %l0,%o7,%l0
sub %l2,%o7,%l2
add SC_HI,8,%g1;add SC_LO,8,%o7
fand P2_f28,MSK_BITSHI17,P2_f22
fmuld P1_f10,P1_f10,P1_f12
fsubd P0_f0,P0_f2,P0_f0
srl %l0,10,%l0
mov %o0,%o3
fsubd P2_f20,P2_f22,P2_f20
srl %l2,10,%l2
mov %o2,%o5
fmuld P1_f12,C_q4,P1_f14
mov %o1,%o4
fmuld P0_f0,P0_f0,P0_f2
andn %l0,0x1f,%l0
fmuld P2_f20,P2_f20,P2_f22
andn %l2,0x1f,%l2
faddd P1_f14,C_q3,P1_f14
fmuld P0_f2,C_pp2,P0_f6
ldd [%g1+%l0],%f32
fmuld P2_f22,C_pp2,P2_f26
ldd [%g1+%l2],%f40
fmuld P1_f12,P1_f14,P1_f14
faddd P0_f6,C_pp1,P0_f6
fmuld P0_f2,C_qq2,P0_f4
ldd [SC_HI+%l0],%f34
faddd P2_f26,C_pp1,P2_f26
fmuld P2_f22,C_qq2,P2_f24
ldd [SC_HI+%l2],%f42
faddd P1_f14,C_q2,P1_f14
fmuld P0_f2,P0_f6,P0_f6
faddd P0_f4,C_qq1,P0_f4
fmuld P2_f22,P2_f26,P2_f26
faddd P2_f24,C_qq1,P2_f24
fmuld P1_f12,P1_f14,P1_f14
faddd P0_f6,C_ONE,P0_f6
fmuld P0_f2,P0_f4,P0_f4
faddd P2_f26,C_ONE,P2_f26
fmuld P2_f22,P2_f24,P2_f24
faddd P1_f14,C_q1,P1_f14
fmuld P0_f0,P0_f6,P0_f6
ldd [%o7+%l0],P0_f2
fmuld P2_f20,P2_f26,P2_f26
ldd [%o7+%l2],P2_f22
fmuld P0_f4,%f32,P0_f4
lda [%i1]%asi,%l0 ! preload next argument
fmuld P2_f24,%f40,P2_f24
lda [%i1]%asi,P0_f0
fmuld P0_f6,%f34,P0_f6
lda [%i1+4]%asi,P0_f1
fmuld P2_f26,%f42,P2_f26
add %i1,%i2,%i1 ! x += stridex
fmuld P1_f12,P1_f14,P1_f14
fsubd P0_f6,P0_f4,P0_f6
fsubd P2_f26,P2_f24,P2_f26
!!(vsin)fmuld P1_f10,P1_f14,P1_f14
fsubd P0_f2,P0_f6,P0_f6
fsubd P2_f22,P2_f26,P2_f26
faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16
faddd P0_f6,%f32,P0_f6
faddd P2_f26,%f40,P2_f26
andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
nop !!(vsin) fors P1_f16,P1_f19,P1_f16
addcc %i0,-1,%i0
nop !!(vsin) fors P0_f6,P0_f9,P0_f6
bg,pt %icc,.loop0
! delay slot
nop !!(vsin) fors P2_f26,P2_f29,P2_f26
ba,pt %icc,.endloop0
! delay slot
nop
.align 32
.case3:
sethi %hi(0x3fc3c000),%o7
fand P0_f8,MSK_BITSHI17,P0_f2
fmuld P1_f10,P1_f10,P1_f12
sub %l0,%o7,%l0
add SC_HI,8,%g1;add SC_LO,8,%o7
fmuld P2_f20,P2_f20,P2_f22
fsubd P0_f0,P0_f2,P0_f0
srl %l0,10,%l0
mov %o0,%o3
fmuld P1_f12,C_q4,P1_f14
mov %o1,%o4
fmuld P2_f22,C_q4,P2_f24
mov %o2,%o5
fmuld P0_f0,P0_f0,P0_f2
andn %l0,0x1f,%l0
faddd P1_f14,C_q3,P1_f14
faddd P2_f24,C_q3,P2_f24
fmuld P0_f2,C_pp2,P0_f6
ldd [%g1+%l0],%f32
fmuld P1_f12,P1_f14,P1_f14
fmuld P2_f22,P2_f24,P2_f24
faddd P0_f6,C_pp1,P0_f6
fmuld P0_f2,C_qq2,P0_f4
ldd [SC_HI+%l0],%f34
faddd P1_f14,C_q2,P1_f14
faddd P2_f24,C_q2,P2_f24
fmuld P0_f2,P0_f6,P0_f6
faddd P0_f4,C_qq1,P0_f4
fmuld P1_f12,P1_f14,P1_f14
fmuld P2_f22,P2_f24,P2_f24
faddd P0_f6,C_ONE,P0_f6
fmuld P0_f2,P0_f4,P0_f4
faddd P1_f14,C_q1,P1_f14
faddd P2_f24,C_q1,P2_f24
fmuld P0_f0,P0_f6,P0_f6
ldd [%o7+%l0],P0_f2
fmuld P0_f4,%f32,P0_f4
lda [%i1]%asi,%l0 ! preload next argument
fmuld P1_f12,P1_f14,P1_f14
lda [%i1]%asi,P0_f0
fmuld P0_f6,%f34,P0_f6
lda [%i1+4]%asi,P0_f1
fmuld P2_f22,P2_f24,P2_f24
add %i1,%i2,%i1 ! x += stridex
!!(vsin)fmuld P1_f10,P1_f14,P1_f14
fsubd P0_f6,P0_f4,P0_f6
!!(vsin)fmuld P2_f20,P2_f24,P2_f24
faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16
fsubd P0_f2,P0_f6,P0_f6
faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26
nop !!(vsin) fors P1_f16,P1_f19,P1_f16
andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
faddd P0_f6,%f32,P0_f6
addcc %i0,-1,%i0
nop !!(vsin) fors P2_f26,P2_f29,P2_f26
bg,pt %icc,.loop0
! delay slot
nop !!(vsin) fors P0_f6,P0_f9,P0_f6
ba,pt %icc,.endloop0
! delay slot
nop
.align 32
.case4:
st P1_f17,[%o4+4]
cmp %l1,LIM_l5
fpadd32s P1_f10,MSK_BIT13,P1_f18
bl,pn %icc,.case6
! delay slot
st P2_f26,[%o5]
cmp %l2,LIM_l5
fpadd32s P2_f20,MSK_BIT13,P2_f28
bl,pn %icc,.case5
! delay slot
st P2_f27,[%o5+4]
sethi %hi(0x3fc3c000),%o7
fand P1_f18,MSK_BITSHI17,P1_f12
sub %l1,%o7,%l1
sub %l2,%o7,%l2
add SC_HI,8,%g1;add SC_LO,8,%o7
fand P2_f28,MSK_BITSHI17,P2_f22
fmuld P0_f0,P0_f0,P0_f2
fsubd P1_f10,P1_f12,P1_f10
srl %l1,10,%l1
mov %o1,%o4
fsubd P2_f20,P2_f22,P2_f20
srl %l2,10,%l2
mov %o2,%o5
fmovd P0_f0,P0_f6 !ID for processing
fmuld P0_f2,C_q4,P0_f4
mov %o0,%o3
fmuld P1_f10,P1_f10,P1_f12
andn %l1,0x1f,%l1
fmuld P2_f20,P2_f20,P2_f22
andn %l2,0x1f,%l2
faddd P0_f4,C_q3,P0_f4
fmuld P1_f12,C_pp2,P1_f16
ldd [%g1+%l1],%f36
fmuld P2_f22,C_pp2,P2_f26
ldd [%g1+%l2],%f40
fmuld P0_f2,P0_f4,P0_f4
faddd P1_f16,C_pp1,P1_f16
fmuld P1_f12,C_qq2,P1_f14
ldd [SC_HI+%l1],%f38
faddd P2_f26,C_pp1,P2_f26
fmuld P2_f22,C_qq2,P2_f24
ldd [SC_HI+%l2],%f42
faddd P0_f4,C_q2,P0_f4
fmuld P1_f12,P1_f16,P1_f16
faddd P1_f14,C_qq1,P1_f14
fmuld P2_f22,P2_f26,P2_f26
faddd P2_f24,C_qq1,P2_f24
fmuld P0_f2,P0_f4,P0_f4
faddd P1_f16,C_ONE,P1_f16
fmuld P1_f12,P1_f14,P1_f14
faddd P2_f26,C_ONE,P2_f26
fmuld P2_f22,P2_f24,P2_f24
faddd P0_f4,C_q1,P0_f4
fmuld P1_f10,P1_f16,P1_f16
ldd [%o7+%l1],P1_f12
fmuld P2_f20,P2_f26,P2_f26
ldd [%o7+%l2],P2_f22
fmuld P1_f14,%f36,P1_f14
lda [%i1]%asi,%l0 ! preload next argument
fmuld P2_f24,%f40,P2_f24
lda [%i1]%asi,P0_f0
fmuld P1_f16,%f38,P1_f16
lda [%i1+4]%asi,P0_f1
fmuld P2_f26,%f42,P2_f26
add %i1,%i2,%i1 ! x += stridex
fmuld P0_f2,P0_f4,P0_f4
fsubd P1_f16,P1_f14,P1_f16
fsubd P2_f26,P2_f24,P2_f26
!!(vsin)fmuld P0_f6,P0_f4,P0_f4
fsubd P1_f12,P1_f16,P1_f16
fsubd P2_f22,P2_f26,P2_f26
faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing
faddd P1_f16,%f36,P1_f16
faddd P2_f26,%f40,P2_f26
andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
nop !!(vsin) fors P0_f6,P0_f9,P0_f6
addcc %i0,-1,%i0
nop !!(vsin) fors P1_f16,P1_f19,P1_f16
bg,pt %icc,.loop0
! delay slot
nop !!(vsin) fors P2_f26,P2_f29,P2_f26
ba,pt %icc,.endloop0
! delay slot
nop
.align 32
.case5:
sethi %hi(0x3fc3c000),%o7
fand P1_f18,MSK_BITSHI17,P1_f12
fmuld P0_f0,P0_f0,P0_f2
sub %l1,%o7,%l1
add SC_HI,8,%g1;add SC_LO,8,%o7
fmuld P2_f20,P2_f20,P2_f22
fsubd P1_f10,P1_f12,P1_f10
srl %l1,10,%l1
mov %o1,%o4
fmovd P0_f0,P0_f6 !ID for processing
fmuld P0_f2,C_q4,P0_f4
mov %o0,%o3
fmuld P2_f22,C_q4,P2_f24
mov %o2,%o5
fmuld P1_f10,P1_f10,P1_f12
andn %l1,0x1f,%l1
faddd P0_f4,C_q3,P0_f4
faddd P2_f24,C_q3,P2_f24
fmuld P1_f12,C_pp2,P1_f16
ldd [%g1+%l1],%f36
fmuld P0_f2,P0_f4,P0_f4
fmuld P2_f22,P2_f24,P2_f24
faddd P1_f16,C_pp1,P1_f16
fmuld P1_f12,C_qq2,P1_f14
ldd [SC_HI+%l1],%f38
faddd P0_f4,C_q2,P0_f4
faddd P2_f24,C_q2,P2_f24
fmuld P1_f12,P1_f16,P1_f16
faddd P1_f14,C_qq1,P1_f14
fmuld P0_f2,P0_f4,P0_f4
fmuld P2_f22,P2_f24,P2_f24
faddd P1_f16,C_ONE,P1_f16
fmuld P1_f12,P1_f14,P1_f14
faddd P0_f4,C_q1,P0_f4
faddd P2_f24,C_q1,P2_f24
fmuld P1_f10,P1_f16,P1_f16
ldd [%o7+%l1],P1_f12
fmuld P1_f14,%f36,P1_f14
lda [%i1]%asi,%l0 ! preload next argument
fmuld P0_f2,P0_f4,P0_f4
lda [%i1]%asi,P0_f0
fmuld P1_f16,%f38,P1_f16
lda [%i1+4]%asi,P0_f1
fmuld P2_f22,P2_f24,P2_f24
add %i1,%i2,%i1 ! x += stridex
!!(vsin)fmuld P0_f6,P0_f4,P0_f4
fsubd P1_f16,P1_f14,P1_f16
!!(vsin)fmuld P2_f20,P2_f24,P2_f24
faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing
fsubd P1_f12,P1_f16,P1_f16
faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26
nop !!(vsin) fors P0_f6,P0_f9,P0_f6
andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
faddd P1_f16,%f36,P1_f16
addcc %i0,-1,%i0
nop !!(vsin) fors P2_f26,P2_f29,P2_f26
bg,pt %icc,.loop0
! delay slot
nop !!(vsin) fors P1_f16,P1_f19,P1_f16
ba,pt %icc,.endloop0
! delay slot
nop
.align 32
.case6:
st P2_f27,[%o5+4]
cmp %l2,LIM_l5
fpadd32s P2_f20,MSK_BIT13,P2_f28
bl,pn %icc,.case7
! delay slot
sethi %hi(0x3fc3c000),%o7
fand P2_f28,MSK_BITSHI17,P2_f22
fmuld P0_f0,P0_f0,P0_f2
sub %l2,%o7,%l2
add SC_HI,8,%g1;add SC_LO,8,%o7
fmuld P1_f10,P1_f10,P1_f12
fsubd P2_f20,P2_f22,P2_f20
srl %l2,10,%l2
mov %o2,%o5
fmovd P0_f0,P0_f6 !ID for processing
fmuld P0_f2,C_q4,P0_f4
mov %o0,%o3
fmuld P1_f12,C_q4,P1_f14
mov %o1,%o4
fmuld P2_f20,P2_f20,P2_f22
andn %l2,0x1f,%l2
faddd P0_f4,C_q3,P0_f4
faddd P1_f14,C_q3,P1_f14
fmuld P2_f22,C_pp2,P2_f26
ldd [%g1+%l2],%f40
fmuld P0_f2,P0_f4,P0_f4
fmuld P1_f12,P1_f14,P1_f14
faddd P2_f26,C_pp1,P2_f26
fmuld P2_f22,C_qq2,P2_f24
ldd [SC_HI+%l2],%f42
faddd P0_f4,C_q2,P0_f4
faddd P1_f14,C_q2,P1_f14
fmuld P2_f22,P2_f26,P2_f26
faddd P2_f24,C_qq1,P2_f24
fmuld P0_f2,P0_f4,P0_f4
fmuld P1_f12,P1_f14,P1_f14
faddd P2_f26,C_ONE,P2_f26
fmuld P2_f22,P2_f24,P2_f24
faddd P0_f4,C_q1,P0_f4
faddd P1_f14,C_q1,P1_f14
fmuld P2_f20,P2_f26,P2_f26
ldd [%o7+%l2],P2_f22
fmuld P2_f24,%f40,P2_f24
lda [%i1]%asi,%l0 ! preload next argument
fmuld P0_f2,P0_f4,P0_f4
lda [%i1]%asi,P0_f0
fmuld P2_f26,%f42,P2_f26
lda [%i1+4]%asi,P0_f1
fmuld P1_f12,P1_f14,P1_f14
add %i1,%i2,%i1 ! x += stridex
!!(vsin)fmuld P0_f6,P0_f4,P0_f4
fsubd P2_f26,P2_f24,P2_f26
!!(vsin)fmuld P1_f10,P1_f14,P1_f14
faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing
fsubd P2_f22,P2_f26,P2_f26
faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16
nop !!(vsin) fors P0_f6,P0_f9,P0_f6
andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
faddd P2_f26,%f40,P2_f26
addcc %i0,-1,%i0
nop !!(vsin) fors P1_f16,P1_f19,P1_f16
bg,pt %icc,.loop0
! delay slot
nop !!(vsin) fors P2_f26,P2_f29,P2_f26
ba,pt %icc,.endloop0
! delay slot
nop
.align 32
.case7:
fmuld P0_f0,P0_f0,P0_f2
fmovd P0_f0,P0_f6 !ID for processing
mov %o0,%o3
fmuld P1_f10,P1_f10,P1_f12
mov %o1,%o4
fmuld P2_f20,P2_f20,P2_f22
mov %o2,%o5
fmuld P0_f2,C_q4,P0_f4
lda [%i1]%asi,%l0 ! preload next argument
fmuld P1_f12,C_q4,P1_f14
lda [%i1]%asi,P0_f0
fmuld P2_f22,C_q4,P2_f24
lda [%i1+4]%asi,P0_f1
faddd P0_f4,C_q3,P0_f4
add %i1,%i2,%i1 ! x += stridex
faddd P1_f14,C_q3,P1_f14
faddd P2_f24,C_q3,P2_f24
fmuld P0_f2,P0_f4,P0_f4
fmuld P1_f12,P1_f14,P1_f14
fmuld P2_f22,P2_f24,P2_f24
faddd P0_f4,C_q2,P0_f4
faddd P1_f14,C_q2,P1_f14
faddd P2_f24,C_q2,P2_f24
fmuld P0_f2,P0_f4,P0_f4
fmuld P1_f12,P1_f14,P1_f14
fmuld P2_f22,P2_f24,P2_f24
faddd P0_f4,C_q1,P0_f4
faddd P1_f14,C_q1,P1_f14
faddd P2_f24,C_q1,P2_f24
fmuld P0_f2,P0_f4,P0_f4
fmuld P1_f12,P1_f14,P1_f14
fmuld P2_f22,P2_f24,P2_f24
!!(vsin)fmuld P0_f6,P0_f4,P0_f4
!!(vsin)fmuld P1_f10,P1_f14,P1_f14
!!(vsin)fmuld P2_f20,P2_f24,P2_f24
faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing
faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16
faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26
andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
nop !!(vsin) fors P0_f6,P0_f9,P0_f6
addcc %i0,-1,%i0
nop !!(vsin) fors P1_f16,P1_f19,P1_f16
bg,pt %icc,.loop0
! delay slot
nop !!(vsin) fors P2_f26,P2_f29,P2_f26
ba,pt %icc,.endloop0
! delay slot
nop
.align 32
.endloop2:
cmp %l1,LIM_l5
bl,pn %icc,1f
! delay slot
fabsd P1_f10,P1_f10
sethi %hi(0x3fc3c000),%o7
fpadd32s P1_f10,MSK_BIT13,P1_f18
fand P1_f18,MSK_BITSHI17,P1_f12
sub %l1,%o7,%l1
add SC_HI,8,%g1;add SC_LO,8,%o7
fsubd P1_f10,P1_f12,P1_f10
srl %l1,10,%l1
fmuld P1_f10,P1_f10,P1_f12
andn %l1,0x1f,%l1
fmuld P1_f12,C_pp2,P2_f20
ldd [%g1+%l1],%f36
faddd P2_f20,C_pp1,P2_f20
fmuld P1_f12,C_qq2,P1_f14
ldd [SC_HI+%l1],%f38
fmuld P1_f12,P2_f20,P2_f20
faddd P1_f14,C_qq1,P1_f14
faddd P2_f20,C_ONE,P2_f20
fmuld P1_f12,P1_f14,P1_f14
fmuld P1_f10,P2_f20,P2_f20
ldd [%o7+%l1],P1_f12
fmuld P1_f14,%f36,P1_f14
fmuld P2_f20,%f38,P2_f20
fsubd P2_f20,P1_f14,P2_f20
fsubd P1_f12,P2_f20,P2_f20
ba,pt %icc,2f
! delay slot
faddd P2_f20,%f36,P2_f20
1:
fmuld P1_f10,P1_f10,P1_f12
fmuld P1_f12,C_q4,P1_f14
faddd P1_f14,C_q3,P1_f14
fmuld P1_f12,P1_f14,P1_f14
faddd P1_f14,C_q2,P1_f14
fmuld P1_f12,P1_f14,P1_f14
faddd P1_f14,C_q1,P1_f14
fmuld P1_f12,P1_f14,P1_f14
!!(vsin)fmuld P1_f10,P1_f14,P1_f14
faddd C_ONE,P1_f14,P2_f20 !!(vsin)faddd P1_f10,P1_f14,P2_f20
2:
nop !!(vsin) fors P2_f20,P1_f19,P2_f20
st P2_f20,[%o1]
st P2_f21,[%o1+4]
.endloop1:
cmp %l0,LIM_l5
bl,pn %icc,1f
! delay slot
fabsd P0_f0,P0_f0
sethi %hi(0x3fc3c000),%o7
fpadd32s P0_f0,MSK_BIT13,P0_f8
fand P0_f8,MSK_BITSHI17,P0_f2
sub %l0,%o7,%l0
add SC_HI,8,%g1;add SC_LO,8,%o7
fsubd P0_f0,P0_f2,P0_f0
srl %l0,10,%l0
fmuld P0_f0,P0_f0,P0_f2
andn %l0,0x1f,%l0
fmuld P0_f2,C_pp2,P2_f20
ldd [%g1+%l0],%f32
faddd P2_f20,C_pp1,P2_f20
fmuld P0_f2,C_qq2,P0_f4
ldd [SC_HI+%l0],%f34
fmuld P0_f2,P2_f20,P2_f20
faddd P0_f4,C_qq1,P0_f4
faddd P2_f20,C_ONE,P2_f20
fmuld P0_f2,P0_f4,P0_f4
fmuld P0_f0,P2_f20,P2_f20
ldd [%o7+%l0],P0_f2
fmuld P0_f4,%f32,P0_f4
fmuld P2_f20,%f34,P2_f20
fsubd P2_f20,P0_f4,P2_f20
fsubd P0_f2,P2_f20,P2_f20
ba,pt %icc,2f
! delay slot
faddd P2_f20,%f32,P2_f20
1:
fmuld P0_f0,P0_f0,P0_f2
fmuld P0_f2,C_q4,P0_f4
faddd P0_f4,C_q3,P0_f4
fmuld P0_f2,P0_f4,P0_f4
faddd P0_f4,C_q2,P0_f4
fmuld P0_f2,P0_f4,P0_f4
faddd P0_f4,C_q1,P0_f4
fmuld P0_f2,P0_f4,P0_f4
!!(vsin)fmuld P0_f0,P0_f4,P0_f4
faddd C_ONE,P0_f4,P2_f20 !!(vsin)faddd P0_f0,P0_f4,P2_f20
2:
nop !!(vsin) fors P2_f20,P0_f9,P2_f20
st P2_f20,[%o0]
st P2_f21,[%o0+4]
.endloop0:
st P0_f6,[%o3]
st P0_f7,[%o3+4]
st P1_f16,[%o4]
st P1_f17,[%o4+4]
st P2_f26,[%o5]
st P2_f27,[%o5+4]
! return. finished off with only primary range arguments
ret
restore
.align 32
.range0:
cmp %l0,LIM_l6
bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg.
! delay slot, annulled if branch not taken
mov 0x1,LIM_l6 ! set biguns flag or
fdtoi P0_f0,P0_f2; fmovd C_ONE,P0_f0 ; st P0_f0,[%o0] ! *y = *x with inexact if x nonzero
st P0_f1,[%o0+4]
!nop ! (vsin) fdtoi P0_f0,P0_f2
addcc %i0,-1,%i0
ble,pn %icc,.endloop0
! delay slot, harmless if branch taken
add %i3,%i4,%i3 ! y += stridey
andn %l1,MSK_SIGN,%l0 ! hx &= ~0x80000000
fmovd P1_f10,P0_f0
ba,pt %icc,.loop0
! delay slot
add %i1,%i2,%i1 ! x += stridex
.align 32
.range1:
cmp %l1,LIM_l6
bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg.
! delay slot, annulled if branch not taken
mov 0x2,LIM_l6 ! set biguns flag or
fdtoi P1_f10,P1_f12; fmovd C_ONE,P1_f10 ; st P1_f10,[%o1] ! *y = *x with inexact if x nonzero
st P1_f11,[%o1+4]
!nop ! (vsin) fdtoi P1_f10,P1_f12
addcc %i0,-1,%i0
ble,pn %icc,.endloop1
! delay slot, harmless if branch taken
add %i3,%i4,%i3 ! y += stridey
andn %l2,MSK_SIGN,%l1 ! hx &= ~0x80000000
fmovd P2_f20,P1_f10
ba,pt %icc,.loop1
! delay slot
add %i1,%i2,%i1 ! x += stridex
.align 32
.range2:
cmp %l2,LIM_l6
bg,a,pt %icc,.MEDIUM ! brance to Medium range on big arg.
! delay slot, annulled if branch not taken
mov 0x3,LIM_l6 ! set biguns flag or
fdtoi P2_f20,P2_f22; fmovd C_ONE,P2_f20 ; st P2_f20,[%o2] ! *y = *x with inexact if x nonzero
st P2_f21,[%o2+4]
nop ! (vsin) fdtoi P2_f20,P2_f22
1:
addcc %i0,-1,%i0
ble,pn %icc,.endloop2
! delay slot
nop
ld [%i1],%l2
ld [%i1],P2_f20
ld [%i1+4],P2_f21
andn %l2,MSK_SIGN,%l2 ! hx &= ~0x80000000
ba,pt %icc,.loop2
! delay slot
add %i1,%i2,%i1 ! x += stridex
.align 32
.MEDIUM:
! ========== medium range ==========
! register use
! i0 n
! i1 x
! i2 stridex
! i3 y
! i4 stridey
! i5 0x80000000
! l0 hx0
! l1 hx1
! l2 hx2
! l3 __vlibm_TBL_sincos_hi
! l4 __vlibm_TBL_sincos_lo
! l5 constants
! l6 biguns stored here : still called LIM_l6
! l7 0x413921fb
! the following are 64-bit registers in both V8+ and V9
! g1 scratch
! g5
! o0 py0
! o1 py1
! o2 py2
! o3 n0
! o4 n1
! o5 n2
! o7 scratch
! f0 x0
! f2 n0,y0
! f4
! f6
! f8 scratch for table base
! f9 signbit0
! f10 x1
! f12 n1,y1
! f14
! f16
! f18 scratch for table base
! f19 signbit1
! f20 x2
! f22 n2,y2
! f24
! f26
! f28 scratch for table base
! f29 signbit2
! f30 0x80000000
! f31 0x4000
! f32
! f34
! f36
! f38
! f40 invpio2
! f42 round
! f44 0xffff800000000000
! f46 pio2_1
! f48 pio2_2
! f50 pio2_3
! f52 pio2_3t
! f54 one
! f56 pp1
! f58 pp2
! f60 qq1
! f62 qq2
PIC_SET(g5,constants,l5)
! %o3,%o4,%o5 need to be stored
st P0_f6,[%o3]
sethi %hi(0x413921fb),%l7
st P0_f7,[%o3+4]
or %l7,%lo(0x413921fb),%l7
st P1_f16,[%o4]
st P1_f17,[%o4+4]
st P2_f26,[%o5]
st P2_f27,[%o5+4]
ldd [%l5+invpio2],%f40
ldd [%l5+round],%f42
ldd [%l5+pio2_1],%f46
ldd [%l5+pio2_2],%f48
ldd [%l5+pio2_3],%f50
ldd [%l5+pio2_3t],%f52
std %f54,[%fp+x0_1+8] ! set up stack data
std %f54,[%fp+x1_1+8]
std %f54,[%fp+x2_1+8]
stx %g0,[%fp+y0_0+8]
stx %g0,[%fp+y1_0+8]
stx %g0,[%fp+y2_0+8]
! branched here in the middle of the array. Need to adjust
! for the members of the triple that were selected in the primary
! loop.
! no adjustment since all three selected here
subcc LIM_l6,0x1,%g0 ! continue in LOOP0?
bz,a %icc,.LOOP0
mov 0x0,LIM_l6 ! delay slot set biguns=0
! ajust 1st triple since 2d and 3d done here
subcc LIM_l6,0x2,%g0 ! continue in LOOP1?
fmuld %f0,%f40,%f2 ! adj LOOP0
bz,a %icc,.LOOP1
mov 0x0,LIM_l6 ! delay slot set biguns=0
! ajust 1st and 2d triple since 3d done here
subcc LIM_l6,0x3,%g0 ! continue in LOOP2?
!done fmuld %f0,%f40,%f2 ! adj LOOP0
sub %i3,%i4,%i3 ! adjust to not double increment
fmuld %f10,%f40,%f12 ! adj LOOP1
faddd %f2,%f42,%f2 ! adj LOOP1
bz,a %icc,.LOOP2
mov 0x0,LIM_l6 ! delay slot set biguns=0
ba .LOOP0
nop
! -- 16 byte aligned
.align 32
.LOOP0:
lda [%i1]%asi,%l1 ! preload next argument
mov %i3,%o0 ! py0 = y
lda [%i1]%asi,%f10
cmp %l0,%l7
add %i3,%i4,%i3 ! y += stridey
bg,pn %icc,.BIG0 ! if hx > 0x413921fb
! delay slot
lda [%i1+4]%asi,%f11
addcc %i0,-1,%i0
add %i1,%i2,%i1 ! x += stridex
ble,pn %icc,.ENDLOOP1
! delay slot
andn %l1,%i5,%l1
nop
fmuld %f0,%f40,%f2
fabsd %f54,%f54 ! a nop for alignment only
.LOOP1:
lda [%i1]%asi,%l2 ! preload next argument
mov %i3,%o1 ! py1 = y
lda [%i1]%asi,%f20
cmp %l1,%l7
add %i3,%i4,%i3 ! y += stridey
bg,pn %icc,.BIG1 ! if hx > 0x413921fb
! delay slot
lda [%i1+4]%asi,%f21
addcc %i0,-1,%i0
add %i1,%i2,%i1 ! x += stridex
ble,pn %icc,.ENDLOOP2
! delay slot
andn %l2,%i5,%l2
nop
fmuld %f10,%f40,%f12
faddd %f2,%f42,%f2
.LOOP2:
st %f3,[%fp+n0]
mov %i3,%o2 ! py2 = y
cmp %l2,%l7
add %i3,%i4,%i3 ! y += stridey
fmuld %f20,%f40,%f22
bg,pn %icc,.BIG2 ! if hx > 0x413921fb
! delay slot
add %l5,thresh+4,%o7
faddd %f12,%f42,%f12
st %f13,[%fp+n1]
! -
add %l5,thresh,%g1
faddd %f22,%f42,%f22
st %f23,[%fp+n2]
fsubd %f2,%f42,%f2 ! n
fsubd %f12,%f42,%f12 ! n
fsubd %f22,%f42,%f22 ! n
fmuld %f2,%f46,%f4
fmuld %f12,%f46,%f14
fmuld %f22,%f46,%f24
fsubd %f0,%f4,%f4
fmuld %f2,%f48,%f6
fsubd %f10,%f14,%f14
fmuld %f12,%f48,%f16
fsubd %f20,%f24,%f24
fmuld %f22,%f48,%f26
fsubd %f4,%f6,%f0
ld [%fp+n0],%o3 ; add %o3,1,%o3
fsubd %f14,%f16,%f10
ld [%fp+n1],%o4 ; add %o4,1,%o4
fsubd %f24,%f26,%f20
ld [%fp+n2],%o5 ; add %o5,1,%o5
fsubd %f4,%f0,%f32
and %o3,1,%o3
fsubd %f14,%f10,%f34
and %o4,1,%o4
fsubd %f24,%f20,%f36
and %o5,1,%o5
fsubd %f32,%f6,%f32
fmuld %f2,%f50,%f8
sll %o3,3,%o3
fsubd %f34,%f16,%f34
fmuld %f12,%f50,%f18
sll %o4,3,%o4
fsubd %f36,%f26,%f36
fmuld %f22,%f50,%f28
sll %o5,3,%o5
fsubd %f8,%f32,%f8
ld [%g1+%o3],%f6
fsubd %f18,%f34,%f18
ld [%g1+%o4],%f16
fsubd %f28,%f36,%f28
ld [%g1+%o5],%f26
fsubd %f0,%f8,%f4
fsubd %f10,%f18,%f14
fsubd %f20,%f28,%f24
fsubd %f0,%f4,%f32
fsubd %f10,%f14,%f34
fsubd %f20,%f24,%f36
fsubd %f32,%f8,%f32
fmuld %f2,%f52,%f2
fsubd %f34,%f18,%f34
fmuld %f12,%f52,%f12
fsubd %f36,%f28,%f36
fmuld %f22,%f52,%f22
fsubd %f2,%f32,%f2
ld [%o7+%o3],%f8
fsubd %f12,%f34,%f12
ld [%o7+%o4],%f18
fsubd %f22,%f36,%f22
ld [%o7+%o5],%f28
fsubd %f4,%f2,%f0 ! x
fsubd %f14,%f12,%f10 ! x
fsubd %f24,%f22,%f20 ! x
fsubd %f4,%f0,%f4
fsubd %f14,%f10,%f14
fsubd %f24,%f20,%f24
fands %f0,%f30,%f9 ! save signbit
fands %f10,%f30,%f19 ! save signbit
fands %f20,%f30,%f29 ! save signbit
fabsd %f0,%f0
std %f0,[%fp+x0_1]
fabsd %f10,%f10
std %f10,[%fp+x1_1]
fabsd %f20,%f20
std %f20,[%fp+x2_1]
fsubd %f4,%f2,%f2 ! y
fsubd %f14,%f12,%f12 ! y
fsubd %f24,%f22,%f22 ! y
fcmpgt32 %f6,%f0,%l0
fcmpgt32 %f16,%f10,%l1
fcmpgt32 %f26,%f20,%l2
! -- 16 byte aligned
fxors %f2,%f9,%f2
fxors %f12,%f19,%f12
fxors %f22,%f29,%f22
fands %f9,%f8,%f9 ! if (n & 1) clear sign bit
andcc %l0,2,%g0
bne,pn %icc,.CASE4
! delay slot
fands %f19,%f18,%f19 ! if (n & 1) clear sign bit
andcc %l1,2,%g0
bne,pn %icc,.CASE2
! delay slot
fands %f29,%f28,%f29 ! if (n & 1) clear sign bit
andcc %l2,2,%g0
bne,pn %icc,.CASE1
! delay slot
fpadd32s %f0,%f31,%f8
sethi %hi(0x3fc3c000),%o7
ld [%fp+x0_1],%l0
fpadd32s %f10,%f31,%f18
add %l3,8,%g1
ld [%fp+x1_1],%l1
fpadd32s %f20,%f31,%f28
ld [%fp+x2_1],%l2
fand %f8,%f44,%f4
sub %l0,%o7,%l0
fand %f18,%f44,%f14
sub %l1,%o7,%l1
fand %f28,%f44,%f24
sub %l2,%o7,%l2
fsubd %f0,%f4,%f0
srl %l0,10,%l0
fsubd %f10,%f14,%f10
srl %l1,10,%l1
fsubd %f20,%f24,%f20
srl %l2,10,%l2
faddd %f0,%f2,%f0
andn %l0,0x1f,%l0
faddd %f10,%f12,%f10
andn %l1,0x1f,%l1
faddd %f20,%f22,%f20
andn %l2,0x1f,%l2
fmuld %f0,%f0,%f2
add %l0,%o3,%l0
fmuld %f10,%f10,%f12
add %l1,%o4,%l1
fmuld %f20,%f20,%f22
add %l2,%o5,%l2
fmuld %f2,%f58,%f6
ldd [%l3+%l0],%f32
fmuld %f12,%f58,%f16
ldd [%l3+%l1],%f34
fmuld %f22,%f58,%f26
ldd [%l3+%l2],%f36
faddd %f6,%f56,%f6
fmuld %f2,%f62,%f4
faddd %f16,%f56,%f16
fmuld %f12,%f62,%f14
faddd %f26,%f56,%f26
fmuld %f22,%f62,%f24
fmuld %f2,%f6,%f6
faddd %f4,%f60,%f4
fmuld %f12,%f16,%f16
faddd %f14,%f60,%f14
fmuld %f22,%f26,%f26
faddd %f24,%f60,%f24
faddd %f6,%f54,%f6
fmuld %f2,%f4,%f4
faddd %f16,%f54,%f16
fmuld %f12,%f14,%f14
faddd %f26,%f54,%f26
fmuld %f22,%f24,%f24
fmuld %f0,%f6,%f6
ldd [%g1+%l0],%f2
fmuld %f10,%f16,%f16
ldd [%g1+%l1],%f12
fmuld %f20,%f26,%f26
ldd [%g1+%l2],%f22
fmuld %f4,%f32,%f4
ldd [%l4+%l0],%f0
fmuld %f14,%f34,%f14
ldd [%l4+%l1],%f10
fmuld %f24,%f36,%f24
ldd [%l4+%l2],%f20
fmuld %f6,%f2,%f6
fmuld %f16,%f12,%f16
fmuld %f26,%f22,%f26
faddd %f6,%f4,%f6
faddd %f16,%f14,%f16
faddd %f26,%f24,%f26
faddd %f6,%f0,%f6
faddd %f16,%f10,%f16
faddd %f26,%f20,%f26
faddd %f6,%f32,%f6
faddd %f16,%f34,%f16
faddd %f26,%f36,%f26
.FIXSIGN:
ld [%fp+n0],%o3 ; add %o3,1,%o3
add %l5,thresh-4,%g1
ld [%fp+n1],%o4 ; add %o4,1,%o4
ld [%fp+n2],%o5 ; add %o5,1,%o5
and %o3,2,%o3
sll %o3,2,%o3
and %o4,2,%o4
lda [%i1]%asi,%l0 ! preload next argument
sll %o4,2,%o4
and %o5,2,%o5
ld [%g1+%o3],%f8
sll %o5,2,%o5
ld [%g1+%o4],%f18
ld [%g1+%o5],%f28
fxors %f9,%f8,%f9
lda [%i1]%asi,%f0
fxors %f29,%f28,%f29
lda [%i1+4]%asi,%f1
fxors %f19,%f18,%f19
fors %f6,%f9,%f6 ! tack on sign
add %i1,%i2,%i1 ! x += stridex
st %f6,[%o0]
fors %f26,%f29,%f26 ! tack on sign
st %f7,[%o0+4]
fors %f16,%f19,%f16 ! tack on sign
st %f26,[%o2]
st %f27,[%o2+4]
addcc %i0,-1,%i0
st %f16,[%o1]
andn %l0,%i5,%l0 ! hx &= ~0x80000000
bg,pt %icc,.LOOP0
! delay slot
st %f17,[%o1+4]
ba,pt %icc,.ENDLOOP0
! delay slot
nop
.align 32
.CASE1:
fpadd32s %f10,%f31,%f18
sethi %hi(0x3fc3c000),%o7
ld [%fp+x0_1],%l0
fand %f8,%f44,%f4
add %l3,8,%g1
ld [%fp+x1_1],%l1
fand %f18,%f44,%f14
sub %l0,%o7,%l0
fsubd %f0,%f4,%f0
srl %l0,10,%l0
sub %l1,%o7,%l1
fsubd %f10,%f14,%f10
srl %l1,10,%l1
fmuld %f20,%f20,%f20
ldd [%l5+%o5],%f36
add %l5,%o5,%l2
faddd %f0,%f2,%f0
andn %l0,0x1f,%l0
faddd %f10,%f12,%f10
andn %l1,0x1f,%l1
fmuld %f20,%f36,%f24
ldd [%l2+0x10],%f26
add %fp,%o5,%o5
fmuld %f0,%f0,%f2
add %l0,%o3,%l0
fmuld %f10,%f10,%f12
add %l1,%o4,%l1
faddd %f24,%f26,%f24
ldd [%l2+0x20],%f36
fmuld %f2,%f58,%f6
ldd [%l3+%l0],%f32
fmuld %f12,%f58,%f16
ldd [%l3+%l1],%f34
fmuld %f20,%f24,%f24
ldd [%l2+0x30],%f26
faddd %f6,%f56,%f6
fmuld %f2,%f62,%f4
faddd %f16,%f56,%f16
fmuld %f12,%f62,%f14
faddd %f24,%f36,%f24
ldd [%o5+x2_1],%f36
fmuld %f2,%f6,%f6
faddd %f4,%f60,%f4
fmuld %f12,%f16,%f16
faddd %f14,%f60,%f14
fmuld %f20,%f24,%f24
faddd %f6,%f54,%f6
fmuld %f2,%f4,%f4
ldd [%g1+%l0],%f2
faddd %f16,%f54,%f16
fmuld %f12,%f14,%f14
ldd [%g1+%l1],%f12
faddd %f24,%f26,%f24
fmuld %f0,%f6,%f6
ldd [%l4+%l0],%f0
fmuld %f10,%f16,%f16
ldd [%l4+%l1],%f10
fmuld %f4,%f32,%f4
std %f22,[%fp+y2_0]
fmuld %f14,%f34,%f14
fmuld %f6,%f2,%f6
fmuld %f16,%f12,%f16
fmuld %f20,%f24,%f24
faddd %f6,%f4,%f6
faddd %f16,%f14,%f16
fmuld %f36,%f24,%f24
ldd [%o5+y2_0],%f22
faddd %f6,%f0,%f6
faddd %f16,%f10,%f16
faddd %f24,%f22,%f24
faddd %f6,%f32,%f6
faddd %f16,%f34,%f16
ba,pt %icc,.FIXSIGN
! delay slot
faddd %f36,%f24,%f26
.align 32
.CASE2:
fpadd32s %f0,%f31,%f8
ld [%fp+x0_1],%l0
andcc %l2,2,%g0
bne,pn %icc,.CASE3
! delay slot
sethi %hi(0x3fc3c000),%o7
fpadd32s %f20,%f31,%f28
ld [%fp+x2_1],%l2
fand %f8,%f44,%f4
sub %l0,%o7,%l0
add %l3,8,%g1
fand %f28,%f44,%f24
sub %l2,%o7,%l2
fsubd %f0,%f4,%f0
srl %l0,10,%l0
fsubd %f20,%f24,%f20
srl %l2,10,%l2
fmuld %f10,%f10,%f10
ldd [%l5+%o4],%f34
add %l5,%o4,%l1
faddd %f0,%f2,%f0
andn %l0,0x1f,%l0
faddd %f20,%f22,%f20
andn %l2,0x1f,%l2
fmuld %f10,%f34,%f14
ldd [%l1+0x10],%f16
add %fp,%o4,%o4
fmuld %f0,%f0,%f2
add %l0,%o3,%l0
fmuld %f20,%f20,%f22
add %l2,%o5,%l2
faddd %f14,%f16,%f14
ldd [%l1+0x20],%f34
fmuld %f2,%f58,%f6
ldd [%l3+%l0],%f32
fmuld %f22,%f58,%f26
ldd [%l3+%l2],%f36
fmuld %f10,%f14,%f14
ldd [%l1+0x30],%f16
faddd %f6,%f56,%f6
fmuld %f2,%f62,%f4
faddd %f26,%f56,%f26
fmuld %f22,%f62,%f24
faddd %f14,%f34,%f14
ldd [%o4+x1_1],%f34
fmuld %f2,%f6,%f6
faddd %f4,%f60,%f4
fmuld %f22,%f26,%f26
faddd %f24,%f60,%f24
fmuld %f10,%f14,%f14
faddd %f6,%f54,%f6
fmuld %f2,%f4,%f4
ldd [%g1+%l0],%f2
faddd %f26,%f54,%f26
fmuld %f22,%f24,%f24
ldd [%g1+%l2],%f22
faddd %f14,%f16,%f14
fmuld %f0,%f6,%f6
ldd [%l4+%l0],%f0
fmuld %f20,%f26,%f26
ldd [%l4+%l2],%f20
fmuld %f4,%f32,%f4
std %f12,[%fp+y1_0]
fmuld %f24,%f36,%f24
fmuld %f6,%f2,%f6
fmuld %f26,%f22,%f26
fmuld %f10,%f14,%f14
faddd %f6,%f4,%f6
faddd %f26,%f24,%f26
fmuld %f34,%f14,%f14
ldd [%o4+y1_0],%f12
faddd %f6,%f0,%f6
faddd %f26,%f20,%f26
faddd %f14,%f12,%f14
faddd %f6,%f32,%f6
faddd %f26,%f36,%f26
ba,pt %icc,.FIXSIGN
! delay slot
faddd %f34,%f14,%f16
.align 32
.CASE3:
fand %f8,%f44,%f4
add %l3,8,%g1
sub %l0,%o7,%l0
fmuld %f10,%f10,%f10
ldd [%l5+%o4],%f34
add %l5,%o4,%l1
fsubd %f0,%f4,%f0
srl %l0,10,%l0
fmuld %f20,%f20,%f20
ldd [%l5+%o5],%f36
add %l5,%o5,%l2
fmuld %f10,%f34,%f14
ldd [%l1+0x10],%f16
add %fp,%o4,%o4
faddd %f0,%f2,%f0
andn %l0,0x1f,%l0
fmuld %f20,%f36,%f24
ldd [%l2+0x10],%f26
add %fp,%o5,%o5
faddd %f14,%f16,%f14
ldd [%l1+0x20],%f34
fmuld %f0,%f0,%f2
add %l0,%o3,%l0
faddd %f24,%f26,%f24
ldd [%l2+0x20],%f36
fmuld %f10,%f14,%f14
ldd [%l1+0x30],%f16
fmuld %f2,%f58,%f6
ldd [%l3+%l0],%f32
fmuld %f20,%f24,%f24
ldd [%l2+0x30],%f26
faddd %f14,%f34,%f14
ldd [%o4+x1_1],%f34
faddd %f6,%f56,%f6
fmuld %f2,%f62,%f4
faddd %f24,%f36,%f24
ldd [%o5+x2_1],%f36
fmuld %f10,%f14,%f14
std %f12,[%fp+y1_0]
fmuld %f2,%f6,%f6
faddd %f4,%f60,%f4
fmuld %f20,%f24,%f24
std %f22,[%fp+y2_0]
faddd %f14,%f16,%f14
faddd %f6,%f54,%f6
fmuld %f2,%f4,%f4
ldd [%g1+%l0],%f2
faddd %f24,%f26,%f24
fmuld %f10,%f14,%f14
fmuld %f0,%f6,%f6
ldd [%l4+%l0],%f0
fmuld %f4,%f32,%f4
fmuld %f20,%f24,%f24
fmuld %f6,%f2,%f6
fmuld %f34,%f14,%f14
ldd [%o4+y1_0],%f12
fmuld %f36,%f24,%f24
ldd [%o5+y2_0],%f22
faddd %f6,%f4,%f6
faddd %f14,%f12,%f14
faddd %f24,%f22,%f24
faddd %f6,%f0,%f6
faddd %f34,%f14,%f16
faddd %f36,%f24,%f26
ba,pt %icc,.FIXSIGN
! delay slot
faddd %f6,%f32,%f6
.align 32
.CASE4:
fands %f29,%f28,%f29 ! if (n & 1) clear sign bit
sethi %hi(0x3fc3c000),%o7
andcc %l1,2,%g0
bne,pn %icc,.CASE6
! delay slot
andcc %l2,2,%g0
fpadd32s %f10,%f31,%f18
ld [%fp+x1_1],%l1
bne,pn %icc,.CASE5
! delay slot
add %l3,8,%g1
ld [%fp+x2_1],%l2
fpadd32s %f20,%f31,%f28
fand %f18,%f44,%f14
sub %l1,%o7,%l1
fand %f28,%f44,%f24
sub %l2,%o7,%l2
fsubd %f10,%f14,%f10
srl %l1,10,%l1
fsubd %f20,%f24,%f20
srl %l2,10,%l2
fmuld %f0,%f0,%f0
ldd [%l5+%o3],%f32
add %l5,%o3,%l0
faddd %f10,%f12,%f10
andn %l1,0x1f,%l1
faddd %f20,%f22,%f20
andn %l2,0x1f,%l2
fmuld %f0,%f32,%f4
ldd [%l0+0x10],%f6
add %fp,%o3,%o3
fmuld %f10,%f10,%f12
add %l1,%o4,%l1
fmuld %f20,%f20,%f22
add %l2,%o5,%l2
faddd %f4,%f6,%f4
ldd [%l0+0x20],%f32
fmuld %f12,%f58,%f16
ldd [%l3+%l1],%f34
fmuld %f22,%f58,%f26
ldd [%l3+%l2],%f36
fmuld %f0,%f4,%f4
ldd [%l0+0x30],%f6
faddd %f16,%f56,%f16
fmuld %f12,%f62,%f14
faddd %f26,%f56,%f26
fmuld %f22,%f62,%f24
faddd %f4,%f32,%f4
ldd [%o3+x0_1],%f32
fmuld %f12,%f16,%f16
faddd %f14,%f60,%f14
fmuld %f22,%f26,%f26
faddd %f24,%f60,%f24
fmuld %f0,%f4,%f4
faddd %f16,%f54,%f16
fmuld %f12,%f14,%f14
ldd [%g1+%l1],%f12
faddd %f26,%f54,%f26
fmuld %f22,%f24,%f24
ldd [%g1+%l2],%f22
faddd %f4,%f6,%f4
fmuld %f10,%f16,%f16
ldd [%l4+%l1],%f10
fmuld %f20,%f26,%f26
ldd [%l4+%l2],%f20
fmuld %f14,%f34,%f14
std %f2,[%fp+y0_0]
fmuld %f24,%f36,%f24
fmuld %f0,%f4,%f4
fmuld %f16,%f12,%f16
fmuld %f26,%f22,%f26
fmuld %f32,%f4,%f4
ldd [%o3+y0_0],%f2
faddd %f16,%f14,%f16
faddd %f26,%f24,%f26
faddd %f4,%f2,%f4
faddd %f16,%f10,%f16
faddd %f26,%f20,%f26
faddd %f32,%f4,%f6
faddd %f16,%f34,%f16
ba,pt %icc,.FIXSIGN
! delay slot
faddd %f26,%f36,%f26
.align 32
.CASE5:
fand %f18,%f44,%f14
sub %l1,%o7,%l1
fmuld %f0,%f0,%f0
ldd [%l5+%o3],%f32
add %l5,%o3,%l0
fsubd %f10,%f14,%f10
srl %l1,10,%l1
fmuld %f20,%f20,%f20
ldd [%l5+%o5],%f36
add %l5,%o5,%l2
fmuld %f0,%f32,%f4
ldd [%l0+0x10],%f6
add %fp,%o3,%o3
faddd %f10,%f12,%f10
andn %l1,0x1f,%l1
fmuld %f20,%f36,%f24
ldd [%l2+0x10],%f26
add %fp,%o5,%o5
faddd %f4,%f6,%f4
ldd [%l0+0x20],%f32
fmuld %f10,%f10,%f12
add %l1,%o4,%l1
faddd %f24,%f26,%f24
ldd [%l2+0x20],%f36
fmuld %f0,%f4,%f4
ldd [%l0+0x30],%f6
fmuld %f12,%f58,%f16
ldd [%l3+%l1],%f34
fmuld %f20,%f24,%f24
ldd [%l2+0x30],%f26
faddd %f4,%f32,%f4
ldd [%o3+x0_1],%f32
faddd %f16,%f56,%f16
fmuld %f12,%f62,%f14
faddd %f24,%f36,%f24
ldd [%o5+x2_1],%f36
fmuld %f0,%f4,%f4
std %f2,[%fp+y0_0]
fmuld %f12,%f16,%f16
faddd %f14,%f60,%f14
fmuld %f20,%f24,%f24
std %f22,[%fp+y2_0]
faddd %f4,%f6,%f4
faddd %f16,%f54,%f16
fmuld %f12,%f14,%f14
ldd [%g1+%l1],%f12
faddd %f24,%f26,%f24
fmuld %f0,%f4,%f4
fmuld %f10,%f16,%f16
ldd [%l4+%l1],%f10
fmuld %f14,%f34,%f14
fmuld %f20,%f24,%f24
fmuld %f16,%f12,%f16
fmuld %f32,%f4,%f4
ldd [%o3+y0_0],%f2
fmuld %f36,%f24,%f24
ldd [%o5+y2_0],%f22
faddd %f16,%f14,%f16
faddd %f4,%f2,%f4
faddd %f24,%f22,%f24
faddd %f16,%f10,%f16
faddd %f32,%f4,%f6
faddd %f36,%f24,%f26
ba,pt %icc,.FIXSIGN
! delay slot
faddd %f16,%f34,%f16
.align 32
.CASE6:
ld [%fp+x2_1],%l2
add %l3,8,%g1
bne,pn %icc,.CASE7
! delay slot
fpadd32s %f20,%f31,%f28
fand %f28,%f44,%f24
ldd [%l5+%o3],%f32
add %l5,%o3,%l0
fmuld %f0,%f0,%f0
sub %l2,%o7,%l2
fsubd %f20,%f24,%f20
srl %l2,10,%l2
fmuld %f10,%f10,%f10
ldd [%l5+%o4],%f34
add %l5,%o4,%l1
fmuld %f0,%f32,%f4
ldd [%l0+0x10],%f6
add %fp,%o3,%o3
faddd %f20,%f22,%f20
andn %l2,0x1f,%l2
fmuld %f10,%f34,%f14
ldd [%l1+0x10],%f16
add %fp,%o4,%o4
faddd %f4,%f6,%f4
ldd [%l0+0x20],%f32
fmuld %f20,%f20,%f22
add %l2,%o5,%l2
faddd %f14,%f16,%f14
ldd [%l1+0x20],%f34
fmuld %f0,%f4,%f4
ldd [%l0+0x30],%f6
fmuld %f22,%f58,%f26
ldd [%l3+%l2],%f36
fmuld %f10,%f14,%f14
ldd [%l1+0x30],%f16
faddd %f4,%f32,%f4
ldd [%o3+x0_1],%f32
faddd %f26,%f56,%f26
fmuld %f22,%f62,%f24
faddd %f14,%f34,%f14
ldd [%o4+x1_1],%f34
fmuld %f0,%f4,%f4
std %f2,[%fp+y0_0]
fmuld %f22,%f26,%f26
faddd %f24,%f60,%f24
fmuld %f10,%f14,%f14
std %f12,[%fp+y1_0]
faddd %f4,%f6,%f4
faddd %f26,%f54,%f26
fmuld %f22,%f24,%f24
ldd [%g1+%l2],%f22
faddd %f14,%f16,%f14
fmuld %f0,%f4,%f4
fmuld %f20,%f26,%f26
ldd [%l4+%l2],%f20
fmuld %f24,%f36,%f24
fmuld %f10,%f14,%f14
fmuld %f26,%f22,%f26
fmuld %f32,%f4,%f4
ldd [%o3+y0_0],%f2
fmuld %f34,%f14,%f14
ldd [%o4+y1_0],%f12
faddd %f26,%f24,%f26
faddd %f4,%f2,%f4
faddd %f14,%f12,%f14
faddd %f26,%f20,%f26
faddd %f32,%f4,%f6
faddd %f34,%f14,%f16
ba,pt %icc,.FIXSIGN
! delay slot
faddd %f26,%f36,%f26
.align 32
.CASE7:
fmuld %f0,%f0,%f0
ldd [%l5+%o3],%f32
add %l5,%o3,%l0
fmuld %f10,%f10,%f10
ldd [%l5+%o4],%f34
add %l5,%o4,%l1
fmuld %f20,%f20,%f20
ldd [%l5+%o5],%f36
add %l5,%o5,%l2
fmuld %f0,%f32,%f4
ldd [%l0+0x10],%f6
add %fp,%o3,%o3
fmuld %f10,%f34,%f14
ldd [%l1+0x10],%f16
add %fp,%o4,%o4
fmuld %f20,%f36,%f24
ldd [%l2+0x10],%f26
add %fp,%o5,%o5
faddd %f4,%f6,%f4
ldd [%l0+0x20],%f32
faddd %f14,%f16,%f14
ldd [%l1+0x20],%f34
faddd %f24,%f26,%f24
ldd [%l2+0x20],%f36
fmuld %f0,%f4,%f4
ldd [%l0+0x30],%f6
fmuld %f10,%f14,%f14
ldd [%l1+0x30],%f16
fmuld %f20,%f24,%f24
ldd [%l2+0x30],%f26
faddd %f4,%f32,%f4
ldd [%o3+x0_1],%f32
faddd %f14,%f34,%f14
ldd [%o4+x1_1],%f34
faddd %f24,%f36,%f24
ldd [%o5+x2_1],%f36
fmuld %f0,%f4,%f4
std %f2,[%fp+y0_0]
fmuld %f10,%f14,%f14
std %f12,[%fp+y1_0]
fmuld %f20,%f24,%f24
std %f22,[%fp+y2_0]
faddd %f4,%f6,%f4
faddd %f14,%f16,%f14
faddd %f24,%f26,%f24
fmuld %f0,%f4,%f4
fmuld %f10,%f14,%f14
fmuld %f20,%f24,%f24
fmuld %f32,%f4,%f4
ldd [%o3+y0_0],%f2
fmuld %f34,%f14,%f14
ldd [%o4+y1_0],%f12
fmuld %f36,%f24,%f24
ldd [%o5+y2_0],%f22
faddd %f4,%f2,%f4
faddd %f14,%f12,%f14
faddd %f24,%f22,%f24
faddd %f32,%f4,%f6
faddd %f34,%f14,%f16
ba,pt %icc,.FIXSIGN
! delay slot
faddd %f36,%f24,%f26
.align 32
.ENDLOOP2:
fmuld %f10,%f40,%f12
add %l5,thresh,%g1
faddd %f12,%f42,%f12
st %f13,[%fp+n1]
fsubd %f12,%f42,%f12 ! n
fmuld %f12,%f46,%f14
fsubd %f10,%f14,%f14
fmuld %f12,%f48,%f16
fsubd %f14,%f16,%f10
ld [%fp+n1],%o4 ; add %o4,1,%o4
fsubd %f14,%f10,%f34
and %o4,1,%o4
fsubd %f34,%f16,%f34
fmuld %f12,%f50,%f18
sll %o4,3,%o4
fsubd %f18,%f34,%f18
ld [%g1+%o4],%f16
fsubd %f10,%f18,%f14
fsubd %f10,%f14,%f34
add %l5,thresh+4,%o7
fsubd %f34,%f18,%f34
fmuld %f12,%f52,%f12
fsubd %f12,%f34,%f12
ld [%o7+%o4],%f18
fsubd %f14,%f12,%f10 ! x
fsubd %f14,%f10,%f14
fands %f10,%f30,%f19 ! save signbit
fabsd %f10,%f10
std %f10,[%fp+x1_1]
fsubd %f14,%f12,%f12 ! y
fcmpgt32 %f16,%f10,%l1
fxors %f12,%f19,%f12
fands %f19,%f18,%f19 ! if (n & 1) clear sign bit
andcc %l1,2,%g0
bne,pn %icc,1f
! delay slot
nop
fpadd32s %f10,%f31,%f18
ld [%fp+x1_1],%l1
fand %f18,%f44,%f14
sethi %hi(0x3fc3c000),%o7
add %l3,8,%g1
fsubd %f10,%f14,%f10
sub %l1,%o7,%l1
srl %l1,10,%l1
faddd %f10,%f12,%f10
andn %l1,0x1f,%l1
fmuld %f10,%f10,%f12
add %l1,%o4,%l1
fmuld %f12,%f58,%f16
ldd [%l3+%l1],%f34
faddd %f16,%f56,%f16
fmuld %f12,%f62,%f14
fmuld %f12,%f16,%f16
faddd %f14,%f60,%f14
faddd %f16,%f54,%f16
fmuld %f12,%f14,%f14
ldd [%g1+%l1],%f12
fmuld %f10,%f16,%f16
ldd [%l4+%l1],%f10
fmuld %f14,%f34,%f14
fmuld %f16,%f12,%f16
faddd %f16,%f14,%f16
faddd %f16,%f10,%f16
ba,pt %icc,2f
faddd %f16,%f34,%f16
1:
fmuld %f10,%f10,%f10
ldd [%l5+%o4],%f34
add %l5,%o4,%l1
fmuld %f10,%f34,%f14
ldd [%l1+0x10],%f16
add %fp,%o4,%o4
faddd %f14,%f16,%f14
ldd [%l1+0x20],%f34
fmuld %f10,%f14,%f14
ldd [%l1+0x30],%f16
faddd %f14,%f34,%f14
ldd [%o4+x1_1],%f34
fmuld %f10,%f14,%f14
std %f12,[%fp+y1_0]
faddd %f14,%f16,%f14
fmuld %f10,%f14,%f14
fmuld %f34,%f14,%f14
ldd [%o4+y1_0],%f12
faddd %f14,%f12,%f14
faddd %f34,%f14,%f16
2:
add %l5,thresh-4,%g1
ld [%fp+n1],%o4 ; add %o4,1,%o4
and %o4,2,%o4
sll %o4,2,%o4
ld [%g1+%o4],%f18
fxors %f19,%f18,%f19
fors %f16,%f19,%f16 ! tack on sign
st %f16,[%o1]
st %f17,[%o1+4]
.ENDLOOP1:
fmuld %f0,%f40,%f2
add %l5,thresh,%g1
faddd %f2,%f42,%f2
st %f3,[%fp+n0]
fsubd %f2,%f42,%f2 ! n
fmuld %f2,%f46,%f4
fsubd %f0,%f4,%f4
fmuld %f2,%f48,%f6
fsubd %f4,%f6,%f0
ld [%fp+n0],%o3 ; add %o3,1,%o3
fsubd %f4,%f0,%f32
and %o3,1,%o3
fsubd %f32,%f6,%f32
fmuld %f2,%f50,%f8
sll %o3,3,%o3
fsubd %f8,%f32,%f8
ld [%g1+%o3],%f6
fsubd %f0,%f8,%f4
fsubd %f0,%f4,%f32
add %l5,thresh+4,%o7
fsubd %f32,%f8,%f32
fmuld %f2,%f52,%f2
fsubd %f2,%f32,%f2
ld [%o7+%o3],%f8
fsubd %f4,%f2,%f0 ! x
fsubd %f4,%f0,%f4
fands %f0,%f30,%f9 ! save signbit
fabsd %f0,%f0
std %f0,[%fp+x0_1]
fsubd %f4,%f2,%f2 ! y
fcmpgt32 %f6,%f0,%l0
fxors %f2,%f9,%f2
fands %f9,%f8,%f9 ! if (n & 1) clear sign bit
andcc %l0,2,%g0
bne,pn %icc,1f
! delay slot
nop
fpadd32s %f0,%f31,%f8
ld [%fp+x0_1],%l0
fand %f8,%f44,%f4
sethi %hi(0x3fc3c000),%o7
add %l3,8,%g1
fsubd %f0,%f4,%f0
sub %l0,%o7,%l0
srl %l0,10,%l0
faddd %f0,%f2,%f0
andn %l0,0x1f,%l0
fmuld %f0,%f0,%f2
add %l0,%o3,%l0
fmuld %f2,%f58,%f6
ldd [%l3+%l0],%f32
faddd %f6,%f56,%f6
fmuld %f2,%f62,%f4
fmuld %f2,%f6,%f6
faddd %f4,%f60,%f4
faddd %f6,%f54,%f6
fmuld %f2,%f4,%f4
ldd [%g1+%l0],%f2
fmuld %f0,%f6,%f6
ldd [%l4+%l0],%f0
fmuld %f4,%f32,%f4
fmuld %f6,%f2,%f6
faddd %f6,%f4,%f6
faddd %f6,%f0,%f6
ba,pt %icc,2f
faddd %f6,%f32,%f6
1:
fmuld %f0,%f0,%f0
ldd [%l5+%o3],%f32
add %l5,%o3,%l0
fmuld %f0,%f32,%f4
ldd [%l0+0x10],%f6
add %fp,%o3,%o3
faddd %f4,%f6,%f4
ldd [%l0+0x20],%f32
fmuld %f0,%f4,%f4
ldd [%l0+0x30],%f6
faddd %f4,%f32,%f4
ldd [%o3+x0_1],%f32
fmuld %f0,%f4,%f4
std %f2,[%fp+y0_0]
faddd %f4,%f6,%f4
fmuld %f0,%f4,%f4
fmuld %f32,%f4,%f4
ldd [%o3+y0_0],%f2
faddd %f4,%f2,%f4
faddd %f32,%f4,%f6
2:
add %l5,thresh-4,%g1
ld [%fp+n0],%o3 ; add %o3,1,%o3
and %o3,2,%o3
sll %o3,2,%o3
ld [%g1+%o3],%f8
fxors %f9,%f8,%f9
fors %f6,%f9,%f6 ! tack on sign
st %f6,[%o0]
st %f7,[%o0+4]
.ENDLOOP0:
! check for huge arguments remaining
tst LIM_l6
be,pt %icc,.exit
! delay slot
nop
! ========== huge range (use C code) ==========
#ifdef __sparcv9
ldx [%fp+xsave],%o1
ldx [%fp+ysave],%o3
#else
ld [%fp+xsave],%o1
ld [%fp+ysave],%o3
#endif
ld [%fp+nsave],%o0
ld [%fp+sxsave],%o2
ld [%fp+sysave],%o4
sra %o2,0,%o2 ! sign-extend for V9
sra %o4,0,%o4
call __vlibm_vcos_big
mov %l7,%o5 ! delay slot
.exit:
ret
restore
.align 32
.SKIP0:
addcc %i0,-1,%i0
ble,pn %icc,.ENDLOOP0
! delay slot, harmless if branch taken
add %i3,%i4,%i3 ! y += stridey
andn %l1,%i5,%l0 ! hx &= ~0x80000000
fmovs %f10,%f0
ld [%i1+4],%f1
ba,pt %icc,.LOOP0
! delay slot
add %i1,%i2,%i1 ! x += stridex
.align 32
.SKIP1:
addcc %i0,-1,%i0
ble,pn %icc,.ENDLOOP1
! delay slot, harmless if branch taken
add %i3,%i4,%i3 ! y += stridey
andn %l2,%i5,%l1 ! hx &= ~0x80000000
fmovs %f20,%f10
ld [%i1+4],%f11
ba,pt %icc,.LOOP1
! delay slot
add %i1,%i2,%i1 ! x += stridex
.align 32
.SKIP2:
addcc %i0,-1,%i0
ble,pn %icc,.ENDLOOP2
! delay slot, harmless if branch taken
add %i3,%i4,%i3 ! y += stridey
ld [%i1],%l2
ld [%i1],%f20
ld [%i1+4],%f21
andn %l2,%i5,%l2 ! hx &= ~0x80000000
ba,pt %icc,.LOOP2
! delay slot
add %i1,%i2,%i1 ! x += stridex
.align 32
.BIG0:
sethi %hi(0x7ff00000),%o7
cmp %l0,%o7
bl,a,pt %icc,1f ! if hx < 0x7ff00000
! delay slot, annulled if branch not taken
mov %l7,LIM_l6 ! set biguns flag or
fsubd %f0,%f0,%f0 ! y = x - x
st %f0,[%o0]
st %f1,[%o0+4]
1:
addcc %i0,-1,%i0
ble,pn %icc,.ENDLOOP0
! delay slot, harmless if branch taken
andn %l1,%i5,%l0 ! hx &= ~0x80000000
fmovd %f10,%f0
ba,pt %icc,.LOOP0
! delay slot
add %i1,%i2,%i1 ! x += stridex
.align 32
.BIG1:
sethi %hi(0x7ff00000),%o7
cmp %l1,%o7
bl,a,pt %icc,1f ! if hx < 0x7ff00000
! delay slot, annulled if branch not taken
mov %l7,LIM_l6 ! set biguns flag or
fsubd %f10,%f10,%f10 ! y = x - x
st %f10,[%o1]
st %f11,[%o1+4]
1:
addcc %i0,-1,%i0
ble,pn %icc,.ENDLOOP1
! delay slot, harmless if branch taken
andn %l2,%i5,%l1 ! hx &= ~0x80000000
fmovd %f20,%f10
ba,pt %icc,.LOOP1
! delay slot
add %i1,%i2,%i1 ! x += stridex
.align 32
.BIG2:
sethi %hi(0x7ff00000),%o7
cmp %l2,%o7
bl,a,pt %icc,1f ! if hx < 0x7ff00000
! delay slot, annulled if branch not taken
mov %l7,LIM_l6 ! set biguns flag or
fsubd %f20,%f20,%f20 ! y = x - x
st %f20,[%o2]
st %f21,[%o2+4]
1:
addcc %i0,-1,%i0
ble,pn %icc,.ENDLOOP2
! delay slot
nop
ld [%i1],%l2
ld [%i1],%f20
ld [%i1+4],%f21
andn %l2,%i5,%l2 ! hx &= ~0x80000000
ba,pt %icc,.LOOP2
! delay slot
add %i1,%i2,%i1 ! x += stridex
SET_SIZE(__vcos)