common/vis/__vlog.S

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

    .file   "__vlog.S"

#include "libm.h"

    RO_DATA
    .align  32
TBL:
    .word   0xbfd522ae, 0x0738a000
    .word   0xbd2ebe70, 0x8164c759
    .word   0xbfd3c252, 0x77333000
    .word   0xbd183b54, 0xb606bd5c
    .word   0xbfd26962, 0x1134e000
    .word   0x3d31b61f, 0x10522625
    .word   0xbfd1178e, 0x8227e000
    .word   0xbd31ef78, 0xce2d07f2
    .word   0xbfcf991c, 0x6cb3c000
    .word   0x3d390d04, 0xcd7cc834
    .word   0xbfcd1037, 0xf2656000
    .word   0x3d084a7e, 0x75b6f6e4
    .word   0xbfca93ed, 0x3c8ae000
    .word   0x3d287243, 0x50562169
    .word   0xbfc823c1, 0x6551a000
    .word   0xbd1e0ddb, 0x9a631e83
    .word   0xbfc5bf40, 0x6b544000
    .word   0x3d127023, 0xeb68981c
    .word   0xbfc365fc, 0xb015a000
    .word   0x3d3fd3a0, 0xafb9691b
    .word   0xbfc1178e, 0x8227e000
    .word   0xbd21ef78, 0xce2d07f2
    .word   0xbfbda727, 0x63844000
    .word   0xbd1a8940, 0x1fa71733
    .word   0xbfb9335e, 0x5d594000
    .word   0xbd23115c, 0x3abd47da
    .word   0xbfb4d311, 0x5d208000
    .word   0x3cf53a25, 0x82f4e1ef
    .word   0xbfb08598, 0xb59e4000
    .word   0x3d17e5dd, 0x7009902c
    .word   0xbfa894aa, 0x149f8000
    .word   0xbd39a19a, 0x8be97661
    .word   0xbfa0415d, 0x89e78000
    .word   0x3d3dddc7, 0xf461c516
    .word   0xbf902056, 0x58930000
    .word   0xbd3611d2, 0x7c8e8417
    .word   0x00000000, 0x00000000
    .word   0x00000000, 0x00000000
    .word   0x3f9f829b, 0x0e780000
    .word   0x3d298026, 0x7c7e09e4
    .word   0x3faf0a30, 0xc0110000
    .word   0x3d48a998, 0x5f325c5c
    .word   0x3fb6f0d2, 0x8ae58000
    .word   0xbd34b464, 0x1b664613
    .word   0x3fbe2707, 0x6e2b0000
    .word   0xbd2a342c, 0x2af0003c
    .word   0x3fc29552, 0xf8200000
    .word   0xbd35b967, 0xf4471dfc
    .word   0x3fc5ff30, 0x70a78000
    .word   0x3d43d3c8, 0x73e20a07
    .word   0x3fc9525a, 0x9cf44000
    .word   0x3d46b476, 0x41307539
    .word   0x3fcc8ff7, 0xc79a8000
    .word   0x3d4a21ac, 0x25d81ef3
    .word   0x3fcfb918, 0x6d5e4000
    .word   0xbd0d572a, 0xab993c87
    .word   0x3fd1675c, 0xababa000
    .word   0x3d38380e, 0x731f55c4
    .word   0x3fd2e8e2, 0xbae12000
    .word   0xbd267b1e, 0x99b72bd8
    .word   0x3fd4618b, 0xc21c6000
    .word   0xbd13d82f, 0x484c84cc
    .word   0x3fd5d1bd, 0xbf580000
    .word   0x3d4394a1, 0x1b1c1ee4
! constants:
    .word   0x40000000,0x00000000
    .word   0x3fe55555,0x555571da
    .word   0x3fd99999,0x8702be3a
    .word   0x3fd24af7,0x3f4569b1
    .word   0x3ea62e42,0xfee00000   ! scaled by 2**-20
    .word   0x3caa39ef,0x35793c76   ! scaled by 2**-20
    .word   0xffff8000,0x00000000
    .word   0x43200000
    .word   0xfff00000
    .word   0xc0194000
    .word   0x4000

#define two     0x200
#define A1      0x208
#define A2      0x210
#define A3      0x218
#define ln2hi       0x220
#define ln2lo       0x228
#define mask        0x230
#define ox43200000  0x238
#define oxfff00000  0x23c
#define oxc0194000  0x240
#define ox4000      0x244

! local storage indices

#define jnk     STACK_BIAS-0x8
#define tmp2        STACK_BIAS-0x10
#define tmp1        STACK_BIAS-0x18
#define tmp0        STACK_BIAS-0x20
! sizeof temp storage - must be a multiple of 16 for V9
#define tmps        0x20

! register use

! i0  n
! i1  x
! i2  stridex
! i3  y
! i4  stridey
! i5

! g1  TBL

! l0  j0
! l1  j1
! l2  j2
! l3
! l4  0x94000
! l5
! l6  0x000fffff
! l7  0x7ff00000

! o0  py0
! o1  py1
! o2  py2
! o3
! o4
! o5
! o7

! f0  u0,q0
! f2  v0,(two-v0)-u0,z0
! f4  n0,f0,q0
! f6  s0
! f8  q
! f10 u1,q1
! f12 v1,(two-v1)-u1,z1
! f14 n1,f1,q1
! f16 s1
! f18 t
! f20 u2,q2
! f22 v2,(two-v2)-u2,q2
! f24 n2,f2,q2
! f26 s2
! f28 0xfff00000
! f29 0x43200000
! f30 0x4000
! f31 0xc0194000
! f32 t0
! f34 h0,f0-(c0-h0)
! f36 c0
! f38 A1
! f40 two
! f42 t1
! f44 h1,f1-(c1-h1)
! f46 c1
! f48 A2
! f50 0xffff8000...
! f52 t2
! f54 h2,f2-(c2-h2)
! f56 c2
! f58 A3
! f60 ln2hi
! f62 ln2lo

    ENTRY(__vlog)
    save    %sp,-SA(MINFRAME)-tmps,%sp
    PIC_SETUP(l7)
    PIC_SET(l7,TBL,o0)
    mov %o0,%g1
    wr  %g0,0x82,%asi       ! set %asi for non-faulting loads
    sethi   %hi(0x94000),%l4
    sethi   %hi(0x000fffff),%l6
    or  %l6,%lo(0x000fffff),%l6
    sethi   %hi(0x7ff00000),%l7
    ldd [%g1+two],%f40
    ldd [%g1+A1],%f38
    ldd [%g1+A2],%f48
    ldd [%g1+A3],%f58
    ldd [%g1+ln2hi],%f60
    ldd [%g1+ln2lo],%f62
    ldd [%g1+mask],%f50
    ld  [%g1+ox43200000],%f29
    ld  [%g1+oxfff00000],%f28
    ld  [%g1+oxc0194000],%f31
    ld  [%g1+ox4000],%f30
    sll %i2,3,%i2       ! scale strides
    sll %i4,3,%i4
    add %fp,jnk,%o0     ! precondition loop
    add %fp,jnk,%o1
    add %fp,jnk,%o2
    fzero   %f2
    fzero   %f6
    fzero   %f18
    fzero   %f36
    fzero   %f12
    fzero   %f14
    fzero   %f16
    fzero   %f42
    fzero   %f44
    fzero   %f46
    std %f46,[%fp+tmp1]
    fzero   %f24
    fzero   %f26
    fzero   %f52
    fzero   %f54
    std %f54,[%fp+tmp2]
    sub %i3,%i4,%i3
    ld  [%i1],%l0       ! ix
    ld  [%i1],%f0       ! u.l[0] = *x
    ba  .loop0
    ld  [%i1+4],%f1     ! u.l[1] = *(1+x)

    .align  16
! -- 16 byte aligned
.loop0:
    sub %l0,%l7,%o3
    sub %l6,%l0,%o4
    fpadd32s %f0,%f31,%f4       ! n = (ix + 0xc0194000) & 0xfff00000
    fmuld   %f6,%f2,%f8     ! (previous iteration)

    andcc   %o3,%o4,%o4
    bge,pn  %icc,.range0        ! ix <= 0x000fffff or >= 0x7ff00000
! delay slot
    fands   %f4,%f28,%f4

    add %i1,%i2,%i1     ! x += stridex
    add %i3,%i4,%i3     ! y += stridey
    fpsub32s %f0,%f4,%f0        ! u.l[0] -= n

.cont0:
    lda [%i1]%asi,%l1       ! preload next argument
    add %l0,%l4,%l0     ! j = ix + 0x94000
    fpadd32s %f0,%f30,%f2       ! v.l[0] = u.l[0] + 0x4000

    lda [%i1]%asi,%f10
    srl %l0,11,%l0      ! j = (j >> 11) & 0x1f0
    fand    %f2,%f50,%f2        ! v.l &= 0xffff8000...

    lda [%i1+4]%asi,%f11
    and %l0,0x1f0,%l0
    fitod   %f4,%f32        ! (double) n

    add %l0,8,%l3
    fsubd   %f0,%f2,%f4     ! f = u.d - v.d

    faddd   %f0,%f2,%f6     ! s = f / (u.d + v.d)

    fsubd   %f40,%f2,%f2        ! two - v.d
    fmuld   %f32,%f60,%f34      ! h = n * ln2hi + TBL[j]

    faddd   %f8,%f18,%f8        ! y = c + (t + q)
    fmuld   %f32,%f62,%f32      ! t = n * ln2lo + TBL[j+1]

    fdivd   %f4,%f6,%f6

    faddd   %f54,%f24,%f56      ! c = h + f
    fmuld   %f26,%f26,%f22      ! z = s * s

    faddd   %f8,%f36,%f8
    st  %f8,[%o0]

    st  %f9,[%o0+4]
    mov %i3,%o0
    faddd   %f14,%f38,%f14

    fsubd   %f56,%f54,%f54      ! t += f - (c - h)
    fmuld   %f22,%f58,%f20      ! q = ...

    fsubd   %f2,%f0,%f2     ! (two - v.d) - u.d
    ldd [%g1+%l0],%f36

    faddd   %f42,%f44,%f18
    fmuld   %f12,%f14,%f14
    ldd [%fp+tmp1],%f12

    faddd   %f20,%f48,%f20
    nop

    faddd   %f34,%f36,%f34
    ldd [%g1+%l3],%f0

    faddd   %f14,%f12,%f12

    fsubd   %f24,%f54,%f54
    fmuld   %f22,%f20,%f24

    std %f2,[%fp+tmp0]
    addcc   %i0,-1,%i0
    ble,pn  %icc,.endloop0
! delay slot
    faddd   %f32,%f0,%f32

! -- 16 byte aligned
.loop1:
    sub %l1,%l7,%o3
    sub %l6,%l1,%o4
    fpadd32s %f10,%f31,%f14     ! n = (ix + 0xc0194000) & 0xfff00000
    fmuld   %f16,%f12,%f8       ! (previous iteration)

    andcc   %o3,%o4,%o4
    bge,pn  %icc,.range1        ! ix <= 0x000fffff or >= 0x7ff00000
! delay slot
    fands   %f14,%f28,%f14

    add %i1,%i2,%i1     ! x += stridex
    add %i3,%i4,%i3     ! y += stridey
    fpsub32s %f10,%f14,%f10     ! u.l[0] -= n

.cont1:
    lda [%i1]%asi,%l2       ! preload next argument
    add %l1,%l4,%l1     ! j = ix + 0x94000
    fpadd32s %f10,%f30,%f12     ! v.l[0] = u.l[0] + 0x4000

    lda [%i1]%asi,%f20
    srl %l1,11,%l1      ! j = (j >> 11) & 0x1f0
    fand    %f12,%f50,%f12      ! v.l &= 0xffff8000...

    lda [%i1+4]%asi,%f21
    and %l1,0x1f0,%l1
    fitod   %f14,%f42       ! (double) n

    add %l1,8,%l3
    fsubd   %f10,%f12,%f14      ! f = u.d - v.d

    faddd   %f10,%f12,%f16      ! s = f / (u.d + v.d)

    fsubd   %f40,%f12,%f12      ! two - v.d
    fmuld   %f42,%f60,%f44      ! h = n * ln2hi + TBL[j]

    faddd   %f8,%f18,%f8        ! y = c + (t + q)
    fmuld   %f42,%f62,%f42      ! t = n * ln2lo + TBL[j+1]

    fdivd   %f14,%f16,%f16

    faddd   %f34,%f4,%f36       ! c = h + f
    fmuld   %f6,%f6,%f2     ! z = s * s

    faddd   %f8,%f46,%f8
    st  %f8,[%o1]

    st  %f9,[%o1+4]
    mov %i3,%o1
    faddd   %f24,%f38,%f24

    fsubd   %f36,%f34,%f34      ! t += f - (c - h)
    fmuld   %f2,%f58,%f0        ! q = ...

    fsubd   %f12,%f10,%f12      ! (two - v.d) - u.d
    ldd [%g1+%l1],%f46

    faddd   %f52,%f54,%f18
    fmuld   %f22,%f24,%f24
    ldd [%fp+tmp2],%f22

    faddd   %f0,%f48,%f0
    nop

    faddd   %f44,%f46,%f44
    ldd [%g1+%l3],%f10

    faddd   %f24,%f22,%f22

    fsubd   %f4,%f34,%f34
    fmuld   %f2,%f0,%f4

    std %f12,[%fp+tmp1]
    addcc   %i0,-1,%i0
    ble,pn  %icc,.endloop1
! delay slot
    faddd   %f42,%f10,%f42

! -- 16 byte aligned
.loop2:
    sub %l2,%l7,%o3
    sub %l6,%l2,%o4
    fpadd32s %f20,%f31,%f24     ! n = (ix + 0xc0194000) & 0xfff00000
    fmuld   %f26,%f22,%f8       ! (previous iteration)

    andcc   %o3,%o4,%o4
    bge,pn  %icc,.range2        ! ix <= 0x000fffff or >= 0x7ff00000
! delay slot
    fands   %f24,%f28,%f24

    add %i1,%i2,%i1     ! x += stridex
    add %i3,%i4,%i3     ! y += stridey
    fpsub32s %f20,%f24,%f20     ! u.l[0] -= n

.cont2:
    lda [%i1]%asi,%l0       ! preload next argument
    add %l2,%l4,%l2     ! j = ix + 0x94000
    fpadd32s %f20,%f30,%f22     ! v.l[0] = u.l[0] + 0x4000

    lda [%i1]%asi,%f0
    srl %l2,11,%l2      ! j = (j >> 11) & 0x1f0
    fand    %f22,%f50,%f22      ! v.l &= 0xffff8000...

    lda [%i1+4]%asi,%f1
    and %l2,0x1f0,%l2
    fitod   %f24,%f52       ! (double) n

    add %l2,8,%l3
    fsubd   %f20,%f22,%f24      ! f = u.d - v.d

    faddd   %f20,%f22,%f26      ! s = f / (u.d + v.d)

    fsubd   %f40,%f22,%f22      ! two - v.d
    fmuld   %f52,%f60,%f54      ! h = n * ln2hi + TBL[j]

    faddd   %f8,%f18,%f8        ! y = c + (t + q)
    fmuld   %f52,%f62,%f52      ! t = n * ln2lo + TBL[j+1]

    fdivd   %f24,%f26,%f26

    faddd   %f44,%f14,%f46      ! c = h + f
    fmuld   %f16,%f16,%f12      ! z = s * s

    faddd   %f8,%f56,%f8
    st  %f8,[%o2]

    st  %f9,[%o2+4]
    mov %i3,%o2
    faddd   %f4,%f38,%f4

    fsubd   %f46,%f44,%f44      ! t += f - (c - h)
    fmuld   %f12,%f58,%f10      ! q = ...

    fsubd   %f22,%f20,%f22      ! (two - v.d) - u.d
    ldd [%g1+%l2],%f56

    faddd   %f32,%f34,%f18
    fmuld   %f2,%f4,%f4
    ldd [%fp+tmp0],%f2

    faddd   %f10,%f48,%f10
    nop

    faddd   %f54,%f56,%f54
    ldd [%g1+%l3],%f20

    faddd   %f4,%f2,%f2

    fsubd   %f14,%f44,%f44
    fmuld   %f12,%f10,%f14

    std %f22,[%fp+tmp2]
    addcc   %i0,-1,%i0
    bg,pt   %icc,.loop0
! delay slot
    faddd   %f52,%f20,%f52


! Once we get to the last element, we loop three more times to finish
! the computations in progress.  This means we will load past the end
! of the argument vector, but since we use non-faulting loads and never
! use the data, the only potential problem is cache miss.  (Note that
! when the argument is 2, the only exception that occurs in the compu-
! tation is an inexact result in the final addition, and we break out
! of the "extra" iterations before then.)
.endloop2:
    sethi   %hi(0x40000000),%l0 ! "next argument" = two
    cmp %i0,-3
    bg,a,pt %icc,.loop0
! delay slot
    fmovd   %f40,%f0
    ret
    restore

    .align  16
.endloop0:
    sethi   %hi(0x40000000),%l1 ! "next argument" = two
    cmp %i0,-3
    bg,a,pt %icc,.loop1
! delay slot
    fmovd   %f40,%f10
    ret
    restore

    .align  16
.endloop1:
    sethi   %hi(0x40000000),%l2 ! "next argument" = two
    cmp %i0,-3
    bg,a,pt %icc,.loop2
! delay slot
    fmovd   %f40,%f20
    ret
    restore


    .align  16
.range0:
    cmp %l0,%l7
    bgeu,pn %icc,2f         ! if (unsigned) ix >= 0x7ff00000
! delay slot
    ld  [%i1+4],%o5
    fxtod   %f0,%f0         ! scale by 2**1074 w/o trapping
    st  %f0,[%fp+tmp0]
    add %i1,%i2,%i1     ! x += stridex
    orcc    %l0,%o5,%g0
    be,pn   %icc,1f         ! if x == 0
! delay slot
    add %i3,%i4,%i3     ! y += stridey
    fpadd32s %f0,%f31,%f4       ! n = (ix + 0xc0194000) & 0xfff00000
    fands   %f4,%f28,%f4
    fpsub32s %f0,%f4,%f0        ! u.l[0] -= n
    ld  [%fp+tmp0],%l0
    ba,pt   %icc,.cont0
! delay slot
    fpsub32s %f4,%f29,%f4       ! n -= 0x43200000
1:
    fdivs   %f29,%f1,%f4        ! raise div-by-zero
    ba,pt   %icc,3f
! delay slot
    st  %f28,[%i3]      ! store -inf
2:
    sll %l0,1,%l0       ! lop off sign bit
    add %i1,%i2,%i1     ! x += stridex
    orcc    %l0,%o5,%g0
    be,pn   %icc,1b         ! if x == -0
! delay slot
    add %i3,%i4,%i3     ! y += stridey
    fabsd   %f0,%f4         ! *y = (x + |x|) * inf
    faddd   %f0,%f4,%f0
    fand    %f28,%f50,%f4
    fnegd   %f4,%f4
    fmuld   %f0,%f4,%f0
    st  %f0,[%i3]
3:
    addcc   %i0,-1,%i0
    ble,pn  %icc,.endloop2
! delay slot
    st  %f1,[%i3+4]
    ld  [%i1],%l0       ! get next argument
    ld  [%i1],%f0
    ba,pt   %icc,.loop0
! delay slot
    ld  [%i1+4],%f1


    .align  16
.range1:
    cmp %l1,%l7
    bgeu,pn %icc,2f         ! if (unsigned) ix >= 0x7ff00000
! delay slot
    ld  [%i1+4],%o5
    fxtod   %f10,%f10       ! scale by 2**1074 w/o trapping
    st  %f10,[%fp+tmp1]
    add %i1,%i2,%i1     ! x += stridex
    orcc    %l1,%o5,%g0
    be,pn   %icc,1f         ! if x == 0
! delay slot
    add %i3,%i4,%i3     ! y += stridey
    fpadd32s %f10,%f31,%f14     ! n = (ix + 0xc0194000) & 0xfff00000
    fands   %f14,%f28,%f14
    fpsub32s %f10,%f14,%f10     ! u.l[0] -= n
    ld  [%fp+tmp1],%l1
    ba,pt   %icc,.cont1
! delay slot
    fpsub32s %f14,%f29,%f14     ! n -= 0x43200000
1:
    fdivs   %f29,%f11,%f14      ! raise div-by-zero
    ba,pt   %icc,3f
! delay slot
    st  %f28,[%i3]      ! store -inf
2:
    sll %l1,1,%l1       ! lop off sign bit
    add %i1,%i2,%i1     ! x += stridex
    orcc    %l1,%o5,%g0
    be,pn   %icc,1b         ! if x == -0
! delay slot
    add %i3,%i4,%i3     ! y += stridey
    fabsd   %f10,%f14       ! *y = (x + |x|) * inf
    faddd   %f10,%f14,%f10
    fand    %f28,%f50,%f14
    fnegd   %f14,%f14
    fmuld   %f10,%f14,%f10
    st  %f10,[%i3]
3:
    addcc   %i0,-1,%i0
    ble,pn  %icc,.endloop0
! delay slot
    st  %f11,[%i3+4]
    ld  [%i1],%l1       ! get next argument
    ld  [%i1],%f10
    ba,pt   %icc,.loop1
! delay slot
    ld  [%i1+4],%f11


    .align  16
.range2:
    cmp %l2,%l7
    bgeu,pn %icc,2f         ! if (unsigned) ix >= 0x7ff00000
! delay slot
    ld  [%i1+4],%o5
    fxtod   %f20,%f20       ! scale by 2**1074 w/o trapping
    st  %f20,[%fp+tmp2]
    add %i1,%i2,%i1     ! x += stridex
    orcc    %l2,%o5,%g0
    be,pn   %icc,1f         ! if x == 0
! delay slot
    add %i3,%i4,%i3     ! y += stridey
    fpadd32s %f20,%f31,%f24     ! n = (ix + 0xc0194000) & 0xfff00000
    fands   %f24,%f28,%f24
    fpsub32s %f20,%f24,%f20     ! u.l[0] -= n
    ld  [%fp+tmp2],%l2
    ba,pt   %icc,.cont2
! delay slot
    fpsub32s %f24,%f29,%f24     ! n -= 0x43200000
1:
    fdivs   %f29,%f21,%f24      ! raise div-by-zero
    ba,pt   %icc,3f
! delay slot
    st  %f28,[%i3]      ! store -inf
2:
    sll %l2,1,%l2       ! lop off sign bit
    add %i1,%i2,%i1     ! x += stridex
    orcc    %l2,%o5,%g0
    be,pn   %icc,1b         ! if x == -0
! delay slot
    add %i3,%i4,%i3     ! y += stridey
    fabsd   %f20,%f24       ! *y = (x + |x|) * inf
    faddd   %f20,%f24,%f20
    fand    %f28,%f50,%f24
    fnegd   %f24,%f24
    fmuld   %f20,%f24,%f20
    st  %f20,[%i3]
3:
    addcc   %i0,-1,%i0
    ble,pn  %icc,.endloop1
! delay slot
    st  %f21,[%i3+4]
    ld  [%i1],%l2       ! get next argument
    ld  [%i1],%f20
    ba,pt   %icc,.loop2
! delay slot
    ld  [%i1+4],%f21

    SET_SIZE(__vlog)