sparcv9/fp/__quad_mag64.s

	__quad_mag64.s revision 7c478bd95313f5f23a4c958a745db2134aa03244
!
! Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
! Use is subject to license terms.
!
! CDDL HEADER START
!
! The contents of this file are subject to the terms of the
! Common Development and Distribution License, Version 1.0 only
! (the "License").  You may not use this file except in compliance
! with the License.
!
! You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
! or http://www.opensolaris.org/os/licensing.
! See the License for the specific language governing permissions
! and limitations under the License.
!
! When distributing Covered Code, include this CDDL HEADER in each
! file and include the License file at usr/src/OPENSOLARIS.LICENSE.
! If applicable, add the following below this CDDL HEADER, with the
! fields enclosed by brackets "[]" replaced with your own identifying
! information: Portions Copyright [yyyy] [name of copyright owner]
!
! CDDL HEADER END
!

.ident  "%Z%%M% %I% %E% SMI"

! /*
!  * This file contains __quad_mag_add and __quad_mag_sub, the core
!  * of the quad precision add and subtract operations.
!  */
! SPARC V9 version hand-coded in assembly to use 64-bit integer registers

    .file   "__quad_mag64.s"

#include <sys/asm_linkage.h>

! union longdouble {
!   struct {
!       unsigned int    msw;
!       unsigned int    frac2;
!       unsigned int    frac3;
!       unsigned int    frac4;
!   } l;
!   struct {
!       unsigned long   msll;
!       unsigned long   frac;
!   } ll;
!   long double d;
! };
!
! /*
!  * __quad_mag_add(x, y, z, fsr)
!  *
!  * Sets *z = *x + *y, rounded according to the rounding mode in *fsr,
!  * and updates the current exceptions in *fsr.  This routine assumes
!  * *x and *y are finite, with the same sign (i.e., an addition of
!  * magnitudes), |*x| >= |*y|, and *z already has its sign bit set.
!  */
! void
! __quad_mag_add(const union longdouble *x, const union longdouble *y,
!   union longdouble *z, unsigned int *fsr)
! {
!   unsigned long   lx, ly, frac, sticky;
!   unsigned int    ex, ey, round, rm;
!   int     e, uflo;
!
!   /* get the leading significand double-words and exponents */
!   ex = (x->ll.msll >> 48) & 0x7fff;
!   lx = x->ll.msll & ~0xffff000000000000ul;
!   if (ex == 0)
!       ex = 1;
!   else
!       lx |= 0x0001000000000000ul;
!
!   ey = (y->ll.msll >> 48) & 0x7fff;
!   ly = y->ll.msll & ~0xffff000000000000ul;
!   if (ey == 0)
!       ey = 1;
!   else
!       ly |= 0x0001000000000000ul;
!
!   /* prenormalize y */
!   e = (int) ex - (int) ey;
!   round = sticky = 0;
!   if (e >= 114) {
!       frac = x->ll.frac;
!       sticky = ly | y->ll.frac;
!   } else {
!       frac = y->ll.frac;
!       if (e >= 64) {
!           sticky = frac & 0x7ffffffffffffffful;
!           round = frac >> 63;
!           frac = ly;
!           ly = 0;
!           e -= 64;
!       }
!       if (e) {
!           sticky |= round | (frac & ((1ul << (e - 1)) - 1));
!           round = (frac >> (e - 1)) & 1;
!           frac = (frac >> e) | (ly << (64 - e));
!           ly >>= e;
!       }
!
!       /* add, propagating carries */
!       frac += x->ll.frac;
!       lx += ly;
!       if (frac < x->ll.frac)
!           lx++;
!
!       /* postnormalize */
!       if (lx >= 0x0002000000000000ul) {
!           sticky |= round;
!           round = frac & 1;
!           frac = (frac >> 1) | (lx << 63);
!           lx >>= 1;
!           ex++;
!       }
!   }
!
!   /* keep track of whether the result before rounding is tiny */
!   uflo = (lx < 0x0001000000000000ul);
!
!   /* get the rounding mode, fudging directed rounding modes
!      as though the result were positive */
!   rm = *fsr >> 30;
!   if (z->l.msw)
!       rm ^= (rm >> 1);
!
!   /* see if we need to round */
!   if (round | sticky) {
!       *fsr |= FSR_NXC;
!
!       /* round up if necessary */
!       if (rm == FSR_RP || (rm == FSR_RN && round &&
!           (sticky || (frac & 1)))) {
!           if (++frac == 0)
!               if (++lx >= 0x0002000000000000ul) {
!                   lx >>= 1;
!                   ex++;
!               }
!       }
!   }
!
!   /* check for overflow */
!   if (ex >= 0x7fff) {
!       /* store the default overflowed result */
!       *fsr |= FSR_OFC | FSR_NXC;
!       if (rm == FSR_RN || rm == FSR_RP) {
!           z->l.msw |= 0x7fff0000;
!           z->l.frac2 = 0;
!           z->ll.frac = 0;
!       } else {
!           z->l.msw |= 0x7ffeffff;
!           z->l.frac2 = 0xffffffff;
!           z->ll.frac = 0xfffffffffffffffful;
!       }
!   } else {
!       /* store the result */
!       if (lx >= 0x0001000000000000ul)
!           z->l.msw |= (ex << 16);
!       z->l.msw |= (lx >> 32) & 0xffff;
!       z->l.frac2 = (lx & 0xffffffff);
!       z->ll.frac = frac;
!
!       /* if the pre-rounded result was tiny and underflow trapping
!          is enabled, simulate underflow */
!       if (uflo && (*fsr & FSR_UFM))
!           *fsr |= FSR_UFC;
!   }
! }

    ENTRY(__quad_mag_add)
    save    %sp,-SA(MINFRAME),%sp

    sethi   %hi(0xffff0000),%g1
    sllx    %g1,32,%g1      ! g1 = 0xffff000000000000

    sethi   %hi(0x7fff),%l7
    or  %l7,%lo(0x7fff),%l7 ! l7 = 0x7fff

    ldx [%i0],%o0
    srlx    %o0,48,%l0
    andcc   %l0,%l7,%l0     ! l0 = ex
    beq,pn  %icc,1f
    andn    %o0,%g1,%o0     ! o0 = lx
    ba,pt   %icc,2f
    sub %o0,%g1,%o0
1:
    mov 1,%l0
2:

    ldx [%i1],%o1
    srlx    %o1,48,%l1
    andcc   %l1,%l7,%l1     ! l1 = ey
    beq,pn  %icc,1f
    andn    %o1,%g1,%o1     ! o1 = ly
    ba,pt   %icc,2f
    sub %o1,%g1,%o1
1:
    mov 1,%l1
2:

    sub %l0,%l1,%l1     ! l1 = e = ex - ey
    cmp %l1,114         ! see if we need to prenormalize
    bge,pn  %icc,1f
    mov 0,%l6           ! l6 = round
    mov 0,%o7           ! o7 = sticky
    cmp %l1,64
    bl,pt   %icc,3f
    ldx [%i1+8],%o2     ! o2 = frac
    sllx    %o2,1,%o7       ! lop off high order bit
    srlx    %o2,63,%l6
    mov %o1,%o2
    mov 0,%o1
    sub %l1,64,%l1
3:
    tst %l1
    beq,pn  %icc,4f
    sub %l1,1,%l2
    mov 1,%o3
    sllx    %o3,%l2,%o3
    sub %o3,1,%o3
    and %o3,%o2,%o3
    or  %o3,%l6,%o3
    or  %o7,%o3,%o7
    srlx    %o2,%l2,%o4
    and %o4,1,%l6
    srlx    %o2,%l1,%o2
    mov 64,%l3
    sub %l3,%l1,%l3
    sllx    %o1,%l3,%o5
    or  %o2,%o5,%o2
    srlx    %o1,%l1,%o1
4:
    ldx [%i0+8],%o3
    add %o2,%o3,%o2     ! add, propagating carry
    cmp %o2,%o3
    bgeu,pt %xcc,5f
    add %o0,%o1,%o0
    add %o0,1,%o0
5:
    srlx    %o0,49,%o5      ! if sum carried out, postnormalize
    tst %o5
    beq,pt  %icc,2f
    nop
    or  %o7,%l6,%o7
    and %o2,1,%l6
    srlx    %o2,1,%o2
    sllx    %o0,63,%o3
    or  %o2,%o3,%o2
    srlx    %o0,1,%o0
    ba,pt   %icc,2f
    add %l0,1,%l0
1:
    ldx [%i0+8],%o2     ! (full prenormalization shift case)
    ldx [%i1+8],%o3
    or  %o1,%o3,%o7
2:

    add %o0,%g1,%o1     ! see if sum is tiny
    srlx    %o1,63,%l2      ! l2 = uflo

    ld  [%i3],%i4       ! get the rounding mode
    srl %i4,30,%l3      ! l3 = rm
    ld  [%i2],%l4       ! l4 = z->l.msw
    tst %l4
    beq,pn  %icc,1f
    srl %l3,1,%l5
    xor %l3,%l5,%l3
1:

    orcc    %o7,%l6,%g0     ! see if we need to round
    beq,pn  %xcc,1f
    andcc   %l3,1,%g0
    or  %i4,1,%i4
    bne,pn  %icc,1f
    tst %l3
    bne,pn  %icc,2f
    tst %l6
    beq,pn  %icc,1f
    and %o2,1,%o3
    orcc    %o3,%o7,%g0
    beq,pn  %xcc,1f
    nop
2:
    addcc   %o2,1,%o2       ! round up and check for carry out
    bne,pt  %xcc,1f
    nop
    add %o0,1,%o0
    srlx    %o0,49,%o1
    tst %o1
    beq,pt  %icc,1f
    nop
    srlx    %o0,1,%o0
    add %l0,1,%l0
1:

    cmp %l0,%l7         ! check for overflow
    bge,pn  %icc,1f
    addcc   %o0,%g1,%g0
    bl,pn   %xcc,2f
    sll %l0,16,%l1
    or  %l4,%l1,%l4
2:
    sllx    %o0,16,%o1
    srlx    %o1,48,%o1
    or  %l4,%o1,%l4
    st  %l4,[%i2]
    st  %o0,[%i2+4]
    stx %o2,[%i2+8]
    tst %l2         ! see if we need to raise underflow
    beq,pt  %icc,3f
    srl %i4,23,%i5
    andcc   %i5,4,%i5
    ba,pt   %icc,3f
    or  %i4,%i5,%i4

1:
    andcc   %l3,1,%g0
    bne,pn  %icc,2f
    or  %i4,9,%i4       ! overflow
    sll %l7,16,%l7      ! 7fff00000...
    or  %l4,%l7,%l4
    st  %l4,[%i2]
    st  %g0,[%i2+4]
    ba,pt   %icc,3f
    stx %g0,[%i2+8]
2:
    mov -1,%o0          ! 7ffeffff...
    sll %l7,16,%l7
    add %o0,%l7,%l7
    or  %l4,%l7,%l4
    st  %l4,[%i2]
    st  %o0,[%i2+4]
    stx %o0,[%i2+8]

3:
    st  %i4,[%i3]
    ret
    restore

    SET_SIZE(__quad_mag_add)

! /*
!  * __quad_mag_sub(x, y, z, fsr)
!  *
!  * Sets *z = *x - *y, rounded according to the rounding mode in *fsr,
!  * and updates the current exceptions in *fsr.  This routine assumes
!  * *x and *y are finite, with opposite signs (i.e., a subtraction of
!  * magnitudes), |*x| >= |*y|, and *z already has its sign bit set.
!  */
! void
! __quad_mag_sub(const union longdouble *x, const union longdouble *y,
!   union longdouble *z, unsigned int *fsr)
! {
!   unsigned long   lx, ly, frac, sticky;
!   unsigned int    ex, ey, gr, borrow, rm;
!   int     e;
!
!   /* get the leading significand double-words and exponents */
!   ex = (x->ll.msll >> 48) & 0x7fff;
!   lx = x->ll.msll & ~0xffff000000000000ul;
!   if (ex == 0)
!       ex = 1;
!   else
!       lx |= 0x0001000000000000ul;
!
!   ey = (y->ll.msll >> 48) & 0x7fff;
!   ly = y->ll.msll & ~0xffff000000000000ul;
!   if (ey == 0)
!       ey = 1;
!   else
!       ly |= 0x0001000000000000ul;
!
!   /* prenormalize y */
!   e = (int) ex - (int) ey;
!   gr = sticky = 0;
!   if (e > 114) {
!       sticky = ly | y->ll.frac;
!       ly = frac = 0;
!   } else {
!       frac = y->ll.frac;
!       if (e >= 64) {
!           gr = frac >> 62;
!           sticky = frac << 2;
!           frac = ly;
!           ly = 0;
!           e -= 64;
!       }
!       if (e > 1) {
!           sticky |= gr | (frac & ((1ul << (e - 2)) - 1));
!           gr = (frac >> (e - 2)) & 3;
!           frac = (frac >> e) | (ly << (64 - e));
!           ly >>= e;
!       } else if (e == 1) {
!           sticky |= (gr & 1);
!           gr = (gr >> 1) | ((frac & 1) << 1);
!           frac = (frac >> 1) | (ly << 63);
!           ly >>= 1;
!       }
!   }
!
!   /* complement guard, round, and sticky as need be */
!   gr <<= 1;
!   if (sticky)
!       gr |= 1;
!   gr = (-gr & 7);
!   if (gr)
!       if (++frac == 0)
!           ly++;
!
!   /* subtract, propagating borrows */
!   frac = x->ll.frac - frac;
!   lx -= ly;
!   if (frac > x->ll.frac)
!       lx--;
!
!   /* get the rounding mode */
!   rm = *fsr >> 30;
!
!   /* handle zero result */
!   if (!(lx | frac | gr)) {
!       z->l.msw = ((rm == FSR_RM)? 0x80000000 : 0);
!       z->l.frac2 = z->l.frac3 = z->l.frac4 = 0;
!       return;
!   }
!
!   /* postnormalize */
!   if (lx < 0x0001000000000000ul) {
!       /* if cancellation occurred or the exponent is 1,
!          the result is exact */
!       if (lx < 0x0000800000000000ul || ex == 1) {
!           if ((lx | (frac & 0xfffe000000000000ul)) == 0 &&
!               ex > 64) {
!               lx = frac;
!               frac = (unsigned long) gr << 61;
!               gr = 0;
!               ex -= 64;
!           }
!           while (lx < 0x0001000000000000ul && ex > 1) {
!               lx = (lx << 1) | (frac >> 63);
!               frac = (frac << 1) | (gr >> 2);
!               gr = 0;
!               ex--;
!           }
!           if (lx >= 0x0001000000000000ul)
!               z->l.msw |= (ex << 16);
!           z->l.msw |= ((lx >> 32) & 0xffff);
!           z->l.frac2 = (lx & 0xffffffff);
!           z->ll.frac = frac;
!
!           /* if the result is tiny and underflow trapping is
!              enabled, simulate underflow */
!           if (lx < 0x0001000000000000ul && (*fsr & FSR_UFM))
!               *fsr |= FSR_UFC;
!           return;
!       }
!
!       /* otherwise we only borrowed one place */
!       lx = (lx << 1) | (frac >> 63);
!       frac = (frac << 1) | (gr >> 2);
!       gr &= 3;
!       ex--;
!   }
!   else
!       gr = (gr >> 1) | (gr & 1);
!
!   /* fudge directed rounding modes as though the result were positive */
!   if (z->l.msw)
!       rm ^= (rm >> 1);
!
!   /* see if we need to round */
!   if (gr) {
!       *fsr |= FSR_NXC;
!
!       /* round up if necessary */
!       if (rm == FSR_RP || (rm == FSR_RN && (gr & 2) &&
!           ((gr & 1) || (frac & 1)))) {
!           if (++frac == 0)
!               if (++lx >= 0x0002000000000000ul) {
!                   lx >>= 1;
!                   ex++;
!               }
!       }
!   }
!
!   /* store the result */
!   z->l.msw |= (ex << 16) | ((lx >> 32) & 0xffff);
!   z->l.frac2 = (lx & 0xffffffff);
!   z->ll.frac = frac;
! }

    ENTRY(__quad_mag_sub)
    save    %sp,-SA(MINFRAME),%sp

    sethi   %hi(0xffff0000),%g1
    sllx    %g1,32,%g1      ! g1 = 0xffff000000000000

    sethi   %hi(0x7fff),%l7
    or  %l7,%lo(0x7fff),%l7 ! l7 = 0x7fff

    ldx [%i0],%o0
    srlx    %o0,48,%l0
    andcc   %l0,%l7,%l0     ! l0 = ex
    beq,pn  %icc,1f
    andn    %o0,%g1,%o0     ! o0 = lx
    ba,pt   %icc,2f
    sub %o0,%g1,%o0
1:
    mov 1,%l0
2:

    ldx [%i1],%o1
    srlx    %o1,48,%l1
    andcc   %l1,%l7,%l1     ! l1 = ey
    beq,pn  %icc,1f
    andn    %o1,%g1,%o1     ! o1 = ly
    ba,pt   %icc,2f
    sub %o1,%g1,%o1
1:
    mov 1,%l1
2:

    sub %l0,%l1,%l1     ! l1 = e = ex - ey
    cmp %l1,114         ! see if we need to prenormalize y
    bg,pn   %icc,1f
    mov 0,%l6           ! l6 = gr
    mov 0,%o7           ! o7 = sticky
    cmp %l1,64
    bl,pt   %icc,3f
    ldx [%i1+8],%o2     ! o2 = frac
    srlx    %o2,62,%l6
    sllx    %o2,2,%o7       ! lop off top two bits
    mov %o1,%o2
    mov 0,%o1
    sub %l1,64,%l1
3:
    cmp %l1,1
    ble,pn  %icc,4f
    sub %l1,2,%l2       ! shift more than one bit
    mov 1,%o3
    sllx    %o3,%l2,%o3
    sub %o3,1,%o3
    and %o3,%o2,%o3
    or  %o3,%l6,%o3
    or  %o7,%o3,%o7
    srlx    %o2,%l2,%o4
    and %o4,3,%l6
    srlx    %o2,%l1,%o2
    mov 64,%l3
    sub %l3,%l1,%l3
    sllx    %o1,%l3,%o5
    or  %o2,%o5,%o2
    ba,pt   %icc,2f
    srlx    %o1,%l1,%o1
4:
    bne,pn  %icc,2f
    and %l6,1,%o3       ! shift one bit
    or  %o7,%o3,%o7
    and %o2,1,%o4
    sllx    %o4,1,%o4
    srl %l6,1,%l6
    or  %l6,%o4,%l6
    srlx    %o2,1,%o2
    sllx    %o1,63,%o5
    or  %o2,%o5,%o2
    ba,pt   %icc,2f
    srlx    %o1,1,%o1
1:
    ldx [%i1+8],%o3     ! (full prenormalization shift case)
    or  %o1,%o3,%o7
    mov 0,%o1
    mov 0,%o2
2:

    tst %o7         ! complement guard, round, and
    beq,pn  %xcc,1f         ! sticky as need be
    sll %l6,1,%l6
    or  %l6,1,%l6
1:
    subcc   %g0,%l6,%l6
    beq,pn  %icc,1f
    and %l6,7,%l6
    addcc   %o2,1,%o2
    beq,a,pn %xcc,1f
    add %o1,1,%o1
1:

    ldx [%i0+8],%o3     ! subtract, propagating borrows
    sub %o3,%o2,%o2
    cmp %o3,%o2
    bgeu,pt %xcc,5f
    sub %o0,%o1,%o0
    sub %o0,1,%o0
5:

    ld  [%i3],%i4       ! get the rounding mode
    srl %i4,30,%l3      ! l3 = rm

    or  %o0,%o2,%o1     ! look for zero result
    orcc    %o1,%l6,%g0
    bne,pt  %xcc,1f
    srl %l3,1,%l4
    and %l3,%l4,%l4
    sll %l4,31,%l4
    st  %l4,[%i2]
    st  %g0,[%i2+4]
    stx %g0,[%i2+8]
    ret
    restore

1:
    addcc   %o0,%g1,%g0     ! postnormalize
    bl,pt   %xcc,1f
    ld  [%i2],%l4       ! l4 = z->l.msw
    and %l6,1,%l5       ! (no cancellation or borrow case)
    srl %l6,1,%l6
    ba,pt   %icc,2f
    or  %l6,%l5,%l6
1:
    srax    %g1,1,%o7
    addcc   %o0,%o7,%g0
    bl,pn   %xcc,1f
    cmp %l0,1
    beq,pt  %icc,1f
    srlx    %o2,63,%o3      ! borrowed one place
    sllx    %o0,1,%o0
    or  %o0,%o3,%o0
    srl %l6,2,%o4
    sllx    %o2,1,%o2
    or  %o2,%o4,%o2
    and %l6,3,%l6
    ba,pt   %icc,2f
    sub %l0,1,%l0
1:
    srlx    %o2,49,%o3      ! cancellation or tiny result
    orcc    %o0,%o3,%g0
    bne,pt  %xcc,1f
    cmp %l0,64
    ble,pn  %icc,1f
    nop
    mov %o2,%o0
    sllx    %l6,61,%o2
    mov 0,%l6
    sub %l0,64,%l0
1:
    addcc   %o0,%g1,%g0     ! normalization loop
    bge,pn  %xcc,1f
    cmp %l0,1
    ble,pn  %icc,1f
    srl %l6,2,%l6
    srlx    %o2,63,%o3
    sllx    %o0,1,%o0
    or  %o0,%o3,%o0
    sllx    %o2,1,%o2
    or  %o2,%l6,%o2
    ba,pt   %icc,1b
    sub %l0,1,%l0
1:
    sllx    %o0,16,%o1
    srlx    %o1,48,%l5
    or  %l4,%l5,%l4
    addcc   %o0,%g1,%g0     ! see if result is tiny
    bl,pn   %xcc,1f
    sll %l0,16,%l5
    or  %l4,%l5,%l4
1:
    st  %l4,[%i2]
    st  %o0,[%i2+4]
    bge,pt  %xcc,1f
    stx %o2,[%i2+8]
    srl %i4,23,%i5
    andcc   %i5,4,%g0       ! see if we need to raise underflow
    beq,pt  %icc,1f
    or  %i4,4,%i4
    st  %i4,[%i3]
1:
    ret
    restore

2:
    tst %l4         ! fudge directect rounding modes
    beq,pn  %icc,1f
    srl %l3,1,%l5
    xor %l3,%l5,%l3
1:

    tst %l6         ! see if we need to round
    beq,pn  %icc,1f
    or  %i4,1,%i4
    st  %i4,[%i3]
    andcc   %l3,1,%g0
    bne,pn  %icc,1f
    tst %l3
    bne,pn  %icc,2f
    andcc   %l6,2,%g0
    beq,pn  %icc,1f
    or  %l6,%o2,%o3
    andcc   %o3,1,%o3
    beq,pn  %xcc,1f
    nop
2:
    addcc   %o2,1,%o2       ! round up and check for carry
    bne,pt  %xcc,1f
    nop
    add %o0,1,%o0
    srlx    %o0,49,%o1
    tst %o1
    beq,pt  %icc,1f
    nop
    srlx    %o0,1,%o0
    add %l0,1,%l0
1:

    sllx    %o0,16,%o1
    srlx    %o1,48,%o1
    or  %l4,%o1,%l4
    sll %l0,16,%l5
    or  %l4,%l5,%l4
    st  %l4,[%i2]
    st  %o0,[%i2+4]
    stx %o2,[%i2+8]
    ret
    restore

    SET_SIZE(__quad_mag_sub)