common/vis/__vhypotf.S

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

    .file   "__vhypotf.S"

#include "libm.h"

    RO_DATA
    .align  64

.CONST_TBL:
    .word   0x3fe00001, 0x80007e00  ! K1  =  5.00000715259318464227e-01
    .word   0xbfc00003, 0xc0017a01  ! K2  = -1.25000447037521686593e-01
    .word   0x000fffff, 0xffffffff  ! DC0 = 0x000fffffffffffff
    .word   0x3ff00000, 0x00000000  ! DC1 = 0x3ff0000000000000
    .word   0x7ffff000, 0x00000000  ! DC2 = 0x7ffff00000000000
    .word   0x7fe00000, 0x00000000  ! DA0 = 0x7fe0000000000000
    .word   0x47efffff, 0xe0000000  ! DFMAX = 3.402823e+38
    .word   0x7f7fffff, 0x80808080  ! FMAX = 3.402823e+38 , SCALE = 0x80808080
    .word   0x20000000, 0x00000000  ! DA1 = 0x2000000000000000

#define DC0     %f12
#define DC1     %f10
#define DC2     %f42
#define DA0     %f6
#define DA1     %f4
#define K2      %f26
#define K1      %f28
#define SCALE       %f3
#define FMAX        %f2
#define DFMAX       %f50

#define stridex     %l6
#define stridey     %i4
#define stridez     %l5
#define _0x7fffffff %o1
#define _0x7f3504f3 %o2
#define _0x1ff0     %l2
#define TBL     %l1

#define counter     %l0

#define tmp_px      STACK_BIAS-0x30
#define tmp_py      STACK_BIAS-0x28
#define tmp_counter STACK_BIAS-0x20
#define tmp0        STACK_BIAS-0x18
#define tmp1        STACK_BIAS-0x10
#define tmp2        STACK_BIAS-0x0c
#define tmp3        STACK_BIAS-0x08
#define tmp4        STACK_BIAS-0x04

! sizeof temp storage - must be a multiple of 16 for V9
#define tmps        0x30

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!      !!!!!   algorithm   !!!!!
!  hx0 = *(int*)px;
!  x0 = *px;
!  px += stridex;
!
!  hy0 = *(int*)py;
!  y0 = *py;
!  py += stridey;
!
!  hx0 &= 0x7fffffff;
!  hy0 &= 0x7fffffff;
!
!  if ( hx >= 0x7f3504f3 || hy >= 0x7f3504f3 )
!  {
!    if ( hx >= 0x7f800000 || hy >= 0x7f800000 )
!    {
!      if ( hx == 0x7f800000 || hy == 0x7f800000 )
!        *(int*)pz = 0x7f800000;
!      else *pz = x * y;
!    }
!    else
!    {
!      hyp = sqrt(x * (double)x + y * (double)y);
!      if ( hyp <= DMAX ) ftmp0 = (float)hyp;
!      else ftmp0 = FMAX * FMAX;
!      *pz = ftmp0;
!    }
!    pz += stridez;
!    continue;
!  }
!  if ( (hx | hy) == 0 )
!  {
!    *pz = 0;
!    pz += stridez;
!    continue;
!  }
!  dx0 = x0 * (double)x0;
!  dy0 = y0 * (double)y0;
!  db0 = dx0 + dy0;
!
!  iexp0 = ((int*)&db0)[0];
!
!  h0 = vis_fand(db0,DC0);
!  h0 = vis_for(h0,DC1);
!  h_hi0 = vis_fand(h0,DC2);
!
!  db0 = vis_fand(db0,DA0);
!  db0 = vis_fmul8x16(SCALE, db0);
!  db0 = vis_fpadd32(db0,DA1);
!
!  iexp0 >>= 8;
!  di0 = iexp0 & 0x1ff0;
!  si0 = (char*)sqrt_arr + di0;
!
!  dtmp0 = ((double*)((char*)div_arr + di0))[0];
!  xx0 = h0 - h_hi0;
!  xx0 *= dmp0;
!
!  dtmp0 = ((double*)si0)[1];
!  res0 = K2 * xx0;
!  res0 += K1;
!  res0 *= xx0;
!  res0 += DC1;
!  res0 = dtmp0 * res0;
!  res0 *= db0;
!  ftmp0 = (float)res0;
!  *pz = ftmp0;
!  pz += stridez;
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    ENTRY(__vhypotf)
    save    %sp,-SA(MINFRAME)-tmps,%sp
    PIC_SETUP(l7)
    PIC_SET(l7,.CONST_TBL,o3)
    PIC_SET(l7,__vlibm_TBL_sqrtf,l1)

#ifdef __sparcv9
    ldx [%fp+STACK_BIAS+176],stridez
#else
    ld  [%fp+STACK_BIAS+92],stridez
#endif
    st  %i0,[%fp+tmp_counter]

    stx %i1,[%fp+tmp_px]

    stx %i3,[%fp+tmp_py]

    ldd [%o3],K1
    sethi   %hi(0x7ffffc00),%o1

    ldd [%o3+8],K2
    sethi   %hi(0x7f350400),%o2

    ldd [%o3+16],DC0
    add %o1,1023,_0x7fffffff
    add %o2,0xf3,_0x7f3504f3

    ldd [%o3+24],DC1
    sll %i2,2,stridex

    ld  [%o3+56],FMAX

    ldd [%o3+32],DC2
    sll %i4,2,stridey

    ldd [%o3+40],DA0
    sll stridez,2,stridez

    ldd [%o3+48],DFMAX

    ld  [%o3+60],SCALE
    or  %g0,0xff8,%l2

    ldd [%o3+64],DA1
    sll %l2,1,_0x1ff0
    or  %g0,%i5,%l7

.begin:
    ld  [%fp+tmp_counter],counter
    ldx [%fp+tmp_px],%i1
    ldx [%fp+tmp_py],%i2
    st  %g0,[%fp+tmp_counter]
.begin1:
    cmp counter,0
    ble,pn  %icc,.exit
    lda [%i1]0x82,%l3       ! (3_0) hx0 = *(int*)px;

    lda [%i2]0x82,%l4       ! (3_0) hy0 = *(int*)py;

    lda [%i1]0x82,%f17      ! (3_0) x0 = *px;
    and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff;

    cmp %l3,_0x7f3504f3     ! (3_0) hx ? 0x7f3504f3
    bge,pn  %icc,.spec      ! (3_0) if ( hx >= 0x7f3504f3 )
    and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff;

    cmp %l4,_0x7f3504f3     ! (3_0) hy ? 0x7f3504f3
    bge,pn  %icc,.spec      ! (3_0) if ( hy >= 0x7f3504f3 )
    or  %g0,%i2,%o7

    orcc    %l3,%l4,%g0
    bz,pn   %icc,.spec1

    add %i1,stridex,%i1     ! px += stridex
    fsmuld  %f17,%f17,%f44      ! (3_0) dx0 = x0 * (double)x0;
    lda [%i2]0x82,%f17      ! (3_0) y0 = *py;

    lda [%i1]0x82,%l3       ! (4_0) hx0 = *(int*)px;

    lda [stridey+%o7]0x82,%l4   ! (4_0) hy0 = *(int*)py;

    and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff;

    fsmuld  %f17,%f17,%f24      ! (3_0) dy0 = y0 * (double)y0;
    cmp %l3,_0x7f3504f3     ! (4_0) hx ? 0x7f3504f3
    bge,pn  %icc,.update0       ! (4_0) if ( hx >= 0x7f3504f3 )
    and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff;

    orcc    %l3,%l4,%g0
    bz,pn   %icc,.update0
    lda [%i1]0x82,%f17      ! (4_0) x0 = *px;
.cont0:
    faddd   %f44,%f24,%f24      ! (3_0) db0 = dx0 + dy0;

    fsmuld  %f17,%f17,%f40      ! (4_1) dy0 = x0 * (double)x0;
    cmp %l4,_0x7f3504f3     ! (4_1) hy ? 0x7f3504f3
    lda [stridey+%o7]0x82,%f17  ! (4_1) hy0 = *py;

    add %o7,stridey,%i5     ! py += stridey
    lda [%i1+stridex]0x82,%l3   ! (0_0) hx0 = *(int*)px;

    bge,pn  %icc,.update1       ! (4_1) if ( hy >= 0x7f3504f3 )
    st  %f24,[%fp+tmp0]     ! (3_1) iexp0 = ((int*)&db0)[0];
.cont1:
    and %l3,_0x7fffffff,%l3 ! (0_0) hx0 &= 0x7fffffff;

    fsmuld  %f17,%f17,%f48      ! (4_1) dy0 = y0 * (double)y0;
    lda [%i1+stridex]0x82,%f8   ! (0_0) x0 = *px;

    add %i1,stridex,%i1     ! px += stridex

    lda [%i5+stridey]0x82,%l4   ! (0_0) hy0 = *(int*)py;
    cmp %l3,_0x7f3504f3     ! (0_0) hx ? 0x7f3504f3
    bge,pn  %icc,.update2       ! (0_0) if ( hx >= 0x7f3504f3 )
    add %i5,stridey,%o4     ! py += stridey
.cont2:
    faddd   %f40,%f48,%f20      ! (4_1) db0 = dx0 + dy0;

    fsmuld  %f8,%f8,%f40        ! (0_0) dx0 = x0 * (double)x0;
    and %l4,_0x7fffffff,%l4 ! (0_0) hy0 &= 0x7fffffff;
    lda [%i5+stridey]0x82,%f17  ! (0_0) hy0 = *py;

    cmp %l4,_0x7f3504f3     ! (0_0) hy ? 0x7f3504f3
    bge,pn  %icc,.update3       ! (0_0) if ( hy >= 0x7f3504f3 )
    st  %f20,[%fp+tmp1]     ! (4_1) iexp0 = ((int*)&db0)[0];

    orcc    %l3,%l4,%g0
    bz,pn   %icc,.update3
.cont3:
    lda [%i1+stridex]0x82,%l3   ! (1_0) hx0 = *(int*)px;

    fand    %f24,DC0,%f60       ! (3_1) h0 = vis_fand(db0,DC0);

    and %l3,_0x7fffffff,%l3 ! (1_0) hx0 &= 0x7fffffff;

    fsmuld  %f17,%f17,%f34      ! (0_0) dy0 = y0 * (double)y0;
    cmp %l3,_0x7f3504f3     ! (1_0) hx ? 0x7f3504f3
    lda [%o4+stridey]0x82,%l4   ! (1_0) hy0 = *(int*)py;

    add %i1,stridex,%i1     ! px += stridex

    lda [%i1]0x82,%f17      ! (1_0) x0 = *px;
    bge,pn  %icc,.update4       ! (1_0) if ( hx >= 0x7f3504f3 )
    add %o4,stridey,%i5     ! py += stridey
.cont4:
    and %l4,_0x7fffffff,%l4 ! (1_0) hy0 &= 0x7fffffff;
    for %f60,DC1,%f46       ! (3_1) h0 = vis_for(h0,DC1);

    cmp %l4,_0x7f3504f3     ! (1_0) hy ? 0x7f3504f3
    ld  [%fp+tmp0],%o0      ! (3_1) iexp0 = ((int*)&db0)[0];
    faddd   %f40,%f34,%f0       ! (0_0) db0 = dx0 + dy0;

    fsmuld  %f17,%f17,%f40      ! (1_0) dx0 = x0 * (double)x0;
    add %i1,stridex,%i1     ! px += stridex
    lda [%o4+stridey]0x82,%f17  ! (1_0) y0 = *py;

    srax    %o0,8,%o0       ! (3_1) iexp0 >>= 8;
    bge,pn  %icc,.update5       ! (1_0) if ( hy >= 0x7f3504f3 )
    fand    %f46,DC2,%f38       ! (3_1) h_hi0 = vis_fand(h0,DC2);

    orcc    %l3,%l4,%g0
    bz,pn   %icc,.update5
.cont5:
    lda [%i1]0x82,%l3       ! (2_0) hx0 = *(int*)px;

    and %o0,_0x1ff0,%o0     ! (3_1) di0 = iexp0 & 0x1ff0;
    st  %f0,[%fp+tmp2]      ! (0_0) iexp0 = ((int*)&db0)[0];
    fand    %f20,DC0,%f60       ! (4_1) h0 = vis_fand(db0,DC0);

    ldd [TBL+%o0],%f22      ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
    fsubd   %f46,%f38,%f38      ! (3_1) xx0 = h0 - h_hi0;

    fsmuld  %f17,%f17,%f32      ! (1_0) dy0 = y0 * (double)y0;
    add %i5,stridey,%i2     ! py += stridey
    lda [stridey+%i5]0x82,%l4   ! (2_0) hy0 = *(int*)py;

    and %l3,_0x7fffffff,%l3 ! (2_0) hx0 &= 0x7fffffff;

    lda [%i1]0x82,%f17      ! (2_0) x0 = *px;
    cmp %l3,_0x7f3504f3     ! (2_0) hx ? 0x7f3504f3

    fmuld   %f38,%f22,%f38      ! (3_1) xx0 *= dmp0;
    and %l4,_0x7fffffff,%l4 ! (2_0) hy0 &= 0x7fffffff;
    for %f60,DC1,%f46       ! (4_1) h0 = vis_for(h0,DC1);

    bge,pn  %icc,.update6       ! (2_0) if ( hx >= 0x7f3504f3 )
    ld  [%fp+tmp1],%o3      ! (4_1) iexp0 = ((int*)&db0)[0];
.cont6:
    faddd   %f40,%f32,%f18      ! (1_0) db0 = dx0 + dy0;

    fsmuld  %f17,%f17,%f44      ! (2_0) dx0 = x0 * (double)x0;
    cmp %l4,_0x7f3504f3     ! (2_0) hy ? 0x7f3504f3
    lda [stridey+%i5]0x82,%f17  ! (2_0) y0 = *py;

    add %i1,stridex,%i1     ! px += stridex
    bge,pn  %icc,.update7       ! (2_0) if ( hy >= 0x7f3504f3 )
    fand    %f46,DC2,%f58       ! (4_1) h_hi0 = vis_fand(h0,DC2);

    orcc    %l3,%l4,%g0
    bz,pn   %icc,.update7
    nop
.cont7:
    fmuld   K2,%f38,%f56        ! (3_1) res0 = K2 * xx0;
    srax    %o3,8,%o3       ! (4_1) iexp0 >>= 8;
    lda [%i1]0x82,%l3       ! (3_0) hx0 = *(int*)px;

    and %o3,_0x1ff0,%o3     ! (4_1) di0 = iexp0 & 0x1ff0;
    st  %f18,[%fp+tmp3]     ! (1_0) iexp0 = ((int*)&db0)[0];
    fand    %f0,DC0,%f60        ! (0_0) h0 = vis_fand(db0,DC0);

    ldd [TBL+%o3],%f22      ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
    add %i2,stridey,%o7     ! py += stridey
    fsubd   %f46,%f58,%f58      ! (4_1) xx0 = h0 - h_hi0;

    fsmuld  %f17,%f17,%f30      ! (2_0) dy0 = y0 * (double)y0;
    lda [stridey+%i2]0x82,%l4   ! (3_0) hy0 = *(int*)py;
    and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff;

    faddd   %f56,K1,%f54        ! (3_1) res0 += K1;
    cmp %l3,_0x7f3504f3     ! (3_0) hx ? 0x7f3504f3

    lda [%i1]0x82,%f17      ! (3_0) x0 = *px;
    add %i1,stridex,%i1     ! px += stridex
    bge,pn  %icc,.update8       ! (3_0) if ( hx >= 0x7f3504f3 )

    fmuld   %f58,%f22,%f58      ! (4_1) xx0 *= dmp0;
.cont8:
    and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff;
    for %f60,DC1,%f46       ! (0_0) h0 = vis_for(h0,DC1);

    cmp %l4,_0x7f3504f3     ! (3_0) hy ? 0x7f3504f3
    ld  [%fp+tmp2],%g1      ! (0_0) iexp0 = ((int*)&db0)[0];
    faddd   %f44,%f30,%f30      ! (2_0) db0 = dx0 + dy0;

    fsmuld  %f17,%f17,%f44      ! (3_0) dx0 = x0 * (double)x0;
    bge,pn  %icc,.update9       ! (3_0) if ( hy >= 0x7f3504f3 )
    lda [stridey+%i2]0x82,%f17  ! (3_0) y0 = *py;

    orcc    %l3,%l4,%g0
    bz,pn   %icc,.update9
    nop
.cont9:
    fmuld   %f54,%f38,%f40      ! (3_1) res0 *= xx0;
    lda [%i1]0x82,%l3       ! (4_0) hx0 = *(int*)px;
    fand    %f46,DC2,%f38       ! (0_0) h_hi0 = vis_fand(h0,DC2);

    fmuld   K2,%f58,%f54        ! (4_1) res0 = K2 * xx0;
    srax    %g1,8,%o5       ! (0_0) iexp0 >>= 8;
    lda [stridey+%o7]0x82,%l4   ! (4_0) hy0 = *(int*)py;
    fand    %f24,DA0,%f56       ! (3_1) db0 = vis_fand(db0,DA0);

    and %o5,_0x1ff0,%o5     ! (0_0) di0 = iexp0 & 0x1ff0;
    st  %f30,[%fp+tmp4]     ! (2_0) iexp0 = ((int*)&db0)[0];
    fand    %f18,DC0,%f60       ! (1_0) h0 = vis_fand(db0,DC0);

    ldd [TBL+%o5],%f22      ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
    add %o0,TBL,%g1     ! (3_1) si0 = (char*)sqrt_arr + di0;
    and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff;
    fsubd   %f46,%f38,%f38      ! (0_0) xx0 = h0 - h_hi0;

    fsmuld  %f17,%f17,%f24      ! (3_0) dy0 = y0 * (double)y0;
    cmp %l3,_0x7f3504f3     ! (4_0) hx ? 0x7f3504f3
    bge,pn  %icc,.update10      ! (4_0) if ( hx >= 0x7f3504f3 )
    faddd   %f40,DC1,%f40       ! (3_1) res0 += DC1;

    fmul8x16    SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0);
    and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff;
    ldd [%g1+8],%f56        ! (3_1) dtmp0 = ((double*)si0)[1];
    faddd   %f54,K1,%f54        ! (4_1) res0 += K1;

    lda [%i1]0x82,%f17      ! (4_0) x0 = *px;
.cont10:
    fmuld   %f38,%f22,%f38      ! (0_0) xx0 *= dmp0;
    cmp counter,5
    for %f60,DC1,%f46       ! (1_0) h0 = vis_for(h0,DC1);

    ld  [%fp+tmp3],%g1      ! (1_0) iexp0 = ((int*)&db0)[0];
    fmuld   %f56,%f40,%f62      ! (3_1) res0 = dtmp0 * res0;
    faddd   %f44,%f24,%f24      ! (3_0) db0 = dx0 + dy0;

    bl,pn   %icc,.tail
    nop

    ba  .main_loop
    sub counter,5,counter

    .align  16
.main_loop:
    fsmuld  %f17,%f17,%f40      ! (4_1) dy0 = x0 * (double)x0;
    cmp %l4,_0x7f3504f3     ! (4_1) hy ? 0x7f3504f3
    lda [stridey+%o7]0x82,%f17  ! (4_1) hy0 = *py;
    fpadd32 %f36,DA1,%f36       ! (3_2) db0 = vis_fpadd32(db0,DA1);

    fmuld   %f54,%f58,%f58      ! (4_2) res0 *= xx0;
    add %o7,stridey,%i5     ! py += stridey
    st  %f24,[%fp+tmp0]     ! (3_1) iexp0 = ((int*)&db0)[0];
    fand    %f46,DC2,%f44       ! (1_1) h_hi0 = vis_fand(h0,DC2);

    fmuld   K2,%f38,%f56        ! (0_1) res0 = K2 * xx0;
    srax    %g1,8,%g5       ! (1_1) iexp0 >>= 8;
    bge,pn  %icc,.update11      ! (4_1) if ( hy >= 0x7f3504f3 )
    fand    %f20,DA0,%f54       ! (4_2) db0 = vis_fand(db0,DA0);

    orcc    %l3,%l4,%g0
    nop
    bz,pn   %icc,.update11
    fzero   %f52
.cont11:
    fmuld   %f62,%f36,%f62      ! (3_2) res0 *= db0;
    and %g5,_0x1ff0,%g5     ! (1_1) di0 = iexp0 & 0x1ff0;
    lda [%i1+stridex]0x82,%l3   ! (0_0) hx0 = *(int*)px;
    fand    %f30,DC0,%f60       ! (2_1) h0 = vis_fand(db0,DC0);

    ldd [%g5+TBL],%f22      ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
    add %o3,TBL,%g1     ! (4_2) si0 = (char*)sqrt_arr + di0;
    add %i1,stridex,%i0     ! px += stridex
    fsubd   %f46,%f44,%f44      ! (1_1) xx0 = h0 - h_hi0;

    fsmuld  %f17,%f17,%f48      ! (4_1) dy0 = y0 * (double)y0;
    nop
    lda [%i1+stridex]0x82,%f8   ! (0_0) x0 = *px;
    faddd   %f58,DC1,%f36       ! (4_2) res0 += DC1;

    faddd   %f56,K1,%f58        ! (0_1) res0 += K1;
    and %l3,_0x7fffffff,%l3 ! (0_0) hx0 &= 0x7fffffff;
    ldd [%g1+8],%f56        ! (4_2) dtmp0 = ((double*)si0)[1];
    fmul8x16    SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0);

    lda [%i5+stridey]0x82,%l4   ! (0_0) hy0 = *(int*)py;
    cmp %l3,_0x7f3504f3     ! (0_0) hx ? 0x7f3504f3
    bge,pn  %icc,.update12      ! (0_0) if ( hx >= 0x7f3504f3 )
    fdtos   %f62,%f14       ! (3_2) ftmp0 = (float)res0;
.cont12:
    fmuld   %f44,%f22,%f44      ! (1_1) xx0 *= dmp0;
    add %l7,stridez,%o7     ! pz += stridez
    st  %f14,[%l7]      ! (3_2) *pz = ftmp0;
    for %f60,DC1,%f46       ! (2_1) h0 = vis_for(h0,DC1);

    fmuld   %f56,%f36,%f36      ! (4_2) res0 = dtmp0 * res0;
    add %i5,stridey,%o4     ! py += stridey
    ld  [%fp+tmp4],%g1      ! (2_1) iexp0 = ((int*)&db0)[0];
    faddd   %f40,%f48,%f20      ! (4_1) db0 = dx0 + dy0;

    fsmuld  %f8,%f8,%f40        ! (0_0) dx0 = x0 * (double)x0;
    and %l4,_0x7fffffff,%l4 ! (0_0) hy0 &= 0x7fffffff;
    lda [%i5+stridey]0x82,%f17  ! (0_0) hy0 = *py;
    fpadd32 %f54,DA1,%f62       ! (4_2) db0 = vis_fpadd32(db0,DA1);

    fmuld   %f58,%f38,%f38      ! (0_1) res0 *= xx0;
    cmp %l4,_0x7f3504f3     ! (0_0) hy ? 0x7f3504f3
    st  %f20,[%fp+tmp1]     ! (4_1) iexp0 = ((int*)&db0)[0];
    fand    %f46,DC2,%f58       ! (2_1) h_hi0 = vis_fand(h0,DC2);

    fmuld   K2,%f44,%f56        ! (1_1) res0 = K2 * xx0;
    srax    %g1,8,%g1       ! (2_1) iexp0 >>= 8;
    bge,pn  %icc,.update13      ! (0_0) if ( hy >= 0x7f3504f3 )
    fand    %f0,DA0,%f54        ! (0_1) db0 = vis_fand(db0,DA0);

    orcc    %l3,%l4,%g0
    nop
    bz,pn   %icc,.update13
    fzero   %f52
.cont13:
    fmuld   %f36,%f62,%f62      ! (4_2) res0 *= db0;
    and %g1,_0x1ff0,%g1     ! (2_1) di0 = iexp0 & 0x1ff0;
    lda [%i0+stridex]0x82,%l3   ! (1_0) hx0 = *(int*)px;
    fand    %f24,DC0,%f60       ! (3_1) h0 = vis_fand(db0,DC0);

    ldd [TBL+%g1],%f22      ! (2_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
    add %o5,TBL,%o0     ! (0_1) si0 = (char*)sqrt_arr + di0;
    add %i0,stridex,%i1     ! px += stridex
    fsubd   %f46,%f58,%f58      ! (2_1) xx0 = h0 - h_hi0;

    fsmuld  %f17,%f17,%f34      ! (0_0) dy0 = y0 * (double)y0;
    add %o7,stridez,%i0     ! pz += stridez
    lda [%o4+stridey]0x82,%l4   ! (1_0) hy0 = *(int*)py;
    faddd   %f38,DC1,%f36       ! (0_1) res0 += DC1;

    faddd   %f56,K1,%f38        ! (1_1) res0 += K1;
    and %l3,_0x7fffffff,%l3 ! (1_0) hx0 &= 0x7fffffff;
    ldd [%o0+8],%f56        ! (0_1) dtmp0 = ((double*)si0)[1];
    fmul8x16    SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0);

    lda [%i1]0x82,%f17      ! (1_0) x0 = *px;
    cmp %l3,_0x7f3504f3     ! (1_0) hx ? 0x7f3504f3
    bge,pn  %icc,.update14      ! (1_0) if ( hx >= 0x7f3504f3 )
    fdtos   %f62,%f14       ! (4_2) ftmp0 = (float)res0;
.cont14:
    fmuld   %f58,%f22,%f58      ! (2_1) xx0 *= dmp0;
    and %l4,_0x7fffffff,%l4 ! (1_0) hy0 &= 0x7fffffff;
    add %o4,stridey,%i5     ! py += stridey
    for %f60,DC1,%f46       ! (3_1) h0 = vis_for(h0,DC1);

    fmuld   %f56,%f36,%f36      ! (0_1) res0 = dtmp0 * res0;
    cmp %l4,_0x7f3504f3     ! (1_0) hy ? 0x7f3504f3
    ld  [%fp+tmp0],%o0      ! (3_1) iexp0 = ((int*)&db0)[0];
    faddd   %f40,%f34,%f0       ! (0_0) db0 = dx0 + dy0;

    fsmuld  %f17,%f17,%f40      ! (1_0) dx0 = x0 * (double)x0;
    add %i1,stridex,%i1     ! px += stridex
    lda [%o4+stridey]0x82,%f17  ! (1_0) y0 = *py;
    fpadd32 %f54,DA1,%f62       ! (0_1) db0 = vis_fpadd32(db0,DA1);

    fmuld   %f38,%f44,%f44      ! (1_1) res0 *= xx0;
    st  %f14,[%o7]      ! (4_2) *pz = ftmp0;
    bge,pn  %icc,.update15      ! (1_0) if ( hy >= 0x7f3504f3 )
    fand    %f46,DC2,%f38       ! (3_1) h_hi0 = vis_fand(h0,DC2);

    orcc    %l3,%l4,%g0
    bz,pn   %icc,.update15
    nop
.cont15:
    fmuld   K2,%f58,%f54        ! (2_1) res0 = K2 * xx0;
    srax    %o0,8,%o0       ! (3_1) iexp0 >>= 8;
    st  %f0,[%fp+tmp2]      ! (0_0) iexp0 = ((int*)&db0)[0];
    fand    %f18,DA0,%f56       ! (1_1) db0 = vis_fand(db0,DA0);

    fmuld   %f36,%f62,%f62      ! (0_1) res0 *= db0;
    and %o0,_0x1ff0,%o0     ! (3_1) di0 = iexp0 & 0x1ff0;
    lda [%i1]0x82,%l3       ! (2_0) hx0 = *(int*)px;
    fand    %f20,DC0,%f60       ! (4_1) h0 = vis_fand(db0,DC0);

    ldd [TBL+%o0],%f22      ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
    add %g5,TBL,%o3     ! (1_1) si0 = (char*)sqrt_arr + di0;
    add %i0,stridez,%i3     ! pz += stridez
    fsubd   %f46,%f38,%f38      ! (3_1) xx0 = h0 - h_hi0;

    fsmuld  %f17,%f17,%f32      ! (1_0) dy0 = y0 * (double)y0;
    add %i5,stridey,%i2     ! py += stridey
    lda [stridey+%i5]0x82,%l4   ! (2_0) hy0 = *(int*)py;
    faddd   %f44,DC1,%f44       ! (1_1) res0 += DC1;

    fmul8x16    SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0);
    and %l3,_0x7fffffff,%l3 ! (2_0) hx0 &= 0x7fffffff;
    ldd [%o3+8],%f56        ! (1_1) dtmp0 = ((double*)si0)[1];
    faddd   %f54,K1,%f54        ! (2_1) res0 += K1;

    lda [%i1]0x82,%f17      ! (2_0) x0 = *px;
    cmp %l3,_0x7f3504f3     ! (2_0) hx ? 0x7f3504f3
    add %i3,stridez,%o4     ! pz += stridez
    fdtos   %f62,%f14       ! (0_1) ftmp0 = (float)res0;

    fmuld   %f38,%f22,%f38      ! (3_1) xx0 *= dmp0;
    and %l4,_0x7fffffff,%l4 ! (2_0) hy0 &= 0x7fffffff;
    st  %f14,[%i0]      ! (0_1) *pz = ftmp0;
    for %f60,DC1,%f46       ! (4_1) h0 = vis_for(h0,DC1);

    fmuld   %f56,%f44,%f62      ! (1_1) res0 = dtmp0 * res0;
    bge,pn  %icc,.update16      ! (2_0) if ( hx >= 0x7f3504f3 )
    ld  [%fp+tmp1],%o3      ! (4_1) iexp0 = ((int*)&db0)[0];
    faddd   %f40,%f32,%f18      ! (1_0) db0 = dx0 + dy0;
.cont16:
    fsmuld  %f17,%f17,%f44      ! (2_0) dx0 = x0 * (double)x0;
    cmp %l4,_0x7f3504f3     ! (2_0) hy ? 0x7f3504f3
    lda [stridey+%i5]0x82,%f17  ! (2_0) y0 = *py;
    fpadd32 %f36,DA1,%f36       ! (1_1) db0 = vis_fpadd32(db0,DA1);

    fmuld   %f54,%f58,%f54      ! (2_1) res0 *= xx0;
    add %i1,stridex,%l7     ! px += stridex
    bge,pn  %icc,.update17      ! (2_0) if ( hy >= 0x7f3504f3 )
    fand    %f46,DC2,%f58       ! (4_1) h_hi0 = vis_fand(h0,DC2);

    orcc    %l3,%l4,%g0
    nop
    bz,pn   %icc,.update17
    fzero   %f52
.cont17:
    fmuld   K2,%f38,%f56        ! (3_1) res0 = K2 * xx0;
    srax    %o3,8,%o3       ! (4_1) iexp0 >>= 8;
    st  %f18,[%fp+tmp3]     ! (1_0) iexp0 = ((int*)&db0)[0];
    fand    %f30,DA0,%f40       ! (2_1) db0 = vis_fand(db0,DA0);

    fmuld   %f62,%f36,%f62      ! (1_1) res0 *= db0;
    and %o3,_0x1ff0,%o3     ! (4_1) di0 = iexp0 & 0x1ff0;
    lda [%l7]0x82,%l3       ! (3_0) hx0 = *(int*)px;
    fand    %f0,DC0,%f60        ! (0_0) h0 = vis_fand(db0,DC0);

    ldd [TBL+%o3],%f22      ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
    add %g1,TBL,%g1     ! (2_1) si0 = (char*)sqrt_arr + di0;
    add %i2,stridey,%o7     ! py += stridey
    fsubd   %f46,%f58,%f58      ! (4_1) xx0 = h0 - h_hi0;

    fsmuld  %f17,%f17,%f30      ! (2_0) dy0 = y0 * (double)y0;
    lda [stridey+%i2]0x82,%l4   ! (3_0) hy0 = *(int*)py;
    add %l7,stridex,%i1     ! px += stridex
    faddd   %f54,DC1,%f36       ! (2_1) res0 += DC1;

    faddd   %f56,K1,%f54        ! (3_1) res0 += K1;
    and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff;
    ldd [%g1+8],%f56        ! (2_1) dtmp0 = ((double*)si0)[1];
    fmul8x16    SCALE,%f40,%f40 ! (2_1) db0 = vis_fmul8x16(SCALE, db0);

    lda [%l7]0x82,%f17      ! (3_0) x0 = *px;
    cmp %l3,_0x7f3504f3     ! (3_0) hx ? 0x7f3504f3
    bge,pn  %icc,.update18      ! (3_0) if ( hx >= 0x7f3504f3 )
    fdtos   %f62,%f14       ! (1_1) ftmp0 = (float)res0;
.cont18:
    fmuld   %f58,%f22,%f58      ! (4_1) xx0 *= dmp0;
    and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff;
    st  %f14,[%i3]      ! (1_1) *pz = ftmp0;
    for %f60,DC1,%f46       ! (0_0) h0 = vis_for(h0,DC1);

    fmuld   %f56,%f36,%f36      ! (2_1) res0 = dtmp0 * res0;
    cmp %l4,_0x7f3504f3     ! (3_0) hy ? 0x7f3504f3
    ld  [%fp+tmp2],%g1      ! (0_0) iexp0 = ((int*)&db0)[0];
    faddd   %f44,%f30,%f30      ! (2_0) db0 = dx0 + dy0;

    fsmuld  %f17,%f17,%f44      ! (3_0) dx0 = x0 * (double)x0;
    bge,pn  %icc,.update19      ! (3_0) if ( hy >= 0x7f3504f3 )
    lda [stridey+%i2]0x82,%f17  ! (3_0) y0 = *py;
    fpadd32 %f40,DA1,%f62       ! (2_1) db0 = vis_fpadd32(db0,DA1);

.cont19:
    fmuld   %f54,%f38,%f40      ! (3_1) res0 *= xx0;
    orcc    %l3,%l4,%g0
    st  %f30,[%fp+tmp4]     ! (2_0) iexp0 = ((int*)&db0)[0];
    fand    %f46,DC2,%f38       ! (0_0) h_hi0 = vis_fand(h0,DC2);

    fmuld   K2,%f58,%f54        ! (4_1) res0 = K2 * xx0;
    srax    %g1,8,%o5       ! (0_0) iexp0 >>= 8;
    lda [%i1]0x82,%l3       ! (4_0) hx0 = *(int*)px;
    fand    %f24,DA0,%f56       ! (3_1) db0 = vis_fand(db0,DA0);

    fmuld   %f36,%f62,%f62      ! (2_1) res0 *= db0;
    and %o5,_0x1ff0,%o5     ! (0_0) di0 = iexp0 & 0x1ff0;
    bz,pn   %icc,.update19a
    fand    %f18,DC0,%f60       ! (1_0) h0 = vis_fand(db0,DC0);
.cont19a:
    ldd [TBL+%o5],%f22      ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
    add %o0,TBL,%g1     ! (3_1) si0 = (char*)sqrt_arr + di0;
    and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff;
    fsubd   %f46,%f38,%f38      ! (0_0) xx0 = h0 - h_hi0;

    fsmuld  %f17,%f17,%f24      ! (3_0) dy0 = y0 * (double)y0;
    cmp %l3,_0x7f3504f3     ! (4_0) hx ? 0x7f3504f3
    lda [stridey+%o7]0x82,%l4   ! (4_0) hy0 = *(int*)py;
    faddd   %f40,DC1,%f40       ! (3_1) res0 += DC1;

    fmul8x16    SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0);
    bge,pn  %icc,.update20      ! (4_0) if ( hx >= 0x7f3504f3 )
    ldd [%g1+8],%f56        ! (3_1) dtmp0 = ((double*)si0)[1];
    faddd   %f54,K1,%f54        ! (4_1) res0 += K1;

    lda [%i1]0x82,%f17      ! (4_0) x0 = *px;
.cont20:
    subcc   counter,5,counter   ! counter -= 5
    add %o4,stridez,%l7     ! pz += stridez
    fdtos   %f62,%f14       ! (2_1) ftmp0 = (float)res0;

    fmuld   %f38,%f22,%f38      ! (0_0) xx0 *= dmp0;
    and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff;
    st  %f14,[%o4]      ! (2_1) *pz = ftmp0;
    for %f60,DC1,%f46       ! (1_0) h0 = vis_for(h0,DC1);

    ld  [%fp+tmp3],%g1      ! (1_0) iexp0 = ((int*)&db0)[0];
    fmuld   %f56,%f40,%f62      ! (3_1) res0 = dtmp0 * res0;
    bpos,pt %icc,.main_loop
    faddd   %f44,%f24,%f24      ! (3_0) db0 = dx0 + dy0;

    add counter,5,counter

.tail:
    subcc   counter,1,counter
    bneg    .begin
    nop

    fpadd32 %f36,DA1,%f36       ! (3_2) db0 = vis_fpadd32(db0,DA1);

    fmuld   %f54,%f58,%f58      ! (4_2) res0 *= xx0;
    fand    %f46,DC2,%f44       ! (1_1) h_hi0 = vis_fand(h0,DC2);

    fmuld   K2,%f38,%f56        ! (0_1) res0 = K2 * xx0;
    srax    %g1,8,%g5       ! (1_1) iexp0 >>= 8;
    fand    %f20,DA0,%f54       ! (4_2) db0 = vis_fand(db0,DA0);

    fmuld   %f62,%f36,%f62      ! (3_2) res0 *= db0;
    and %g5,_0x1ff0,%g5     ! (1_1) di0 = iexp0 & 0x1ff0;

    ldd [%g5+TBL],%f22      ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
    add %o3,TBL,%g1     ! (4_2) si0 = (char*)sqrt_arr + di0;
    fsubd   %f46,%f44,%f44      ! (1_1) xx0 = h0 - h_hi0;

    faddd   %f58,DC1,%f36       ! (4_2) res0 += DC1;

    faddd   %f56,K1,%f58        ! (0_1) res0 += K1;
    ldd [%g1+8],%f56        ! (4_2) dtmp0 = ((double*)si0)[1];
    fmul8x16    SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0);

    fdtos   %f62,%f14       ! (3_2) ftmp0 = (float)res0;

    fmuld   %f44,%f22,%f44      ! (1_1) xx0 *= dmp0;
    add %l7,stridez,%o7     ! pz += stridez
    st  %f14,[%l7]      ! (3_2) *pz = ftmp0;

    subcc   counter,1,counter
    bneg    .begin
    or  %g0,%o7,%l7

    fmuld   %f56,%f36,%f36      ! (4_2) res0 = dtmp0 * res0;

    fpadd32 %f54,DA1,%f62       ! (4_2) db0 = vis_fpadd32(db0,DA1);

    fmuld   %f58,%f38,%f38      ! (0_1) res0 *= xx0;

    fmuld   K2,%f44,%f56        ! (1_1) res0 = K2 * xx0;
    fand    %f0,DA0,%f54        ! (0_1) db0 = vis_fand(db0,DA0);

    fmuld   %f36,%f62,%f62      ! (4_2) res0 *= db0;

    add %o5,TBL,%o0     ! (0_1) si0 = (char*)sqrt_arr + di0;

    faddd   %f38,DC1,%f36       ! (0_1) res0 += DC1;

    faddd   %f56,K1,%f38        ! (1_1) res0 += K1;
    ldd [%o0+8],%f56        ! (0_1) dtmp0 = ((double*)si0)[1];
    fmul8x16    SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0);

    add %o7,stridez,%i0     ! pz += stridez
    fdtos   %f62,%f14       ! (4_2) ftmp0 = (float)res0;

    fmuld   %f56,%f36,%f36      ! (0_1) res0 = dtmp0 * res0;

    fpadd32 %f54,DA1,%f62       ! (0_1) db0 = vis_fpadd32(db0,DA1);

    fmuld   %f38,%f44,%f44      ! (1_1) res0 *= xx0;
    add %i0,stridez,%i3     ! pz += stridez
    st  %f14,[%o7]      ! (4_2) *pz = ftmp0;

    subcc   counter,1,counter
    bneg    .begin
    or  %g0,%i0,%l7

    fand    %f18,DA0,%f56       ! (1_1) db0 = vis_fand(db0,DA0);

    fmuld   %f36,%f62,%f62      ! (0_1) res0 *= db0;

    add %g5,TBL,%o3     ! (1_1) si0 = (char*)sqrt_arr + di0;

    faddd   %f44,DC1,%f44       ! (1_1) res0 += DC1;

    fmul8x16    SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0);
    ldd [%o3+8],%f56        ! (1_1) dtmp0 = ((double*)si0)[1];

    add %i3,stridez,%o4     ! pz += stridez
    fdtos   %f62,%f14       ! (0_1) ftmp0 = (float)res0;

    st  %f14,[%i0]      ! (0_1) *pz = ftmp0;

    subcc   counter,1,counter
    bneg    .begin
    or  %g0,%i3,%l7

    fmuld   %f56,%f44,%f62      ! (1_1) res0 = dtmp0 * res0;

    fpadd32 %f36,DA1,%f36       ! (1_1) db0 = vis_fpadd32(db0,DA1);

    fmuld   %f62,%f36,%f62      ! (1_1) res0 *= db0;

    fdtos   %f62,%f14       ! (1_1) ftmp0 = (float)res0;

    st  %f14,[%i3]      ! (1_1) *pz = ftmp0;

    ba  .begin
    or  %g0,%o4,%l7

    .align  16
.spec1:
    st  %g0,[%l7]       ! *pz = 0;
    add %l7,stridez,%l7     ! pz += stridez

    add %i2,stridey,%i2     ! py += stridey
    ba  .begin1
    sub counter,1,counter   ! counter--

    .align  16
.spec:
    sethi   %hi(0x7f800000),%i0
    cmp %l3,%i0         ! hx ? 0x7f800000
    bge,pt  %icc,2f         ! if ( hx >= 0x7f800000 )
    ld  [%i2],%f8

    cmp %l4,%i0         ! hy ? 0x7f800000
    bge,pt  %icc,2f         ! if ( hy >= 0x7f800000 )
    nop

    fsmuld  %f17,%f17,%f44      ! x * (double)x
    fsmuld  %f8,%f8,%f24        ! y * (double)y
    faddd   %f44,%f24,%f24      ! x * (double)x + y * (double)y
    fsqrtd  %f24,%f24       ! hyp = sqrt(x * (double)x + y * (double)y);
    fcmped  %f24,DFMAX      ! hyp ? DMAX
    fbug,a  1f          ! if ( hyp > DMAX )
    fmuls   FMAX,FMAX,%f20      ! ftmp0 = FMAX * FMAX;

    fdtos   %f24,%f20       ! ftmp0 = (float)hyp;
1:
    st  %f20,[%l7]      ! *pz = ftmp0;
    add %l7,stridez,%l7     ! pz += stridez
    add %i1,stridex,%i1     ! px += stridex

    add %i2,stridey,%i2     ! py += stridey
    ba  .begin1
    sub counter,1,counter   ! counter--
2:
    fcmps   %f17,%f8        ! exceptions
    cmp %l3,%i0         ! hx ? 0x7f800000
    be,a    %icc,1f         ! if ( hx == 0x7f800000 )
    st  %i0,[%l7]       ! *(int*)pz = 0x7f800000;

    cmp %l4,%i0         ! hy ? 0x7f800000
    be,a    %icc,1f         ! if ( hy == 0x7f800000
    st  %i0,[%l7]       ! *(int*)pz = 0x7f800000;

    fmuls   %f17,%f8,%f8        ! x * y
    st  %f8,[%l7]       ! *pz = x * y;

1:
    add %l7,stridez,%l7     ! pz += stridez
    add %i1,stridex,%i1     ! px += stridex

    add %i2,stridey,%i2     ! py += stridey
    ba  .begin1
    sub counter,1,counter   ! counter--

    .align  16
.update0:
    cmp counter,1
    ble .cont0
    fzeros  %f17

    stx %i1,[%fp+tmp_px]

    add %o7,stridey,%i5
    stx %i5,[%fp+tmp_py]

    sub counter,1,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont0
    or  %g0,1,counter

    .align  16
.update1:
    cmp counter,1
    ble .cont1
    fzeros  %f17

    stx %i1,[%fp+tmp_px]
    stx %i5,[%fp+tmp_py]

    sub counter,1,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont1
    or  %g0,1,counter

    .align  16
.update2:
    cmp counter,2
    ble .cont2
    fzeros  %f8

    stx %i1,[%fp+tmp_px]
    stx %o4,[%fp+tmp_py]

    sub counter,2,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont2
    or  %g0,2,counter

    .align  16
.update3:
    cmp counter,2
    ble .cont3
    fzeros  %f17

    stx %i1,[%fp+tmp_px]
    stx %o4,[%fp+tmp_py]

    sub counter,2,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont3
    or  %g0,2,counter

    .align  16
.update4:
    cmp counter,3
    ble .cont4
    fzeros  %f17

    stx %i1,[%fp+tmp_px]
    stx %i5,[%fp+tmp_py]

    sub counter,3,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont4
    or  %g0,3,counter

    .align  16
.update5:
    cmp counter,3
    ble .cont5
    fzeros  %f17

    sub %i1,stridex,%i2
    stx %i2,[%fp+tmp_px]
    stx %i5,[%fp+tmp_py]

    sub counter,3,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont5
    or  %g0,3,counter

    .align  16
.update6:
    cmp counter,4
    ble .cont6
    fzeros  %f17

    stx %i1,[%fp+tmp_px]
    stx %i2,[%fp+tmp_py]

    sub counter,4,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont6
    or  %g0,4,counter

    .align  16
.update7:
    cmp counter,4
    ble .cont7
    fzeros  %f17

    sub %i1,stridex,%o7
    stx %o7,[%fp+tmp_px]
    stx %i2,[%fp+tmp_py]

    sub counter,4,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont7
    or  %g0,4,counter

    .align  16
.update8:
    cmp counter,5
    ble .cont8
    fzeros  %f17

    sub %i1,stridex,%o5
    stx %o5,[%fp+tmp_px]
    stx %o7,[%fp+tmp_py]

    sub counter,5,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont8
    or  %g0,5,counter

    .align  16
.update9:
    cmp counter,5
    ble .cont9
    fzeros  %f17

    sub %i1,stridex,%o5
    stx %o5,[%fp+tmp_px]
    stx %o7,[%fp+tmp_py]

    sub counter,5,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont9
    or  %g0,5,counter

    .align  16
.update10:
    fmul8x16    SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0);
    and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff;
    ldd [%g1+8],%f56        ! (3_1) dtmp0 = ((double*)si0)[1];
    faddd   %f54,K1,%f54        ! (4_1) res0 += K1;

    cmp counter,6
    ble .cont10
    fzeros  %f17

    stx %i1,[%fp+tmp_px]
    add %o7,stridey,%i5
    stx %i5,[%fp+tmp_py]

    sub counter,6,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont10
    or  %g0,6,counter

    .align  16
.update11:
    cmp counter,1
    ble .cont11
    fzeros  %f17

    stx %i1,[%fp+tmp_px]
    stx %i5,[%fp+tmp_py]

    sub counter,1,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont11
    or  %g0,1,counter

    .align  16
.update12:
    cmp counter,2
    ble .cont12
    fzeros  %f8

    stx %i0,[%fp+tmp_px]
    add %i5,stridey,%o4
    stx %o4,[%fp+tmp_py]

    sub counter,2,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont12
    or  %g0,2,counter

    .align  16
.update13:
    cmp counter,2
    ble .cont13
    fzeros  %f17

    stx %i0,[%fp+tmp_px]
    stx %o4,[%fp+tmp_py]

    sub counter,2,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont13
    or  %g0,2,counter

    .align  16
.update14:
    cmp counter,3
    ble .cont14
    fzeros  %f17

    stx %i1,[%fp+tmp_px]
    add %o4,stridey,%i5
    stx %i5,[%fp+tmp_py]

    sub counter,3,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont14
    or  %g0,3,counter

    .align  16
.update15:
    cmp counter,3
    ble .cont15
    fzeros  %f17

    sub %i1,stridex,%i2
    stx %i2,[%fp+tmp_px]
    stx %i5,[%fp+tmp_py]

    sub counter,3,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont15
    or  %g0,3,counter

    .align  16
.update16:
    faddd   %f40,%f32,%f18      ! (1_0) db0 = dx0 + dy0;
    cmp counter,4
    ble .cont16
    fzeros  %f17

    stx %i1,[%fp+tmp_px]
    stx %i2,[%fp+tmp_py]

    sub counter,4,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont16
    or  %g0,4,counter

    .align  16
.update17:
    cmp counter,4
    ble .cont17
    fzeros  %f17

    stx %i1,[%fp+tmp_px]
    stx %i2,[%fp+tmp_py]

    sub counter,4,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont17
    or  %g0,4,counter

    .align  16
.update18:
    cmp counter,5
    ble .cont18
    fzeros  %f17

    stx %l7,[%fp+tmp_px]
    stx %o7,[%fp+tmp_py]

    sub counter,5,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont18
    or  %g0,5,counter

    .align  16
.update19:
    fpadd32 %f40,DA1,%f62       ! (2_1) db0 = vis_fpadd32(db0,DA1);
    cmp counter,5
    ble .cont19
    fzeros  %f17

    stx %l7,[%fp+tmp_px]
    stx %o7,[%fp+tmp_py]

    sub counter,5,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont19
    or  %g0,5,counter

    .align  16
.update19a:
    cmp counter,5
    ble .cont19a
    fzeros  %f17

    stx %l7,[%fp+tmp_px]
    stx %o7,[%fp+tmp_py]

    sub counter,5,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont19a
    or  %g0,5,counter

    .align  16
.update20:
    faddd   %f54,K1,%f54        ! (4_1) res0 += K1;
    cmp counter,6
    ble .cont20
    fzeros  %f17

    stx %i1,[%fp+tmp_px]
    add %o7,stridey,%g1
    stx %g1,[%fp+tmp_py]

    sub counter,6,counter
    st  counter,[%fp+tmp_counter]

    ba  .cont20
    or  %g0,6,counter

.exit:
    ret
    restore
    SET_SIZE(__vhypotf)