sgs/rtld.4.x/umultiply.s

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 *  .seg    "data"
 *  .asciz  "Copyr 1987 Sun Micro"
 *  .align  4
 */
    .seg    "text"

#ident  "%Z%%M% %I% %E% SMI"

!   Copyright (c) 1987 by Sun Microsystems, Inc.


#include <sys/asm_linkage.h>

/*
 * procedure to perform a 32 by 32 unsigned integer multiply.
 * pass the multiplier into %o0, and the multiplicand into %o1
 * the least significant 32 bits of the result will be returned in %o0,
 * and the most significant in %o1
 *
 * Most unsigned integer multiplies involve small numbers, so it is
 * worthwhile to optimize for short multiplies at the expense of long
 * multiplies.  This code checks the size of the multiplier, and has
 * special cases for the following:
 *
 *  4 or fewer bit multipliers: 19 or 21 instruction cycles
 *  8 or fewer bit multipliers: 26 or 28 instruction cycles
 *  12 or fewer bit multipliers:    34 or 36 instruction cycles
 *  16 or fewer bit multipliers:    42 or 44 instruction cycles
 *
 * Long multipliers require 58 or 60 instruction cycles:
 *
 * This code indicates that overflow has occured, by leaving the Z condition
 * code clear. The following call sequence would be used if you wish to
 * deal with overflow:
 *
 *      call    .umul
 *      nop     ( or set up last parameter here )
 *      bnz overflow_code   (or tnz to overflow handler)
 */

!   RTENTRY(.umul)
    .global .umul
.umul:
    wr  %o0, %y         ! multiplier to Y register

    andncc  %o0, 0xf, %o4       ! mask out lower 4 bits; if branch
                    ! taken, %o4, N and V have been cleared

    be  umul_4bit       ! 4-bit multiplier
    sethi   %hi(0xffff0000), %o5    ! mask for 16-bit case; have to
                    ! wait 3 instructions after wd
                    ! before %y has stabilized anyway

    andncc  %o0, 0xff, %o4
    be,a    umul_8bit       ! 8-bit multiplier
    mulscc  %o4, %o1, %o4       ! first iteration of 9

    andncc  %o0, 0xfff, %o4
    be,a    umul_12bit      ! 12-bit multiplier
    mulscc  %o4, %o1, %o4       ! first iteration of 13

    andcc   %o0, %o5, %o4
    be,a    umul_16bit      ! 16-bit multiplier
    mulscc  %o4, %o1, %o4       ! first iteration of 17

    andcc   %g0, %g0, %o4       ! zero the partial product
                    ! and clear N and V conditions
    !
    ! long multiply
    !
    mulscc  %o4, %o1, %o4       ! first iteration of 33
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4       ! 32nd iteration
    mulscc  %o4, %g0, %o4       ! last iteration only shifts
    !
    ! For unsigned multiplies, a pure shifty-add approach yields the
    ! correct result.  Signed multiplies introduce complications.
    !
    ! With 32-bit twos-complement numbers, -x can be represented as
    !
    !   ((2 - (x/(2**32)) mod 2) * 2**32.
    !
    ! To simplify the equations, the radix point can be moved to just
    ! to the left of the sign bit.  So:
    !
    !    x *  y = (xy) mod 2
    !   -x *  y = (2 - x) mod 2 * y = (2y - xy) mod 2
    !    x * -y = x * (2 - y) mod 2 = (2x - xy) mod 2
    !   -x * -y = (2 - x) * (2 - y) = (4 - 2x - 2y + xy) mod 2
    !
    ! Because of the way the shift into the partial product is calculated
    ! (N xor V), the extra term is automagically removed for negative
    ! multiplicands, so no adjustment is necessary.
    !
    ! But for unsigned multiplies, the high-order bit of the multiplicand
    ! is incorrectly treated as a sign bit.  For unsigned multiplies where
    ! the high-order bit of the multiplicand is one, the result is
    !
    !   xy - y * (2**32)
    !
    ! we fix that here
    !
    tst %o1
    bge 1f
    nop

    add %o4, %o0, %o4       ! add (2**32) * %o0; bits 63-32
                    ! of the product are in %o4
    !
    ! The multiply hasn't overflowed if the high-order bits are 0
    !
    ! if you are not interested in detecting overflow,
    ! replace the following code with:
    !
    !   1:
    !       rd  %y, %o0
    !       retl
    !       mov %o4, %o1
    !
1:
    rd  %y, %o0
    retl                ! leaf routine return
    addcc   %o4, %g0, %o1       ! return high-order bits and set Z if
                    ! high order bits are 0
    !
    ! 4-bit multiply
    !
umul_4bit:
    mulscc  %o4, %o1, %o4       ! first iteration of 5
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4       ! 4th iteration
    mulscc  %o4, %g0, %o4       ! last iteration only shifts

    rd  %y, %o5
    !
    ! The folowing code adds (2**32) * %o0 to the product if the
    ! multiplicand had it's high bit set (see 32-bit case for explanation)
    !
    tst %o1
    bge 2f
    sra %o4, 28, %o1        ! right shift high bits by 28 bits

    add %o1, %o0, %o1
    !
    ! The multiply hasn't overflowed if high-order bits are 0
    !
    ! if you are not interested in detecting overflow,
    ! replace the following code with:
    !
    !   2:
    !       sll %o4, 4, %o0
    !       srl %o5, 28, %o5
    !       retl
    !       or  %o5, %o0, %o0
    !
2:
    sll %o4, 4, %o0     ! left shift middle bits by 4 bits
    srl %o5, 28, %o5        ! right shift low bits by 28 bits
    or  %o5, %o0, %o0       ! merge for true product
    retl                ! leaf routine return
    tst %o1         ! set Z if high order bits are 0
    !
    ! 8-bit multiply
    !
umul_8bit:
    mulscc  %o4, %o1, %o4       ! second iteration of 9
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4       ! 8th iteration
    mulscc  %o4, %g0, %o4       ! last iteration only shifts

    rd  %y, %o5
    !
    ! The folowing code adds (2**32) * %o0 to the product if the
    ! multiplicand had it's high bit set (see 32-bit case for explanation)
    !
    tst %o1
    bge 3f
    sra %o4, 24, %o1        ! right shift high bits by 24 bits

    add %o1, %o0, %o1
    !
    ! The multiply hasn't overflowed if high-order bits are 0
    !
    ! if you are not interested in detecting overflow,
    ! replace the following code with:
    !
    !   3:
    !       sll %o4, 8, %o0
    !       srl %o5, 24, %o5
    !       retl
    !       or  %o5, %o0, %o0
    !
3:
    sll %o4, 8, %o0     ! left shift middle bits by 8 bits
    srl %o5, 24, %o5        ! right shift low bits by 24 bits
    or  %o5, %o0, %o0       ! merge for true product
    retl                ! leaf routine return
    tst %o1         ! set Z if high order bits are 0
    !
    ! 12-bit multiply
    !
umul_12bit:
    mulscc  %o4, %o1, %o4       ! second iteration of 13
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4       ! 12th iteration
    mulscc  %o4, %g0, %o4       ! last iteration only shifts

    rd  %y, %o5
    !
    ! The folowing code adds (2**32) * %o0 to the product if the
    ! multiplicand had it's high bit set (see 32-bit case for explanation)
    !
    tst %o1
    bge 4f
    sra %o4, 20, %o1        ! right shift high bits by 20 bits

    add %o1, %o0, %o1
    !
    ! The multiply hasn't overflowed if high-order bits are 0
    !
    ! if you are not interested in detecting overflow,
    ! replace the following code with:
    !
    !   4:
    !       sll %o4, 12, %o0
    !       srl %o5, 20, %o5
    !       retl
    !       or  %o5, %o0, %o0
    !
4:
    sll %o4, 12, %o0        ! left shift middle bits by 12 bits
    srl %o5, 20, %o5        ! right shift low bits by 20 bits
    or  %o5, %o0, %o0       ! merge for true product
    retl                ! leaf routine return
    tst %o1         ! set Z if high order bits are 0
    !
    ! 16-bit multiply
    !
umul_16bit:
    mulscc  %o4, %o1, %o4       ! second iteration of 17
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4
    mulscc  %o4, %o1, %o4       ! 16th iteration
    mulscc  %o4, %g0, %o4       ! last iteration only shifts

    rd  %y, %o5
    !
    ! The folowing code adds (2**32) * %o0 to the product if the
    ! multiplicand had it's high bit set (see 32-bit case for explanation)
    !
    tst %o1
    bge 5f
    sra %o4, 16, %o1        ! right shift high bits by 16 bits

    add %o1, %o0, %o1
    !
    ! The multiply hasn't overflowed if high-order bits are 0
    !
    ! if you are not interested in detecting overflow,
    ! replace the following code with:
    !
    !   5:
    !       sll %o4, 16, %o0
    !       srl %o5, 16, %o5
    !       retl
    !       or  %o5, %o0, %o0
    !
5:
    sll %o4, 16, %o0        ! left shift middle bits by 16 bits
    srl %o5, 16, %o5        ! right shift low bits by 16 bits
    or  %o5, %o0, %o0       ! merge for true product
    retl                ! leaf routine return
    tst %o1         ! set Z if high order bits are 0