/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* .seg "data"
* .asciz "Copyr 1987 Sun Micro"
* .align 4
*/
.seg "text"
#ident "%Z%%M% %I% %E% SMI"
! Copyright (c) 1987 by Sun Microsystems, Inc.
#include <sys/asm_linkage.h>
/*
* procedure to perform a 32 by 32 unsigned integer multiply.
* pass the multiplier into %o0, and the multiplicand into %o1
* the least significant 32 bits of the result will be returned in %o0,
* and the most significant in %o1
*
* Most unsigned integer multiplies involve small numbers, so it is
* worthwhile to optimize for short multiplies at the expense of long
* multiplies. This code checks the size of the multiplier, and has
* special cases for the following:
*
* 4 or fewer bit multipliers: 19 or 21 instruction cycles
* 8 or fewer bit multipliers: 26 or 28 instruction cycles
* 12 or fewer bit multipliers: 34 or 36 instruction cycles
* 16 or fewer bit multipliers: 42 or 44 instruction cycles
*
* Long multipliers require 58 or 60 instruction cycles:
*
* This code indicates that overflow has occured, by leaving the Z condition
* code clear. The following call sequence would be used if you wish to
* deal with overflow:
*
* call .umul
* nop ( or set up last parameter here )
* bnz overflow_code (or tnz to overflow handler)
*/
! RTENTRY(.umul)
.global .umul
.umul:
wr %o0, %y ! multiplier to Y register
andncc %o0, 0xf, %o4 ! mask out lower 4 bits; if branch
! taken, %o4, N and V have been cleared
be umul_4bit ! 4-bit multiplier
sethi %hi(0xffff0000), %o5 ! mask for 16-bit case; have to
! wait 3 instructions after wd
! before %y has stabilized anyway
andncc %o0, 0xff, %o4
be,a umul_8bit ! 8-bit multiplier
mulscc %o4, %o1, %o4 ! first iteration of 9
andncc %o0, 0xfff, %o4
be,a umul_12bit ! 12-bit multiplier
mulscc %o4, %o1, %o4 ! first iteration of 13
andcc %o0, %o5, %o4
be,a umul_16bit ! 16-bit multiplier
mulscc %o4, %o1, %o4 ! first iteration of 17
andcc %g0, %g0, %o4 ! zero the partial product
! and clear N and V conditions
!
! long multiply
!
mulscc %o4, %o1, %o4 ! first iteration of 33
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4 ! 32nd iteration
mulscc %o4, %g0, %o4 ! last iteration only shifts
!
! For unsigned multiplies, a pure shifty-add approach yields the
! correct result. Signed multiplies introduce complications.
!
! With 32-bit twos-complement numbers, -x can be represented as
!
! ((2 - (x/(2**32)) mod 2) * 2**32.
!
! To simplify the equations, the radix point can be moved to just
! to the left of the sign bit. So:
!
! x * y = (xy) mod 2
! -x * y = (2 - x) mod 2 * y = (2y - xy) mod 2
! x * -y = x * (2 - y) mod 2 = (2x - xy) mod 2
! -x * -y = (2 - x) * (2 - y) = (4 - 2x - 2y + xy) mod 2
!
! Because of the way the shift into the partial product is calculated
! (N xor V), the extra term is automagically removed for negative
! multiplicands, so no adjustment is necessary.
!
! But for unsigned multiplies, the high-order bit of the multiplicand
! is incorrectly treated as a sign bit. For unsigned multiplies where
! the high-order bit of the multiplicand is one, the result is
!
! xy - y * (2**32)
!
! we fix that here
!
tst %o1
bge 1f
nop
add %o4, %o0, %o4 ! add (2**32) * %o0; bits 63-32
! of the product are in %o4
!
! The multiply hasn't overflowed if the high-order bits are 0
!
! if you are not interested in detecting overflow,
! replace the following code with:
!
! 1:
! rd %y, %o0
! retl
! mov %o4, %o1
!
1:
rd %y, %o0
retl ! leaf routine return
addcc %o4, %g0, %o1 ! return high-order bits and set Z if
! high order bits are 0
!
! 4-bit multiply
!
umul_4bit:
mulscc %o4, %o1, %o4 ! first iteration of 5
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4 ! 4th iteration
mulscc %o4, %g0, %o4 ! last iteration only shifts
rd %y, %o5
!
! The folowing code adds (2**32) * %o0 to the product if the
! multiplicand had it's high bit set (see 32-bit case for explanation)
!
tst %o1
bge 2f
sra %o4, 28, %o1 ! right shift high bits by 28 bits
add %o1, %o0, %o1
!
! The multiply hasn't overflowed if high-order bits are 0
!
! if you are not interested in detecting overflow,
! replace the following code with:
!
! 2:
! sll %o4, 4, %o0
! srl %o5, 28, %o5
! retl
! or %o5, %o0, %o0
!
2:
sll %o4, 4, %o0 ! left shift middle bits by 4 bits
srl %o5, 28, %o5 ! right shift low bits by 28 bits
or %o5, %o0, %o0 ! merge for true product
retl ! leaf routine return
tst %o1 ! set Z if high order bits are 0
!
! 8-bit multiply
!
umul_8bit:
mulscc %o4, %o1, %o4 ! second iteration of 9
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4 ! 8th iteration
mulscc %o4, %g0, %o4 ! last iteration only shifts
rd %y, %o5
!
! The folowing code adds (2**32) * %o0 to the product if the
! multiplicand had it's high bit set (see 32-bit case for explanation)
!
tst %o1
bge 3f
sra %o4, 24, %o1 ! right shift high bits by 24 bits
add %o1, %o0, %o1
!
! The multiply hasn't overflowed if high-order bits are 0
!
! if you are not interested in detecting overflow,
! replace the following code with:
!
! 3:
! sll %o4, 8, %o0
! srl %o5, 24, %o5
! retl
! or %o5, %o0, %o0
!
3:
sll %o4, 8, %o0 ! left shift middle bits by 8 bits
srl %o5, 24, %o5 ! right shift low bits by 24 bits
or %o5, %o0, %o0 ! merge for true product
retl ! leaf routine return
tst %o1 ! set Z if high order bits are 0
!
! 12-bit multiply
!
umul_12bit:
mulscc %o4, %o1, %o4 ! second iteration of 13
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4 ! 12th iteration
mulscc %o4, %g0, %o4 ! last iteration only shifts
rd %y, %o5
!
! The folowing code adds (2**32) * %o0 to the product if the
! multiplicand had it's high bit set (see 32-bit case for explanation)
!
tst %o1
bge 4f
sra %o4, 20, %o1 ! right shift high bits by 20 bits
add %o1, %o0, %o1
!
! The multiply hasn't overflowed if high-order bits are 0
!
! if you are not interested in detecting overflow,
! replace the following code with:
!
! 4:
! sll %o4, 12, %o0
! srl %o5, 20, %o5
! retl
! or %o5, %o0, %o0
!
4:
sll %o4, 12, %o0 ! left shift middle bits by 12 bits
srl %o5, 20, %o5 ! right shift low bits by 20 bits
or %o5, %o0, %o0 ! merge for true product
retl ! leaf routine return
tst %o1 ! set Z if high order bits are 0
!
! 16-bit multiply
!
umul_16bit:
mulscc %o4, %o1, %o4 ! second iteration of 17
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4
mulscc %o4, %o1, %o4 ! 16th iteration
mulscc %o4, %g0, %o4 ! last iteration only shifts
rd %y, %o5
!
! The folowing code adds (2**32) * %o0 to the product if the
! multiplicand had it's high bit set (see 32-bit case for explanation)
!
tst %o1
bge 5f
sra %o4, 16, %o1 ! right shift high bits by 16 bits
add %o1, %o0, %o1
!
! The multiply hasn't overflowed if high-order bits are 0
!
! if you are not interested in detecting overflow,
! replace the following code with:
!
! 5:
! sll %o4, 16, %o0
! srl %o5, 16, %o5
! retl
! or %o5, %o0, %o0
!
5:
sll %o4, 16, %o0 ! left shift middle bits by 16 bits
srl %o5, 16, %o5 ! right shift low bits by 16 bits
or %o5, %o0, %o0 ! merge for true product
retl ! leaf routine return
tst %o1 ! set Z if high order bits are 0