/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
#include <sys/asm_linkage.h>
#include <sys/x86_archext.h>
#include <sys/controlregs.h>
#if defined(__lint)
{ return (0); }
/* Not to be called by C code */
/* ARGSUSED */
{ return (0); }
/* Not to be called by C code */
/* ARGSUSED */
{ return (0); }
/* ARGSUSED */
{ return (0); }
/* ARGSUSED */
{ return (0); }
/* ARGSUSED */
void
{}
/* ARGSUSED */
void
{}
#if defined(MMX_MANAGE)
/* ARGSUSED */
{ return (0); }
/* ARGSUSED */
{ return (0); }
/* Not to be called by C code */
/* ARGSUSED */
void
{}
#endif /* MMX_MANAGE */
/*
* UMUL
*
*/
/* ARGSUSED */
{ return (0); }
/* ARGSUSED */
{ return (0); }
#else /* __lint */
#if defined(MMX_MANAGE)
#if defined(_KERNEL)
clts; \
#else /* _KERNEL */
#define KPREEMPT_DISABLE
#define KPREEMPT_ENABLE
#endif /* _KERNEL */
#endif /* MMX_MANAGE */
/ big_mul_set_vec()
/ big_mul_add_vec()
/ big_mul_vec()
/ big_sqr_vec()
/
/
/ Note:
/
/ uint32_t
/ bignum_use_sse2()
#if defined(_KERNEL)
#else /* _KERNEL */
#endif /* _KERNEL */
/ ------------------------------------------------------------------------
/ ------------------------------------------------------------------------
/
/ uint32_t
/
/ r %edx
/ a %ebx
/
/
/ N.B.:
/
/ The basic multiply digit loop is unrolled 8 times.
/ Each comment is preceded by an instance number.
/ Instructions that have been moved retain their original, "natural"
/ instance number. It should be easier this way to follow
/ the step-wise refinement process that went into constructing
/ the final code.
#define UNROLL 8
#define UNROLL32 32
ENTRY(big_mul_set_vec_sse2_r)
xorl %eax, %eax / if (len == 0) return (0);
testl %ecx, %ecx
jz .L17
pxor %mm0, %mm0 / cy = 0
.L15:
cmpl $UNROLL, %ecx
jl .L16
movd 0(%ebx), %mm1 / 1: mm1 = a[i]
pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy;
movd 4(%ebx), %mm1 / 2: mm1 = a[i]
movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
psrlq $32, %mm0 / 1: cy = product[63..32]
pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy;
movd 8(%ebx), %mm1 / 3: mm1 = a[i]
movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
psrlq $32, %mm0 / 2: cy = product[63..32]
pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy;
movd 12(%ebx), %mm1 / 4: mm1 = a[i]
movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
psrlq $32, %mm0 / 3: cy = product[63..32]
pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy;
movd 16(%ebx), %mm1 / 5: mm1 = a[i]
movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
psrlq $32, %mm0 / 4: cy = product[63..32]
pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy;
movd 20(%ebx), %mm1 / 6: mm1 = a[i]
movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
psrlq $32, %mm0 / 5: cy = product[63..32]
pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy;
movd 24(%ebx), %mm1 / 7: mm1 = a[i]
movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
psrlq $32, %mm0 / 6: cy = product[63..32]
pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy;
movd 28(%ebx), %mm1 / 8: mm1 = a[i]
movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
psrlq $32, %mm0 / 7: cy = product[63..32]
pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i]
paddq %mm1, %mm0 / 8: mm0 = digit * a[i] + cy;
movd %mm0, 28(%edx) / 8: r[i] = product[31..0]
psrlq $32, %mm0 / 8: cy = product[63..32]
leal UNROLL32(%ebx), %ebx / a += UNROLL
leal UNROLL32(%edx), %edx / r += UNROLL
subl $UNROLL, %ecx / len -= UNROLL
jz .L17
jmp .L15
.L16:
movd 0(%ebx), %mm1 / 1: mm1 = a[i]
pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy;
movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
psrlq $32, %mm0 / 1: cy = product[63..32]
subl $1, %ecx
jz .L17
movd 4(%ebx), %mm1 / 2: mm1 = a[i]
pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy;
movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
psrlq $32, %mm0 / 2: cy = product[63..32]
subl $1, %ecx
jz .L17
movd 8(%ebx), %mm1 / 3: mm1 = a[i]
pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy;
movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
psrlq $32, %mm0 / 3: cy = product[63..32]
subl $1, %ecx
jz .L17
movd 12(%ebx), %mm1 / 4: mm1 = a[i]
pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy;
movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
psrlq $32, %mm0 / 4: cy = product[63..32]
subl $1, %ecx
jz .L17
movd 16(%ebx), %mm1 / 5: mm1 = a[i]
pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy;
movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
psrlq $32, %mm0 / 5: cy = product[63..32]
subl $1, %ecx
jz .L17
movd 20(%ebx), %mm1 / 6: mm1 = a[i]
pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy;
movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
psrlq $32, %mm0 / 6: cy = product[63..32]
subl $1, %ecx
jz .L17
movd 24(%ebx), %mm1 / 7: mm1 = a[i]
pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy;
movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
psrlq $32, %mm0 / 7: cy = product[63..32]
.L17:
movd %mm0, %eax / return (cy)
/ no emms. caller is responsible for emms
ret
SET_SIZE(big_mul_set_vec_sse2_r)
/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ r 8(%ebp) %edx
/ a 12(%ebp) %ebx
/ len 16(%ebp) %ecx
/ digit 20(%ebp) %mm3
/
/ In userland, there is just the one function, big_mul_set_vec_sse2().
/ But in the kernel, there are two variations:
/ 1. big_mul_set_vec_sse2() which does what is necessary to save and
/ restore state, if necessary, and to ensure that preemtion is
/ disabled.
/ 2. big_mul_set_vec_sse2_nsv() which just does the work;
#if defined(MMX_MANAGE)
#else /* !defined(MMX_MANAGE) */
/
#endif /* MMX_MANAGE */
/
/ uint32_t
/
/ r %edx
/ a %ebx
/
/ N.B.:
/
/ The basic multiply digit loop is unrolled 8 times.
/ Each comment is preceded by an instance number.
/ Instructions that have been moved retain their original, "natural"
/ instance number. It should be easier this way to follow
/ the step-wise refinement process that went into constructing
/ the final code.
ENTRY(big_mul_add_vec_sse2_r)
xorl %eax, %eax
testl %ecx, %ecx
jz .L27
pxor %mm0, %mm0 / cy = 0
.L25:
cmpl $UNROLL, %ecx
jl .L26
movd 0(%ebx), %mm1 / 1: mm1 = a[i]
movd 0(%edx), %mm2 / 1: mm2 = r[i]
pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i]
movd 4(%ebx), %mm1 / 2: mm1 = a[i]
paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
movd 4(%edx), %mm2 / 2: mm2 = r[i]
psrlq $32, %mm0 / 1: cy = product[63..32]
pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i]
movd 8(%ebx), %mm1 / 3: mm1 = a[i]
paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
movd 8(%edx), %mm2 / 3: mm2 = r[i]
psrlq $32, %mm0 / 2: cy = product[63..32]
pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i]
movd 12(%ebx), %mm1 / 4: mm1 = a[i]
paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
movd 12(%edx), %mm2 / 4: mm2 = r[i]
psrlq $32, %mm0 / 3: cy = product[63..32]
pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i]
movd 16(%ebx), %mm1 / 5: mm1 = a[i]
paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
movd 16(%edx), %mm2 / 5: mm2 = r[i]
psrlq $32, %mm0 / 4: cy = product[63..32]
pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i]
movd 20(%ebx), %mm1 / 6: mm1 = a[i]
paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
movd 20(%edx), %mm2 / 6: mm2 = r[i]
psrlq $32, %mm0 / 5: cy = product[63..32]
pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i]
movd 24(%ebx), %mm1 / 7: mm1 = a[i]
paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
movd 24(%edx), %mm2 / 7: mm2 = r[i]
psrlq $32, %mm0 / 6: cy = product[63..32]
pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i]
movd 28(%ebx), %mm1 / 8: mm1 = a[i]
paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
movd 28(%edx), %mm2 / 8: mm2 = r[i]
psrlq $32, %mm0 / 7: cy = product[63..32]
pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i]
paddq %mm1, %mm2 / 8: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 8: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 28(%edx) / 8: r[i] = product[31..0]
psrlq $32, %mm0 / 8: cy = product[63..32]
leal UNROLL32(%ebx), %ebx / a += UNROLL
leal UNROLL32(%edx), %edx / r += UNROLL
subl $UNROLL, %ecx / len -= UNROLL
jz .L27
jmp .L25
.L26:
movd 0(%ebx), %mm1 / 1: mm1 = a[i]
movd 0(%edx), %mm2 / 1: mm2 = r[i]
pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
psrlq $32, %mm0 / 1: cy = product[63..32]
subl $1, %ecx
jz .L27
movd 4(%ebx), %mm1 / 2: mm1 = a[i]
movd 4(%edx), %mm2 / 2: mm2 = r[i]
pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
psrlq $32, %mm0 / 2: cy = product[63..32]
subl $1, %ecx
jz .L27
movd 8(%ebx), %mm1 / 3: mm1 = a[i]
movd 8(%edx), %mm2 / 3: mm2 = r[i]
pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
psrlq $32, %mm0 / 3: cy = product[63..32]
subl $1, %ecx
jz .L27
movd 12(%ebx), %mm1 / 4: mm1 = a[i]
movd 12(%edx), %mm2 / 4: mm2 = r[i]
pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
psrlq $32, %mm0 / 4: cy = product[63..32]
subl $1, %ecx
jz .L27
movd 16(%ebx), %mm1 / 5: mm1 = a[i]
movd 16(%edx), %mm2 / 5: mm2 = r[i]
pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
psrlq $32, %mm0 / 5: cy = product[63..32]
subl $1, %ecx
jz .L27
movd 20(%ebx), %mm1 / 6: mm1 = a[i]
movd 20(%edx), %mm2 / 6: mm2 = r[i]
pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
psrlq $32, %mm0 / 6: cy = product[63..32]
subl $1, %ecx
jz .L27
movd 24(%ebx), %mm1 / 7: mm1 = a[i]
movd 24(%edx), %mm2 / 7: mm2 = r[i]
pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
psrlq $32, %mm0 / 7: cy = product[63..32]
.L27:
movd %mm0, %eax
/ no emms. caller is responsible for emms
ret
SET_SIZE(big_mul_add_vec_sse2_r)
/ r = r + a * digit, r and a are vectors of length len
/ returns the carry digit
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ r 8(%ebp) %edx
/ a 12(%ebp) %ebx
/ len 16(%ebp) %ecx
/ digit 20(%ebp) %mm3
/
/ In userland, there is just the one function, big_mul_add_vec_sse2().
/ But in the kernel, there are two variations:
/ 1. big_mul_add_vec_sse2() which does what is necessary to save and
/ restore state, if necessary, and to ensure that preemtion is
/ disabled.
/ 2. big_mul_add_vec_sse2_nsv() which just does the work;
#if defined(MMX_MANAGE)
#else /* !defined(MMX_MANAGE) */
#endif /* MMX_MANAGE */
/ void
/ {
/ int i;
/
/ for (i = 1; i < blen; ++i)
/ }
#if defined(MMX_MANAGE)
#else
#endif
#if defined(MMX_MANAGE)
#else
#endif
.align 16
#if defined(MMX_MANAGE)
#else
#endif
#if defined(MMX_MANAGE)
#endif
#if defined(MMX_MANAGE)
#else
#endif
#if defined(MMX_MANAGE)
#endif /* MMX_MANAGE */
/
#if defined(MMX_MANAGE)
#else
#endif
/ /* High-level vector C pseudocode */
/
/ /* Same thing, but slightly lower level C-like pseudocode */
/ i = 1;
/ a = &arg_a[i + 1];
/ while (cnt != 0) {
/ r += 2;
/ ++a;
/ --cnt;
/ }
/
/ /* Same thing, but even lower level
/ * For example, pointers are raw pointers,
/ * with no scaling by object size.
/ */
/ a = arg_a + 8;
/ while (cnt != 0) {
/ cy = big_mul_add_vec_sse2_r();
/ r += 8;
/ a += 4;
/ --cnt;
/ }
.L31:
.L32:
/ p = a[0]**2
/ r[0] = lo32(p)
/ p = 2 * r[1]
/ t = p + cy
/ r[1] = lo32(t)
.L34:
/ of the inner diagonal.
/ r[$-3..$-2] += 2 * r[$-3..$-2] + cy
movd 8(%edx), %mm3 / mm3 = r[2*i]
psllq $1, %mm3 / mm3 = 2*r[2*i]
paddq %mm3, %mm2 / mm2 = 2*r[2*i] + cy
movd %mm2, 8(%edx) / r[2*i] = lo32(2*r[2*i] + cy)
psrlq $32, %mm2 / mm2 = cy = hi32(2*r[2*i] + cy)
movd 12(%edx), %mm3 / mm3 = r[2*i+1]
psllq $1, %mm3 / mm3 = 2*r[2*i+1]
paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + cy
movd %mm2, 12(%edx) / r[2*i+1] = mm2
psrlq $32, %mm2 / mm2 = cy
/ compute high-order corner and add it in
/ p = a[alen - 1]**2
/ t = p + cy
/ r[alen + alen - 2] += lo32(t)
/ cy = hi32(t)
/ r[alen + alen - 1] = cy
movd 4(%ebx), %mm0 / mm0 = a[$-1]
movd 8(%edx), %mm3 / mm3 = r[$-2]
pmuludq %mm0, %mm0 / mm0 = p = a[$-1]**2
paddq %mm0, %mm2 / mm2 = t = p + cy
paddq %mm3, %mm2 / mm2 = r[$-2] + t
movd %mm2, 8(%edx) / r[$-2] = lo32(r[$-2] + t)
psrlq $32, %mm2 / mm2 = cy = hi32(r[$-2] + t)
movd 12(%edx), %mm3
paddq %mm3, %mm2
movd %mm2, 12(%edx) / r[$-1] += cy
.L35:
emms
popl %esi
popl %edi
popl %ebx
#if defined(MMX_MANAGE)
ret
SET_SIZE(big_sqr_vec_sse2_fc)
#else
leave
ret
SET_SIZE(big_sqr_vec_sse2)
#endif
#if defined(MMX_MANAGE)
ENTRY(big_sqr_vec_sse2)
pushl %ebp
movl %esp, %ebp
KPREEMPT_DISABLE
TEST_TS(%ebx)
pushl %ebx
jnz .sqr_no_save
pushl %edi
SAVE_MMX_0TO4(%edi)
call big_sqr_vec_sse2_fc
RSTOR_MMX_0TO4(%edi)
popl %edi
jmp .sqr_rtn
.sqr_no_save:
call big_sqr_vec_sse2_fc
.sqr_rtn:
popl %ebx
movl %ebx, %cr0
KPREEMPT_ENABLE
leave
ret
SET_SIZE(big_sqr_vec_sse2)
#endif /* MMX_MANAGE */
/ ------------------------------------------------------------------------
/ UMUL Implementations
/ ------------------------------------------------------------------------
/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ Does not use any MMX, SSE, or SSE2 instructions.
/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
/ This is a fall-back implementation for x86 models that do not support
/ the PMULUDQ instruction.
/
/ uint32_t
/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
/
/ r 8(%ebp) %edx %edi
/ a 12(%ebp) %ebx %esi
/ len 16(%ebp) %ecx
/ digit 20(%ebp) %esi
ENTRY(big_mul_set_vec_umul)
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %edi
pushl %ebx
movl 16(%ebp), %ecx
xorl %ebx, %ebx / cy = 0
testl %ecx, %ecx
movl 8(%ebp), %edi
movl 12(%ebp), %esi
je .L57
.L55:
movl (%esi), %eax / eax = a[i]
leal 4(%esi), %esi / ++a
mull 20(%ebp) / edx:eax = a[i] * digit
addl %ebx, %eax
adcl $0, %edx / edx:eax = a[i] * digit + cy
movl %eax, (%edi) / r[i] = product[31..0]
movl %edx, %ebx / cy = product[63..32]
leal 4(%edi), %edi / ++r
decl %ecx / --len
jnz .L55 / while (len != 0)
.L57:
movl %ebx, %eax
popl %ebx
popl %edi
popl %esi
leave
ret
SET_SIZE(big_mul_set_vec_umul)
/ r = r + a * digit, r and a are vectors of length len
/ returns the carry digit
/ Does not use any MMX, SSE, or SSE2 instructions.
/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
/ This is a fall-back implementation for x86 models that do not support
/ the PMULUDQ instruction.
/
/ uint32_t
/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
/
/ r 8(%ebp) %edx %edi
/ a 12(%ebp) %ebx %esi
/ len 16(%ebp) %ecx
/ digit 20(%ebp) %esi
ENTRY(big_mul_add_vec_umul)
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %edi
pushl %ebx
movl 16(%ebp), %ecx
xorl %ebx, %ebx / cy = 0
testl %ecx, %ecx
movl 8(%ebp), %edi
movl 12(%ebp), %esi
je .L67
.align 4
.L65:
movl (%esi), %eax / eax = a[i]
leal 4(%esi), %esi / ++a
mull 20(%ebp) / edx:eax = a[i] * digit
addl (%edi), %eax
adcl $0, %edx / edx:eax = a[i] * digit + r[i]
addl %ebx, %eax
adcl $0, %edx / edx:eax = a[i] * digit + r[i] + cy
movl %eax, (%edi) / r[i] = product[31..0]
movl %edx, %ebx / cy = product[63..32]
leal 4(%edi), %edi / ++r
decl %ecx / --len
jnz .L65 / while (len != 0)
.L67:
movl %ebx, %eax
popl %ebx
popl %edi
popl %esi
leave
ret
SET_SIZE(big_mul_add_vec_umul)
#endif /* __lint */