bignum/i386/bignum_i386_asm.s

	bignum_i386_asm.s revision 7417cfdecea1902cef03c0d61a72df97d945925d
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/asm_linkage.h>
#include <sys/x86_archext.h>
#include <sys/controlregs.h>

#if defined(__lint)

#include <sys/types.h>

uint32_t
bignum_use_sse2()
{ return (0); }

/* Not to be called by C code */
/* ARGSUSED */
uint32_t
big_mul_set_vec_sse2_r()
{ return (0); }

/* Not to be called by C code */
/* ARGSUSED */
uint32_t
big_mul_add_vec_sse2_r()
{ return (0); }

/* ARGSUSED */
uint32_t
big_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{ return (0); }

/* ARGSUSED */
uint32_t
big_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{ return (0); }

/* ARGSUSED */
void
big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
{}

/* ARGSUSED */
void
big_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len)
{}

#if defined(MMX_MANAGE)

/* ARGSUSED */
uint32_t
big_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{ return (0); }

/* ARGSUSED */
uint32_t
big_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{ return (0); }

/* Not to be called by C code */
/* ARGSUSED */
void
big_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len)
{}

#endif  /* MMX_MANAGE */

/*
 * UMUL
 *
 */

/* ARGSUSED */
uint32_t
big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{ return (0); }

/* ARGSUSED */
uint32_t
big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{ return (0); }

#else   /* __lint */

#if defined(MMX_MANAGE)

#if defined(_KERNEL)

#define KPREEMPT_DISABLE call kpr_disable
#define KPREEMPT_ENABLE call kpr_enable
#define TEST_TS(reg)                    \
    movl    %cr0, reg;              \
    clts;                       \
    testl   $CR0_TS, reg

#else   /* _KERNEL */

#define KPREEMPT_DISABLE
#define KPREEMPT_ENABLE

#define TEST_TS(reg)                    \
    movl    $0, reg;                \
    testl   $CR0_TS, reg

#endif  /* _KERNEL */

#define MMX_SIZE 8
#define MMX_ALIGN 8

#define SAVE_MMX_PROLOG(sreg, nreg)         \
    subl    $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;    \
    movl    %esp, sreg;             \
    addl    $MMX_ALIGN, sreg;           \
    andl    $-1![MMX_ALIGN-1], sreg;

#define RSTOR_MMX_EPILOG(nreg)              \
    addl    $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;

#define SAVE_MMX_0TO4(sreg)         \
    SAVE_MMX_PROLOG(sreg, 5);       \
    movq    %mm0, 0(sreg);          \
    movq    %mm1, 8(sreg);          \
    movq    %mm2, 16(sreg);         \
    movq    %mm3, 24(sreg);         \
    movq    %mm4, 32(sreg)

#define RSTOR_MMX_0TO4(sreg)            \
    movq    0(sreg), %mm0;          \
    movq    8(sreg), %mm1;          \
    movq    16(sreg), %mm2;         \
    movq    24(sreg), %mm3;         \
    movq    32(sreg), %mm4;         \
    RSTOR_MMX_EPILOG(5)

#endif  /* MMX_MANAGE */

/ Note: this file contains implementations for
/   big_mul_set_vec()
/   big_mul_add_vec()
/   big_mul_vec()
/   big_sqr_vec()
/ One set of implementations is for SSE2-capable models.
/ The other uses no MMX, SSE, or SSE2 instructions, only
/ the x86 32 X 32 -> 64 unsigned multiply instruction, MUL.
/
/ The code for the implementations is grouped by SSE2 vs UMUL,
/ rather than grouping pairs of implementations for each function.
/ This is because the bignum implementation gets "imprinted"
/ on the correct implementation, at the time of first use,
/ so none of the code for the other implementations is ever
/ executed.  So, it is a no-brainer to layout the code to minimize
/ the "footprint" of executed code.

/ Can we use SSE2 instructions?  Return value is non-zero
/ if we can.
/
/ Note:
/   Using the cpuid instruction directly would work equally
/   well in userland and in the kernel, but we do not use the
/   cpuid instruction in the kernel, we use x86_featureset,
/   instead.  This means we honor any decisions the kernel
/   startup code may have made in setting this variable,
/   including disabling SSE2.  It might even be a good idea
/   to honor this kind of setting in userland, as well, but
/   the variable, x86_featureset is not readily available to
/   userland processes.
/
/ uint32_t
/ bignum_use_sse2()

    ENTRY(bignum_use_sse2)
#if defined(_KERNEL)
    xor %eax, %eax
    bt  $X86FSET_SSE2, x86_featureset
    adc     %eax, %eax
#else   /* _KERNEL */
    pushl   %ebx
    movl    $1, %eax        / Get feature information
    cpuid
    movl    %edx, %eax      / set return value
    popl    %ebx
    andl    $CPUID_INTC_EDX_SSE2, %eax
#endif  /* _KERNEL */
    ret
    SET_SIZE(bignum_use_sse2)


/ ------------------------------------------------------------------------
/       SSE2 Implementations
/ ------------------------------------------------------------------------

/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ uint32_t
/ big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
/
/ r %edx
/ a %ebx
/ len   %ecx
/ digit %mm3
/
/ Does not touch the following registers: %esi, %edi, %mm4
/
/ N.B.:
/   This is strictly for internal use.
/   The interface is very light-weight.
/   All parameters are passed in registers.
/   It does not conform to the SYSV x86 ABI.
/   So, don't even think about calling this function directly from C code.
/
/ The basic multiply digit loop is unrolled 8 times.
/ Each comment is preceded by an instance number.
/ Instructions that have been moved retain their original, "natural"
/ instance number.  It should be easier this way to follow
/ the step-wise refinement process that went into constructing
/ the final code.

#define UNROLL      8
#define UNROLL32    32

    ENTRY(big_mul_set_vec_sse2_r)
    xorl    %eax, %eax  / if (len == 0) return (0);
    testl   %ecx, %ecx
    jz  .L17

    pxor    %mm0, %mm0  / cy = 0

.L15:
    cmpl    $UNROLL, %ecx
    jl  .L16
    movd    0(%ebx), %mm1   / 1: mm1 = a[i]
    pmuludq %mm3, %mm1  / 1: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 1: mm0 = digit * a[i] + cy;
    movd    4(%ebx), %mm1   / 2: mm1 = a[i]
    movd    %mm0, 0(%edx)   / 1: r[i] = product[31..0]
    psrlq   $32, %mm0   / 1: cy = product[63..32]

    pmuludq %mm3, %mm1  / 2: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 2: mm0 = digit * a[i] + cy;
    movd    8(%ebx), %mm1   / 3: mm1 = a[i]
    movd    %mm0, 4(%edx)   / 2: r[i] = product[31..0]
    psrlq   $32, %mm0   / 2: cy = product[63..32]

    pmuludq %mm3, %mm1  / 3: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 3: mm0 = digit * a[i] + cy;
    movd    12(%ebx), %mm1  / 4: mm1 = a[i]
    movd    %mm0, 8(%edx)   / 3: r[i] = product[31..0]
    psrlq   $32, %mm0   / 3: cy = product[63..32]

    pmuludq %mm3, %mm1  / 4: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 4: mm0 = digit * a[i] + cy;
    movd    16(%ebx), %mm1  / 5: mm1 = a[i]
    movd    %mm0, 12(%edx)  / 4: r[i] = product[31..0]
    psrlq   $32, %mm0   / 4: cy = product[63..32]

    pmuludq %mm3, %mm1  / 5: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 5: mm0 = digit * a[i] + cy;
    movd    20(%ebx), %mm1  / 6: mm1 = a[i]
    movd    %mm0, 16(%edx)  / 5: r[i] = product[31..0]
    psrlq   $32, %mm0   / 5: cy = product[63..32]

    pmuludq %mm3, %mm1  / 6: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 6: mm0 = digit * a[i] + cy;
    movd    24(%ebx), %mm1  / 7: mm1 = a[i]
    movd    %mm0, 20(%edx)  / 6: r[i] = product[31..0]
    psrlq   $32, %mm0   / 6: cy = product[63..32]

    pmuludq %mm3, %mm1  / 7: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 7: mm0 = digit * a[i] + cy;
    movd    28(%ebx), %mm1  / 8: mm1 = a[i]
    movd    %mm0, 24(%edx)  / 7: r[i] = product[31..0]
    psrlq   $32, %mm0   / 7: cy = product[63..32]

    pmuludq %mm3, %mm1  / 8: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 8: mm0 = digit * a[i] + cy;
    movd    %mm0, 28(%edx)  / 8: r[i] = product[31..0]
    psrlq   $32, %mm0   / 8: cy = product[63..32]

    leal    UNROLL32(%ebx), %ebx    / a += UNROLL
    leal    UNROLL32(%edx), %edx    / r += UNROLL
    subl    $UNROLL, %ecx       / len -= UNROLL
    jz  .L17
    jmp .L15

.L16:
    movd    0(%ebx), %mm1   / 1: mm1 = a[i]
    pmuludq %mm3, %mm1  / 1: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 1: mm0 = digit * a[i] + cy;
    movd    %mm0, 0(%edx)   / 1: r[i] = product[31..0]
    psrlq   $32, %mm0   / 1: cy = product[63..32]
    subl    $1, %ecx
    jz  .L17

    movd    4(%ebx), %mm1   / 2: mm1 = a[i]
    pmuludq %mm3, %mm1  / 2: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 2: mm0 = digit * a[i] + cy;
    movd    %mm0, 4(%edx)   / 2: r[i] = product[31..0]
    psrlq   $32, %mm0   / 2: cy = product[63..32]
    subl    $1, %ecx
    jz  .L17

    movd    8(%ebx), %mm1   / 3: mm1 = a[i]
    pmuludq %mm3, %mm1  / 3: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 3: mm0 = digit * a[i] + cy;
    movd    %mm0, 8(%edx)   / 3: r[i] = product[31..0]
    psrlq   $32, %mm0   / 3: cy = product[63..32]
    subl    $1, %ecx
    jz  .L17

    movd    12(%ebx), %mm1  / 4: mm1 = a[i]
    pmuludq %mm3, %mm1  / 4: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 4: mm0 = digit * a[i] + cy;
    movd    %mm0, 12(%edx)  / 4: r[i] = product[31..0]
    psrlq   $32, %mm0   / 4: cy = product[63..32]
    subl    $1, %ecx
    jz  .L17

    movd    16(%ebx), %mm1  / 5: mm1 = a[i]
    pmuludq %mm3, %mm1  / 5: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 5: mm0 = digit * a[i] + cy;
    movd    %mm0, 16(%edx)  / 5: r[i] = product[31..0]
    psrlq   $32, %mm0   / 5: cy = product[63..32]
    subl    $1, %ecx
    jz  .L17

    movd    20(%ebx), %mm1  / 6: mm1 = a[i]
    pmuludq %mm3, %mm1  / 6: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 6: mm0 = digit * a[i] + cy;
    movd    %mm0, 20(%edx)  / 6: r[i] = product[31..0]
    psrlq   $32, %mm0   / 6: cy = product[63..32]
    subl    $1, %ecx
    jz  .L17

    movd    24(%ebx), %mm1  / 7: mm1 = a[i]
    pmuludq %mm3, %mm1  / 7: mm1 = digit * a[i]
    paddq   %mm1, %mm0  / 7: mm0 = digit * a[i] + cy;
    movd    %mm0, 24(%edx)  / 7: r[i] = product[31..0]
    psrlq   $32, %mm0   / 7: cy = product[63..32]

.L17:
    movd    %mm0, %eax  / return (cy)
    / no emms.  caller is responsible for emms
    ret
    SET_SIZE(big_mul_set_vec_sse2_r)


/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ r      8(%ebp)    %edx
/ a     12(%ebp)    %ebx
/ len       16(%ebp)    %ecx
/ digit     20(%ebp)    %mm3
/
/ In userland, there is just the one function, big_mul_set_vec_sse2().
/ But in the kernel, there are two variations:
/    1. big_mul_set_vec_sse2() which does what is necessary to save and
/       restore state, if necessary, and to ensure that preemtion is
/       disabled.
/    2. big_mul_set_vec_sse2_nsv() which just does the work;
/       it is the caller's responsibility to ensure that MMX state
/       does not need to be saved and restored and that preemption
/       is already disabled.

#if defined(MMX_MANAGE)
    ENTRY(big_mul_set_vec_sse2)
    pushl   %ebp
    movl    %esp, %ebp
    pushl   %ebx
    pushl   %esi
    KPREEMPT_DISABLE
    TEST_TS(%ebx)
    pushl   %ebx
    jnz .setvec_no_save
    pushl   %edi
    SAVE_MMX_0TO4(%edi)
    movl    8(%ebp), %edx
    movl    12(%ebp), %ebx
    movl    16(%ebp), %ecx
    movd    20(%ebp), %mm3
    call    big_mul_set_vec_sse2_r
    movl    %eax, %esi
    RSTOR_MMX_0TO4(%edi)
    popl    %edi
    jmp .setvec_rtn

.setvec_no_save:
    movl    8(%ebp), %edx
    movl    12(%ebp), %ebx
    movl    16(%ebp), %ecx
    movd    20(%ebp), %mm3
    call    big_mul_set_vec_sse2_r
    movl    %eax, %esi

.setvec_rtn:
    emms
    popl    %ebx
    movl    %ebx, %cr0
    KPREEMPT_ENABLE
    movl    %esi, %eax
    popl    %esi
    popl    %ebx
    leave
    ret
    SET_SIZE(big_mul_set_vec_sse2)

    ENTRY(big_mul_set_vec_sse2_nsv)
    pushl   %ebp
    movl    %esp, %ebp
    pushl   %ebx
    movl    8(%ebp), %edx
    movl    12(%ebp), %ebx
    movl    16(%ebp), %ecx
    movd    20(%ebp), %mm3
    call    big_mul_set_vec_sse2_r
    popl    %ebx
    leave
    ret
    SET_SIZE(big_mul_set_vec_sse2_nsv)

#else   /* !defined(MMX_MANAGE) */

/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ r      8(%ebp)    %edx
/ a     12(%ebp)    %ebx
/ len       16(%ebp)    %ecx
/ digit     20(%ebp)    %mm3

    ENTRY(big_mul_set_vec_sse2)
    pushl   %ebp
    movl    %esp, %ebp
    pushl   %ebx
    movl    8(%ebp), %edx
    movl    12(%ebp), %ebx
    movl    16(%ebp), %ecx
    movd    20(%ebp), %mm3
    call    big_mul_set_vec_sse2_r
    popl    %ebx
    emms
    leave
    ret
    SET_SIZE(big_mul_set_vec_sse2)

#endif  /* MMX_MANAGE */


/ r = r + a * digit, r and a are vectors of length len
/ returns the carry digit
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ uint32_t
/ big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
/
/ r %edx
/ a %ebx
/ len   %ecx
/ digit %mm3
/
/ N.B.:
/   This is strictly for internal use.
/   The interface is very light-weight.
/   All parameters are passed in registers.
/   It does not conform to the SYSV x86 ABI.
/   So, don't even think about calling this function directly from C code.
/
/ The basic multiply digit loop is unrolled 8 times.
/ Each comment is preceded by an instance number.
/ Instructions that have been moved retain their original, "natural"
/ instance number.  It should be easier this way to follow
/ the step-wise refinement process that went into constructing
/ the final code.

    ENTRY(big_mul_add_vec_sse2_r)
    xorl    %eax, %eax
    testl   %ecx, %ecx
    jz  .L27

    pxor    %mm0, %mm0  / cy = 0

.L25:
    cmpl    $UNROLL, %ecx
    jl  .L26
    movd    0(%ebx), %mm1   / 1: mm1 = a[i]
    movd    0(%edx), %mm2   / 1: mm2 = r[i]
    pmuludq %mm3, %mm1  / 1: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 1: mm2 = digit * a[i] + r[i]
    movd    4(%ebx), %mm1   / 2: mm1 = a[i]
    paddq   %mm2, %mm0  / 1: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 0(%edx)   / 1: r[i] = product[31..0]
    movd    4(%edx), %mm2   / 2: mm2 = r[i]
    psrlq   $32, %mm0   / 1: cy = product[63..32]

    pmuludq %mm3, %mm1  / 2: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 2: mm2 = digit * a[i] + r[i]
    movd    8(%ebx), %mm1   / 3: mm1 = a[i]
    paddq   %mm2, %mm0  / 2: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 4(%edx)   / 2: r[i] = product[31..0]
    movd    8(%edx), %mm2   / 3: mm2 = r[i]
    psrlq   $32, %mm0   / 2: cy = product[63..32]

    pmuludq %mm3, %mm1  / 3: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 3: mm2 = digit * a[i] + r[i]
    movd    12(%ebx), %mm1  / 4: mm1 = a[i]
    paddq   %mm2, %mm0  / 3: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 8(%edx)   / 3: r[i] = product[31..0]
    movd    12(%edx), %mm2  / 4: mm2 = r[i]
    psrlq   $32, %mm0   / 3: cy = product[63..32]

    pmuludq %mm3, %mm1  / 4: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 4: mm2 = digit * a[i] + r[i]
    movd    16(%ebx), %mm1  / 5: mm1 = a[i]
    paddq   %mm2, %mm0  / 4: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 12(%edx)  / 4: r[i] = product[31..0]
    movd    16(%edx), %mm2  / 5: mm2 = r[i]
    psrlq   $32, %mm0   / 4: cy = product[63..32]

    pmuludq %mm3, %mm1  / 5: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 5: mm2 = digit * a[i] + r[i]
    movd    20(%ebx), %mm1  / 6: mm1 = a[i]
    paddq   %mm2, %mm0  / 5: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 16(%edx)  / 5: r[i] = product[31..0]
    movd    20(%edx), %mm2  / 6: mm2 = r[i]
    psrlq   $32, %mm0   / 5: cy = product[63..32]

    pmuludq %mm3, %mm1  / 6: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 6: mm2 = digit * a[i] + r[i]
    movd    24(%ebx), %mm1  / 7: mm1 = a[i]
    paddq   %mm2, %mm0  / 6: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 20(%edx)  / 6: r[i] = product[31..0]
    movd    24(%edx), %mm2  / 7: mm2 = r[i]
    psrlq   $32, %mm0   / 6: cy = product[63..32]

    pmuludq %mm3, %mm1  / 7: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 7: mm2 = digit * a[i] + r[i]
    movd    28(%ebx), %mm1  / 8: mm1 = a[i]
    paddq   %mm2, %mm0  / 7: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 24(%edx)  / 7: r[i] = product[31..0]
    movd    28(%edx), %mm2  / 8: mm2 = r[i]
    psrlq   $32, %mm0   / 7: cy = product[63..32]

    pmuludq %mm3, %mm1  / 8: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 8: mm2 = digit * a[i] + r[i]
    paddq   %mm2, %mm0  / 8: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 28(%edx)  / 8: r[i] = product[31..0]
    psrlq   $32, %mm0   / 8: cy = product[63..32]

    leal    UNROLL32(%ebx), %ebx    / a += UNROLL
    leal    UNROLL32(%edx), %edx    / r += UNROLL
    subl    $UNROLL, %ecx       / len -= UNROLL
    jz  .L27
    jmp .L25

.L26:
    movd    0(%ebx), %mm1   / 1: mm1 = a[i]
    movd    0(%edx), %mm2   / 1: mm2 = r[i]
    pmuludq %mm3, %mm1  / 1: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 1: mm2 = digit * a[i] + r[i]
    paddq   %mm2, %mm0  / 1: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 0(%edx)   / 1: r[i] = product[31..0]
    psrlq   $32, %mm0   / 1: cy = product[63..32]
    subl    $1, %ecx
    jz  .L27

    movd    4(%ebx), %mm1   / 2: mm1 = a[i]
    movd    4(%edx), %mm2   / 2: mm2 = r[i]
    pmuludq %mm3, %mm1  / 2: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 2: mm2 = digit * a[i] + r[i]
    paddq   %mm2, %mm0  / 2: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 4(%edx)   / 2: r[i] = product[31..0]
    psrlq   $32, %mm0   / 2: cy = product[63..32]
    subl    $1, %ecx
    jz  .L27

    movd    8(%ebx), %mm1   / 3: mm1 = a[i]
    movd    8(%edx), %mm2   / 3: mm2 = r[i]
    pmuludq %mm3, %mm1  / 3: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 3: mm2 = digit * a[i] + r[i]
    paddq   %mm2, %mm0  / 3: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 8(%edx)   / 3: r[i] = product[31..0]
    psrlq   $32, %mm0   / 3: cy = product[63..32]
    subl    $1, %ecx
    jz  .L27

    movd    12(%ebx), %mm1  / 4: mm1 = a[i]
    movd    12(%edx), %mm2  / 4: mm2 = r[i]
    pmuludq %mm3, %mm1  / 4: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 4: mm2 = digit * a[i] + r[i]
    paddq   %mm2, %mm0  / 4: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 12(%edx)  / 4: r[i] = product[31..0]
    psrlq   $32, %mm0   / 4: cy = product[63..32]
    subl    $1, %ecx
    jz  .L27

    movd    16(%ebx), %mm1  / 5: mm1 = a[i]
    movd    16(%edx), %mm2  / 5: mm2 = r[i]
    pmuludq %mm3, %mm1  / 5: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 5: mm2 = digit * a[i] + r[i]
    paddq   %mm2, %mm0  / 5: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 16(%edx)  / 5: r[i] = product[31..0]
    psrlq   $32, %mm0   / 5: cy = product[63..32]
    subl    $1, %ecx
    jz  .L27

    movd    20(%ebx), %mm1  / 6: mm1 = a[i]
    movd    20(%edx), %mm2  / 6: mm2 = r[i]
    pmuludq %mm3, %mm1  / 6: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 6: mm2 = digit * a[i] + r[i]
    paddq   %mm2, %mm0  / 6: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 20(%edx)  / 6: r[i] = product[31..0]
    psrlq   $32, %mm0   / 6: cy = product[63..32]
    subl    $1, %ecx
    jz  .L27

    movd    24(%ebx), %mm1  / 7: mm1 = a[i]
    movd    24(%edx), %mm2  / 7: mm2 = r[i]
    pmuludq %mm3, %mm1  / 7: mm1 = digit * a[i]
    paddq   %mm1, %mm2  / 7: mm2 = digit * a[i] + r[i]
    paddq   %mm2, %mm0  / 7: mm0 = digit * a[i] + r[i] + cy;
    movd    %mm0, 24(%edx)  / 7: r[i] = product[31..0]
    psrlq   $32, %mm0   / 7: cy = product[63..32]

.L27:
    movd    %mm0, %eax
    / no emms.  caller is responsible for emms
    ret
    SET_SIZE(big_mul_add_vec_sse2_r)


/ r = r + a * digit, r and a are vectors of length len
/ returns the carry digit
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ r      8(%ebp)    %edx
/ a     12(%ebp)    %ebx
/ len       16(%ebp)    %ecx
/ digit     20(%ebp)    %mm3
/
/ In userland, there is just the one function, big_mul_add_vec_sse2().
/ But in the kernel, there are two variations:
/    1. big_mul_add_vec_sse2() which does what is necessary to save and
/       restore state, if necessary, and to ensure that preemtion is
/       disabled.
/    2. big_mul_add_vec_sse2_nsv() which just does the work;
/       it is the caller's responsibility to ensure that MMX state
/       does not need to be saved and restored and that preemption
/       is already disabled.


#if defined(MMX_MANAGE)

    ENTRY(big_mul_add_vec_sse2)
    pushl   %ebp
    movl    %esp, %ebp
    pushl   %ebx
    pushl   %esi
    KPREEMPT_DISABLE
    TEST_TS(%ebx)
    pushl   %ebx
    jnz .addvec_no_save
    pushl   %edi
    SAVE_MMX_0TO4(%edi)
    movl    8(%ebp), %edx
    movl    12(%ebp), %ebx
    movl    16(%ebp), %ecx
    movd    20(%ebp), %mm3
    call    big_mul_add_vec_sse2_r
    movl    %eax, %esi
    RSTOR_MMX_0TO4(%edi)
    popl    %edi
    jmp .addvec_rtn

.addvec_no_save:
    movl    8(%ebp), %edx
    movl    12(%ebp), %ebx
    movl    16(%ebp), %ecx
    movd    20(%ebp), %mm3
    call    big_mul_add_vec_sse2_r
    movl    %eax, %esi

.addvec_rtn:
    emms
    popl    %ebx
    movl    %ebx, %cr0
    KPREEMPT_ENABLE
    movl    %esi, %eax
    popl    %esi
    popl    %ebx
    leave
    ret
    SET_SIZE(big_mul_add_vec_sse2)

    ENTRY(big_mul_add_vec_sse2_nsv)
    pushl   %ebp
    movl    %esp, %ebp
    pushl   %ebx
    movl    8(%ebp), %edx
    movl    12(%ebp), %ebx
    movl    16(%ebp), %ecx
    movd    20(%ebp), %mm3
    call    big_mul_add_vec_sse2_r
    popl    %ebx
    leave
    ret
    SET_SIZE(big_mul_add_vec_sse2_nsv)


#else   /* !defined(MMX_MANAGE) */

    ENTRY(big_mul_add_vec_sse2)
    pushl   %ebp
    movl    %esp, %ebp
    pushl   %ebx
    movl    8(%ebp), %edx
    movl    12(%ebp), %ebx
    movl    16(%ebp), %ecx
    movd    20(%ebp), %mm3
    call    big_mul_add_vec_sse2_r
    popl    %ebx
    emms
    leave
    ret
    SET_SIZE(big_mul_add_vec_sse2)

#endif  /* MMX_MANAGE */


/ void
/ big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
/ {
/   int i;
/
/   r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]);
/   for (i = 1; i < blen; ++i)
/       r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]);
/ }


#if defined(MMX_MANAGE)
    ENTRY(big_mul_vec_sse2_fc)
#else
    ENTRY(big_mul_vec_sse2)
#endif
    subl    $0x8, %esp
    pushl   %ebx
    pushl   %ebp
    pushl   %esi
    pushl   %edi
    movl    40(%esp), %eax
    movl    %eax, 20(%esp)
    pushl   (%eax)
    movl    40(%esp), %edi
    pushl   %edi
    movl    40(%esp), %esi
    pushl   %esi
    movl    40(%esp), %ebx
    pushl   %ebx
#if defined(MMX_MANAGE)
    call    big_mul_set_vec_sse2_nsv
#else
    call    big_mul_set_vec_sse2
#endif
    addl    $0x10, %esp
    movl    %eax, (%ebx,%edi,4)
    movl    44(%esp), %eax
    movl    %eax, 16(%esp)
    cmpl    $0x1, %eax
    jle .mulvec_rtn
    movl    $0x1, %ebp

    .align 16
.mulvec_add:
    movl    20(%esp), %eax
    pushl   (%eax,%ebp,4)
    pushl   %edi
    pushl   %esi
    leal    (%ebx,%ebp,4), %eax
    pushl   %eax
#if defined(MMX_MANAGE)
    call    big_mul_add_vec_sse2_nsv
#else
    call    big_mul_add_vec_sse2
#endif
    addl    $0x10, %esp
    leal    (%ebp,%edi), %ecx
    movl    %eax, (%ebx,%ecx,4)
    incl    %ebp
    cmpl    16(%esp), %ebp
    jl  .mulvec_add
.mulvec_rtn:
#if defined(MMX_MANAGE)
    emms
#endif
    popl    %edi
    popl    %esi
    popl    %ebp
    popl    %ebx
    addl    $0x8, %esp
    ret
#if defined(MMX_MANAGE)
    SET_SIZE(big_mul_vec_sse2_fc)
#else
    SET_SIZE(big_mul_vec_sse2)
#endif

#if defined(MMX_MANAGE)

    ENTRY(big_mul_vec_sse2)
    pushl   %ebp
    movl    %esp, %ebp
    subl    $8, %esp
    pushl   %edi
    KPREEMPT_DISABLE
    TEST_TS(%eax)
    movl    %eax, -8(%ebp)
    jnz .mulvec_no_save
    SAVE_MMX_0TO4(%edi)
    movl    %edi, -4(%ebp)
.mulvec_no_save:
    movl    24(%ebp), %eax      / blen
    pushl   %eax
    movl    20(%ebp), %eax      / b
    pushl   %eax
    movl    16(%ebp), %eax      / alen
    pushl   %eax
    movl    12(%ebp), %eax      / a
    pushl   %eax
    movl    8(%ebp), %eax       / r
    pushl   %eax
    call    big_mul_vec_sse2_fc
    addl    $20, %esp
    movl    -8(%ebp), %eax
    testl   $CR0_TS, %eax
    jnz .mulvec_no_rstr
    movl    -4(%ebp), %edi
    RSTOR_MMX_0TO4(%edi)
.mulvec_no_rstr:
    movl    %eax, %cr0
    KPREEMPT_ENABLE
    popl    %edi
    leave
    ret
    SET_SIZE(big_mul_vec_sse2)

#endif  /* MMX_MANAGE */


#undef UNROLL
#undef UNROLL32


/ r = a * a, r and a are vectors of length len
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ This function is not suitable for a truly general-purpose multiprecision
/ arithmetic library, because it does not work for "small" numbers, that is
/ numbers of 1 or 2 digits.  big_mul() just uses the ordinary big_mul_vec()
/ for any small numbers.

#if defined(MMX_MANAGE)
    ENTRY(big_sqr_vec_sse2_fc)
#else
    ENTRY(big_sqr_vec_sse2)
    pushl   %ebp
    movl    %esp, %ebp
#endif

    pushl   %ebx
    pushl   %edi
    pushl   %esi

    / r[1..alen] = a[0] * a[1..alen-1]

    movl    8(%ebp), %edi       / r = arg(r)
    movl    12(%ebp), %esi      / a = arg(a)
    movl    16(%ebp), %ecx      / cnt = arg(alen)
    movd    %ecx, %mm4      / save_cnt = arg(alen)
    leal    4(%edi), %edx       / dst = &r[1]
    movl    %esi, %ebx      / src = a
    movd    0(%ebx), %mm3       / mm3 = a[0]
    leal    4(%ebx), %ebx       / src = &a[1]
    subl    $1, %ecx        / --cnt
    call    big_mul_set_vec_sse2_r  / r[1..alen-1] = a[0] * a[1..alen-1]
    movl    %edi, %edx      / dst = r
    movl    %esi, %ebx      / src = a
    movd    %mm4, %ecx      / cnt = save_cnt
    movl    %eax, (%edx, %ecx, 4)   / r[cnt] = cy

/   /* High-level vector C pseudocode */
/   for (i = 1; i < alen-1; ++i)
/       r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1]
/
/   /* Same thing, but slightly lower level C-like pseudocode */
/   i = 1;
/   r = &arg_r[2*i + 1];
/   a = &arg_a[i + 1];
/   digit = arg_a[i];
/   cnt = alen - 3;
/   while (cnt != 0) {
/       r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit);
/       r += 2;
/       ++a;
/       --cnt;
/   }
/
/   /* Same thing, but even lower level
/    * For example, pointers are raw pointers,
/    * with no scaling by object size.
/    */
/   r = arg_r + 12; /* i == 1; 2i + 1 == 3;  4*3 == 12; */
/   a = arg_a + 8;
/   digit = *(arg_a + 4);
/   cnt = alen - 3;
/   while (cnt != 0) {
/       cy = big_mul_add_vec_sse2_r();
/       *(r + 4 * cnt) = cy;
/       r += 8;
/       a += 4;
/       --cnt;
/   }

    leal    4(%edi), %edi       / r += 4; r = &r[1]
    leal    4(%esi), %esi       / a += 4; a = &a[1]
    movd    %mm4, %ecx      / cnt = save
    subl    $2, %ecx        / cnt = alen - 2; i in 1..alen-2
    movd    %ecx, %mm4      / save_cnt
    jecxz   .L32            / while (cnt != 0) {
.L31:
    movd    0(%esi), %mm3       / digit = a[i]
    leal    4(%esi), %esi       / a += 4; a = &a[1]; a = &a[i + 1]
    leal    8(%edi), %edi       / r += 8; r = &r[2]; r = &r[2 * i + 1]
    movl    %edi, %edx      / edx = r
    movl    %esi, %ebx      / ebx = a
    cmp $1, %ecx        / The last triangle term is special
    jz  .L32
    call    big_mul_add_vec_sse2_r
    movd    %mm4, %ecx      / cnt = save_cnt
    movl    %eax, (%edi, %ecx, 4)   / r[cnt] = cy
    subl    $1, %ecx        / --cnt
    movd    %ecx, %mm4      / save_cnt = cnt
    jmp .L31            / }

.L32:
    movd    0(%ebx), %mm1       / mm1 = a[i + 1]
    movd    0(%edx), %mm2       / mm2 = r[2 * i + 1]
    pmuludq %mm3, %mm1      / mm1 = p = digit * a[i + 1]
    paddq   %mm1, %mm2      / mm2 = r[2 * i + 1] + p
    movd    %mm2, 0(%edx)       / r[2 * i + 1] += lo32(p)
    psrlq   $32, %mm2       / mm2 = cy
    movd    %mm2, 4(%edx)       / r[2 * i + 2] = cy
    pxor    %mm2, %mm2
    movd    %mm2, 8(%edx)       / r[2 * i + 3] = 0

    movl    8(%ebp), %edx       / r = arg(r)
    movl    12(%ebp), %ebx      / a = arg(a)
    movl    16(%ebp), %ecx      / cnt = arg(alen)

    / compute low-order corner
    / p = a[0]**2
    / r[0] = lo32(p)
    / cy   = hi32(p)
    movd    0(%ebx), %mm2       / mm2 = a[0]
    pmuludq %mm2, %mm2      / mm2 = p = a[0]**2
    movd    %mm2, 0(%edx)       / r[0] = lo32(p)
    psrlq   $32, %mm2       / mm2 = cy = hi32(p)

    / p = 2 * r[1]
    / t = p + cy
    / r[1] = lo32(t)
    / cy   = hi32(t)
    movd    4(%edx), %mm1       / mm1 = r[1]
    psllq   $1, %mm1        / mm1 = p = 2 * r[1]
    paddq   %mm1, %mm2      / mm2 = t = p + cy
    movd    %mm2, 4(%edx)       / r[1] = low32(t)
    psrlq   $32, %mm2       / mm2 = cy = hi32(t)

    / r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
    subl    $2, %ecx        / cnt = alen - 2
.L34:
    movd    4(%ebx), %mm0       / mm0 = diag = a[i+1]
    pmuludq %mm0, %mm0      / mm0 = p = diag**2
    paddq   %mm0, %mm2      / mm2 = t = p + cy
    movd    %mm2, %eax
    movd    %eax, %mm1      / mm1 = lo32(t)
    psrlq   $32, %mm2       / mm2 = hi32(t)

    movd    8(%edx), %mm3       / mm3 = r[2*i]
    psllq   $1, %mm3        / mm3 = 2*r[2*i]
    paddq   %mm3, %mm1      / mm1 = 2*r[2*i] + lo32(t)
    movd    %mm1, 8(%edx)       / r[2*i] = 2*r[2*i] + lo32(t)
    psrlq   $32, %mm1
    paddq   %mm1, %mm2

    movd    12(%edx), %mm3      / mm3 = r[2*i+1]
    psllq   $1, %mm3        / mm3 = 2*r[2*i+1]
    paddq   %mm3, %mm2      / mm2 = 2*r[2*i+1] + hi32(t)
    movd    %mm2, 12(%edx)      / r[2*i+1] = mm2
    psrlq   $32, %mm2       / mm2 = cy
    leal    8(%edx), %edx       / r += 2
    leal    4(%ebx), %ebx       / ++a
    subl    $1, %ecx        / --cnt
    jnz .L34

    / Carry from last triangle term must participate in doubling,
    / but this step isn't paired up with a squaring the elements
    / of the inner diagonal.
    / r[$-3..$-2] += 2 * r[$-3..$-2] + cy
    movd    8(%edx), %mm3       / mm3 = r[2*i]
    psllq   $1, %mm3        / mm3 = 2*r[2*i]
    paddq   %mm3, %mm2      / mm2 = 2*r[2*i] + cy
    movd    %mm2, 8(%edx)       / r[2*i] = lo32(2*r[2*i] + cy)
    psrlq   $32, %mm2       / mm2 = cy = hi32(2*r[2*i] + cy)

    movd    12(%edx), %mm3      / mm3 = r[2*i+1]
    psllq   $1, %mm3        / mm3 = 2*r[2*i+1]
    paddq   %mm3, %mm2      / mm2 = 2*r[2*i+1] + cy
    movd    %mm2, 12(%edx)      / r[2*i+1] = mm2
    psrlq   $32, %mm2       / mm2 = cy

    / compute high-order corner and add it in
    / p = a[alen - 1]**2
    / t = p + cy
    / r[alen + alen - 2] += lo32(t)
    / cy = hi32(t)
    / r[alen + alen - 1] = cy
    movd    4(%ebx), %mm0       / mm0 = a[$-1]
    movd    8(%edx), %mm3       / mm3 = r[$-2]
    pmuludq %mm0, %mm0      / mm0 = p = a[$-1]**2
    paddq   %mm0, %mm2      / mm2 = t = p + cy
    paddq   %mm3, %mm2      / mm2 = r[$-2] + t
    movd    %mm2, 8(%edx)       / r[$-2] = lo32(r[$-2] + t)
    psrlq   $32, %mm2       / mm2 = cy = hi32(r[$-2] + t)
    movd    12(%edx), %mm3
    paddq   %mm3, %mm2
    movd    %mm2, 12(%edx)      / r[$-1] += cy

.L35:
    emms
    popl    %esi
    popl    %edi
    popl    %ebx

#if defined(MMX_MANAGE)
    ret
    SET_SIZE(big_sqr_vec_sse2_fc)
#else
    leave
    ret
    SET_SIZE(big_sqr_vec_sse2)
#endif


#if defined(MMX_MANAGE)
    ENTRY(big_sqr_vec_sse2)
    pushl   %ebp
    movl    %esp, %ebp
    KPREEMPT_DISABLE
    TEST_TS(%ebx)
    pushl   %ebx
    jnz .sqr_no_save
    pushl   %edi
    SAVE_MMX_0TO4(%edi)
    call    big_sqr_vec_sse2_fc
    RSTOR_MMX_0TO4(%edi)
    popl    %edi
    jmp .sqr_rtn

.sqr_no_save:
    call    big_sqr_vec_sse2_fc

.sqr_rtn:
    popl    %ebx
    movl    %ebx, %cr0
    KPREEMPT_ENABLE
    leave
    ret
    SET_SIZE(big_sqr_vec_sse2)

#endif  /* MMX_MANAGE */

/ ------------------------------------------------------------------------
/       UMUL Implementations
/ ------------------------------------------------------------------------


/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ Does not use any MMX, SSE, or SSE2 instructions.
/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
/ This is a fall-back implementation for x86 models that do not support
/ the PMULUDQ instruction.
/
/ uint32_t
/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
/
/ r      8(%ebp)    %edx    %edi
/ a     12(%ebp)    %ebx    %esi
/ len       16(%ebp)    %ecx
/ digit     20(%ebp)    %esi

    ENTRY(big_mul_set_vec_umul)
    pushl   %ebp
    movl    %esp, %ebp
    pushl   %esi
    pushl   %edi
    pushl   %ebx
    movl    16(%ebp), %ecx
    xorl    %ebx, %ebx  / cy = 0
    testl   %ecx, %ecx
    movl    8(%ebp), %edi
    movl    12(%ebp), %esi
    je  .L57

.L55:
    movl    (%esi), %eax    / eax = a[i]
    leal    4(%esi), %esi   / ++a
    mull    20(%ebp)    / edx:eax = a[i] * digit
    addl    %ebx, %eax
    adcl    $0, %edx    / edx:eax = a[i] * digit + cy
    movl    %eax, (%edi)    / r[i] = product[31..0]
    movl    %edx, %ebx  / cy = product[63..32]
    leal    4(%edi), %edi   / ++r
    decl    %ecx        / --len
    jnz .L55        / while (len != 0)
.L57:
    movl    %ebx, %eax
    popl    %ebx
    popl    %edi
    popl    %esi
    leave
    ret
    SET_SIZE(big_mul_set_vec_umul)


/ r = r + a * digit, r and a are vectors of length len
/ returns the carry digit
/ Does not use any MMX, SSE, or SSE2 instructions.
/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
/ This is a fall-back implementation for x86 models that do not support
/ the PMULUDQ instruction.
/
/ uint32_t
/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
/
/ r      8(%ebp)    %edx    %edi
/ a     12(%ebp)    %ebx    %esi
/ len       16(%ebp)    %ecx
/ digit     20(%ebp)    %esi

    ENTRY(big_mul_add_vec_umul)
    pushl   %ebp
    movl    %esp, %ebp
    pushl   %esi
    pushl   %edi
    pushl   %ebx
    movl    16(%ebp), %ecx
    xorl    %ebx, %ebx  / cy = 0
    testl   %ecx, %ecx
    movl    8(%ebp), %edi
    movl    12(%ebp), %esi
    je  .L67
    .align 4
.L65:
    movl    (%esi), %eax    / eax = a[i]
    leal    4(%esi), %esi   / ++a
    mull    20(%ebp)    / edx:eax = a[i] * digit
    addl    (%edi), %eax
    adcl    $0, %edx    / edx:eax = a[i] * digit + r[i]
    addl    %ebx, %eax
    adcl    $0, %edx    / edx:eax = a[i] * digit + r[i] + cy
    movl    %eax, (%edi)    / r[i] = product[31..0]
    movl    %edx, %ebx  / cy = product[63..32]
    leal    4(%edi), %edi   / ++r
    decl    %ecx        / --len
    jnz .L65        / while (len != 0)
.L67:
    movl    %ebx, %eax
    popl    %ebx
    popl    %edi
    popl    %esi
    leave
    ret
    SET_SIZE(big_mul_add_vec_umul)

#endif  /* __lint */