bignum_i386_asm.s revision 7417cfdecea1902cef03c0d61a72df97d945925d
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/asm_linkage.h>
#include <sys/x86_archext.h>
#include <sys/controlregs.h>
#if defined(__lint)
#include <sys/types.h>
uint32_t
bignum_use_sse2()
{ return (0); }
/* Not to be called by C code */
/* ARGSUSED */
uint32_t
big_mul_set_vec_sse2_r()
{ return (0); }
/* Not to be called by C code */
/* ARGSUSED */
uint32_t
big_mul_add_vec_sse2_r()
{ return (0); }
/* ARGSUSED */
uint32_t
big_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{ return (0); }
/* ARGSUSED */
uint32_t
big_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{ return (0); }
/* ARGSUSED */
void
big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
{}
/* ARGSUSED */
void
big_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len)
{}
#if defined(MMX_MANAGE)
/* ARGSUSED */
uint32_t
big_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{ return (0); }
/* ARGSUSED */
uint32_t
big_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{ return (0); }
/* Not to be called by C code */
/* ARGSUSED */
void
big_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len)
{}
#endif /* MMX_MANAGE */
/*
* UMUL
*
*/
/* ARGSUSED */
uint32_t
big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{ return (0); }
/* ARGSUSED */
uint32_t
big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{ return (0); }
#else /* __lint */
#if defined(MMX_MANAGE)
#if defined(_KERNEL)
#define KPREEMPT_DISABLE call kpr_disable
#define KPREEMPT_ENABLE call kpr_enable
#define TEST_TS(reg) \
movl %cr0, reg; \
clts; \
testl $CR0_TS, reg
#else /* _KERNEL */
#define KPREEMPT_DISABLE
#define KPREEMPT_ENABLE
#define TEST_TS(reg) \
movl $0, reg; \
testl $CR0_TS, reg
#endif /* _KERNEL */
#define MMX_SIZE 8
#define MMX_ALIGN 8
#define SAVE_MMX_PROLOG(sreg, nreg) \
subl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp; \
movl %esp, sreg; \
addl $MMX_ALIGN, sreg; \
andl $-1![MMX_ALIGN-1], sreg;
#define RSTOR_MMX_EPILOG(nreg) \
addl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;
#define SAVE_MMX_0TO4(sreg) \
SAVE_MMX_PROLOG(sreg, 5); \
movq %mm0, 0(sreg); \
movq %mm1, 8(sreg); \
movq %mm2, 16(sreg); \
movq %mm3, 24(sreg); \
movq %mm4, 32(sreg)
#define RSTOR_MMX_0TO4(sreg) \
movq 0(sreg), %mm0; \
movq 8(sreg), %mm1; \
movq 16(sreg), %mm2; \
movq 24(sreg), %mm3; \
movq 32(sreg), %mm4; \
RSTOR_MMX_EPILOG(5)
#endif /* MMX_MANAGE */
/ Note: this file contains implementations for
/ big_mul_set_vec()
/ big_mul_add_vec()
/ big_mul_vec()
/ big_sqr_vec()
/ One set of implementations is for SSE2-capable models.
/ The other uses no MMX, SSE, or SSE2 instructions, only
/ the x86 32 X 32 -> 64 unsigned multiply instruction, MUL.
/
/ The code for the implementations is grouped by SSE2 vs UMUL,
/ rather than grouping pairs of implementations for each function.
/ This is because the bignum implementation gets "imprinted"
/ on the correct implementation, at the time of first use,
/ so none of the code for the other implementations is ever
/ executed. So, it is a no-brainer to layout the code to minimize
/ the "footprint" of executed code.
/ Can we use SSE2 instructions? Return value is non-zero
/ if we can.
/
/ Note:
/ Using the cpuid instruction directly would work equally
/ well in userland and in the kernel, but we do not use the
/ cpuid instruction in the kernel, we use x86_featureset,
/ instead. This means we honor any decisions the kernel
/ startup code may have made in setting this variable,
/ including disabling SSE2. It might even be a good idea
/ to honor this kind of setting in userland, as well, but
/ the variable, x86_featureset is not readily available to
/ userland processes.
/
/ uint32_t
/ bignum_use_sse2()
ENTRY(bignum_use_sse2)
#if defined(_KERNEL)
xor %eax, %eax
bt $X86FSET_SSE2, x86_featureset
adc %eax, %eax
#else /* _KERNEL */
pushl %ebx
movl $1, %eax / Get feature information
cpuid
movl %edx, %eax / set return value
popl %ebx
andl $CPUID_INTC_EDX_SSE2, %eax
#endif /* _KERNEL */
ret
SET_SIZE(bignum_use_sse2)
/ ------------------------------------------------------------------------
/ SSE2 Implementations
/ ------------------------------------------------------------------------
/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ uint32_t
/ big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
/
/ r %edx
/ a %ebx
/ len %ecx
/ digit %mm3
/
/ Does not touch the following registers: %esi, %edi, %mm4
/
/ N.B.:
/ This is strictly for internal use.
/ The interface is very light-weight.
/ All parameters are passed in registers.
/ It does not conform to the SYSV x86 ABI.
/ So, don't even think about calling this function directly from C code.
/
/ The basic multiply digit loop is unrolled 8 times.
/ Each comment is preceded by an instance number.
/ Instructions that have been moved retain their original, "natural"
/ instance number. It should be easier this way to follow
/ the step-wise refinement process that went into constructing
/ the final code.
#define UNROLL 8
#define UNROLL32 32
ENTRY(big_mul_set_vec_sse2_r)
xorl %eax, %eax / if (len == 0) return (0);
testl %ecx, %ecx
jz .L17
pxor %mm0, %mm0 / cy = 0
.L15:
cmpl $UNROLL, %ecx
jl .L16
movd 0(%ebx), %mm1 / 1: mm1 = a[i]
pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy;
movd 4(%ebx), %mm1 / 2: mm1 = a[i]
movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
psrlq $32, %mm0 / 1: cy = product[63..32]
pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy;
movd 8(%ebx), %mm1 / 3: mm1 = a[i]
movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
psrlq $32, %mm0 / 2: cy = product[63..32]
pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy;
movd 12(%ebx), %mm1 / 4: mm1 = a[i]
movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
psrlq $32, %mm0 / 3: cy = product[63..32]
pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy;
movd 16(%ebx), %mm1 / 5: mm1 = a[i]
movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
psrlq $32, %mm0 / 4: cy = product[63..32]
pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy;
movd 20(%ebx), %mm1 / 6: mm1 = a[i]
movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
psrlq $32, %mm0 / 5: cy = product[63..32]
pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy;
movd 24(%ebx), %mm1 / 7: mm1 = a[i]
movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
psrlq $32, %mm0 / 6: cy = product[63..32]
pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy;
movd 28(%ebx), %mm1 / 8: mm1 = a[i]
movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
psrlq $32, %mm0 / 7: cy = product[63..32]
pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i]
paddq %mm1, %mm0 / 8: mm0 = digit * a[i] + cy;
movd %mm0, 28(%edx) / 8: r[i] = product[31..0]
psrlq $32, %mm0 / 8: cy = product[63..32]
leal UNROLL32(%ebx), %ebx / a += UNROLL
leal UNROLL32(%edx), %edx / r += UNROLL
subl $UNROLL, %ecx / len -= UNROLL
jz .L17
jmp .L15
.L16:
movd 0(%ebx), %mm1 / 1: mm1 = a[i]
pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy;
movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
psrlq $32, %mm0 / 1: cy = product[63..32]
subl $1, %ecx
jz .L17
movd 4(%ebx), %mm1 / 2: mm1 = a[i]
pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy;
movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
psrlq $32, %mm0 / 2: cy = product[63..32]
subl $1, %ecx
jz .L17
movd 8(%ebx), %mm1 / 3: mm1 = a[i]
pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy;
movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
psrlq $32, %mm0 / 3: cy = product[63..32]
subl $1, %ecx
jz .L17
movd 12(%ebx), %mm1 / 4: mm1 = a[i]
pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy;
movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
psrlq $32, %mm0 / 4: cy = product[63..32]
subl $1, %ecx
jz .L17
movd 16(%ebx), %mm1 / 5: mm1 = a[i]
pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy;
movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
psrlq $32, %mm0 / 5: cy = product[63..32]
subl $1, %ecx
jz .L17
movd 20(%ebx), %mm1 / 6: mm1 = a[i]
pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy;
movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
psrlq $32, %mm0 / 6: cy = product[63..32]
subl $1, %ecx
jz .L17
movd 24(%ebx), %mm1 / 7: mm1 = a[i]
pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy;
movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
psrlq $32, %mm0 / 7: cy = product[63..32]
.L17:
movd %mm0, %eax / return (cy)
/ no emms. caller is responsible for emms
ret
SET_SIZE(big_mul_set_vec_sse2_r)
/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ r 8(%ebp) %edx
/ a 12(%ebp) %ebx
/ len 16(%ebp) %ecx
/ digit 20(%ebp) %mm3
/
/ In userland, there is just the one function, big_mul_set_vec_sse2().
/ But in the kernel, there are two variations:
/ 1. big_mul_set_vec_sse2() which does what is necessary to save and
/ restore state, if necessary, and to ensure that preemtion is
/ disabled.
/ 2. big_mul_set_vec_sse2_nsv() which just does the work;
/ it is the caller's responsibility to ensure that MMX state
/ does not need to be saved and restored and that preemption
/ is already disabled.
#if defined(MMX_MANAGE)
ENTRY(big_mul_set_vec_sse2)
pushl %ebp
movl %esp, %ebp
pushl %ebx
pushl %esi
KPREEMPT_DISABLE
TEST_TS(%ebx)
pushl %ebx
jnz .setvec_no_save
pushl %edi
SAVE_MMX_0TO4(%edi)
movl 8(%ebp), %edx
movl 12(%ebp), %ebx
movl 16(%ebp), %ecx
movd 20(%ebp), %mm3
call big_mul_set_vec_sse2_r
movl %eax, %esi
RSTOR_MMX_0TO4(%edi)
popl %edi
jmp .setvec_rtn
.setvec_no_save:
movl 8(%ebp), %edx
movl 12(%ebp), %ebx
movl 16(%ebp), %ecx
movd 20(%ebp), %mm3
call big_mul_set_vec_sse2_r
movl %eax, %esi
.setvec_rtn:
emms
popl %ebx
movl %ebx, %cr0
KPREEMPT_ENABLE
movl %esi, %eax
popl %esi
popl %ebx
leave
ret
SET_SIZE(big_mul_set_vec_sse2)
ENTRY(big_mul_set_vec_sse2_nsv)
pushl %ebp
movl %esp, %ebp
pushl %ebx
movl 8(%ebp), %edx
movl 12(%ebp), %ebx
movl 16(%ebp), %ecx
movd 20(%ebp), %mm3
call big_mul_set_vec_sse2_r
popl %ebx
leave
ret
SET_SIZE(big_mul_set_vec_sse2_nsv)
#else /* !defined(MMX_MANAGE) */
/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ r 8(%ebp) %edx
/ a 12(%ebp) %ebx
/ len 16(%ebp) %ecx
/ digit 20(%ebp) %mm3
ENTRY(big_mul_set_vec_sse2)
pushl %ebp
movl %esp, %ebp
pushl %ebx
movl 8(%ebp), %edx
movl 12(%ebp), %ebx
movl 16(%ebp), %ecx
movd 20(%ebp), %mm3
call big_mul_set_vec_sse2_r
popl %ebx
emms
leave
ret
SET_SIZE(big_mul_set_vec_sse2)
#endif /* MMX_MANAGE */
/ r = r + a * digit, r and a are vectors of length len
/ returns the carry digit
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ uint32_t
/ big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
/
/ r %edx
/ a %ebx
/ len %ecx
/ digit %mm3
/
/ N.B.:
/ This is strictly for internal use.
/ The interface is very light-weight.
/ All parameters are passed in registers.
/ It does not conform to the SYSV x86 ABI.
/ So, don't even think about calling this function directly from C code.
/
/ The basic multiply digit loop is unrolled 8 times.
/ Each comment is preceded by an instance number.
/ Instructions that have been moved retain their original, "natural"
/ instance number. It should be easier this way to follow
/ the step-wise refinement process that went into constructing
/ the final code.
ENTRY(big_mul_add_vec_sse2_r)
xorl %eax, %eax
testl %ecx, %ecx
jz .L27
pxor %mm0, %mm0 / cy = 0
.L25:
cmpl $UNROLL, %ecx
jl .L26
movd 0(%ebx), %mm1 / 1: mm1 = a[i]
movd 0(%edx), %mm2 / 1: mm2 = r[i]
pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i]
movd 4(%ebx), %mm1 / 2: mm1 = a[i]
paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
movd 4(%edx), %mm2 / 2: mm2 = r[i]
psrlq $32, %mm0 / 1: cy = product[63..32]
pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i]
movd 8(%ebx), %mm1 / 3: mm1 = a[i]
paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
movd 8(%edx), %mm2 / 3: mm2 = r[i]
psrlq $32, %mm0 / 2: cy = product[63..32]
pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i]
movd 12(%ebx), %mm1 / 4: mm1 = a[i]
paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
movd 12(%edx), %mm2 / 4: mm2 = r[i]
psrlq $32, %mm0 / 3: cy = product[63..32]
pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i]
movd 16(%ebx), %mm1 / 5: mm1 = a[i]
paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
movd 16(%edx), %mm2 / 5: mm2 = r[i]
psrlq $32, %mm0 / 4: cy = product[63..32]
pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i]
movd 20(%ebx), %mm1 / 6: mm1 = a[i]
paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
movd 20(%edx), %mm2 / 6: mm2 = r[i]
psrlq $32, %mm0 / 5: cy = product[63..32]
pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i]
movd 24(%ebx), %mm1 / 7: mm1 = a[i]
paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
movd 24(%edx), %mm2 / 7: mm2 = r[i]
psrlq $32, %mm0 / 6: cy = product[63..32]
pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i]
movd 28(%ebx), %mm1 / 8: mm1 = a[i]
paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
movd 28(%edx), %mm2 / 8: mm2 = r[i]
psrlq $32, %mm0 / 7: cy = product[63..32]
pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i]
paddq %mm1, %mm2 / 8: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 8: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 28(%edx) / 8: r[i] = product[31..0]
psrlq $32, %mm0 / 8: cy = product[63..32]
leal UNROLL32(%ebx), %ebx / a += UNROLL
leal UNROLL32(%edx), %edx / r += UNROLL
subl $UNROLL, %ecx / len -= UNROLL
jz .L27
jmp .L25
.L26:
movd 0(%ebx), %mm1 / 1: mm1 = a[i]
movd 0(%edx), %mm2 / 1: mm2 = r[i]
pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
psrlq $32, %mm0 / 1: cy = product[63..32]
subl $1, %ecx
jz .L27
movd 4(%ebx), %mm1 / 2: mm1 = a[i]
movd 4(%edx), %mm2 / 2: mm2 = r[i]
pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
psrlq $32, %mm0 / 2: cy = product[63..32]
subl $1, %ecx
jz .L27
movd 8(%ebx), %mm1 / 3: mm1 = a[i]
movd 8(%edx), %mm2 / 3: mm2 = r[i]
pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
psrlq $32, %mm0 / 3: cy = product[63..32]
subl $1, %ecx
jz .L27
movd 12(%ebx), %mm1 / 4: mm1 = a[i]
movd 12(%edx), %mm2 / 4: mm2 = r[i]
pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
psrlq $32, %mm0 / 4: cy = product[63..32]
subl $1, %ecx
jz .L27
movd 16(%ebx), %mm1 / 5: mm1 = a[i]
movd 16(%edx), %mm2 / 5: mm2 = r[i]
pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
psrlq $32, %mm0 / 5: cy = product[63..32]
subl $1, %ecx
jz .L27
movd 20(%ebx), %mm1 / 6: mm1 = a[i]
movd 20(%edx), %mm2 / 6: mm2 = r[i]
pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
psrlq $32, %mm0 / 6: cy = product[63..32]
subl $1, %ecx
jz .L27
movd 24(%ebx), %mm1 / 7: mm1 = a[i]
movd 24(%edx), %mm2 / 7: mm2 = r[i]
pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i]
paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy;
movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
psrlq $32, %mm0 / 7: cy = product[63..32]
.L27:
movd %mm0, %eax
/ no emms. caller is responsible for emms
ret
SET_SIZE(big_mul_add_vec_sse2_r)
/ r = r + a * digit, r and a are vectors of length len
/ returns the carry digit
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ r 8(%ebp) %edx
/ a 12(%ebp) %ebx
/ len 16(%ebp) %ecx
/ digit 20(%ebp) %mm3
/
/ In userland, there is just the one function, big_mul_add_vec_sse2().
/ But in the kernel, there are two variations:
/ 1. big_mul_add_vec_sse2() which does what is necessary to save and
/ restore state, if necessary, and to ensure that preemtion is
/ disabled.
/ 2. big_mul_add_vec_sse2_nsv() which just does the work;
/ it is the caller's responsibility to ensure that MMX state
/ does not need to be saved and restored and that preemption
/ is already disabled.
#if defined(MMX_MANAGE)
ENTRY(big_mul_add_vec_sse2)
pushl %ebp
movl %esp, %ebp
pushl %ebx
pushl %esi
KPREEMPT_DISABLE
TEST_TS(%ebx)
pushl %ebx
jnz .addvec_no_save
pushl %edi
SAVE_MMX_0TO4(%edi)
movl 8(%ebp), %edx
movl 12(%ebp), %ebx
movl 16(%ebp), %ecx
movd 20(%ebp), %mm3
call big_mul_add_vec_sse2_r
movl %eax, %esi
RSTOR_MMX_0TO4(%edi)
popl %edi
jmp .addvec_rtn
.addvec_no_save:
movl 8(%ebp), %edx
movl 12(%ebp), %ebx
movl 16(%ebp), %ecx
movd 20(%ebp), %mm3
call big_mul_add_vec_sse2_r
movl %eax, %esi
.addvec_rtn:
emms
popl %ebx
movl %ebx, %cr0
KPREEMPT_ENABLE
movl %esi, %eax
popl %esi
popl %ebx
leave
ret
SET_SIZE(big_mul_add_vec_sse2)
ENTRY(big_mul_add_vec_sse2_nsv)
pushl %ebp
movl %esp, %ebp
pushl %ebx
movl 8(%ebp), %edx
movl 12(%ebp), %ebx
movl 16(%ebp), %ecx
movd 20(%ebp), %mm3
call big_mul_add_vec_sse2_r
popl %ebx
leave
ret
SET_SIZE(big_mul_add_vec_sse2_nsv)
#else /* !defined(MMX_MANAGE) */
ENTRY(big_mul_add_vec_sse2)
pushl %ebp
movl %esp, %ebp
pushl %ebx
movl 8(%ebp), %edx
movl 12(%ebp), %ebx
movl 16(%ebp), %ecx
movd 20(%ebp), %mm3
call big_mul_add_vec_sse2_r
popl %ebx
emms
leave
ret
SET_SIZE(big_mul_add_vec_sse2)
#endif /* MMX_MANAGE */
/ void
/ big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
/ {
/ int i;
/
/ r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]);
/ for (i = 1; i < blen; ++i)
/ r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]);
/ }
#if defined(MMX_MANAGE)
ENTRY(big_mul_vec_sse2_fc)
#else
ENTRY(big_mul_vec_sse2)
#endif
subl $0x8, %esp
pushl %ebx
pushl %ebp
pushl %esi
pushl %edi
movl 40(%esp), %eax
movl %eax, 20(%esp)
pushl (%eax)
movl 40(%esp), %edi
pushl %edi
movl 40(%esp), %esi
pushl %esi
movl 40(%esp), %ebx
pushl %ebx
#if defined(MMX_MANAGE)
call big_mul_set_vec_sse2_nsv
#else
call big_mul_set_vec_sse2
#endif
addl $0x10, %esp
movl %eax, (%ebx,%edi,4)
movl 44(%esp), %eax
movl %eax, 16(%esp)
cmpl $0x1, %eax
jle .mulvec_rtn
movl $0x1, %ebp
.align 16
.mulvec_add:
movl 20(%esp), %eax
pushl (%eax,%ebp,4)
pushl %edi
pushl %esi
leal (%ebx,%ebp,4), %eax
pushl %eax
#if defined(MMX_MANAGE)
call big_mul_add_vec_sse2_nsv
#else
call big_mul_add_vec_sse2
#endif
addl $0x10, %esp
leal (%ebp,%edi), %ecx
movl %eax, (%ebx,%ecx,4)
incl %ebp
cmpl 16(%esp), %ebp
jl .mulvec_add
.mulvec_rtn:
#if defined(MMX_MANAGE)
emms
#endif
popl %edi
popl %esi
popl %ebp
popl %ebx
addl $0x8, %esp
ret
#if defined(MMX_MANAGE)
SET_SIZE(big_mul_vec_sse2_fc)
#else
SET_SIZE(big_mul_vec_sse2)
#endif
#if defined(MMX_MANAGE)
ENTRY(big_mul_vec_sse2)
pushl %ebp
movl %esp, %ebp
subl $8, %esp
pushl %edi
KPREEMPT_DISABLE
TEST_TS(%eax)
movl %eax, -8(%ebp)
jnz .mulvec_no_save
SAVE_MMX_0TO4(%edi)
movl %edi, -4(%ebp)
.mulvec_no_save:
movl 24(%ebp), %eax / blen
pushl %eax
movl 20(%ebp), %eax / b
pushl %eax
movl 16(%ebp), %eax / alen
pushl %eax
movl 12(%ebp), %eax / a
pushl %eax
movl 8(%ebp), %eax / r
pushl %eax
call big_mul_vec_sse2_fc
addl $20, %esp
movl -8(%ebp), %eax
testl $CR0_TS, %eax
jnz .mulvec_no_rstr
movl -4(%ebp), %edi
RSTOR_MMX_0TO4(%edi)
.mulvec_no_rstr:
movl %eax, %cr0
KPREEMPT_ENABLE
popl %edi
leave
ret
SET_SIZE(big_mul_vec_sse2)
#endif /* MMX_MANAGE */
#undef UNROLL
#undef UNROLL32
/ r = a * a, r and a are vectors of length len
/ Suitable only for x86 models that support SSE2 instruction set extensions
/
/ This function is not suitable for a truly general-purpose multiprecision
/ arithmetic library, because it does not work for "small" numbers, that is
/ numbers of 1 or 2 digits. big_mul() just uses the ordinary big_mul_vec()
/ for any small numbers.
#if defined(MMX_MANAGE)
ENTRY(big_sqr_vec_sse2_fc)
#else
ENTRY(big_sqr_vec_sse2)
pushl %ebp
movl %esp, %ebp
#endif
pushl %ebx
pushl %edi
pushl %esi
/ r[1..alen] = a[0] * a[1..alen-1]
movl 8(%ebp), %edi / r = arg(r)
movl 12(%ebp), %esi / a = arg(a)
movl 16(%ebp), %ecx / cnt = arg(alen)
movd %ecx, %mm4 / save_cnt = arg(alen)
leal 4(%edi), %edx / dst = &r[1]
movl %esi, %ebx / src = a
movd 0(%ebx), %mm3 / mm3 = a[0]
leal 4(%ebx), %ebx / src = &a[1]
subl $1, %ecx / --cnt
call big_mul_set_vec_sse2_r / r[1..alen-1] = a[0] * a[1..alen-1]
movl %edi, %edx / dst = r
movl %esi, %ebx / src = a
movd %mm4, %ecx / cnt = save_cnt
movl %eax, (%edx, %ecx, 4) / r[cnt] = cy
/ /* High-level vector C pseudocode */
/ for (i = 1; i < alen-1; ++i)
/ r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1]
/
/ /* Same thing, but slightly lower level C-like pseudocode */
/ i = 1;
/ r = &arg_r[2*i + 1];
/ a = &arg_a[i + 1];
/ digit = arg_a[i];
/ cnt = alen - 3;
/ while (cnt != 0) {
/ r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit);
/ r += 2;
/ ++a;
/ --cnt;
/ }
/
/ /* Same thing, but even lower level
/ * For example, pointers are raw pointers,
/ * with no scaling by object size.
/ */
/ r = arg_r + 12; /* i == 1; 2i + 1 == 3; 4*3 == 12; */
/ a = arg_a + 8;
/ digit = *(arg_a + 4);
/ cnt = alen - 3;
/ while (cnt != 0) {
/ cy = big_mul_add_vec_sse2_r();
/ *(r + 4 * cnt) = cy;
/ r += 8;
/ a += 4;
/ --cnt;
/ }
leal 4(%edi), %edi / r += 4; r = &r[1]
leal 4(%esi), %esi / a += 4; a = &a[1]
movd %mm4, %ecx / cnt = save
subl $2, %ecx / cnt = alen - 2; i in 1..alen-2
movd %ecx, %mm4 / save_cnt
jecxz .L32 / while (cnt != 0) {
.L31:
movd 0(%esi), %mm3 / digit = a[i]
leal 4(%esi), %esi / a += 4; a = &a[1]; a = &a[i + 1]
leal 8(%edi), %edi / r += 8; r = &r[2]; r = &r[2 * i + 1]
movl %edi, %edx / edx = r
movl %esi, %ebx / ebx = a
cmp $1, %ecx / The last triangle term is special
jz .L32
call big_mul_add_vec_sse2_r
movd %mm4, %ecx / cnt = save_cnt
movl %eax, (%edi, %ecx, 4) / r[cnt] = cy
subl $1, %ecx / --cnt
movd %ecx, %mm4 / save_cnt = cnt
jmp .L31 / }
.L32:
movd 0(%ebx), %mm1 / mm1 = a[i + 1]
movd 0(%edx), %mm2 / mm2 = r[2 * i + 1]
pmuludq %mm3, %mm1 / mm1 = p = digit * a[i + 1]
paddq %mm1, %mm2 / mm2 = r[2 * i + 1] + p
movd %mm2, 0(%edx) / r[2 * i + 1] += lo32(p)
psrlq $32, %mm2 / mm2 = cy
movd %mm2, 4(%edx) / r[2 * i + 2] = cy
pxor %mm2, %mm2
movd %mm2, 8(%edx) / r[2 * i + 3] = 0
movl 8(%ebp), %edx / r = arg(r)
movl 12(%ebp), %ebx / a = arg(a)
movl 16(%ebp), %ecx / cnt = arg(alen)
/ compute low-order corner
/ p = a[0]**2
/ r[0] = lo32(p)
/ cy = hi32(p)
movd 0(%ebx), %mm2 / mm2 = a[0]
pmuludq %mm2, %mm2 / mm2 = p = a[0]**2
movd %mm2, 0(%edx) / r[0] = lo32(p)
psrlq $32, %mm2 / mm2 = cy = hi32(p)
/ p = 2 * r[1]
/ t = p + cy
/ r[1] = lo32(t)
/ cy = hi32(t)
movd 4(%edx), %mm1 / mm1 = r[1]
psllq $1, %mm1 / mm1 = p = 2 * r[1]
paddq %mm1, %mm2 / mm2 = t = p + cy
movd %mm2, 4(%edx) / r[1] = low32(t)
psrlq $32, %mm2 / mm2 = cy = hi32(t)
/ r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
subl $2, %ecx / cnt = alen - 2
.L34:
movd 4(%ebx), %mm0 / mm0 = diag = a[i+1]
pmuludq %mm0, %mm0 / mm0 = p = diag**2
paddq %mm0, %mm2 / mm2 = t = p + cy
movd %mm2, %eax
movd %eax, %mm1 / mm1 = lo32(t)
psrlq $32, %mm2 / mm2 = hi32(t)
movd 8(%edx), %mm3 / mm3 = r[2*i]
psllq $1, %mm3 / mm3 = 2*r[2*i]
paddq %mm3, %mm1 / mm1 = 2*r[2*i] + lo32(t)
movd %mm1, 8(%edx) / r[2*i] = 2*r[2*i] + lo32(t)
psrlq $32, %mm1
paddq %mm1, %mm2
movd 12(%edx), %mm3 / mm3 = r[2*i+1]
psllq $1, %mm3 / mm3 = 2*r[2*i+1]
paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + hi32(t)
movd %mm2, 12(%edx) / r[2*i+1] = mm2
psrlq $32, %mm2 / mm2 = cy
leal 8(%edx), %edx / r += 2
leal 4(%ebx), %ebx / ++a
subl $1, %ecx / --cnt
jnz .L34
/ Carry from last triangle term must participate in doubling,
/ but this step isn't paired up with a squaring the elements
/ of the inner diagonal.
/ r[$-3..$-2] += 2 * r[$-3..$-2] + cy
movd 8(%edx), %mm3 / mm3 = r[2*i]
psllq $1, %mm3 / mm3 = 2*r[2*i]
paddq %mm3, %mm2 / mm2 = 2*r[2*i] + cy
movd %mm2, 8(%edx) / r[2*i] = lo32(2*r[2*i] + cy)
psrlq $32, %mm2 / mm2 = cy = hi32(2*r[2*i] + cy)
movd 12(%edx), %mm3 / mm3 = r[2*i+1]
psllq $1, %mm3 / mm3 = 2*r[2*i+1]
paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + cy
movd %mm2, 12(%edx) / r[2*i+1] = mm2
psrlq $32, %mm2 / mm2 = cy
/ compute high-order corner and add it in
/ p = a[alen - 1]**2
/ t = p + cy
/ r[alen + alen - 2] += lo32(t)
/ cy = hi32(t)
/ r[alen + alen - 1] = cy
movd 4(%ebx), %mm0 / mm0 = a[$-1]
movd 8(%edx), %mm3 / mm3 = r[$-2]
pmuludq %mm0, %mm0 / mm0 = p = a[$-1]**2
paddq %mm0, %mm2 / mm2 = t = p + cy
paddq %mm3, %mm2 / mm2 = r[$-2] + t
movd %mm2, 8(%edx) / r[$-2] = lo32(r[$-2] + t)
psrlq $32, %mm2 / mm2 = cy = hi32(r[$-2] + t)
movd 12(%edx), %mm3
paddq %mm3, %mm2
movd %mm2, 12(%edx) / r[$-1] += cy
.L35:
emms
popl %esi
popl %edi
popl %ebx
#if defined(MMX_MANAGE)
ret
SET_SIZE(big_sqr_vec_sse2_fc)
#else
leave
ret
SET_SIZE(big_sqr_vec_sse2)
#endif
#if defined(MMX_MANAGE)
ENTRY(big_sqr_vec_sse2)
pushl %ebp
movl %esp, %ebp
KPREEMPT_DISABLE
TEST_TS(%ebx)
pushl %ebx
jnz .sqr_no_save
pushl %edi
SAVE_MMX_0TO4(%edi)
call big_sqr_vec_sse2_fc
RSTOR_MMX_0TO4(%edi)
popl %edi
jmp .sqr_rtn
.sqr_no_save:
call big_sqr_vec_sse2_fc
.sqr_rtn:
popl %ebx
movl %ebx, %cr0
KPREEMPT_ENABLE
leave
ret
SET_SIZE(big_sqr_vec_sse2)
#endif /* MMX_MANAGE */
/ ------------------------------------------------------------------------
/ UMUL Implementations
/ ------------------------------------------------------------------------
/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ Does not use any MMX, SSE, or SSE2 instructions.
/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
/ This is a fall-back implementation for x86 models that do not support
/ the PMULUDQ instruction.
/
/ uint32_t
/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
/
/ r 8(%ebp) %edx %edi
/ a 12(%ebp) %ebx %esi
/ len 16(%ebp) %ecx
/ digit 20(%ebp) %esi
ENTRY(big_mul_set_vec_umul)
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %edi
pushl %ebx
movl 16(%ebp), %ecx
xorl %ebx, %ebx / cy = 0
testl %ecx, %ecx
movl 8(%ebp), %edi
movl 12(%ebp), %esi
je .L57
.L55:
movl (%esi), %eax / eax = a[i]
leal 4(%esi), %esi / ++a
mull 20(%ebp) / edx:eax = a[i] * digit
addl %ebx, %eax
adcl $0, %edx / edx:eax = a[i] * digit + cy
movl %eax, (%edi) / r[i] = product[31..0]
movl %edx, %ebx / cy = product[63..32]
leal 4(%edi), %edi / ++r
decl %ecx / --len
jnz .L55 / while (len != 0)
.L57:
movl %ebx, %eax
popl %ebx
popl %edi
popl %esi
leave
ret
SET_SIZE(big_mul_set_vec_umul)
/ r = r + a * digit, r and a are vectors of length len
/ returns the carry digit
/ Does not use any MMX, SSE, or SSE2 instructions.
/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
/ This is a fall-back implementation for x86 models that do not support
/ the PMULUDQ instruction.
/
/ uint32_t
/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
/
/ r 8(%ebp) %edx %edi
/ a 12(%ebp) %ebx %esi
/ len 16(%ebp) %ecx
/ digit 20(%ebp) %esi
ENTRY(big_mul_add_vec_umul)
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %edi
pushl %ebx
movl 16(%ebp), %ecx
xorl %ebx, %ebx / cy = 0
testl %ecx, %ecx
movl 8(%ebp), %edi
movl 12(%ebp), %esi
je .L67
.align 4
.L65:
movl (%esi), %eax / eax = a[i]
leal 4(%esi), %esi / ++a
mull 20(%ebp) / edx:eax = a[i] * digit
addl (%edi), %eax
adcl $0, %edx / edx:eax = a[i] * digit + r[i]
addl %ebx, %eax
adcl $0, %edx / edx:eax = a[i] * digit + r[i] + cy
movl %eax, (%edi) / r[i] = product[31..0]
movl %edx, %ebx / cy = product[63..32]
leal 4(%edi), %edi / ++r
decl %ecx / --len
jnz .L65 / while (len != 0)
.L67:
movl %ebx, %eax
popl %ebx
popl %edi
popl %esi
leave
ret
SET_SIZE(big_mul_add_vec_umul)
#endif /* __lint */