bignum/amd64/bignum_amd64_asm.s

	bignum_amd64_asm.s revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

#include <sys/asm_linkage.h>

#if defined(lint) || defined(__lint)

#include <sys/types.h>

/* ARGSUSED */
uint64_t
big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
{ return (0); }

/* ARGSUSED */
uint64_t
big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
{ return (0); }

/* ARGSUSED */
void
big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
{}

#else   /* lint */

/ ------------------------------------------------------------------------
/
/  Implementation of big_mul_set_vec which exploits
/  the 64X64->128 bit  unsigned multiply instruction.
/
/  As defined in Sun's bignum library for pkcs11, bignums are
/  composed of an array of 32-bit "digits" along with descriptive
/  information.  The arrays of digits are only required to be
/  aligned on 32-bit boundary.  This implementation works only
/  when the two factors and the result happen to be 64 bit aligned
/  and have an even number of digits.
/
/ ------------------------------------------------------------------------

/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ r and a are 64 bit aligned.
/
/ uint64_t
/ big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
/
    ENTRY(big_mul_set_vec64)
    xorq    %rax, %rax      / if (len == 0) return (0)
    testq   %rdx, %rdx
    jz  .L17

    movq    %rdx, %r8       / Use r8 for len; %rdx is used by mul
    xorq    %r9, %r9        / cy = 0

.L15:
    cmpq    $8, %r8         / 8 - len
    jb  .L16
    movq    0(%rsi), %rax       / rax = a[0]
    movq    8(%rsi), %r11       / prefetch a[1]
    mulq    %rcx            / p = a[0] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 0(%rdi)       / r[0] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    movq    16(%rsi), %r11      / prefetch a[2]
    mulq    %rcx            / p = a[1] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 8(%rdi)       / r[1] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    movq    24(%rsi), %r11      / prefetch a[3]
    mulq    %rcx            / p = a[2] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 16(%rdi)      / r[2] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    movq    32(%rsi), %r11      / prefetch a[4]
    mulq    %rcx            / p = a[3] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 24(%rdi)      / r[3] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    movq    40(%rsi), %r11      / prefetch a[5]
    mulq    %rcx            / p = a[4] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 32(%rdi)      / r[4] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    movq    48(%rsi), %r11      / prefetch a[6]
    mulq    %rcx            / p = a[5] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 40(%rdi)      / r[5] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    movq    56(%rsi), %r11      / prefetch a[7]
    mulq    %rcx            / p = a[6] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 48(%rdi)      / r[6] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    mulq    %rcx            / p = a[7] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 56(%rdi)      / r[7] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    addq    $64, %rsi
    addq    $64, %rdi
    subq    $8, %r8

    jz  .L17
    jmp .L15

.L16:
    movq    0(%rsi), %rax
    mulq    %rcx            / p = a[0] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 0(%rdi)       / r[0] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L17

    movq    8(%rsi), %rax
    mulq    %rcx            / p = a[1] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 8(%rdi)       / r[1] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L17

    movq    16(%rsi), %rax
    mulq    %rcx            / p = a[2] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 16(%rdi)      / r[2] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L17

    movq    24(%rsi), %rax
    mulq    %rcx            / p = a[3] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 24(%rdi)      / r[3] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L17

    movq    32(%rsi), %rax
    mulq    %rcx            / p = a[4] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 32(%rdi)      / r[4] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L17

    movq    40(%rsi), %rax
    mulq    %rcx            / p = a[5] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 40(%rdi)      / r[5] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L17

    movq    48(%rsi), %rax
    mulq    %rcx            / p = a[6] * digit
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 48(%rdi)      / r[6] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L17


.L17:
    movq    %r9, %rax
    ret
    SET_SIZE(big_mul_set_vec64)

/ ------------------------------------------------------------------------
/
/  Implementation of big_mul_add_vec which exploits
/  the 64X64->128 bit  unsigned multiply instruction.
/
/  As defined in Sun's bignum library for pkcs11, bignums are
/  composed of an array of 32-bit "digits" along with descriptive
/  information.  The arrays of digits are only required to be
/  aligned on 32-bit boundary.  This implementation works only
/  when the two factors and the result happen to be 64 bit aligned
/  and have an even number of digits.
/
/ ------------------------------------------------------------------------

/ r += a * digit, r and a are vectors of length len
/ returns the carry digit
/ r and a are 64 bit aligned.
/
/ uint64_t
/ big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
/
    ENTRY(big_mul_add_vec64)
    xorq    %rax, %rax      / if (len == 0) return (0)
    testq   %rdx, %rdx
    jz  .L27

    movq    %rdx, %r8       / Use r8 for len; %rdx is used by mul
    xorq    %r9, %r9        / cy = 0

.L25:
    cmpq    $8, %r8         / 8 - len
    jb  .L26
    movq    0(%rsi), %rax       / rax = a[0]
    movq    0(%rdi), %r10       / r10 = r[0]
    movq    8(%rsi), %r11       / prefetch a[1]
    mulq    %rcx            / p = a[0] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[0]
    movq    8(%rdi), %r10       / prefetch r[1]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 0(%rdi)       / r[0] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    movq    16(%rsi), %r11      / prefetch a[2]
    mulq    %rcx            / p = a[1] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[1]
    movq    16(%rdi), %r10      / prefetch r[2]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 8(%rdi)       / r[1] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    movq    24(%rsi), %r11      / prefetch a[3]
    mulq    %rcx            / p = a[2] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[2]
    movq    24(%rdi), %r10      / prefetch r[3]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 16(%rdi)      / r[2] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    movq    32(%rsi), %r11      / prefetch a[4]
    mulq    %rcx            / p = a[3] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[3]
    movq    32(%rdi), %r10      / prefetch r[4]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 24(%rdi)      / r[3] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    movq    40(%rsi), %r11      / prefetch a[5]
    mulq    %rcx            / p = a[4] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[4]
    movq    40(%rdi), %r10      / prefetch r[5]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 32(%rdi)      / r[4] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    movq    48(%rsi), %r11      / prefetch a[6]
    mulq    %rcx            / p = a[5] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[5]
    movq    48(%rdi), %r10      / prefetch r[6]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 40(%rdi)      / r[5] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    movq    56(%rsi), %r11      / prefetch a[7]
    mulq    %rcx            / p = a[6] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[6]
    movq    56(%rdi), %r10      / prefetch r[7]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 48(%rdi)      / r[6] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    movq    %r11, %rax
    mulq    %rcx            / p = a[7] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[7]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 56(%rdi)      / r[7] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)

    addq    $64, %rsi
    addq    $64, %rdi
    subq    $8, %r8

    jz  .L27
    jmp .L25

.L26:
    movq    0(%rsi), %rax
    movq    0(%rdi), %r10
    mulq    %rcx            / p = a[0] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[0]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 0(%rdi)       / r[0] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L27

    movq    8(%rsi), %rax
    movq    8(%rdi), %r10
    mulq    %rcx            / p = a[1] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[1]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 8(%rdi)       / r[1] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L27

    movq    16(%rsi), %rax
    movq    16(%rdi), %r10
    mulq    %rcx            / p = a[2] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[2]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 16(%rdi)      / r[2] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L27

    movq    24(%rsi), %rax
    movq    24(%rdi), %r10
    mulq    %rcx            / p = a[3] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[3]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 24(%rdi)      / r[3] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L27

    movq    32(%rsi), %rax
    movq    32(%rdi), %r10
    mulq    %rcx            / p = a[4] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[4]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 32(%rdi)      / r[4] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L27

    movq    40(%rsi), %rax
    movq    40(%rdi), %r10
    mulq    %rcx            / p = a[5] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[5]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 40(%rdi)      / r[5] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L27

    movq    48(%rsi), %rax
    movq    48(%rdi), %r10
    mulq    %rcx            / p = a[6] * digit
    addq    %r10, %rax
    adcq    $0, %rdx        / p += r[6]
    addq    %r9, %rax
    adcq    $0, %rdx        / p += cy
    movq    %rax, 48(%rdi)      / r[6] = lo(p)
    movq    %rdx, %r9       / cy = hi(p)
    decq    %r8
    jz  .L27


.L27:
    movq    %r9, %rax
    ret
    SET_SIZE(big_mul_add_vec64)


/ void
/ big_sqr_vec64(uint64_t *r, uint64_t *a, int len)

    ENTRY(big_sqr_vec64)
    pushq   %rbx
    pushq   %rbp
    pushq   %r12
    pushq   %r13
    pushq   %r14
    pushq   %r15
    pushq   %rdx            / save arg3, len
    pushq   %rsi            / save arg2, a
    pushq   %rdi            / save arg1, r

    leaq    8(%rdi), %r13       / tr = r + 1
    movq    %rsi, %r14      / ta = a
    movq    %rdx, %r15      / tlen = len
    decq    %r15            / tlen = len - 1
    movq    %r13, %rdi      / arg1 = tr
    leaq    8(%r14), %rsi       / arg2 = ta + 1
    movq    %r15, %rdx      / arg3 = tlen
    movq    0(%r14), %rcx       / arg4 = ta[0]
    call    big_mul_set_vec64
    movq    %rax, 0(%r13, %r15, 8)  / tr[tlen] = cy
.L31:
    decq    %r15            / --tlen
    jz  .L32            / while (--tlen != 0)

    addq    $16, %r13       / tr += 2
    addq    $8, %r14        / ++ta
    movq    %r13, %rdi      / arg1 = tr
    leaq    8(%r14), %rsi       / arg2 = ta + 1
    movq    %r15, %rdx      / arg3 = tlen
    movq    0(%r14), %rcx       / arg4 = ta[0]
    call    big_mul_add_vec64
    movq    %rax, 0(%r13, %r15, 8)  / tr[tlen] = cy
    jmp .L31

.L32:

/ No more function calls after this.
/ Restore arguments to registers.
/ However, don't use %rdx for arg3, len, because it is heavily
/ used by the hardware MUL instruction.  Use %r8, instead.
    movq    0(%rsp), %rdi       / %rdi == arg1 == r
    movq    8(%rsp), %rsi       / %rsi == arg2 == a
    movq    16(%rsp), %r8       / %r8  == arg3 == len

    movq    0(%rsi), %rax       / %rax = a[0];
    mulq    %rax            / s = %edx:%eax = a[0]**2
    movq    %rax, 0(%rdi)       / r[0] = lo64(s)
    movq    %rdx, %r9       / cy = hi64(s)
    xorq    %rdx, %rdx
    movq    8(%rdi), %rax       / p = %rdx:%rax = r[1]
    addq    %rax, %rax
    adcq    $0, %rdx        / p = p << 1
    addq    %r9, %rax
    adcq    $0, %rdx        / p = (r[1] << 1) + cy
    movq    %rax, 8(%rdi)       / r[1] = lo64(p)
    movq    %rdx, %r9       / cy = hi64(p)
    movq    $1, %r11        / row = 1
    movq    $2, %r12        / col = 2
    movq    %r8, %r15
    decq    %r15            / tlen = len - 1
.L33:
    cmpq    %r8, %r11       / len - row
    jae .L34            / while (row < len)

    movq    0(%rsi, %r11, 8), %rax  / s = (uint128_t)a[row]
    mulq    %rax            / s = s * s
    xorq    %rbx, %rbx
    movq    0(%rdi, %r12, 8), %rcx  / p = (uint128_t)r[col]
    addq    %rcx, %rcx
    adcq    $0, %rbx        / p = p << 1
    addq    %rcx, %rax
    adcq    %rbx, %rdx      / t = p + s
    xorq    %r10, %r10
    movq    %rax, %rbp      / t2 = 0:lo64(t)
    addq    %r9, %rbp
    adcq    $0, %r10        / t2 = %r10:%rbp = lo64(t) + cy
    movq    %rbp, 0(%rdi, %r12, 8)  / r[col] = lo64(t2)
    xorq    %rcx, %rcx
    movq    %rdx, %r9
    addq    %r10, %r9
    adcq    $0, %rcx        / cy = hi64(t) + hi64(t2)
    cmpq    %r11, %r15
    je  .L34            / if (row == len - 1) break
    xorq    %rdx, %rdx
    movq    8(%rdi, %r12, 8), %rax
    addq    %rax, %rax
    adcq    $0, %rdx
    addq    %r9, %rax
    adcq    %rcx, %rdx      / p = (lo64(r[col+1]) << 1) + cy
    movq    %rax, 8(%rdi, %r12, 8)  / r[col+1] = lo64(p)
    movq    %rdx, %r9       / cy = hi64(p)

    incq    %r11            / ++row
    addq    $2, %r12        / col += 2
    jmp .L33

.L34:
    movq    %r9, 8(%rdi, %r12, 8)   / r[col+1] = lo64(cy)

    addq    $24, %rsp       / skip %rdi, %rsi, %rdx
    popq    %r15
    popq    %r14
    popq    %r13
    popq    %r12
    popq    %rbp
    popq    %rbx

    ret

    SET_SIZE(big_sqr_vec64)

#endif  /* lint */