x86_64-gcc.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* x86_64 BIGNUM accelerator version 0.1, December 2002.
*
* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
* project.
*
* Rights for redistribution and usage in source and binary forms are
* granted according to the OpenSSL license. Warranty of any kind is
* disclaimed.
*
* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
* versions, like 1.0...
* A. Well, that's because this code is basically a quick-n-dirty
* proof-of-concept hack. As you can see it's implemented with
* inline assembler, which means that you're bound to GCC and that
* there must be a room for fine-tuning.
*
* Q. Why inline assembler?
* A. x86_64 features own ABI I'm not familiar with. Which is why
* I decided to let the compiler take care of subroutine
*
* Q. How much faster does it get?
* A. Unfortunately people sitting on x86_64 hardware are prohibited
* to disclose the performance numbers, so they (SuSE labs to be
* specific) wouldn't tell me. However! Very similar coding technique
* (reaching out for 128-bit result from 64x64-bit multiplication)
* results in >3 times performance improvement on MIPS and I see no
* reason why gain on x86_64 would be so much different:-)
*/
#define BN_ULONG unsigned long
/*
* "m"(a), "+m"(r) is the way to favor DirectPath �-code;
* "g"(0) let the compiler to decide where does it
* want to keep the value of zero;
*/
asm ("mulq %3" \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "cc"); \
} while (0)
asm ("mulq %3" \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "cc"); \
} while (0)
asm ("mulq %2" \
: "a"(a) \
: "cc");
{
while (num&~3)
{
}
if (num)
{
}
return(c1);
}
{
while (num&~3)
{
}
if (num)
{
}
return(c1);
}
{
if (n <= 0) return;
while (n&~3)
{
sqr(r[0],r[1],a[0]);
a+=4; r+=8; n-=4;
}
if (n)
{
sqr(r[0],r[1],a[0]); if (--n == 0) return;
}
}
asm ("divq %4"
: "a"(l),"d"(h),"g"(d)
: "cc");
return ret;
}
if (n <= 0) return 0;
asm (
" subq %2,%2 \n"
".align 16 \n"
"1: movq (%4,%2,8),%0 \n"
" adcq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" leaq 1(%2),%2 \n"
" loop 1b \n"
" sbbq %0,%0 \n"
: "cc"
);
return ret&1;
}
#ifndef SIMICS
if (n <= 0) return 0;
asm (
" subq %2,%2 \n"
".align 16 \n"
"1: movq (%4,%2,8),%0 \n"
" sbbq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" leaq 1(%2),%2 \n"
" loop 1b \n"
" sbbq %0,%0 \n"
: "cc"
);
return ret&1;
}
#else
/* Simics 1.4<7 has buggy sbbq:-( */
#define BN_MASK2 0xffffffffffffffffL
{
int c=0;
if (n <= 0) return((BN_ULONG)0);
for (;;)
{
if (--n <= 0) break;
if (--n <= 0) break;
if (--n <= 0) break;
if (--n <= 0) break;
a+=4;
b+=4;
r+=4;
}
return(c);
}
#endif
/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
#if 0
/* original macros are kept for reference purposes */
}
}
#else
asm ("mulq %3" \
: "a"(a),"m"(b) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "cc"); \
} while (0)
asm ("mulq %2" \
: "a"(a[i]) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "cc"); \
} while (0)
asm ("mulq %3" \
: "a"(a),"m"(b) \
: "cc"); \
asm ("addq %0,%0; adcq %2,%1" \
: "g"(0) \
: "cc"); \
asm ("addq %0,%0; adcq %2,%1" \
: "g"(0) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "cc"); \
} while (0)
#endif
{
c1=0;
c2=0;
c3=0;
r[0]=c1;
c1=0;
r[1]=c2;
c2=0;
r[2]=c3;
c3=0;
r[3]=c1;
c1=0;
r[4]=c2;
c2=0;
r[5]=c3;
c3=0;
r[6]=c1;
c1=0;
r[7]=c2;
c2=0;
r[8]=c3;
c3=0;
r[9]=c1;
c1=0;
r[10]=c2;
c2=0;
r[11]=c3;
c3=0;
r[12]=c1;
c1=0;
r[13]=c2;
c2=0;
r[14]=c3;
r[15]=c1;
}
{
c1=0;
c2=0;
c3=0;
r[0]=c1;
c1=0;
r[1]=c2;
c2=0;
r[2]=c3;
c3=0;
r[3]=c1;
c1=0;
r[4]=c2;
c2=0;
r[5]=c3;
c3=0;
r[6]=c1;
r[7]=c2;
}
{
c1=0;
c2=0;
c3=0;
r[0]=c1;
c1=0;
r[1]=c2;
c2=0;
r[2]=c3;
c3=0;
r[3]=c1;
c1=0;
r[4]=c2;
c2=0;
r[5]=c3;
c3=0;
r[6]=c1;
c1=0;
r[7]=c2;
c2=0;
r[8]=c3;
c3=0;
r[9]=c1;
c1=0;
r[10]=c2;
c2=0;
r[11]=c3;
c3=0;
r[12]=c1;
c1=0;
r[13]=c2;
c2=0;
r[14]=c3;
r[15]=c1;
}
{
c1=0;
c2=0;
c3=0;
r[0]=c1;
c1=0;
r[1]=c2;
c2=0;
r[2]=c3;
c3=0;
r[3]=c1;
c1=0;
r[4]=c2;
c2=0;
r[5]=c3;
c3=0;
r[6]=c1;
r[7]=c2;
}