sparcv8plus.S revision 7c478bd95313f5f23a4c958a745db2134aa03244
.ident "sparcv8plus.s, Version 1.4"
.ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
/*
* ====================================================================
* Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
* project.
*
* Rights for redistribution and usage in source and binary forms are
* granted according to the OpenSSL license. Warranty of any kind is
* disclaimed.
* ====================================================================
*/
/*
* This is my modest contributon to OpenSSL project (see
* http://www.openssl.org/ for more information about it) and is
* module. For updates see http://fy.chalmers.se/~appro/hpe/.
*
* Questions-n-answers.
*
* Q. How to compile?
*
* cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
*
* and with gcc:
*
* gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
*
* or if above fails (it does if you have gas installed):
*
*
* Quick-n-dirty way to fuse the module into the library.
* Provided that the library is already configured and built
* (in 0.9.2 case with no-asm option):
*
* # cp /some/place/bn_asm.sparc.v8plus.S .
* # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
* # make
* # cd ../..
* # make; make test
*
* Quick-n-dirty way to get rid of it:
*
* # touch bn_asm.c
* # make
* # cd ../..
* # make; make test
*
* Q. V8plus achitecture? What kind of beast is that?
* A. Well, it's rather a programming model than an architecture...
* It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
* special conditions, namely when kernel doesn't preserve upper
* 32 bits of otherwise 64-bit registers during a context switch.
*
* Q. Why just UltraSPARC? What about SuperSPARC?
* A. Original release did target UltraSPARC only. Now SuperSPARC
* version is provided along. Both version share bn_*comba[48]
* implementations (see comment later in code for explanation).
* But what's so special about this UltraSPARC implementation?
* Why didn't I let compiler do the job? Trouble is that most of
* available compilers (well, SC5.0 is the only exception) don't
* attempt to take advantage of UltraSPARC's 64-bitness under
* 32-bit kernels even though it's perfectly possible (see next
* question).
*
* Q. 64-bit registers under 32-bit kernels? Didn't you just say it
* doesn't work?
* A. You can't adress *all* registers as 64-bit wide:-( The catch is
* that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
* preserved if you're in a leaf function, i.e. such never calling
* any other functions. All functions in this module are leaf and
* 10 registers is a handful. And as a matter of fact none-"comba"
* routines don't require even that much and I could even afford to
* not allocate own stack frame for 'em:-)
*
* Q. What about 64-bit kernels?
* A. What about 'em? Just kidding:-) Pure 64-bit version is currently
* under evaluation and development...
*
* Q. What about shared libraries?
* A. What about 'em? Kidding again:-) Code does *not* contain any
* code position dependencies and it's safe to include it into
* shared library as is.
*
* Q. How much faster does it go?
* A. Do you have a good benchmark? In either case below is what I
* experience with crypto/bn/expspeed.c test program:
*
* v8plus module on U10/300MHz against bn_asm.c compiled with:
*
* cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12%
* cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35%
* egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45%
*
* v8 module on SS10/60MHz against bn_asm.c compiled with:
*
* cc-5.0 -xarch=v8 -xO5 -xdepend +7-10%
* cc-4.2 -xarch=v8 -xO5 -xdepend +10%
* egcs-1.1.2 -mv8 -O3 +35-45%
*
* As you can see it's damn hard to beat the new Sun C compiler
* and it's in first place GNU C users who will appreciate this
* assembler implementation:-)
*/
/*
* Revision history.
*
* 1.0 - initial release;
* 1.1 - new loop unrolling model(*);
* - some more fine tuning;
* 1.2 - made gas friendly;
* - updates to documentation concerning v9;
* - new performance comparison matrix;
* 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient)
* resulting in slight overall performance kick;
* - some retunes;
* - support for GNU as added;
*
* (*) Originally unrolled loop looked like this:
* for (;;) {
* op(p+0); if (--n==0) break;
* op(p+1); if (--n==0) break;
* op(p+2); if (--n==0) break;
* op(p+3); if (--n==0) break;
* p+=4;
* }
* I unroll according to following:
* while (n&~3) {
* op(p+0); op(p+1); op(p+2); op(p+3);
* p+=4; n=-4;
* }
* if (n) {
* op(p+0); if (--n==0) return;
* op(p+2); if (--n==0) return;
* op(p+3); return;
* }
*/
/*
* GNU assembler can't stand stuw:-(
*/
.align 32
/*
* BN_ULONG bn_mul_add_words(rp,ap,num,w)
* BN_ULONG *rp,*ap;
* int num;
* BN_ULONG w;
*/
.align 32
/*
* BN_ULONG bn_mul_words(rp,ap,num,w)
* BN_ULONG *rp,*ap;
* int num;
* BN_ULONG w;
*/
.align 32
/*
* void bn_sqr_words(r,a,n)
* BN_ULONG *r,*a;
* int n;
*/
.align 32
/*
* BN_ULONG bn_div_words(h,l,d)
* BN_ULONG h,l,d;
*/
.align 32
/*
* BN_ULONG bn_add_words(rp,ap,bp,n)
* BN_ULONG *rp,*ap,*bp;
* int n;
*/
/*
* BN_ULONG bn_sub_words(rp,ap,bp,n)
* BN_ULONG *rp,*ap,*bp;
* int n;
*/
/*
* Code below depends on the fact that upper parts of the %l0-%l7
* and %i0-%i7 are zeroed by kernel after context switch. In
* previous versions this comment stated that "the trouble is that
* it's not feasible to implement the mumbo-jumbo in less V9
* instructions:-(" which apparently isn't true thanks to
* 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
* results not from the shorter code, but from elimination of
* multicycle none-pairable 'rd %y,%rd' instructions.
*
* Andy.
*/
#define FRAME_SIZE -96
/*
* Here is register usage map for *all* routines below.
*/
.align 32
/*
* void bn_mul_comba8(r,a,b)
* BN_ULONG *r,*a,*b;
*/
.align 32
/*
* void bn_mul_comba4(r,a,b)
* BN_ULONG *r,*a,*b;
*/
.align 32
.align 32
/*
* void bn_sqr_comba4(r,a)
* BN_ULONG *r,*a;
*/
.align 32