3421N/A#!/usr/bin/env perl
3421N/A#
3421N/A# ====================================================================
3421N/A# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
3421N/A# project. The module is, however, dual licensed under OpenSSL and
3421N/A# CRYPTOGAMS licenses depending on where you obtain it. For further
3421N/A# details see http://www.openssl.org/~appro/cryptogams/.
3421N/A# ====================================================================
3421N/A#
3421N/A# October 2012
3421N/A#
3421N/A# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
3421N/A# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
3421N/A# the time being... Except that it has two code paths: one suitable
3421N/A# for all SPARCv9 processors and one for VIS3-capable ones. Former
3421N/A# delivers ~25-45% more, more for longer keys, heaviest DH and DSA
3421N/A# verify operations on venerable UltraSPARC II. On T4 VIS3 code is
3421N/A# ~100-230% faster than gcc-generated code and ~35-90% faster than
3421N/A# the pure SPARCv9 code path.
3421N/A
3421N/A$locals=16*8;
3421N/A
3421N/A$tab="%l0";
3421N/A
3421N/A@T=("%g2","%g3");
3421N/A@i=("%g4","%g5");
3421N/A
3421N/A($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5));
3421N/A($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;
3421N/A
3421N/A$code.=<<___;
3421N/A#include <sparc_arch.h>
3421N/A#include <openssl/fipssyms.h>
3421N/A
3421N/A#ifdef __arch64__
3421N/A.register %g2,#scratch
3421N/A.register %g3,#scratch
3421N/A#endif
3421N/A
3421N/A#ifdef __PIC__
3421N/ASPARC_PIC_THUNK(%g1)
3421N/A#endif
3421N/A
3421N/A.globl bn_GF2m_mul_2x2
3421N/A.align 16
3421N/Abn_GF2m_mul_2x2:
3421N/A SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
3421N/A ld [%g1+0],%g1 ! OPENSSL_sparcv9cap_P[0]
3421N/A
3421N/A andcc %g1, SPARCV9_VIS3, %g0
3421N/A bz,pn %icc,.Lsoftware
3421N/A nop
3421N/A
3421N/A sllx %o1, 32, %o1
3421N/A sllx %o3, 32, %o3
3421N/A or %o2, %o1, %o1
3421N/A or %o4, %o3, %o3
3421N/A .word 0x95b262ab ! xmulx %o1, %o3, %o2
3421N/A .word 0x99b262cb ! xmulxhi %o1, %o3, %o4
3421N/A srlx %o2, 32, %o1 ! 13 cycles later
3421N/A st %o2, [%o0+0]
3421N/A st %o1, [%o0+4]
3421N/A srlx %o4, 32, %o3
3421N/A st %o4, [%o0+8]
3421N/A retl
3421N/A st %o3, [%o0+12]
3421N/A
3421N/A.align 16
3421N/A.Lsoftware:
3421N/A save %sp,-STACK_FRAME-$locals,%sp
3421N/A
3421N/A sllx %i1,32,$a
3421N/A mov -1,$a12
3421N/A sllx %i3,32,$b
3421N/A or %i2,$a,$a
3421N/A srlx $a12,1,$a48 ! 0x7fff...
3421N/A or %i4,$b,$b
3421N/A srlx $a12,2,$a12 ! 0x3fff...
3421N/A add %sp,STACK_BIAS+STACK_FRAME,$tab
3421N/A
3421N/A sllx $a,2,$a4
3421N/A mov $a,$a1
3421N/A sllx $a,1,$a2
3421N/A
3421N/A srax $a4,63,@i[1] ! broadcast 61st bit
3421N/A and $a48,$a4,$a4 ! (a<<2)&0x7fff...
3421N/A srlx $a48,2,$a48
3421N/A srax $a2,63,@i[0] ! broadcast 62nd bit
3421N/A and $a12,$a2,$a2 ! (a<<1)&0x3fff...
3421N/A srax $a1,63,$lo ! broadcast 63rd bit
3421N/A and $a48,$a1,$a1 ! (a<<0)&0x1fff...
3421N/A
3421N/A sllx $a1,3,$a8
3421N/A and $b,$lo,$lo
3421N/A and $b,@i[0],@i[0]
3421N/A and $b,@i[1],@i[1]
3421N/A
3421N/A stx %g0,[$tab+0*8] ! tab[0]=0
3421N/A xor $a1,$a2,$a12
3421N/A stx $a1,[$tab+1*8] ! tab[1]=a1
3421N/A stx $a2,[$tab+2*8] ! tab[2]=a2
3421N/A xor $a4,$a8,$a48
3421N/A stx $a12,[$tab+3*8] ! tab[3]=a1^a2
3421N/A xor $a4,$a1,$a1
3421N/A
3421N/A stx $a4,[$tab+4*8] ! tab[4]=a4
3421N/A xor $a4,$a2,$a2
3421N/A stx $a1,[$tab+5*8] ! tab[5]=a1^a4
3421N/A xor $a4,$a12,$a12
3421N/A stx $a2,[$tab+6*8] ! tab[6]=a2^a4
3421N/A xor $a48,$a1,$a1
3421N/A stx $a12,[$tab+7*8] ! tab[7]=a1^a2^a4
3421N/A xor $a48,$a2,$a2
3421N/A
3421N/A stx $a8,[$tab+8*8] ! tab[8]=a8
3421N/A xor $a48,$a12,$a12
3421N/A stx $a1,[$tab+9*8] ! tab[9]=a1^a8
3421N/A xor $a4,$a1,$a1
3421N/A stx $a2,[$tab+10*8] ! tab[10]=a2^a8
3421N/A xor $a4,$a2,$a2
3421N/A stx $a12,[$tab+11*8] ! tab[11]=a1^a2^a8
3421N/A
3421N/A xor $a4,$a12,$a12
3421N/A stx $a48,[$tab+12*8] ! tab[12]=a4^a8
3421N/A srlx $lo,1,$hi
3421N/A stx $a1,[$tab+13*8] ! tab[13]=a1^a4^a8
3421N/A sllx $lo,63,$lo
3421N/A stx $a2,[$tab+14*8] ! tab[14]=a2^a4^a8
3421N/A srlx @i[0],2,@T[0]
3421N/A stx $a12,[$tab+15*8] ! tab[15]=a1^a2^a4^a8
3421N/A
3421N/A sllx @i[0],62,$a1
3421N/A sllx $b,3,@i[0]
3421N/A srlx @i[1],3,@T[1]
3421N/A and @i[0],`0xf<<3`,@i[0]
3421N/A sllx @i[1],61,$a2
3421N/A ldx [$tab+@i[0]],@i[0]
3421N/A srlx $b,4-3,@i[1]
3421N/A xor @T[0],$hi,$hi
3421N/A and @i[1],`0xf<<3`,@i[1]
3421N/A xor $a1,$lo,$lo
3421N/A ldx [$tab+@i[1]],@i[1]
3421N/A xor @T[1],$hi,$hi
3421N/A
3421N/A xor @i[0],$lo,$lo
3421N/A srlx $b,8-3,@i[0]
3421N/A xor $a2,$lo,$lo
3421N/A and @i[0],`0xf<<3`,@i[0]
3421N/A___
3421N/Afor($n=1;$n<14;$n++) {
3421N/A$code.=<<___;
3421N/A sllx @i[1],`$n*4`,@T[0]
3421N/A ldx [$tab+@i[0]],@i[0]
3421N/A srlx @i[1],`64-$n*4`,@T[1]
3421N/A xor @T[0],$lo,$lo
3421N/A srlx $b,`($n+2)*4`-3,@i[1]
3421N/A xor @T[1],$hi,$hi
3421N/A and @i[1],`0xf<<3`,@i[1]
3421N/A___
3421N/A push(@i,shift(@i)); push(@T,shift(@T));
3421N/A}
3421N/A$code.=<<___;
3421N/A sllx @i[1],`$n*4`,@T[0]
3421N/A ldx [$tab+@i[0]],@i[0]
3421N/A srlx @i[1],`64-$n*4`,@T[1]
3421N/A xor @T[0],$lo,$lo
3421N/A
3421N/A sllx @i[0],`($n+1)*4`,@T[0]
3421N/A xor @T[1],$hi,$hi
3421N/A srlx @i[0],`64-($n+1)*4`,@T[1]
3421N/A xor @T[0],$lo,$lo
3421N/A xor @T[1],$hi,$hi
3421N/A
3421N/A srlx $lo,32,%i1
3421N/A st $lo,[%i0+0]
3421N/A st %i1,[%i0+4]
3421N/A srlx $hi,32,%i2
3421N/A st $hi,[%i0+8]
3421N/A st %i2,[%i0+12]
3421N/A
3421N/A ret
3421N/A restore
3421N/A.type bn_GF2m_mul_2x2,#function
3421N/A.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
3421N/A.asciz "GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
3421N/A.align 4
3421N/A___
3421N/A
3421N/A$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3421N/Aprint $code;
3421N/Aclose STDOUT;
3421N/A