# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
# ====================================================================
# Performance improvement is not really impressive on pre-T1 CPU: +8%
# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
# X[16] vector is packed to 8 64-bit registers and as result nothing
# is spilled on stack. In addition input data is loaded in compact
# instruction sequence, thus minimizing the window when the code is
# subject to [inter-thread] cache-thrashing hazard. The goal is to
# ensure scalability on UltraSPARC T1, or rather to avoid decay when
# amount of active threads exceeds the number of physical cores.
# SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x
# faster than software. Multi-process benchmark saturates at 11x
# single-process result on 8-core processor, or ~9GBps per 2.85GHz
# socket.
$output=shift;
open STDOUT,">$output";
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
$rot1m="%g2";
$tmp64="%g3";
$Xi="%g4";
$A="%l0";
$B="%l1";
$C="%l2";
$D="%l3";
$E="%l4";
@V=($A,$B,$C,$D,$E);
$K_00_19="%l5";
$K_20_39="%l6";
$K_40_59="%l7";
$K_60_79="%g5";
$ctx="%i0";
$inp="%i1";
$len="%i2";
$tmp0="%i3";
$tmp1="%i4";
$tmp2="%i5";
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e)=@_;
add @K[$i/20],$e,$e
and $c,$b,$tmp0
srl $b,2,$b
or $tmp2,$b,$b
if ($i&1 && $i<15) {
$code.=
" srlx @X[(($i+1)/2)%8],32,$Xi\n";
}
}
sub Xupdate {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i/2;
if ($i&1) {
add @K[$i/20],$e,$e
} else {
xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
add @K[$i/20],$e,$e !!
}
}
sub BODY_16_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
&Xupdate(@_);
if ($i&1) {
} else {
$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
}
and $c,$b,$tmp0
srl $b,2,$b
or $tmp2,$b,$b
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi;
&Xupdate(@_);
if ($i&1) {
} else {
$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
}
xor $c,$b,$tmp0
srl $b,2,$b
or $tmp2,$b,$b
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi;
&Xupdate(@_);
if ($i&1) {
} else {
$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
}
and $c,$b,$tmp0
or $c,$b,$tmp1
srl $b,2,$b
or $tmp2,$b,$b
}
#include "sparc_arch.h"
#include <openssl/fipssyms.h>
#ifdef __arch64__
#endif
#ifdef __PIC__
#endif
.align 32
.Lhw_loop:
.align 8
.align 16
.Lloop:
for($i=0;$i<7;$i++)
or $Xi,@X[$i],@X[$i]
}
.Laligned:
for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
add $A,@X[0],$A
add $B,@X[1],$B
add $C,@X[2],$C
add $D,@X[3],$D
add $E,@X[4],$E
.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
"for" => 0x07c );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
$_=$1;
if ($1>=32) {
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
$ref;
} else {
return $ref;
}
}
sub unalignaddr {
my $ref="$mnemonic\t$rs1,$rs2,$rd";
else { return $ref; }
}
return sprintf ".word\t0x%08x !%s",
$ref;
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
/ge;
/ge;
print $_,"\n";
}
close STDOUT;