# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
# ====================================================================
# SHA256 performance improvement over compiler generated code varies
# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
# build]. Just like in SHA1 module I aim to ensure scalability on
# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
# SHA512 on pre-T1 UltraSPARC.
#
# Performance is >75% better than 64-bit code generated by Sun C and
# over 2x than 32-bit code. X[16] resides on stack, but access to it
# is scheduled for L2 latency and staged through 32 least significant
# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
# good [optimal coefficient is 50%].
#
# SHA512 on UltraSPARC T1.
#
# It's not any faster than 64-bit code generated by Sun C 5.8. This is
# because 64-bit code generator has the advantage of using 64-bit
# loads(*) to access X[16], which I consciously traded for 32-/64-bit
# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
# code by 60%, not to mention that it doesn't suffer from severe decay
# when running 4 times physical cores threads and that it leaves gcc
# [3.4] behind by over 4x factor! If compared to SHA256, single thread
# performance is only 10% better, but overall throughput for maximum
# amount of threads for given CPU exceeds corresponding one of SHA256
# by 30% [again, optimal coefficient is 50%].
#
# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
# in-order, i.e. load instruction has to complete prior next
# instruction in given thread is executed, even if the latter is
# not dependent on load result! This means that on T1 two 32-bit
# loads are always slower than one 64-bit load. Once again this
# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
# 2x32-bit loads can be as fast as 1x64-bit ones.
#
# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
# which is 9.3x/11.1x faster than software. Multi-process benchmark
# saturates at 11.5x single-process result on 8-core processor, or
# ~11/16GBps per 2.85GHz socket.
$output=shift;
open STDOUT,">$output";
if ($output =~ /512/) {
$label="512";
$SZ=8;
$lastK=0x817;
$rounds=80;
$align=4;
$A="%o0";
$B="%o1";
$C="%o2";
$D="%o3";
$E="%o4";
$F="%o5";
$G="%g1";
$H="%o7";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
} else {
$label="256";
$SZ=4;
$lastK=0x8f2;
$rounds=64;
$align=8;
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
$A="%l0";
$B="%l1";
$C="%l2";
$D="%l3";
$E="%l4";
$F="%l5";
$G="%l6";
$H="%l7";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
}
$T1="%g2";
$tmp0="%g3";
$tmp1="%g4";
$tmp2="%g5";
$ctx="%i0";
$inp="%i1";
$len="%i2";
$Ktbl="%i3";
$tmp31="%i4";
$tmp32="%i5";
########### SHA256
$Xload = sub {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
if ($i==0) {
for($j=0;$j<7;$j++)
or $tmp1,@X[$j],@X[$j]
}
.Laligned:
}
if ($i&1) {
$code.="\tadd @X[$i/2],$h,$T1\n";
} else {
$code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
}
} if ($SZ==4);
########### SHA512
$Xload = sub {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
cmp $tmp31,0
`"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
`"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
} if ($SZ==8);
########### common
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
if ($i<16) {
&$Xload(@_);
} else {
$code.="\tadd $h,$T1,$T1\n";
}
xor $f,$g,$tmp2
xor $tmp1,$h,$h
xor $tmp0,$h,$h
xor $tmp1,$h,$h
xor $tmp0,$h,$h
xor $tmp1,$h,$h
xor $tmp0,$h,$h
xor $tmp1,$h,$h
xor $tmp0,$h,$h
or $a,$b,$tmp0
and $a,$b,$tmp1
}
########### SHA256
$BODY_16_XX = sub {
my $i=@_[0];
my $xi;
if ($i&1) {
$code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
} else {
}
if ($i&1) {
} else {
$code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
}
if ($i&1) {
} else {
}
&BODY_00_15(@_);
} if ($SZ==4);
########### SHA512
$BODY_16_XX = sub {
my $i=@_[0];
&BODY_00_15(@_);
} if ($SZ==8);
#include "sparc_arch.h"
#include <openssl/fipssyms.h>
#ifdef __arch64__
#endif
.align 64
K${label}:
if ($SZ==4) {
} else {
}
#ifdef __PIC__
#endif
.align 32
.align 16
.Lhwloop:
.align 8
.align 16
.Lloop:
$code.=".L16_xx:\n";
for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
add $A,@X[0],$A
add $B,@X[1],$B
add $C,@X[2],$C
add $D,@X[3],$D
add $E,@X[4],$E
add $F,@X[5],$F
add $G,@X[6],$G
add $H,@X[7],$H
.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
"for" => 0x07c );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
$_=$1;
if ($1>=32) {
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
$ref;
} else {
return $ref;
}
}
sub unalignaddr {
my $ref="$mnemonic\t$rs1,$rs2,$rd";
else { return $ref; }
}
return sprintf ".word\t0x%08x !%s",
$ref;
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
/ge;
/ge;
print $_,"\n";
}
close STDOUT;