#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# sha256/512_block procedure for x86_64.
#
# 40% improvement over compiler-generated code on Opteron. On EM64T
# sha256 was observed to run >80% faster and sha512 - >40%. No magical
# tricks, just straight implementation... I really wonder why gcc
# [being armed with inline assembler] fails to generate as fast code.
# The only thing which is cool about this module is that it's very
# same instruction sequence used for both SHA-256 and SHA-512. In
# former case the instructions operate on 32-bit operands, while in
# latter - on 64-bit ones. All I had to do is to get one flavor right,
# the other one passed the test right away:-)
#
# sha256_block runs in ~1005 cycles on Opteron, which gives you
# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
# frequency in GHz. sha512_block runs in ~1275 cycles, which results
# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
# Well, if you compare it to IA-64 implementation, which maintains
# X[16] in register bank[!], tends to 4 instructions per CPU clock
# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
# issue Opteron pipeline and X[16] maintained in memory. So that *if*
# there is a way to improve it, *then* the only way would be to try to
# offload X[16] updates to SSE unit, but that would require "deeper"
# loop unroll, which in turn would naturally cause size blow-up, not
# to mention increased complexity! And once again, only *if* it's
# actually possible to noticeably improve overall ILP, instruction
# level parallelism, on a given CPU implementation in this case.
#
# Special note on Intel EM64T. While Opteron CPU exhibits perfect
# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
# [currently available] EM64T CPUs apparently are far from it. On the
# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
# apparently are not atomic instructions, but implemented in microcode.
#
# OpenSolaris OS modifications
#
# Sun elects to use this software under the BSD license.
#
# This source originates from OpenSSL file sha512-x86_64.pl at
# (presumably for future OpenSSL release 0.9.8h), with these changes:
#
# 1. Added perl "use strict" and declared variables.
#
# /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
#
# 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
# assemblers). Replaced the .picmeup macro with assembler code.
#
# 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
# at the beginning of SHA2_CTX (the next field is 8-byte aligned).
#
use strict;
my $output = shift;
open STDOUT,">$output";
#
# OpenSSL library:
# void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num);
# void sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num);
#
# OpenSolaris OS:
# void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
# void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
# Note: the OpenSolaris SHA2 structure has an extra 8 byte field at the
# beginning (over OpenSSL's SHA512 or SHA256 structure).
#
if ($output =~ /512/) {
$func="SHA512TransformBlocks";
$TABLE="K512";
$SZ=8;
"%r8", "%r9", "%r10","%r11");
$rounds=80;
} else {
$func="SHA256TransformBlocks";
$TABLE="K256";
$SZ=4;
"%r8d","%r9d","%r10d","%r11d");
$rounds=64;
}
$Tbl="%rbp";
$_ctx="16*$SZ+0*8(%rsp)";
$_inp="16*$SZ+1*8(%rsp)";
$_end="16*$SZ+2*8(%rsp)";
$_rsp="16*$SZ+3*8(%rsp)";
$framesz="16*$SZ+4*8";
sub ROUND_00_15()
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
xor $g,$a2 # f^g
and $e,$a2 # (f^g)&e
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
mov $a,$h
xor $a0,$h
or $c,$a1 # a|c
xor $a0,$h # h=Sigma0(a)
and $c,$a2 # a&c
and $b,$a1 # (a|c)&b
}
sub ROUND_16_XX()
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
&ROUND_00_15(@_);
}
#
# Execution begins here
#
#if defined(lint) || defined(__lint)
/* ARGSUSED */
{
}
#else
#include <sys/asm_linkage.h>
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
.Llea:
.align 16
.Lloop:
for($i=0;$i<16;$i++) {
$code.=" mov $SZ*$i($inp),$T1\n";
$code.=" bswap $T1\n";
&ROUND_00_15($i,@ROT);
}
.align 16
for(;$i<32;$i++) {
&ROUND_16_XX($i,@ROT);
}
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
if ($SZ==4) {
# SHA256
.align 64
$TABLE:
} else {
# SHA512
.align 64
$TABLE:
}
#endif /* !lint && !__lint */
print $code;
close STDOUT;