55553f719b521a0bb4deab6efc944cd30c1a56aada# ====================================================================
55553f719b521a0bb4deab6efc944cd30c1a56aada# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
55553f719b521a0bb4deab6efc944cd30c1a56aada# project. Rights for redistribution and usage in source and binary
55553f719b521a0bb4deab6efc944cd30c1a56aada# forms are granted according to the OpenSSL license.
55553f719b521a0bb4deab6efc944cd30c1a56aada# ====================================================================
55553f719b521a0bb4deab6efc944cd30c1a56aada# sha256/512_block procedure for x86_64.
55553f719b521a0bb4deab6efc944cd30c1a56aada# 40% improvement over compiler-generated code on Opteron. On EM64T
55553f719b521a0bb4deab6efc944cd30c1a56aada# sha256 was observed to run >80% faster and sha512 - >40%. No magical
55553f719b521a0bb4deab6efc944cd30c1a56aada# tricks, just straight implementation... I really wonder why gcc
55553f719b521a0bb4deab6efc944cd30c1a56aada# [being armed with inline assembler] fails to generate as fast code.
55553f719b521a0bb4deab6efc944cd30c1a56aada# The only thing which is cool about this module is that it's very
55553f719b521a0bb4deab6efc944cd30c1a56aada# same instruction sequence used for both SHA-256 and SHA-512. In
55553f719b521a0bb4deab6efc944cd30c1a56aada# former case the instructions operate on 32-bit operands, while in
55553f719b521a0bb4deab6efc944cd30c1a56aada# latter - on 64-bit ones. All I had to do is to get one flavor right,
55553f719b521a0bb4deab6efc944cd30c1a56aada# the other one passed the test right away:-)
55553f719b521a0bb4deab6efc944cd30c1a56aada# sha256_block runs in ~1005 cycles on Opteron, which gives you
55553f719b521a0bb4deab6efc944cd30c1a56aada# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
55553f719b521a0bb4deab6efc944cd30c1a56aada# frequency in GHz. sha512_block runs in ~1275 cycles, which results
55553f719b521a0bb4deab6efc944cd30c1a56aada# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
55553f719b521a0bb4deab6efc944cd30c1a56aada# Well, if you compare it to IA-64 implementation, which maintains
55553f719b521a0bb4deab6efc944cd30c1a56aada# X[16] in register bank[!], tends to 4 instructions per CPU clock
55553f719b521a0bb4deab6efc944cd30c1a56aada# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
55553f719b521a0bb4deab6efc944cd30c1a56aada# issue Opteron pipeline and X[16] maintained in memory. So that *if*
55553f719b521a0bb4deab6efc944cd30c1a56aada# there is a way to improve it, *then* the only way would be to try to
55553f719b521a0bb4deab6efc944cd30c1a56aada# offload X[16] updates to SSE unit, but that would require "deeper"
55553f719b521a0bb4deab6efc944cd30c1a56aada# loop unroll, which in turn would naturally cause size blow-up, not
55553f719b521a0bb4deab6efc944cd30c1a56aada# to mention increased complexity! And once again, only *if* it's
55553f719b521a0bb4deab6efc944cd30c1a56aada# actually possible to noticeably improve overall ILP, instruction
55553f719b521a0bb4deab6efc944cd30c1a56aada# level parallelism, on a given CPU implementation in this case.
55553f719b521a0bb4deab6efc944cd30c1a56aada# Special note on Intel EM64T. While Opteron CPU exhibits perfect
55553f719b521a0bb4deab6efc944cd30c1a56aada# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
55553f719b521a0bb4deab6efc944cd30c1a56aada# [currently available] EM64T CPUs apparently are far from it. On the
55553f719b521a0bb4deab6efc944cd30c1a56aada# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
55553f719b521a0bb4deab6efc944cd30c1a56aada# sha256_block:-( This is presumably because 64-bit shifts/rotates
55553f719b521a0bb4deab6efc944cd30c1a56aada# apparently are not atomic instructions, but implemented in microcode.
55553f719b521a0bb4deab6efc944cd30c1a56aada# OpenSolaris OS modifications
55553f719b521a0bb4deab6efc944cd30c1a56aada# Sun elects to use this software under the BSD license.
55553f719b521a0bb4deab6efc944cd30c1a56aada# This source originates from OpenSSL file sha512-x86_64.pl at
55553f719b521a0bb4deab6efc944cd30c1a56aada# ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
55553f719b521a0bb4deab6efc944cd30c1a56aada# (presumably for future OpenSSL release 0.9.8h), with these changes:
55553f719b521a0bb4deab6efc944cd30c1a56aada# 1. Added perl "use strict" and declared variables.
55553f719b521a0bb4deab6efc944cd30c1a56aada# 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
55553f719b521a0bb4deab6efc944cd30c1a56aada# /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
55553f719b521a0bb4deab6efc944cd30c1a56aada# 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
55553f719b521a0bb4deab6efc944cd30c1a56aada# assemblers). Replaced the .picmeup macro with assembler code.
55553f719b521a0bb4deab6efc944cd30c1a56aada# 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
55553f719b521a0bb4deab6efc944cd30c1a56aada# at the beginning of SHA2_CTX (the next field is 8-byte aligned).
55553f719b521a0bb4deab6efc944cd30c1a56aadause strict;
55553f719b521a0bb4deab6efc944cd30c1a56aadamy ($code, $func, $TABLE, $SZ, @Sigma0, @Sigma1, @sigma0, @sigma1, $rounds,
55553f719b521a0bb4deab6efc944cd30c1a56aada @ROT, $A, $B, $C, $D, $E, $F, $G, $H, $T1, $a0, $a1, $a2, $i,
55553f719b521a0bb4deab6efc944cd30c1a56aada $ctx, $round, $inp, $Tbl, $_ctx, $_inp, $_end, $_rsp, $framesz);
55553f719b521a0bb4deab6efc944cd30c1a56aadamy $output = shift;
55553f719b521a0bb4deab6efc944cd30c1a56aada# OpenSSL library:
55553f719b521a0bb4deab6efc944cd30c1a56aada# void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num);
55553f719b521a0bb4deab6efc944cd30c1a56aada# void sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num);
55553f719b521a0bb4deab6efc944cd30c1a56aada# OpenSolaris OS:
55553f719b521a0bb4deab6efc944cd30c1a56aada# void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
55553f719b521a0bb4deab6efc944cd30c1a56aada# void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
55553f719b521a0bb4deab6efc944cd30c1a56aada# Note: the OpenSolaris SHA2 structure has an extra 8 byte field at the
55553f719b521a0bb4deab6efc944cd30c1a56aada# beginning (over OpenSSL's SHA512 or SHA256 structure).
55553f719b521a0bb4deab6efc944cd30c1a56aada @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
55553f719b521a0bb4deab6efc944cd30c1a56aada @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
55553f719b521a0bb4deab6efc944cd30c1a56aada{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
55553f719b521a0bb4deab6efc944cd30c1a56aada{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
55553f719b521a0bb4deab6efc944cd30c1a56aada# Execution begins here
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson#if defined(lint) || defined(__lint)
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson$func(SHA2_CTX *ctx, const void *in, size_t num)
55553f719b521a0bb4deab6efc944cd30c1a56aada / The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
55553f719b521a0bb4deab6efc944cd30c1a56aada / the address of the "next" instruction into the target register
55553f719b521a0bb4deab6efc944cd30c1a56aada /nop / .picmeup generates a nop for mod 8 alignment--not needed here
55553f719b521a0bb4deab6efc944cd30c1a56aada for(;$i<32;$i++) {
55553f719b521a0bb4deab6efc944cd30c1a56aada#endif /* !lint && !__lint */