use strict;
#if !defined(lint) && !defined(__lint)
/
/
/
.
ident "%Z%%M% %I% %E% SMI"
/
/
/
/
/
# Workaround for a Solaris "gas" assembler bug where compiling the source
# errors out and does not generate a valid "lea" instruction. Specifically,
# lea OFFSET(REGISTER,%r10d),REGISTER
#
# For Solaris as, "as -a32" must be used to compile this.
# For Solaris gas 2.15, this errors out with this message:
# Error: `0xf57c0faf(%eax,%r10d)' is not a valid 64 bit base/index expression # This should be fixed in Solaris gas 2.16.
# It assembles with the Linux "as --64" gas 2.17 assembler and runs OK.
#
# For the ONBLD NV tools, the aw wrapper script fails when -a32 is used:
# aw: as->gas mapping failed at or near arg '-a32'
#
# For more information, see CRs 6644870 and 6628627.
# Note2: Solaris "as" uses "/" for comments; Linux "as" uses "#" for comments.
{
# Failed "lea" instruction.
# This instruction errors out from the Solaris as assembler.
# It assembles with the Linux "as --64" assembler and runs OK.
$
code .=
" / lea $offset($register,%r10d),$register" . " $comment\n";
# Workaround #1 (not used)
# One workaround is to generate two "add" instructions that are
# functionally equivalent to "lea." The problem is this workaround
# is about 4.5% slower than a lea, so is not used.
#$code .= " add %r10d,$register\n";
#$code .= " add \&0x%0x,$offset\n";
# Workaround #2 (used)
# This workaround hand-generates hex machine code for lea.
$
code .=
" / Solaris as assembly bug CR 6628627 errors out for\n";
$
code .=
" / the above, so we specify the machine code in hex:\n";
$
code .=
" .byte 0x67,0x42,0x8d / lea offset(reg,%r10d),reg\n";
case "%eax" { $
code .=
" .byte 0x84,0x10 / reg=%eax\n"; }
case "%ebx" { $
code .=
" .byte 0x9c,0x13 / reg=%ebx\n"; }
case "%ecx" { $
code .=
" .byte 0x8c,0x11 / reg=%ecx\n"; }
case "%edx" { $
code .=
" .byte 0x94,0x12 / reg=%edx\n"; }
else { $
code .=
"ERROR: unknown register $register\n"; }
}
$
code .=
" .long $offset / offset\n";
}
# round1_step() does:
# dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s)
# %r10d = X[k_next]
# %r11d = z' (copy of z for the next step)
# Each round1_step() takes about 5.3 clocks (9 instructions, 1.7 IPC)
{
$
code .=
" mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */\n" if ($
pos == -
1);
$
code .=
" mov %edx, %r11d /* (NEXT STEP) z' = %edx */\n" if ($
pos == -
1);
$
code .=
" xor $y, %r11d /* y ^ ... */\n";
#lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
and $x, %
r11d /* x & ... */
xor $z, %
r11d /* z ^ ... */
}
# round2_step() does:
# dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s)
# %r10d = X[k_next]
# %r11d = z' (copy of z for the next step)
# %r12d = z' (copy of z for the next step)
# Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC)
{
$
code .=
" mov 1*4(%rsi), %r10d /* (NEXT STEP) X[1] */\n" if ($
pos == -
1);
$
code .=
" mov %edx, %r11d /* (NEXT STEP) z' = %edx */\n" if ($
pos == -
1);
$
code .=
" mov %edx, %r12d /* (NEXT STEP) z' = %edx */\n" if ($
pos == -
1);
$
code .=
" not %r11d /* not z */\n";
#lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
and $x, %
r12d /* x & z */
and $y, %
r11d /* y & (
not z) */
or %
r11d, %
r12d /* (y & (
not z)) | (x & z) */
}
# round3_step() does:
# dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s)
# %r10d = X[k_next]
# %r11d = y' (copy of y for the next step)
# Each round3_step() takes about 4.2 clocks (8 instructions, 1.9 IPC)
{
$
code .=
" mov 5*4(%rsi), %r10d /* (NEXT STEP) X[5] */\n" if ($
pos == -
1);
$
code .=
" mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */\n" if ($
pos == -
1);
#lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
xor $z, %
r11d /* z ^ ... */
xor $x, %
r11d /* x ^ ... */
}
# round4_step() does:
# dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s)
# %r10d = X[k_next]
# %r11d = not z' (copy of not z for the next step)
# Each round4_step() takes about 5.2 clocks (9 instructions, 1.7 IPC)
{
$
code .=
" mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */\n" if ($
pos == -
1);
$
code .=
" mov \$0xffffffff, %r11d\n" if ($
pos == -
1);
$
code .=
" xor %edx, %r11d /* (NEXT STEP) not z' = not %edx*/\n" #lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
or $x, %
r11d /* x | ... */
xor $y, %
r11d /* y ^ ... */
}
#
# Execution begins here.
#
open STDOUT,
">$output" or die "can't open $output: $!";
/
rdi =
arg #1 (ctx, MD5_CTX pointer) /
rsi =
arg #2 (ptr, data pointer) /
rdx =
arg #3 (nbr, number of 64-byte blocks to process)
round1_step(-
1,
'%eax',
'%ebx',
'%ecx',
'%edx',
'1',
'0xd76aa478',
'7');
round1_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'2',
'0xe8c7b756',
'12');
round1_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'3',
'0x242070db',
'17');
round1_step(
0,
'%ebx',
'%ecx',
'%edx',
'%eax',
'4',
'0xc1bdceee',
'22');
round1_step(
0,
'%eax',
'%ebx',
'%ecx',
'%edx',
'5',
'0xf57c0faf',
'7');
round1_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'6',
'0x4787c62a',
'12');
round1_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'7',
'0xa8304613',
'17');
round1_step(
0,
'%ebx',
'%ecx',
'%edx',
'%eax',
'8',
'0xfd469501',
'22');
round1_step(
0,
'%eax',
'%ebx',
'%ecx',
'%edx',
'9',
'0x698098d8',
'7');
round1_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'10',
'0x8b44f7af',
'12');
round1_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'11',
'0xffff5bb1',
'17');
round1_step(
0,
'%ebx',
'%ecx',
'%edx',
'%eax',
'12',
'0x895cd7be',
'22');
round1_step(
0,
'%eax',
'%ebx',
'%ecx',
'%edx',
'13',
'0x6b901122',
'7');
round1_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'14',
'0xfd987193',
'12');
round1_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'15',
'0xa679438e',
'17');
round1_step(
1,
'%ebx',
'%ecx',
'%edx',
'%eax',
'0',
'0x49b40821',
'22');
round2_step(-
1,
'%eax',
'%ebx',
'%ecx',
'%edx',
'6',
'0xf61e2562',
'5');
round2_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'11',
'0xc040b340',
'9');
round2_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'0',
'0x265e5a51',
'14');
round2_step(
0,
'%ebx',
'%ecx',
'%edx',
'%eax',
'5',
'0xe9b6c7aa',
'20');
round2_step(
0,
'%eax',
'%ebx',
'%ecx',
'%edx',
'10',
'0xd62f105d',
'5');
round2_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'15',
'0x2441453',
'9');
round2_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'4',
'0xd8a1e681',
'14');
round2_step(
0,
'%ebx',
'%ecx',
'%edx',
'%eax',
'9',
'0xe7d3fbc8',
'20');
round2_step(
0,
'%eax',
'%ebx',
'%ecx',
'%edx',
'14',
'0x21e1cde6',
'5');
round2_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'3',
'0xc33707d6',
'9');
round2_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'8',
'0xf4d50d87',
'14');
round2_step(
0,
'%ebx',
'%ecx',
'%edx',
'%eax',
'13',
'0x455a14ed',
'20');
round2_step(
0,
'%eax',
'%ebx',
'%ecx',
'%edx',
'2',
'0xa9e3e905',
'5');
round2_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'7',
'0xfcefa3f8',
'9');
round2_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'12',
'0x676f02d9',
'14');
round2_step(
1,
'%ebx',
'%ecx',
'%edx',
'%eax',
'0',
'0x8d2a4c8a',
'20');
round3_step(-
1,
'%eax',
'%ebx',
'%ecx',
'%edx',
'8',
'0xfffa3942',
'4');
round3_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'11',
'0x8771f681',
'11');
round3_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'14',
'0x6d9d6122',
'16');
round3_step(
0,
'%ebx',
'%ecx',
'%edx',
'%eax',
'1',
'0xfde5380c',
'23');
round3_step(
0,
'%eax',
'%ebx',
'%ecx',
'%edx',
'4',
'0xa4beea44',
'4');
round3_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'7',
'0x4bdecfa9',
'11');
round3_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'10',
'0xf6bb4b60',
'16');
round3_step(
0,
'%ebx',
'%ecx',
'%edx',
'%eax',
'13',
'0xbebfbc70',
'23');
round3_step(
0,
'%eax',
'%ebx',
'%ecx',
'%edx',
'0',
'0x289b7ec6',
'4');
round3_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'3',
'0xeaa127fa',
'11');
round3_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'6',
'0xd4ef3085',
'16');
round3_step(
0,
'%ebx',
'%ecx',
'%edx',
'%eax',
'9',
'0x4881d05',
'23');
round3_step(
0,
'%eax',
'%ebx',
'%ecx',
'%edx',
'12',
'0xd9d4d039',
'4');
round3_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'15',
'0xe6db99e5',
'11');
round3_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'2',
'0x1fa27cf8',
'16');
round3_step(
1,
'%ebx',
'%ecx',
'%edx',
'%eax',
'0',
'0xc4ac5665',
'23');
round4_step(-
1,
'%eax',
'%ebx',
'%ecx',
'%edx',
'7',
'0xf4292244',
'6');
round4_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'14',
'0x432aff97',
'10');
round4_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'5',
'0xab9423a7',
'15');
round4_step(
0,
'%ebx',
'%ecx',
'%edx',
'%eax',
'12',
'0xfc93a039',
'21');
round4_step(
0,
'%eax',
'%ebx',
'%ecx',
'%edx',
'3',
'0x655b59c3',
'6');
round4_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'10',
'0x8f0ccc92',
'10');
round4_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'1',
'0xffeff47d',
'15');
round4_step(
0,
'%ebx',
'%ecx',
'%edx',
'%eax',
'8',
'0x85845dd1',
'21');
round4_step(
0,
'%eax',
'%ebx',
'%ecx',
'%edx',
'15',
'0x6fa87e4f',
'6');
round4_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'6',
'0xfe2ce6e0',
'10');
round4_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'13',
'0xa3014314',
'15');
round4_step(
0,
'%ebx',
'%ecx',
'%edx',
'%eax',
'4',
'0x4e0811a1',
'21');
round4_step(
0,
'%eax',
'%ebx',
'%ecx',
'%edx',
'11',
'0xf7537e82',
'6');
round4_step(
0,
'%edx',
'%eax',
'%ebx',
'%ecx',
'2',
'0xbd3af235',
'10');
round4_step(
0,
'%ecx',
'%edx',
'%eax',
'%ebx',
'9',
'0x2ad7d2bb',
'15');
round4_step(
1,
'%ebx',
'%ecx',
'%edx',
'%eax',
'0',
'0xeb86d391',
'21');
#else
#endif /* !lint && !__lint */