# Specific modes implementations for SPARC Architecture 2011. There
# is T4 dependency though, an ASI value that is not specified in the
# Architecture Manual. But as SPARC universe is rather monocultural,
# we imply that processor capable of executing crypto instructions
# can handle the ASI in question as well. This means that we ought to
# keep eyes open when new processors emerge...
#
# As for above mentioned ASI. It's so called "block initializing
# store" which cancels "read" in "read-update-write" on cache lines.
# This is "cooperative" optimization, as it reduces overall pressure
# on memory interface. Benefits can't be observed/quantified with
# usual benchmarks, on the contrary you can notice that single-thread
# performance for parallelizable modes is ~1.5% worse for largest
# block sizes [though few percent better for not so long ones]. All
# this based on suggestions from David Miller.
}
# unified interface
# local variables
.align 32
cmp $len, 0
1:
cmp $len, 127
.L${bits}_cbc_enc_loop:
4:
.L${bits}_cbc_enc_abort:
.align 16
! and ~3x deterioration
.align 16
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align 32
.L${bits}cbc_enc_blk:
.L${bits}_cbc_enc_blk_loop:
5:
membar #StoreLoad|#StoreStore
}
.align 32
cmp $len, 0
1:
cmp $len, 255
.L${bits}_cbc_dec_loop:
4:
.L${bits}_cbc_dec_abort:
.align 16
! and ~3x deterioration
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align 32
.L${bits}_cbc_dec_loop2x:
4:
.align 16
! and ~3x deterioration
.align 16
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align 32
.L${bits}cbc_dec_blk:
.L${bits}_cbc_dec_blk_loop2x:
5:
membar #StoreLoad|#StoreStore
}
sub alg_ctr32_implement {
.align 32
cmp $len, 255
.L${bits}_ctr32_loop:
4:
.align 16
! and ~3x deterioration
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align 32
.L${bits}_ctr32_loop2x:
4:
.align 16
! and ~3x deterioration
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align 32
.L${bits}_ctr32_blk:
.L${bits}_ctr32_blk_loop2x:
5:
membar #StoreLoad|#StoreStore
}
sub alg_xts_implement {
.align 32
cmp $len, 255
4:
.align 16
! and ~3x deterioration
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align 32
4:
.align 16
! and ~3x deterioration
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align 32
5:
membar #StoreLoad|#StoreStore
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align 32
.align 32
8:
}
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
"bshuffle" => 0x04c,
"fnot2" => 0x066,
"fxor" => 0x06c,
"fsrc2" => 0x078 );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
$_=$1;
if ($1>=32) {
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
$ref;
} else {
return $ref;
}
}
sub unvis3 {
"addxccc" => 0x013,
"umulxhi" => 0x016,
"alignaddr" => 0x018,
"bmask" => 0x019,
"alignaddrl" => 0x01a );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
}
return sprintf ".word\t0x%08x !%s",
$ref;
} else {
return $ref;
}
}
"aes_eround23" => 1,
"aes_dround01" => 2,
"aes_dround23" => 3,
"aes_eround01_l"=> 4,
"aes_eround23_l"=> 5,
"aes_dround01_l"=> 6,
"aes_dround23_l"=> 7,
"aes_kexpand1" => 8 );
$ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
$_=$1;
if ($1>=32) {
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
$ref;
} else {
return $ref;
}
}
"aes_kexpand2" => 0x131 );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
$_=$1;
if ($1>=32) {
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
$ref;
} else {
return $ref;
}
}
$ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
if (1) {
$_=$1;
if ($1>=32) {
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
$ref;
} else {
return $ref;
}
}
"camellia_fli" => 0x13d );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
$_=$1;
if ($1>=32) {
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
$ref;
} else {
return $ref;
}
}
"movstouw" => 0x111,
"movstosw" => 0x113,
"movxtod" => 0x118,
"movwtos" => 0x119 );
$ref = "$mnemonic\t$rs,$rd";
if ($2>=32) {
# re-encode for upper double register addressing
$_=($2|$2>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
$ref;
} else {
return $ref;
}
}
sub undes {
my ($mnemonic)=shift;
my @args=@_;
if ($mnemonic eq "des_round") {
$_=$1;
if ($1>=32) {
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
$ref;
$_=$2;
if ($2>=32) {
# re-encode for upper double register addressing
$_=($2|$2>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
$ref;
} else { # 2-arg
$_=$1;
if ($1>=32) {
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
$ref;
}
} else {
return $ref;
}
}
sub emit_assembler {
foreach (split("\n",$::code)) {
s/\`([^\`]*)\`/eval $1/ge;
/geo or
/geo or
/geo or
/geo or
/geo or
/geo or
/geo or
/geo or
s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
/geo;
print $_,"\n";
}
}
1;