/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include "../arcfour.h"
/* Initialize the key stream 'key' using the key value */
void
arcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen)
{
uchar_t ext_keyval[256];
uchar_t tmp;
int i, j;
for (i = j = 0; i < 256; i++, j++) {
if (j == keyvallen)
j = 0;
ext_keyval[i] = keyval[j];
}
for (i = 0; i < 256; i++)
key->arr[i] = (uchar_t)i;
j = 0;
for (i = 0; i < 256; i++) {
j = (j + key->arr[i] + ext_keyval[i]) % 256;
tmp = key->arr[i];
key->arr[i] = key->arr[j];
key->arr[j] = tmp;
}
key->i = 0;
key->j = 0;
}
/*
* Encipher 'in' using 'key.
* in and out can point to the same location
*/
void
arcfour_crypt(ARCFour_key *key, uchar_t *in, uchar_t *out, size_t len)
{
size_t ii;
unsigned long long in0, merge = 0, merge0 = 0, merge1, mask = 0;
uchar_t i, j, *base, jj, *base1, tmp;
unsigned int tmp0, tmp1, i_accum, shift = 0, i1;
int index;
base = key->arr;
index = (((uintptr_t)in) & 0x7);
/* Get the 'in' on an 8-byte alignment */
if (index > 0) {
i = key->i;
j = key->j;
for (index = 8 - index; (index-- > 0) && len > 0;
len--, in++, out++) {
i = i + 1;
j = j + key->arr[i];
tmp = key->arr[i];
key->arr[i] = key->arr[j];
key->arr[j] = tmp;
tmp = key->arr[i] + key->arr[j];
*out = *in ^ key->arr[tmp];
}
key->i = i;
key->j = j;
}
if (len == 0)
return;
/* See if we're fortunate and 'out' got aligned as well */
/*
* Niagara optimized version for
* the cases where the input and output buffers are aligned on
* a multiple of 8-byte boundary.
*/
#ifdef sun4v
if ((((uintptr_t)out) & 7) != 0) {
#endif /* sun4v */
i = key->i;
j = key->j;
for (ii = 0; ii < len; ii++) {
i = i + 1;
tmp0 = base[i];
j = j + tmp0;
tmp1 = base[j];
base[i] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
out[ii] = in[ii] ^ base[tmp0];
}
key->i = i;
key->j = j;
#ifdef sun4v
} else {
i = key->i;
j = key->j;
/*
* Want to align base[i] on a 2B boundary -- allows updates
* via [i] to be performed in 2B chunks (reducing # of stores).
* Requires appropriate alias detection.
*/
if (((i+1) % 2) != 0) {
i = i + 1;
tmp0 = base[i];
j = j + tmp0;
tmp1 = base[j];
base[i] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
merge0 = (unsigned long long)(base[tmp0]) << 56;
shift = 8; mask = 0xff;
}
/*
* Note - in and out may now be misaligned -
* as updating [out] in 8B chunks need to handle this
* possibility. Also could have a 1B overrun.
* Need to drop out of loop early as a result.
*/
for (ii = 0, i1 = i; ii < ((len-1) & (~7));
ii += 8, i1 = i1&0xff) {
/*
* If i < less than 248, know wont wrap around
* (i % 256), so don't need to bother with masking i
* after each increment
*/
if (i1 < 248) {
/* BYTE 0 */
i1 = (i1 + 1);
/*
* Creating this base pointer reduces subsequent
* arihmetic ops required to load [i]
*
* N.B. don't need to check if [j] aliases.
* [i] and [j] end up with the same values
* anyway.
*/
base1 = &base[i1];
tmp0 = base1[0];
j = j + tmp0;
tmp1 = base[j];
/*
* Don't store [i] yet
*/
i_accum = tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
/*
* Check [tmp0] doesn't alias with [i]
*/
/*
* Updating [out] in 8B chunks
*/
if (i1 == tmp0) {
merge =
(unsigned long long)(i_accum) << 56;
} else {
merge =
(unsigned long long)(base[tmp0]) <<
56;
}
/* BYTE 1 */
tmp0 = base1[1];
j = j + tmp0;
/*
* [j] can now alias with [i] and [i-1]
* If alias abort speculation
*/
if ((i1 ^ j) < 2) {
base1[0] = (uchar_t)i_accum;
tmp1 = base[j];
base1[1] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
merge |= (unsigned long long)
(base[tmp0]) << 48;
} else {
tmp1 = base[j];
i_accum = i_accum << 8;
i_accum |= tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
/*
* Speculation suceeded! Update [i]
* in 2B chunk
*/
/* LINTED E_BAD_PTR_CAST_ALIGN */
*((unsigned short *) &base[i1]) =
i_accum;
merge |=
(unsigned long long)(base[tmp0]) <<
48;
}
/*
* Too expensive to perform [i] speculation for
* every byte. Just need to reduce frequency
* of stores until store buffer full stalls
* are not the bottleneck.
*/
/* BYTE 2 */
tmp0 = base1[2];
j = j + tmp0;
tmp1 = base[j];
base1[2] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp1 += tmp0;
tmp1 = tmp1 & 0xff;
merge |= (unsigned long long)(base[tmp1]) << 40;
/* BYTE 3 */
tmp0 = base1[3];
j = j + tmp0;
tmp1 = base[j];
base1[3] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
merge |= (unsigned long long)(base[tmp0]) << 32;
/* BYTE 4 */
tmp0 = base1[4];
j = j + tmp0;
tmp1 = base[j];
base1[4] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
merge |= (unsigned long long)(base[tmp0]) << 24;
/* BYTE 5 */
tmp0 = base1[5];
j = j + tmp0;
tmp1 = base[j];
base1[5] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
merge |= (unsigned long long)(base[tmp0]) << 16;
/* BYTE 6 */
i1 = (i1+6);
tmp0 = base1[6];
j = j + tmp0;
tmp1 = base[j];
i_accum = tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
if (i1 == tmp0) {
merge |=
(unsigned long long)(i_accum) << 8;
} else {
merge |=
(unsigned long long)(base[tmp0]) <<
8;
}
/* BYTE 7 */
tmp0 = base1[7];
/*
* Perform [i] speculation again. Indentical
* to that performed for BYTE0 and BYTE1.
*/
j = j + tmp0;
if ((i1 ^ j) < 2) {
base1[6] = (uchar_t)i_accum;
tmp1 = base[j];
base1[7] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
merge |=
(unsigned long long)(base[tmp0]);
} else {
tmp1 = base[j];
i_accum = i_accum << 8;
i_accum |= tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
/* LINTED E_BAD_PTR_CAST_ALIGN */
*((unsigned short *) &base[i1]) =
i_accum;
merge |=
(unsigned long long)(base[tmp0]);
}
i1++;
} else {
/*
* i is too close to wrap-around to allow
* masking to be disregarded
*/
/*
* Same old speculation for BYTE 0 and BYTE 1
*/
/* BYTE 0 */
i1 = (i1 + 1) & 0xff;
jj = (uchar_t)i1;
tmp0 = base[i1];
j = j + tmp0;
tmp1 = base[j];
i_accum = tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
if (i1 == tmp0) {
merge =
(unsigned long long)(i_accum) << 56;
} else {
merge =
(unsigned long long)(base[tmp0]) <<
56;
}
/* BYTE 1 */
tmp0 = base[i1+1];
j = j + tmp0;
if ((jj ^ j) < 2) {
base[jj] = (uchar_t)i_accum;
tmp1 = base[j];
base[i1+1] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
merge |=
(unsigned long long)(base[tmp0]) <<
48;
} else {
tmp1 = base[j];
i_accum = i_accum << 8;
i_accum |= tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
/* LINTED E_BAD_PTR_CAST_ALIGN */
*((unsigned short *) &base[jj]) =
i_accum;
merge |=
(unsigned long long)(base[tmp0]) <<
48;
}
/* BYTE 2 */
/*
* As know i must be even when enter loop (to
* satisfy alignment), can only wrap around
* on the even bytes. So just need to perform
* mask every 2nd byte
*/
i1 = (i1 + 2) & 0xff;
tmp0 = base[i1];
j = j + tmp0;
tmp1 = base[j];
base[i1] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
merge |= (unsigned long long)(base[tmp0]) << 40;
/* BYTE 3 */
tmp0 = base[i1+1];
j = j + tmp0;
tmp1 = base[j];
base[i1+1] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
merge |= (unsigned long long)(base[tmp0]) << 32;
/* BYTE 4 */
i1 = (i1 + 2) & 0xff;
tmp0 = base[i1];
j = j + tmp0;
tmp1 = base[j];
base[i1] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
merge |= (unsigned long long)(base[tmp0]) << 24;
/* BYTE 5 */
tmp0 = base[i1+1];
j = j + tmp0;
tmp1 = base[j];
base[i1+1] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
merge |= (unsigned long long)(base[tmp0]) << 16;
/* BYTE 6 */
i1 = (i1+2) &0xff;
jj = (uchar_t)i1;
tmp0 = base[i1];
j = j + tmp0;
tmp1 = base[j];
i_accum = tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
if (i1 == tmp0) {
merge |=
(unsigned long long)(i_accum) << 8;
} else {
merge |=
(unsigned long long)(base[tmp0]) <<
8;
}
/* BYTE 7 */
i1++;
tmp0 = base[i1];
j = j + tmp0;
if ((jj ^ j) < 2) {
base[jj] = (uchar_t)i_accum;
tmp1 = base[j];
base[i1] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
merge |=
(unsigned long long)(base[tmp0]);
} else {
tmp1 = base[j];
i_accum = i_accum << 8;
i_accum |= tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
/* LINTED E_BAD_PTR_CAST_ALIGN */
*((unsigned short *) &base[jj]) =
i_accum;
merge |=
(unsigned long long)(base[tmp0]);
}
}
/*
* Perform update to [out]
* Remember could be alignment issues
*/
/* LINTED E_BAD_PTR_CAST_ALIGN */
in0 = *((unsigned long long *) (&in[ii]));
merge1 = merge0 | (merge >> shift);
merge0 = (merge & mask) << 56;
in0 = in0 ^ merge1;
/* LINTED E_BAD_PTR_CAST_ALIGN */
*((unsigned long long *) (&out[ii])) = in0;
}
i = (uchar_t)i1;
/*
* Handle any overrun
*/
if (shift) {
out[ii] = in[ii] ^ (merge0 >> 56);
ii++;
}
/*
* Handle final few bytes
*/
for (; ii < len; ii++) {
i = i + 1;
tmp0 = base[i];
j = j + tmp0;
tmp1 = base[j];
base[i] = (uchar_t)tmp1;
base[j] = (uchar_t)tmp0;
tmp0 += tmp1;
tmp0 = tmp0 & 0xff;
out[ii] = in[ii] ^ base[tmp0];
}
key->i = i;
key->j = j;
}
#endif /* sun4v */
}