/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* Define the conversions between sequences of small integers and raw bytes.
* This is a schema of encodings which incorporates varying lengths,
* varying degrees of length variability, and varying amounts of signed-ness.
* @author John Rose
*/
/*
Coding schema for single integers, parameterized by (B,H,S):
Let B in [1,5], H in [1,256], S in [0,3].
(S limit is arbitrary. B follows the 32-bit limit. H is byte size.)
A given (B,H,S) code varies in length from 1 to B bytes.
The 256 values a byte may take on are divided into L=(256-H) and H
values, with all the H values larger than the L values.
(That is, the L values are [0,L) and the H are [L,256).)
The last byte is always either the B-th byte, a byte with "L value"
(<L), or both. There is no other byte that satisfies these conditions.
All bytes before the last always have "H values" (>=L).
Therefore, if L==0, the code always has the full length of B bytes.
The coding then becomes a classic B-byte little-endian unsigned integer.
(Also, if L==128, the high bit of each byte acts signals the presence
of a following byte, up to the maximum length.)
In the unsigned case (S==0), the coding is compact and monotonic
in the ordering of byte sequences defined by appending zero bytes
to pad them to a common length B, reversing them, and ordering them
lexicographically. (This agrees with "little-endian" byte order.)
Therefore, the unsigned value of a byte sequence may be defined as:
<pre>
U(b0) == b0
in [0..L)
or [0..256) if B==1 (**)
U(b0,b1) == b0 + b1*H
in [L..L*(1+H))
or [L..L*(1+H) + H^2) if B==2
U(b0,b1,b2) == b0 + b1*H + b2*H^2
in [L*(1+H)..L*(1+H+H^2))
or [L*(1+H)..L*(1+H+H^2) + H^3) if B==3
U(b[i]: i<n) == Sum[i<n]( b[i] * H^i )
up to L*Sum[i<n]( H^i )
or to L*Sum[i<n]( H^i ) + H^n if n==B
</pre>
(**) If B==1, the values H,L play no role in the coding.
As a convention, we require that any (1,H,S) code must always
encode values less than H. Thus, a simple unsigned byte is coded
specifically by the code (1,256,0).
(Properly speaking, the unsigned case should be parameterized as
S==Infinity. If the schema were regular, the case S==0 would really
denote a numbering in which all coded values are negative.)
If S>0, the unsigned value of a byte sequence is regarded as a binary
integer. If any of the S low-order bits are zero, the corresponding
signed value will be non-negative. If all of the S low-order bits
(S>0) are one, the the corresponding signed value will be negative.
The non-negative signed values are compact and monotonically increasing
(from 0) in the ordering of the corresponding unsigned values.
The negative signed values are compact and monotonically decreasing
(from -1) in the ordering of the corresponding unsigned values.
In essence, the low-order S bits function as a collective sign bit
for negative signed numbers, and as a low-order base-(2^S-1) digit
for non-negative signed numbers.
Therefore, the signed value corresponding to an unsigned value is:
<pre>
Sgn(x) == x if S==0
Sgn(x) == (x / 2^S)*(2^S-1) + (x % 2^S), if S>0, (x % 2^S) < 2^S-1
Sgn(x) == -(x / 2^S)-1, if S>0, (x % 2^S) == 2^S-1
</pre>
Finally, the value of a byte sequence, given the coding parameters
(B,H,S), is defined as:
<pre>
V(b[i]: i<n) == Sgn(U(b[i]: i<n))
</pre>
The extremal positive and negative signed value for a given range
of unsigned values may be found by sign-encoding the largest unsigned
value which is not 2^S-1 mod 2^S, and that which is, respectively.
Because B,H,S are variable, this is not a single coding but a schema
of codings. For optimal compression, it is necessary to adaptively
select specific codings to the data being compressed.
For example, if a sequence of values happens never to be negative,
S==0 is the best choice. If the values are equally balanced between
negative and positive, S==1. If negative values are rare, then S>1
is more appropriate.
A (B,H,S) encoding is called a "subrange" if it does not encode
the largest 32-bit value, and if the number R of values it does
encode can be expressed as a positive 32-bit value. (Note that
B=1 implies R<=256, B=2 implies R<=65536, etc.)
A delta version of a given (B,H,S) coding encodes an array of integers
by writing their successive differences in the (B,H,S) coding.
The original integers themselves may be recovered by making a
running accumulation of sum of the differences as they are read.
As a special case, if a (B,H,S) encoding is a subrange, its delta
version will only encode arrays of numbers in the coding's unsigned
range, [0..R-1]. The coding of deltas is still in the normal signed
range, if S!=0. During delta encoding, all subtraction results are
reduced to the signed range, by adding multiples of R. Likewise,
. during encoding, all addition results are reduced to the unsigned range.
This special case for subranges allows the benefits of wraparound
when encoding correlated sequences of very small positive numbers.
*/
// Code-specific limits:
private static int saturate32(long x) {
return (int)x;
}
private static long codeRangeLong(int B, int H) {
return codeRangeLong(B, H, B);
}
// Code range for a all (B,H) codes of length <=nMax (<=B).
// n < B: L*Sum[i<n]( H^i )
// n == B: L*Sum[i<B]( H^i ) + H^B
assert(B >= 1 && B <= 5);
assert(H >= 1 && H <= 256);
if (B == 1) return H; // special case; see (**) above
int L = 256-H;
long sum = 0;
long H_i = 1;
for (int n = 1; n <= nMax; n++) {
H_i *= H;
}
sum *= L;
if (nMax == B)
return sum;
}
/** Largest int representable by (B,H,S) in up to nMax bytes. */
//assert(S >= 0 && S <= S_MAX);
if (range == 0)
return -1; // degenerate max value for empty set of codes
while (isNegativeCode(maxPos, S)) {
--maxPos;
}
// check for 32-bit wraparound:
if (smax < 0)
return smax;
}
/** Smallest int representable by (B,H,S) in up to nMax bytes.
Returns Integer.MIN_VALUE if 32-bit wraparound covers
the entire negative range.
*/
//assert(S >= 0 && S <= S_MAX);
// Can code negative values via 32-bit wraparound.
}
if (S == 0) {
return 0;
}
while (!isNegativeCode(maxNeg, S))
--maxNeg;
return decodeSign32(maxNeg, S);
}
// Some of the arithmetic below is on unsigned 32-bit integers.
// These must be represented in Java as longs in the range [0..2^32-1].
// The conversion to a signed int is just the Java cast (int), but
// the conversion to an unsigned int is the following little method:
}
// Sign encoding:
assert(S > 0);
}
assert(S > 0);
// If S>=2 very low negatives are coded by 32-bit-wrapped positives.
// The lowest negative representable by a negative coding is
// ~(umax32 >> S), and the next lower number is coded by wrapping
// the highest positive:
// CodePos(umax32-1) -> (umax32-1)-((umax32-1)>>S)
// which simplifies to ~(umax32 >> S)-1.
}
if (S == 0) {
return (int) ux; // cast to signed int
}
int sx;
if (isNegativeCode(ux, S)) {
// Sgn(x) == -(x / 2^S)-1
} else {
// Sgn(x) == (x / 2^S)*(2^S-1) + (x % 2^S)
}
// Assert special case of S==1:
return sx;
}
if (S == 0) {
}
long ux;
if (!hasNegativeCode(sx, S)) {
// InvSgn(sx) = (sx / (2^S-1))*2^S + (sx % (2^S-1))
} else {
// InvSgn(sx) = (-sx-1)*2^S + (2^S-1)
}
return ux;
}
// Top-level coding of single integers:
assert(ux < codeRangeLong(B, H))
int L = 256-H;
for (int i = 0; i < B-1; i++) {
if (sum < L)
break;
sum -= L;
sum /= H;
}
// Report number of bytes written by updating outpos[0]:
// Check right away for mis-coding.
//assert(sx == readInt(out, new int[1], B, H, S));
}
// U(b[i]: i<n) == Sum[i<n]( b[i] * H^i )
int L = 256-H;
long sum = 0;
long H_i = 1;
for (int i = 0; i < B; i++) {
H_i *= H;
if (b_i < L) break;
}
//assert(sum >= 0 && sum < codeRangeLong(B, H));
// Report number of bytes read by updating inpos[0]:
return decodeSign32(sum, S);
}
// The Stream version doesn't fetch a byte unless it is needed for coding.
// U(b[i]: i<n) == Sum[i<n]( b[i] * H^i )
int L = 256-H;
long sum = 0;
long H_i = 1;
for (int i = 0; i < B; i++) {
H_i *= H;
if (b_i < L) break;
}
return decodeSign32(sum, S);
}
// END OF STATICS.
private final int B; /*1..5*/ // # bytes (1..5)
private final int H; /*1..256*/ // # codes requiring a higher byte
private final int L; /*0..255*/ // # codes requiring a higher byte
private final int S; /*0..3*/ // # low-order bits representing sign
private Coding(int B, int H, int S) {
this(B, H, S, 0);
}
this.B = B;
this.H = H;
this.L = 256-H;
this.S = S;
this.byteMin = new int[B];
this.byteMax = new int[B];
}
}
if (!(x instanceof Coding)) return false;
if (this.B != that.B) return false;
if (this.H != that.H) return false;
if (this.S != that.S) return false;
return true;
}
public int hashCode() {
}
return x1;
}
}
return of(B, H, S, 0);
}
public boolean canRepresentValue(int x) {
if (isSubrange())
return canRepresentUnsigned(x);
else
return canRepresentSigned(x);
}
/** Can this coding represent a single value, possibly a delta?
* This ignores the D property. That is, for delta codings,
* this tests whether a delta value of 'x' can be coded.
* For signed delta codings which produce unsigned end values,
* use canRepresentUnsigned.
*/
public boolean canRepresentSigned(int x) {
}
/** Can this coding, apart from its S property,
* represent a single value? (Negative values
* can only be represented via 32-bit overflow,
* so this returns true for negative values
* if isFullRange is true.)
*/
public boolean canRepresentUnsigned(int x) {
}
}
}
// Stream versions
return readIntFrom(in, B, H, S);
}
byte[] buf = new byte[B];
int[] pos = new int[1];
}
// %%% use byte[] buffer
long state = 0;
state += a[i];
// Reduce array values to the required range.
if (isSubrange()) {
}
a[i] = (int) state;
}
}
}
int[] deltas;
if (!isSubrange())
else
a = deltas;
start = 0;
}
// The following code is a buffered version of this loop:
// for (int i = start; i < end; i++)
// writeTo(out, a[i]);
int[] pos = { 0 };
if (i >= end) break;
}
}
}
/** Tell if the range of this coding (number of distinct
* representable values) can be expressed in 32 bits.
*/
boolean isSubrange() {
}
/** Tell if this coding can represent all 32-bit values.
* Note: Some codings, such as unsigned ones, can be neither
* subranges nor full-range codings.
*/
boolean isFullRange() {
}
/** Return the number of values this coding (a subrange) can represent. */
int getRange() {
assert(isSubrange());
}
/** Return a coding suitable for representing summed, modulo-reduced values. */
if (isDelta())
else
return this;
}
/** Reduce the given value to be within this coding's unsigned range,
* by adding or subtracting a multiple of (max-min+1).
*/
// already in unsigned range
return (int)value;
assert(range > 0);
assert(canRepresentUnsigned((int)value));
return (int)value;
}
if (canRepresentSigned(value))
// already in signed range
return value;
}
assert(range > 0);
// 32-bit overflow, but the next '%=' op needs to be unsigned
assert(value >= 0);
}
return value;
}
/** Does this coding support at least one negative value?
Includes codings that can do so via 32-bit wraparound.
*/
boolean isSigned() {
return min < 0;
}
/** Does this coding code arrays by making successive differences? */
boolean isDelta() {
return del != 0;
}
public int B() { return B; }
public int H() { return H; }
public int L() { return L; }
public int S() { return S; }
if (dkey == 0)
if (dkey == 0)
if (dkey == 0)
return dkey;
}
/** Heuristic measure of the difference between two codings. */
int diffHL;
if (this.H == that.H) {
diffHL = 0;
} else {
// Distance in log space of H (<=128) and L (<128).
// Double the accuracy of the log:
else
}
return norm;
}
private int getHL() {
// Follow H in log space by the multiplicative inverse of L.
if (H <= 128) return H;
if (L >= 1) return 128*128/L;
return 128*256;
}
/** ceiling(log[2](x)): {1->0, 2->1, 3->2, 4->2, ...} */
static int ceil_lg2(int x) {
assert(x-1 >= 0); // x in range (int.MIN_VALUE -> 32)
x -= 1;
int lg = 0;
while (x != 0) {
lg++;
x >>= 1;
}
return lg;
}
static {
}
for (int i = 10; i >= 0; i = (i << 1) - (i >> 3)) {
}
}
/** Number of significant bits in i, not counting sign bits.
* For positive i, it is ceil_lg2(i + 1).
*/
static int bitWidth(int i) {
if (i < 0) i = ~i; // change sign
int w = 0;
int lo = i;
return byteBitWidths[lo];
int hi;
if (hi != 0) {
w += 16;
}
if (hi != 0) {
w += 8;
}
w += byteBitWidths[lo];
//assert(w == ceil_lg2(i + 1));
return w;
}
/** Create an array of successive differences.
* If min==max, accept any and all 32-bit overflow.
* Otherwise, avoid 32-bit overflow, and reduce all differences
* to a value in the given range, by adding or subtracting
* multiples of the range cardinality (max-min+1).
* Also, the values are assumed to be in the range [0..(max-min)].
*/
int state = 0;
for (int i = 0; i < count; i++) {
}
} else {
for (int i = 0; i < count; i++) {
// Reduce delta values to the required range.
}
}
return deltas;
}
if (del > 0) {
if (isSubrange()) {
// We will force the values to reduce to the right subrange.
return canRepresentUnsigned(maxValue)
} else {
// Huge range; delta values must assume full 32-bit range.
return isFullRange();
}
}
else
// final values must be representable
return canRepresentSigned(maxValue)
}
if (len == 0) return true;
if (isFullRange()) return true;
// Calculate max, min:
for (int i = 1; i < len; i++) {
}
}
}
/** How many bytes are in the coding of this value?
* Returns Integer.MAX_VALUE if the value has no coding.
* The coding must not be a delta coding, since there is no
* definite size for a single value apart from its context.
*/
if (isDelta() && isSubrange()) {
if (!canRepresentUnsigned(value))
}
if (value >= 0) {
for (int n = 0; n < B; n++) {
}
} else {
for (int n = 0; n < B; n++) {
}
}
}
if (B == 1) return len;
if (L == 0) return len * B;
if (isDelta()) {
int[] deltas;
if (!isSubrange())
else
//return Coding.of(B, H, S).getLength(deltas, 0, len);
start = 0;
}
// add extra bytes for extra-long values
for (int n = 1; n <= B; n++) {
// what is the coding interval [min..max] for n bytes?
for (int i = 0; i < len; i++) {
if (value >= 0) {
} else {
}
}
}
return sum;
}
if (dflt == this) return new byte[]{ (byte) _meta_default };
if (canonicalIndex > 0)
return new byte[]{ (byte) canonicalIndex };
return new byte[]{
(byte)_meta_arb,
(byte)(H-1)
};
}
assert(c != null);
res[0] = c;
return pos;
}
int H = H_1+1;
if (!((1 <= B && B <= B_MAX) &&
(0 <= S && S <= S_MAX) &&
(1 <= H && H <= H_MAX) &&
|| (B == 1 && H != 256)
|| (B == 5 && H == 256)) {
}
return pos;
}
}
}
// If -ea, print out more informative strings!
//assert((str = stringForDebug()) != null);
return str;
}
static boolean verboseStringForDebug = false;
if (isSubrange())
str += " subrange";
else if (!isFullRange())
str += " MIDRANGE";
if (verboseStringForDebug) {
str += " {";
int prev_range = 0;
for (int n = 1; n <= B; n++) {
range_n -= prev_range;
}
str += " }";
}
return str;
}
}