2N/A/* xz_dec_bcj.c - Branch/Call/Jump (BCJ) filter decoders */
2N/A/*
2N/A * GRUB -- GRand Unified Bootloader
2N/A * Copyright (C) 2010 Free Software Foundation, Inc.
2N/A *
2N/A * GRUB is free software: you can redistribute it and/or modify
2N/A * it under the terms of the GNU General Public License as published by
2N/A * the Free Software Foundation, either version 3 of the License, or
2N/A * (at your option) any later version.
2N/A *
2N/A * GRUB is distributed in the hope that it will be useful,
2N/A * but WITHOUT ANY WARRANTY; without even the implied warranty of
2N/A * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2N/A * GNU General Public License for more details.
2N/A *
2N/A * You should have received a copy of the GNU General Public License
2N/A * along with GRUB. If not, see <http://www.gnu.org/licenses/>.
2N/A */
2N/A/*
2N/A * This file is based on code from XZ embedded project
2N/A * http://tukaani.org/xz/embedded.html
2N/A */
2N/A
2N/A#include "xz_private.h"
2N/A
2N/Astruct xz_dec_bcj {
2N/A /* Type of the BCJ filter being used */
2N/A enum {
2N/A BCJ_X86 = 4, /* x86 or x86-64 */
2N/A BCJ_POWERPC = 5, /* Big endian only */
2N/A BCJ_IA64 = 6, /* Big or little endian */
2N/A BCJ_ARM = 7, /* Little endian only */
2N/A BCJ_ARMTHUMB = 8, /* Little endian only */
2N/A BCJ_SPARC = 9 /* Big or little endian */
2N/A } type;
2N/A
2N/A /*
2N/A * Return value of the next filter in the chain. We need to preserve
2N/A * this information across calls, because we must not call the next
2N/A * filter anymore once it has returned XZ_STREAM_END.
2N/A */
2N/A enum xz_ret ret;
2N/A
2N/A /* True if we are operating in single-call mode. */
2N/A bool single_call;
2N/A
2N/A /*
2N/A * Absolute position relative to the beginning of the uncompressed
2N/A * data (in a single .xz Block). We care only about the lowest 32
2N/A * bits so this doesn't need to be uint64_t even with big files.
2N/A */
2N/A uint32_t pos;
2N/A
2N/A /* x86 filter state */
2N/A uint32_t x86_prev_mask;
2N/A
2N/A /* Temporary space to hold the variables from struct xz_buf */
2N/A uint8_t *out;
2N/A size_t out_pos;
2N/A size_t out_size;
2N/A
2N/A struct {
2N/A /* Amount of already filtered data in the beginning of buf */
2N/A size_t filtered;
2N/A
2N/A /* Total amount of data currently stored in buf */
2N/A size_t size;
2N/A
2N/A /*
2N/A * Buffer to hold a mix of filtered and unfiltered data. This
2N/A * needs to be big enough to hold Alignment + 2 * Look-ahead:
2N/A *
2N/A * Type Alignment Look-ahead
2N/A * x86 1 4
2N/A * PowerPC 4 0
2N/A * IA-64 16 0
2N/A * ARM 4 0
2N/A * ARM-Thumb 2 2
2N/A * SPARC 4 0
2N/A */
2N/A uint8_t buf[16];
2N/A } temp;
2N/A};
2N/A
2N/A#ifdef XZ_DEC_X86
2N/A/*
2N/A * This is macro used to test the most significant byte of a memory address
2N/A * in an x86 instruction.
2N/A */
2N/A#define bcj_x86_test_msbyte(b) ((b) == 0x00 || (b) == 0xFF)
2N/A
2N/Astatic noinline_for_stack size_t bcj_x86(
2N/A struct xz_dec_bcj *s, uint8_t *buf, size_t size)
2N/A{
2N/A static const bool mask_to_allowed_status[8]
2N/A = { true, true, true, false, true, false, false, false };
2N/A
2N/A static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
2N/A
2N/A size_t i;
2N/A size_t prev_pos = (size_t)-1;
2N/A uint32_t prev_mask = s->x86_prev_mask;
2N/A uint32_t src;
2N/A uint32_t dest;
2N/A uint32_t j;
2N/A uint8_t b;
2N/A
2N/A if (size <= 4)
2N/A return 0;
2N/A
2N/A size -= 4;
2N/A for (i = 0; i < size; ++i) {
2N/A if ((buf[i] & 0xFE) != 0xE8)
2N/A continue;
2N/A
2N/A prev_pos = i - prev_pos;
2N/A if (prev_pos > 3) {
2N/A prev_mask = 0;
2N/A } else {
2N/A prev_mask = (prev_mask << (prev_pos - 1)) & 7;
2N/A if (prev_mask != 0) {
2N/A b = buf[i + 4 - mask_to_bit_num[prev_mask]];
2N/A if (!mask_to_allowed_status[prev_mask]
2N/A || bcj_x86_test_msbyte(b)) {
2N/A prev_pos = i;
2N/A prev_mask = (prev_mask << 1) | 1;
2N/A continue;
2N/A }
2N/A }
2N/A }
2N/A
2N/A prev_pos = i;
2N/A
2N/A if (bcj_x86_test_msbyte(buf[i + 4])) {
2N/A src = get_unaligned_le32(buf + i + 1);
2N/A while (true) {
2N/A dest = src - (s->pos + (uint32_t)i + 5);
2N/A if (prev_mask == 0)
2N/A break;
2N/A
2N/A j = mask_to_bit_num[prev_mask] * 8;
2N/A b = (uint8_t)(dest >> (24 - j));
2N/A if (!bcj_x86_test_msbyte(b))
2N/A break;
2N/A
2N/A src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
2N/A }
2N/A
2N/A dest &= 0x01FFFFFF;
2N/A dest |= (uint32_t)0 - (dest & 0x01000000);
2N/A put_unaligned_le32(dest, buf + i + 1);
2N/A i += 4;
2N/A } else {
2N/A prev_mask = (prev_mask << 1) | 1;
2N/A }
2N/A }
2N/A
2N/A prev_pos = i - prev_pos;
2N/A s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
2N/A return i;
2N/A}
2N/A#endif
2N/A
2N/A#ifdef XZ_DEC_POWERPC
2N/Astatic noinline_for_stack size_t bcj_powerpc(
2N/A struct xz_dec_bcj *s, uint8_t *buf, size_t size)
2N/A{
2N/A size_t i;
2N/A uint32_t instr;
2N/A
2N/A for (i = 0; i + 4 <= size; i += 4) {
2N/A instr = get_unaligned_be32(buf + i);
2N/A if ((instr & 0xFC000003) == 0x48000001) {
2N/A instr &= 0x03FFFFFC;
2N/A instr -= s->pos + (uint32_t)i;
2N/A instr &= 0x03FFFFFC;
2N/A instr |= 0x48000001;
2N/A put_unaligned_be32(instr, buf + i);
2N/A }
2N/A }
2N/A
2N/A return i;
2N/A}
2N/A#endif
2N/A
2N/A#ifdef XZ_DEC_IA64
2N/Astatic noinline_for_stack size_t bcj_ia64(
2N/A struct xz_dec_bcj *s, uint8_t *buf, size_t size)
2N/A{
2N/A static const uint8_t branch_table[32] = {
2N/A 0, 0, 0, 0, 0, 0, 0, 0,
2N/A 0, 0, 0, 0, 0, 0, 0, 0,
2N/A 4, 4, 6, 6, 0, 0, 7, 7,
2N/A 4, 4, 0, 0, 4, 4, 0, 0
2N/A };
2N/A
2N/A /*
2N/A * The local variables take a little bit stack space, but it's less
2N/A * than what LZMA2 decoder takes, so it doesn't make sense to reduce
2N/A * stack usage here without doing that for the LZMA2 decoder too.
2N/A */
2N/A
2N/A /* Loop counters */
2N/A size_t i;
2N/A size_t j;
2N/A
2N/A /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
2N/A uint32_t slot;
2N/A
2N/A /* Bitwise offset of the instruction indicated by slot */
2N/A uint32_t bit_pos;
2N/A
2N/A /* bit_pos split into byte and bit parts */
2N/A uint32_t byte_pos;
2N/A uint32_t bit_res;
2N/A
2N/A /* Address part of an instruction */
2N/A uint32_t addr;
2N/A
2N/A /* Mask used to detect which instructions to convert */
2N/A uint32_t mask;
2N/A
2N/A /* 41-bit instruction stored somewhere in the lowest 48 bits */
2N/A uint64_t instr;
2N/A
2N/A /* Instruction normalized with bit_res for easier manipulation */
2N/A uint64_t norm;
2N/A
2N/A for (i = 0; i + 16 <= size; i += 16) {
2N/A mask = branch_table[buf[i] & 0x1F];
2N/A for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {
2N/A if (((mask >> slot) & 1) == 0)
2N/A continue;
2N/A
2N/A byte_pos = bit_pos >> 3;
2N/A bit_res = bit_pos & 7;
2N/A instr = 0;
2N/A for (j = 0; j < 6; ++j)
2N/A instr |= (uint64_t)(buf[i + j + byte_pos])
2N/A << (8 * j);
2N/A
2N/A norm = instr >> bit_res;
2N/A
2N/A if (((norm >> 37) & 0x0F) == 0x05
2N/A && ((norm >> 9) & 0x07) == 0) {
2N/A addr = (norm >> 13) & 0x0FFFFF;
2N/A addr |= ((uint32_t)(norm >> 36) & 1) << 20;
2N/A addr <<= 4;
2N/A addr -= s->pos + (uint32_t)i;
2N/A addr >>= 4;
2N/A
2N/A norm &= ~((uint64_t)0x8FFFFF << 13);
2N/A norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
2N/A norm |= (uint64_t)(addr & 0x100000)
2N/A << (36 - 20);
2N/A
2N/A instr &= (1 << bit_res) - 1;
2N/A instr |= norm << bit_res;
2N/A
2N/A for (j = 0; j < 6; j++)
2N/A buf[i + j + byte_pos]
2N/A = (uint8_t)(instr >> (8 * j));
2N/A }
2N/A }
2N/A }
2N/A
2N/A return i;
2N/A}
2N/A#endif
2N/A
2N/A#ifdef XZ_DEC_ARM
2N/Astatic noinline_for_stack size_t bcj_arm(
2N/A struct xz_dec_bcj *s, uint8_t *buf, size_t size)
2N/A{
2N/A size_t i;
2N/A uint32_t addr;
2N/A
2N/A for (i = 0; i + 4 <= size; i += 4) {
2N/A if (buf[i + 3] == 0xEB) {
2N/A addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
2N/A | ((uint32_t)buf[i + 2] << 16);
2N/A addr <<= 2;
2N/A addr -= s->pos + (uint32_t)i + 8;
2N/A addr >>= 2;
2N/A buf[i] = (uint8_t)addr;
2N/A buf[i + 1] = (uint8_t)(addr >> 8);
2N/A buf[i + 2] = (uint8_t)(addr >> 16);
2N/A }
2N/A }
2N/A
2N/A return i;
2N/A}
2N/A#endif
2N/A
2N/A#ifdef XZ_DEC_ARMTHUMB
2N/Astatic noinline_for_stack size_t bcj_armthumb(
2N/A struct xz_dec_bcj *s, uint8_t *buf, size_t size)
2N/A{
2N/A size_t i;
2N/A uint32_t addr;
2N/A
2N/A for (i = 0; i + 4 <= size; i += 2) {
2N/A if ((buf[i + 1] & 0xF8) == 0xF0
2N/A && (buf[i + 3] & 0xF8) == 0xF8) {
2N/A addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
2N/A | ((uint32_t)buf[i] << 11)
2N/A | (((uint32_t)buf[i + 3] & 0x07) << 8)
2N/A | (uint32_t)buf[i + 2];
2N/A addr <<= 1;
2N/A addr -= s->pos + (uint32_t)i + 4;
2N/A addr >>= 1;
2N/A buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
2N/A buf[i] = (uint8_t)(addr >> 11);
2N/A buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
2N/A buf[i + 2] = (uint8_t)addr;
2N/A i += 2;
2N/A }
2N/A }
2N/A
2N/A return i;
2N/A}
2N/A#endif
2N/A
2N/A#ifdef XZ_DEC_SPARC
2N/Astatic noinline_for_stack size_t bcj_sparc(
2N/A struct xz_dec_bcj *s, uint8_t *buf, size_t size)
2N/A{
2N/A size_t i;
2N/A uint32_t instr;
2N/A
2N/A for (i = 0; i + 4 <= size; i += 4) {
2N/A instr = get_unaligned_be32(buf + i);
2N/A if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
2N/A instr <<= 2;
2N/A instr -= s->pos + (uint32_t)i;
2N/A instr >>= 2;
2N/A instr = ((uint32_t)0x40000000 - (instr & 0x400000))
2N/A | 0x40000000 | (instr & 0x3FFFFF);
2N/A put_unaligned_be32(instr, buf + i);
2N/A }
2N/A }
2N/A
2N/A return i;
2N/A}
2N/A#endif
2N/A
2N/A/*
2N/A * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
2N/A * of data that got filtered.
2N/A *
2N/A * NOTE: This is implemented as a switch statement to avoid using function
2N/A * pointers, which could be problematic in the kernel boot code, which must
2N/A * avoid pointers to static data (at least on x86).
2N/A */
2N/Astatic void bcj_apply(struct xz_dec_bcj *s,
2N/A uint8_t *buf, size_t *pos, size_t size)
2N/A{
2N/A size_t filtered;
2N/A
2N/A buf += *pos;
2N/A size -= *pos;
2N/A
2N/A switch (s->type) {
2N/A#ifdef XZ_DEC_X86
2N/A case BCJ_X86:
2N/A filtered = bcj_x86(s, buf, size);
2N/A break;
2N/A#endif
2N/A#ifdef XZ_DEC_POWERPC
2N/A case BCJ_POWERPC:
2N/A filtered = bcj_powerpc(s, buf, size);
2N/A break;
2N/A#endif
2N/A#ifdef XZ_DEC_IA64
2N/A case BCJ_IA64:
2N/A filtered = bcj_ia64(s, buf, size);
2N/A break;
2N/A#endif
2N/A#ifdef XZ_DEC_ARM
2N/A case BCJ_ARM:
2N/A filtered = bcj_arm(s, buf, size);
2N/A break;
2N/A#endif
2N/A#ifdef XZ_DEC_ARMTHUMB
2N/A case BCJ_ARMTHUMB:
2N/A filtered = bcj_armthumb(s, buf, size);
2N/A break;
2N/A#endif
2N/A#ifdef XZ_DEC_SPARC
2N/A case BCJ_SPARC:
2N/A filtered = bcj_sparc(s, buf, size);
2N/A break;
2N/A#endif
2N/A default:
2N/A /* Never reached but silence compiler warnings. */
2N/A filtered = 0;
2N/A break;
2N/A }
2N/A
2N/A *pos += filtered;
2N/A s->pos += filtered;
2N/A}
2N/A
2N/A/*
2N/A * Flush pending filtered data from temp to the output buffer.
2N/A * Move the remaining mixture of possibly filtered and unfiltered
2N/A * data to the beginning of temp.
2N/A */
2N/Astatic void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
2N/A{
2N/A size_t copy_size;
2N/A
2N/A copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
2N/A memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
2N/A b->out_pos += copy_size;
2N/A
2N/A s->temp.filtered -= copy_size;
2N/A s->temp.size -= copy_size;
2N/A memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
2N/A}
2N/A
2N/A/*
2N/A * The BCJ filter functions are primitive in sense that they process the
2N/A * data in chunks of 1-16 bytes. To hide this issue, this function does
2N/A * some buffering.
2N/A */
2N/Aenum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s,
2N/A struct xz_dec_lzma2 *lzma2, struct xz_buf *b)
2N/A{
2N/A size_t out_start;
2N/A
2N/A /*
2N/A * Flush pending already filtered data to the output buffer. Return
2N/A * immediatelly if we couldn't flush everything, or if the next
2N/A * filter in the chain had already returned XZ_STREAM_END.
2N/A */
2N/A if (s->temp.filtered > 0) {
2N/A bcj_flush(s, b);
2N/A if (s->temp.filtered > 0)
2N/A return XZ_OK;
2N/A
2N/A if (s->ret == XZ_STREAM_END)
2N/A return XZ_STREAM_END;
2N/A }
2N/A
2N/A /*
2N/A * If we have more output space than what is currently pending in
2N/A * temp, copy the unfiltered data from temp to the output buffer
2N/A * and try to fill the output buffer by decoding more data from the
2N/A * next filter in the chain. Apply the BCJ filter on the new data
2N/A * in the output buffer. If everything cannot be filtered, copy it
2N/A * to temp and rewind the output buffer position accordingly.
2N/A */
2N/A if (s->temp.size < b->out_size - b->out_pos) {
2N/A out_start = b->out_pos;
2N/A memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
2N/A b->out_pos += s->temp.size;
2N/A
2N/A s->ret = xz_dec_lzma2_run(lzma2, b);
2N/A if (s->ret != XZ_STREAM_END
2N/A && (s->ret != XZ_OK || s->single_call))
2N/A return s->ret;
2N/A
2N/A bcj_apply(s, b->out, &out_start, b->out_pos);
2N/A
2N/A /*
2N/A * As an exception, if the next filter returned XZ_STREAM_END,
2N/A * we can do that too, since the last few bytes that remain
2N/A * unfiltered are meant to remain unfiltered.
2N/A */
2N/A if (s->ret == XZ_STREAM_END)
2N/A return XZ_STREAM_END;
2N/A
2N/A s->temp.size = b->out_pos - out_start;
2N/A b->out_pos -= s->temp.size;
2N/A memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
2N/A }
2N/A
2N/A /*
2N/A * If we have unfiltered data in temp, try to fill by decoding more
2N/A * data from the next filter. Apply the BCJ filter on temp. Then we
2N/A * hopefully can fill the actual output buffer by copying filtered
2N/A * data from temp. A mix of filtered and unfiltered data may be left
2N/A * in temp; it will be taken care on the next call to this function.
2N/A */
2N/A if (s->temp.size > 0) {
2N/A /* Make b->out{,_pos,_size} temporarily point to s->temp. */
2N/A s->out = b->out;
2N/A s->out_pos = b->out_pos;
2N/A s->out_size = b->out_size;
2N/A b->out = s->temp.buf;
2N/A b->out_pos = s->temp.size;
2N/A b->out_size = sizeof(s->temp.buf);
2N/A
2N/A s->ret = xz_dec_lzma2_run(lzma2, b);
2N/A
2N/A s->temp.size = b->out_pos;
2N/A b->out = s->out;
2N/A b->out_pos = s->out_pos;
2N/A b->out_size = s->out_size;
2N/A
2N/A if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
2N/A return s->ret;
2N/A
2N/A bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
2N/A
2N/A /*
2N/A * If the next filter returned XZ_STREAM_END, we mark that
2N/A * everything is filtered, since the last unfiltered bytes
2N/A * of the stream are meant to be left as is.
2N/A */
2N/A if (s->ret == XZ_STREAM_END)
2N/A s->temp.filtered = s->temp.size;
2N/A
2N/A bcj_flush(s, b);
2N/A if (s->temp.filtered > 0)
2N/A return XZ_OK;
2N/A }
2N/A
2N/A return s->ret;
2N/A}
2N/A
2N/A#ifdef GRUB_EMBED_DECOMPRESSOR
2N/Astruct xz_dec_bcj bcj;
2N/A#endif
2N/A
2N/Astruct xz_dec_bcj * xz_dec_bcj_create(bool single_call)
2N/A{
2N/A struct xz_dec_bcj *s;
2N/A#ifdef GRUB_EMBED_DECOMPRESSOR
2N/A s = &bcj;
2N/A#else
2N/A s = kmalloc(sizeof(*s), GFP_KERNEL);
2N/A#endif
2N/A if (s != NULL)
2N/A s->single_call = single_call;
2N/A
2N/A return s;
2N/A}
2N/A
2N/Aenum xz_ret xz_dec_bcj_reset(
2N/A struct xz_dec_bcj *s, uint8_t id)
2N/A{
2N/A switch (id) {
2N/A#ifdef XZ_DEC_X86
2N/A case BCJ_X86:
2N/A#endif
2N/A#ifdef XZ_DEC_POWERPC
2N/A case BCJ_POWERPC:
2N/A#endif
2N/A#ifdef XZ_DEC_IA64
2N/A case BCJ_IA64:
2N/A#endif
2N/A#ifdef XZ_DEC_ARM
2N/A case BCJ_ARM:
2N/A#endif
2N/A#ifdef XZ_DEC_ARMTHUMB
2N/A case BCJ_ARMTHUMB:
2N/A#endif
2N/A#ifdef XZ_DEC_SPARC
2N/A case BCJ_SPARC:
2N/A#endif
2N/A break;
2N/A
2N/A default:
2N/A /* Unsupported Filter ID */
2N/A return XZ_OPTIONS_ERROR;
2N/A }
2N/A
2N/A s->type = id;
2N/A s->ret = XZ_OK;
2N/A s->pos = 0;
2N/A s->x86_prev_mask = 0;
2N/A s->temp.filtered = 0;
2N/A s->temp.size = 0;
2N/A
2N/A return XZ_OK;
2N/A}