cmd/od/od.c

	od.c revision 84441f85b19f6b8080883f30109e58e43c893709
/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
 */

/*
 * od - octal dump.  Not really just octal anymore; read the POSIX
 * specification for it -- its more complex than you think!
 *
 * NB: We followed the POSIX semantics fairly strictly, where the
 * legacy code's behavior was in conflict.  In many cases the legacy
 * Solaris code was so completely broken as to be completely unusable.
 * (For example, the long double support was broken beyond
 * imagination!)  Note that GNU coreutils violates POSIX in a few
 * interesting ways, such as changing the numbering of the addresses
 * when skipping.  (Address starts should always be at 0, according to
 * the sample output in the Open Group man page.)
 */

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>
#include <err.h>
#include <wchar.h>
#include <locale.h>
#include <unistd.h>
#include <sys/stat.h>

#define _(x)    gettext(x)


#ifndef TEXT_DOMAIN
#define TEXT_DOMAIN "SYS_TEST"
#endif

/* address format */
static char *afmt  =    "%07llo";
static char *cfmt  =    "       ";

static FILE *input = NULL;
static size_t lcm = 1;
static size_t blocksize = 16;
static int numfiles = 0;
static int curfile = 0;
static char **files = NULL;
static off_t limit = -1;

/*
 * This structure describes our ring buffer.  Its always a power of 2
 * in size to make wrap around calculations fast using a mask instead
 * of doing modulo.
 *
 * The size is calculated thusly: We need three "blocks" of data, as
 * we process a block at a time (one block == one line of od output.)
 *
 * We need lookahead of an extra block to support multibyte chars.  We
 * also have a look behind so that we can avoid printing lines that
 * are identical to what we've already printed.  Finally, we need the
 * current block.
 *
 * The block size is determined by the least common multiple of the
 * data items being displayed.  Usually it will be 16, but sometimes
 * it is 24 (when 12-byte long doubles are presented.)
 *
 * The data buffer is allocaed via memalign to make sure it is
 * properly aligned.
 */
typedef struct buffer {
    char    *data;      /* data buffer */
    int prod;       /* producer index */
    int cons;       /* consumer index */
    int mask;       /* buffer size - 1, wraparound index */
    int navail;     /* total bytes avail */
} buffer_t;

/*
 * This structure is used to provide information on a specific output
 * format.  We link them together in a list representing the output
 * formats that the user has selected.
 */
typedef struct output {
    int width;              /* bytes consumed per call */
    void    (*func)(buffer_t *, int);   /* output function */
    struct output   *next;          /* link node */
} output_t;

/*
 * Specifiers
 */

typedef unsigned char       u8;
typedef unsigned short      u16;
typedef unsigned int        u32;
typedef unsigned long long  u64;
typedef char            s8;
typedef short           s16;
typedef int         s32;
typedef long long       s64;
typedef float           fF;
typedef double          fD;
typedef long double     fL;

static void
usage(void)
{
    (void) fprintf(stderr, _("usage: od [-bcCdDfFoOsSvxX] "
        "[-t types ]... [-A base] [-j skip] [-N count] [file]...\n"));
    exit(1);
}

#define DECL_GET(typ)                           \
static typ                              \
get_ ## typ(buffer_t *b, int index)                 \
{                                   \
    typ val = *(typ *)(void *)(b->data + index);            \
    return (val);                           \
}
DECL_GET(u8)
DECL_GET(u16)
DECL_GET(u32)
DECL_GET(u64)
DECL_GET(s8)
DECL_GET(s16)
DECL_GET(s32)
DECL_GET(s64)
DECL_GET(fF)
DECL_GET(fD)
DECL_GET(fL)

#define DECL_OUT(nm, typ, fmt)                  \
static void                         \
do_ ## nm(buffer_t *buf, int index)             \
{                               \
    typ v = get_ ## typ(buf, index);            \
    (void) printf(fmt, v);                  \
}                               \
                                \
static output_t output_ ## nm =  {              \
    sizeof (typ), do_ ## nm                 \
};

DECL_OUT(oct_b, u8, " %03o")
DECL_OUT(oct_w, u16, " %06ho")
DECL_OUT(oct_d, u32, " %011o")
DECL_OUT(oct_q, u64, " %022llo")
DECL_OUT(dec_b, u8, " %03u")
DECL_OUT(dec_w, u16, " %05hu")
DECL_OUT(dec_d, u32, " %010u")
DECL_OUT(dec_q, u64, " %020llu")
DECL_OUT(sig_b, s8, " %03d")
DECL_OUT(sig_w, s16, " %6.05hd")
DECL_OUT(sig_d, s32, " %11.010d")
DECL_OUT(sig_q, s64, " %20.019lld")
DECL_OUT(hex_b, u8, " %02x")
DECL_OUT(hex_w, u16, " %04hx")
DECL_OUT(hex_d, s32, " %08x")
DECL_OUT(hex_q, s64, " %016llx")
DECL_OUT(float, fF, " %14.7e")
DECL_OUT(double, fD, " %21.14e")
DECL_OUT(ldouble, fL, " %24.14Le")

static char *ascii[] = {
    "nul", "soh", "stx", "etx", "eot", "enq", "ack", " be",
    " bs", " ht", " lf", " vt", " ff", " cr", " so", " si",
    "dle", "dc1", "dc2", "dc3", "dc4", "nak", "syn", "etb",
    "can", " em", "sub", "esc", " fs", " gs", " rs", " us",
    " sp", "  !", "  \"", "  #", "  $", "  %", "  &", "  '",
    "  (", "  )", "  *", "  +", "  ,", "  -", "  .", "  /",
    "  0", "  1", "  2", "  3", "  4", "  5", "  6", "  7",
    "  8", "  9", "  :", "  ;", "  <", "  =", "  >", "  ?",
    "  @", "  A", "  B", "  C", "  D", "  E", "  F", "  G",
    "  H", "  I", "  J", "  K", "  L", "  M", "  N", "  O",
    "  P", "  Q", "  R", "  S", "  T", "  U", "  V", "  W",
    "  X", "  Y", "  Z", "  [", "  \\", "  ]", "  ^", "  _",
    "  `", "  a", "  b", "  c", "  d", "  e", "  f", "  g",
    "  h", "  i", "  j", "  k", "  l", "  m", "  n", "  o",
    "  p", "  q", "  r", "  s", "  t", "  u", "  v", "  w",
    "  x", "  y", "  z", "  {", "  |", "  }", "  ~", "del"
};

static void
do_ascii(buffer_t *buf, int index)
{
    uint8_t v = get_u8(buf, index);

    (void) fputc(' ', stdout);
    (void) fputs(ascii[v & 0x7f], stdout);
}

static output_t output_ascii = {
    1, do_ascii,
};

static void
do_char(buffer_t *buf, int index)
{
    static int  nresid = 0;
    static int  printable = 0;
    int     cnt;
    int     avail;
    int     nb;
    char        scratch[10];
    wchar_t     wc;
    int     which;

    uint8_t v = get_u8(buf, index);

    /*
     * If there were residual bytes from an earlier
     * character, then just display the ** continuation
     * indication.
     */
    if (nresid) {
        if (printable) {
            (void) fputs("  **", stdout);
        } else {
            (void) printf(" %03o", v);
        }
        nresid--;
        return;
    }

    /*
     * Peek ahead up to MB_CUR_MAX characters.  This has to be
     * done carefully because we might need to look into the next
     * block to really know for sure.
     */
    scratch[0] = v;
    avail = buf->navail;
    if (avail > MB_CUR_MAX)
        avail = MB_CUR_MAX;
    for (cnt = 1, which = index + 1; cnt < avail; cnt++, which++) {
        scratch[cnt] = buf->data[which & buf->mask];
    }

    /* now see if the value is a real character */
    nresid = 0;
    wc = 0;
    nb = mbtowc(&wc, scratch, avail);
    if (nb < 0) {
        (void) printf(" %03o", v);
        return;
    }
    if (nb == 0) {
        (void) fputs("  \\0", stdout);
        return;
    }
    nresid = nb - 1;
    if (nb && iswprint(wc)) {
        scratch[nb] = 0;
        (void) fputs("   ", stdout);
        (void) fputs(scratch, stdout);
        printable = 1;
        return;
    }
    printable = 0;
    if (wc == 0) {
        (void) fputs("  \\0", stdout);
    } else if (wc == '\b') {
        (void) fputs("  \\b", stdout);
    } else if (wc == '\f') {
        (void) fputs("  \\f", stdout);
    } else if (wc == '\n') {
        (void) fputs("  \\n", stdout);
    } else if (wc == '\r') {
        (void) fputs("  \\r", stdout);
    } else if (wc == '\t') {
        (void) fputs("  \\t", stdout);
    } else {
        (void) printf(" %03o", v);
    }
}

static output_t output_char = {
    1, do_char,
};

/*
 * List of output formatting structures.
 */
static output_t *head = NULL;
static output_t **tailp = &head;

static void
add_out(output_t *src)
{
    output_t    *out;
    int     m;

    if ((out = calloc(1, sizeof (*src))) == NULL) {
        err(1, "malloc");
    }

    m = lcm;
    while ((m % src->width) != 0) {
        m += lcm;
    }
    lcm = m;
    blocksize = lcm;
    while (blocksize < 16)
        blocksize *= 2;

    (void) memcpy(out, src, sizeof (*src));
    *tailp = out;
    tailp = &out->next;
}

static FILE *
next_input(void)
{
    for (;;) {
        if (curfile >= numfiles)
            return (NULL);

        if (input != NULL) {
            if ((input = freopen(files[curfile], "r", input)) !=
                NULL) {
                curfile++;
                return (input);
            }
        } else {
            if ((input = fopen(files[curfile], "r")) != NULL) {
                curfile++;
                return (input);
            }
        }
        warn("open: %s", files[curfile]);
        curfile++;
    }
}

static void
refill(buffer_t *b)
{
    int n;
    int want;
    int zero;

    /*
     * If we have 2 blocks of bytes available, we're done.  Note
     * that each iteration usually loads up 16 bytes, unless we
     * run out of data.
     */
    while ((input != NULL) && (b->navail < (2 * blocksize))) {

        /* we preload the next one in advance */

        if (limit == 0) {
            (void) fclose(input);
            input = NULL;
            continue;
        }

        /* we want to read a whole block if possible */
        want = blocksize;
        if ((limit >= 0) && (want > limit)) {
            want = limit;
        }
        zero = blocksize;

        while (want && input) {
            int c;
            b->prod &= b->mask;
            c = (b->prod + want > (b->mask + 1)) ?
                b->mask - b->prod :
                want;

            n = fread(b->data + b->prod, 1, c, input);
            if (n < 0) {
                warn("read: %s",
                    files ? files[curfile-1] : "stdin");
                input = next_input();
                continue;
            }
            if (n == 0) {
                input = next_input();
                continue;
            }
            if (limit >= 0)
                limit -= n;
            b->navail += n;
            b->prod += n;
            want -= n;
            zero -= n;
        }

        while (zero) {
            b->data[b->prod & b->mask] = 0;
            b->prod++;
            b->prod &= b->mask;
            zero--;
        }
    }
}

#define STR1    "C1"
#define STR2    "S2"
#ifdef  _LP64
#define STR8    "L8"
#define STR4    "I4"
#else
#define STR8    "8"
#define STR4    "IL4"
#endif

static void
do_type_string(char *typestr)
{
    if (*typestr == 0) {
        errx(1, _("missing type string"));
    }
    while (*typestr) {
        switch (*typestr) {
        case 'a':
            typestr++;
            add_out(&output_ascii);
            break;
        case 'c':
            add_out(&output_char);
            typestr++;
            break;
        case 'f':
            typestr++;
            switch (*typestr) {
            case 'F':
            case '4':
                add_out(&output_float);
                typestr++;
                break;
            case '8':
            case 'D':
                add_out(&output_double);
                typestr++;
                break;
            case 'L':
                add_out(&output_ldouble);
                typestr++;
                break;
            default:
                add_out(&output_float);
                break;
            }
            break;


        case 'd':
            typestr++;
            if (strchr(STR1, *typestr)) {
                typestr++;
                add_out(&output_sig_b);
            } else if (strchr(STR2, *typestr)) {
                typestr++;
                add_out(&output_sig_w);
            } else if (strchr(STR4, *typestr)) {
                typestr++;
                add_out(&output_sig_d);
            } else if (strchr(STR8, *typestr)) {
                typestr++;
                add_out(&output_sig_q);
            } else {
                add_out(&output_sig_d);
            }
            break;

        case 'u':
            typestr++;
            if (strchr(STR1, *typestr)) {
                typestr++;
                add_out(&output_dec_b);
            } else if (strchr(STR2, *typestr)) {
                typestr++;
                add_out(&output_dec_w);
            } else if (strchr(STR4, *typestr)) {
                typestr++;
                add_out(&output_dec_d);
            } else if (strchr(STR8, *typestr)) {
                typestr++;
                add_out(&output_dec_q);
            } else {
                add_out(&output_dec_d);
            }
            break;

        case 'o':
            typestr++;
            if (strchr(STR1, *typestr)) {
                typestr++;
                add_out(&output_oct_b);
            } else if (strchr(STR2, *typestr)) {
                typestr++;
                add_out(&output_oct_w);
            } else if (strchr(STR4, *typestr)) {
                typestr++;
                add_out(&output_oct_d);
            } else if (strchr(STR8, *typestr)) {
                typestr++;
                add_out(&output_oct_q);
            } else {
                add_out(&output_oct_d);
            }
            break;

        case 'x':
            typestr++;
            if (strchr(STR1, *typestr)) {
                typestr++;
                add_out(&output_hex_b);
            } else if (strchr(STR2, *typestr)) {
                typestr++;
                add_out(&output_hex_w);
            } else if (strchr(STR4, *typestr)) {
                typestr++;
                add_out(&output_hex_d);
            } else if (strchr(STR8, *typestr)) {
                typestr++;
                add_out(&output_hex_q);
            } else {
                add_out(&output_hex_d);
            }
            break;

        default:
            errx(1, _("unrecognized type string character: %c"),
                *typestr);
            exit(1);
        }
    }
}

int
main(int argc, char **argv)
{
    int     c;
    int     i;
    buffer_t    buffer;
    boolean_t   first = B_TRUE;
    boolean_t   doall = B_FALSE;
    boolean_t   same = B_FALSE;
    boolean_t   newarg = B_FALSE;
    off_t       offset = 0;
    off_t       skip = 0;
    char        *eptr;
    char        *offstr = 0;

    input = stdin;

    (void) setlocale(LC_ALL, "");
    (void) textdomain(TEXT_DOMAIN);

    while ((c = getopt(argc, argv, "A:bCcdDfFj:N:oOsSxXvt:")) != EOF) {
        switch (c) {
        case 'A':
            newarg = B_TRUE;
            if (strlen(optarg) > 1) {
                afmt = NULL;
            }
            switch (*optarg) {
            case 'o':
                afmt = "%07llo";
                cfmt = "       ";
                break;
            case 'd':
                afmt = "%07lld";
                cfmt = "       ";
                break;
            case 'x':
                afmt = "%07llx";
                cfmt = "       ";
                break;
            case 'n':
                /*
                 * You could argue that the code should
                 * use the same 7 spaces.  Legacy uses 8
                 * though.  Oh well.  Better to avoid
                 * gratuitous change.
                 */
                afmt = "        ";
                cfmt = "        ";
                break;
            default:
                afmt = NULL;
                break;
            }
            if (strlen(optarg) != 1) {
                afmt = NULL;
            }
            if (afmt == NULL)
                warnx(_("invalid address base, "
                    "must be o, d, x, or n"));
            break;

        case 'b':
            add_out(&output_oct_b);
            break;

        case 'c':
        case 'C':
            add_out(&output_char);
            break;

        case 'f':
            add_out(&output_float);
            break;

        case 'F':
            add_out(&output_double);
            break;

        case 'd':
            add_out(&output_dec_w);
            break;

        case 'D':
            add_out(&output_dec_d);
            break;

        case 't':
            newarg = B_TRUE;
            do_type_string(optarg);
            break;

        case 'o':
            add_out(&output_oct_w);
            break;

        case 'O':
            add_out(&output_oct_d);
            break;

        case 's':
            add_out(&output_sig_w);
            break;

        case 'S':
            add_out(&output_sig_d);
            break;

        case 'x':
            add_out(&output_hex_w);
            break;

        case 'X':
            add_out(&output_hex_d);
            break;

        case 'v':
            doall = B_TRUE;
            break;

        case 'j':
            newarg = B_TRUE;
            skip = strtoll(optarg, &eptr, 0);
            if (*eptr == 'b') {
                skip <<= 9; /* 512 bytes */
                eptr++;
            } else if (*eptr == 'k') {
                skip <<= 10;    /* 1k */
                eptr++;
            } else if (*eptr == 'm') {
                skip <<= 20;    /* 1m */
                eptr++;
            } else if (*eptr == 'g') {
                skip <<= 30;    /* 1g */
                eptr++;
            }
            if ((skip < 0) || (eptr[0] != 0)) {
                warnx(_("invalid skip count '%s' specified"),
                    optarg);
                exit(1);
            }
            break;

        case 'N':
            newarg = B_TRUE;
            limit = strtoll(optarg, &eptr, 0);
            /*
             * POSIX doesn't specify this, but I think these
             * may be helpful.
             */
            if (*eptr == 'b') {
                limit <<= 9;
                eptr++;
            } else if (*eptr == 'k') {
                limit <<= 10;
                eptr++;
            } else if (*eptr == 'm') {
                limit <<= 20;
                eptr++;
            } else if (*eptr == 'g') {
                limit <<= 30;
                eptr++;
            }
            if ((limit < 0) || (eptr[0] != 0)) {
                warnx(_("invalid byte count '%s' specified"),
                    optarg);
                exit(1);
            }
            break;

        default:
            usage();
            break;
        }
    }

    /* this finds the smallest power of two size we can use */
    buffer.mask = (1 << (ffs(blocksize * 3) + 1)) - 1;
    buffer.data = memalign(16, buffer.mask + 1);
    if (buffer.data == NULL) {
        err(1, "memalign");
    }


    /*
     * Wow.  This option parsing is hideous.
     *
     * If the we've not seen a new option, and there is just one
     * operand, if it starts with a "+", then treat it as an
     * offset.  Otherwise if two operands, and the second operand
     * starts with + or a digit, then it is an offset.
     */
    if (!newarg) {
        if (((argc - optind) == 1) && (argv[optind][0] == '+')) {
            offstr = argv[optind];
            argc--;
        } else if (((argc - optind) == 2) &&
            (strchr("+0123456789", (argv[optind + 1][0])) != NULL)) {
            offstr = argv[optind + 1];
            argc--;
        }
    }
    if (offstr) {
        int base = 0;
        int mult = 1;
        int l;
        if (*offstr == '+') {
            offstr++;
        }
        l = strlen(offstr);
        if ((strncmp(offstr, "0x", 2) == 0)) {
            afmt = "%07llx";
            base = 16;
            offstr += 2;
            if (offstr[l - 1] == 'B') {
                offstr[l - 1] = 0;
                l--;
                mult = 512;
            }
        } else {
            base = 8;
            afmt = "%07llo";
            if ((offstr[l - 1] == 'B') || (offstr[l - 1] == 'b')) {
                offstr[l - 1] = 0;
                l--;
                mult = 512;
            }
            if (offstr[l - 1] == '.') {
                offstr[l - 1] = 0;
                base = 10;
                afmt = "%07lld";
            }
        }
        skip = strtoll(offstr, &eptr, base);
        if (*eptr != '\0') {
            errx(1, _("invalid offset string specified"));
        }
        skip *= mult;
        offset += skip;
    }

    /*
     * Allocate an array for all the input files.
     */
    if (argc > optind) {
        files = calloc(sizeof (char *), argc - optind);
        for (i = 0; i < argc - optind; i++) {
            files[i] = argv[optind + i];
            numfiles++;
        }
        input = next_input();
    } else {
        input = stdin;
    }

    /*
     * We need to seek ahead.  fseek would be faster.
     */
    while (skip && (input != NULL)) {
        struct stat sbuf;

        /*
         * Only fseek() on regular files.  (Others
         * we have to read().
         */
        if (fstat(fileno(input), &sbuf) < 0) {
            warn("fstat: %s", files[curfile-1]);
            input = next_input();
            continue;
        }
        if (S_ISREG(sbuf.st_mode)) {
            /*
             * No point in seeking a file that is too
             * short to begin with.
             */
            if (sbuf.st_size < skip) {
                skip -= sbuf.st_size;
                input = next_input();
                continue;
            }
            if (fseeko(input, skip, SEEK_SET) < 0) {
                err(1, "fseek:%s", files[curfile-1]);
            }
            /* Done seeking. */
            skip = 0;
            break;
        }

        /*
         * fgetc seems like it would be slow, but it uses
         * buffered I/O, so it should be fast enough.
         */
        flockfile(input);
        while (skip) {
            if (getc_unlocked(input) == EOF) {
                funlockfile(input);
                if (ferror(input)) {
                    warn("read: %s", files[curfile-1]);
                }
                input = next_input();
                if (input != NULL) {
                    flockfile(input);
                }
                break;
            }
            skip--;
        }
        if (input != NULL)
            funlockfile(input);
    }

    if (head == NULL) {
        add_out(&output_oct_w);
    }

    buffer.navail = 0;
    buffer.prod = 0;
    buffer.cons = 0;

    for (refill(&buffer); buffer.navail > 0; refill(&buffer)) {
        output_t *out;
        int mx;
        int j, k;

        /*
         * If this buffer was the same as last, then just
         * dump an asterisk.
         */
        if ((!first) && (buffer.navail >= blocksize) && (!doall)) {
            j = buffer.cons;
            k = j - blocksize;
            for (i = 0; i < blocksize; i++) {
                if (buffer.data[j & buffer.mask] !=
                    buffer.data[k & buffer.mask]) {
                    break;
                }
                j++;
                k++;
            }
            if (i == blocksize) {
                if (!same) {
                    (void) fputs("*\n", stdout);
                    same = B_TRUE;
                }
                buffer.navail -= blocksize;
                offset += blocksize;
                buffer.cons += blocksize;
                buffer.cons &= buffer.mask;
                continue;
            }
        }

        first = B_FALSE;
        same = B_FALSE;
        mx = (buffer.navail > blocksize) ? blocksize : buffer.navail;

        for (out = head; out != NULL; out = out->next) {

            if (out == head) {
                /*LINTED E_SEC_PRINTF_VAR_FMT*/
                (void) printf(afmt, offset);
            } else {
                (void) fputs(cfmt, stdout);
            }
            for (i = 0, j = buffer.cons; i < mx; i += out->width) {
                out->func(&buffer, j);
                j += out->width;
                j &= buffer.mask;
            }
            (void) fputs("\n", stdout);
        }
        buffer.cons += mx;
        buffer.cons &= buffer.mask;
        offset += mx;
        buffer.navail -= mx;
    }
    /*LINTED E_SEC_PRINTF_VAR_FMT*/
    (void) printf(afmt, offset);
    (void) fputs("\n", stdout);
    return (0);
}