cmd/lockstat/lockstat.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <strings.h>
#include <ctype.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/modctl.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <dtrace.h>
#include <sys/lockstat.h>
#include <alloca.h>
#include <signal.h>
#include <assert.h>

#define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"

#define LS_MAX_STACK_DEPTH  50
#define LS_MAX_EVENTS       64

typedef struct lsrec {
    struct lsrec    *ls_next;   /* next in hash chain */
    uintptr_t   ls_lock;    /* lock address */
    uintptr_t   ls_caller;  /* caller address */
    uint32_t    ls_count;   /* cumulative event count */
    uint32_t    ls_event;   /* type of event */
    uintptr_t   ls_refcnt;  /* cumulative reference count */
    uint64_t    ls_time;    /* cumulative event duration */
    uint32_t    ls_hist[64];    /* log2(duration) histogram */
    uintptr_t   ls_stack[LS_MAX_STACK_DEPTH];
} lsrec_t;

typedef struct lsdata {
    struct lsrec    *lsd_next;  /* next available */
    int     lsd_count;  /* number of records */
} lsdata_t;

/*
 * Definitions for the types of experiments which can be run.  They are
 * listed in increasing order of memory cost and processing time cost.
 * The numerical value of each type is the number of bytes needed per record.
 */
#define LS_BASIC    offsetof(lsrec_t, ls_time)
#define LS_TIME     offsetof(lsrec_t, ls_hist[0])
#define LS_HIST     offsetof(lsrec_t, ls_stack[0])
#define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth])

static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
static void report_trace(FILE *, lsrec_t **);

extern int symtab_init(void);
extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
extern uintptr_t sym_to_addr(char *name);
extern size_t sym_size(char *name);
extern char *strtok_r(char *, const char *, char **);

#define DEFAULT_NRECS   10000
#define DEFAULT_HZ  97
#define MAX_HZ      1000
#define MIN_AGGSIZE (16 * 1024)
#define MAX_AGGSIZE (32 * 1024 * 1024)

static int g_stkdepth;
static int g_topn = INT_MAX;
static hrtime_t g_elapsed;
static int g_rates = 0;
static int g_pflag = 0;
static int g_Pflag = 0;
static int g_wflag = 0;
static int g_Wflag = 0;
static int g_cflag = 0;
static int g_kflag = 0;
static int g_gflag = 0;
static int g_Vflag = 0;
static int g_tracing = 0;
static size_t g_recsize;
static size_t g_nrecs;
static int g_nrecs_used;
static uchar_t g_enabled[LS_MAX_EVENTS];
static hrtime_t g_min_duration[LS_MAX_EVENTS];
static dtrace_hdl_t *g_dtp;
static char *g_predicate;
static char *g_ipredicate;
static char *g_prog;
static int g_proglen;
static int g_dropped;

typedef struct ls_event_info {
    char    ev_type;
    char    ev_lhdr[20];
    char    ev_desc[80];
    char    ev_units[10];
    char    ev_name[DTRACE_NAMELEN];
    char    *ev_predicate;
    char    *ev_acquire;
} ls_event_info_t;

static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
    { 'C',  "Lock", "Adaptive mutex spin",          "nsec",
        "lockstat:::adaptive-spin" },
    { 'C',  "Lock", "Adaptive mutex block",         "nsec",
        "lockstat:::adaptive-block" },
    { 'C',  "Lock", "Spin lock spin",           "nsec",
        "lockstat:::spin-spin" },
    { 'C',  "Lock", "Thread lock spin",         "nsec",
        "lockstat:::thread-spin" },
    { 'C',  "Lock", "R/W writer blocked by writer",     "nsec",
        "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
    { 'C',  "Lock", "R/W writer blocked by readers",    "nsec",
        "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
    { 'C',  "Lock", "R/W reader blocked by writer",     "nsec",
        "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
    { 'C',  "Lock", "R/W reader blocked by write wanted",   "nsec",
        "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
    { 'C',  "Lock", "Unknown event (type 8)",       "units" },
    { 'C',  "Lock", "Unknown event (type 9)",       "units" },
    { 'C',  "Lock", "Unknown event (type 10)",      "units" },
    { 'C',  "Lock", "Unknown event (type 11)",      "units" },
    { 'C',  "Lock", "Unknown event (type 12)",      "units" },
    { 'C',  "Lock", "Unknown event (type 13)",      "units" },
    { 'C',  "Lock", "Unknown event (type 14)",      "units" },
    { 'C',  "Lock", "Unknown event (type 15)",      "units" },
    { 'C',  "Lock", "Unknown event (type 16)",      "units" },
    { 'C',  "Lock", "Unknown event (type 17)",      "units" },
    { 'C',  "Lock", "Unknown event (type 18)",      "units" },
    { 'C',  "Lock", "Unknown event (type 19)",      "units" },
    { 'C',  "Lock", "Unknown event (type 20)",      "units" },
    { 'C',  "Lock", "Unknown event (type 21)",      "units" },
    { 'C',  "Lock", "Unknown event (type 22)",      "units" },
    { 'C',  "Lock", "Unknown event (type 23)",      "units" },
    { 'C',  "Lock", "Unknown event (type 24)",      "units" },
    { 'C',  "Lock", "Unknown event (type 25)",      "units" },
    { 'C',  "Lock", "Unknown event (type 26)",      "units" },
    { 'C',  "Lock", "Unknown event (type 27)",      "units" },
    { 'C',  "Lock", "Unknown event (type 28)",      "units" },
    { 'C',  "Lock", "Unknown event (type 29)",      "units" },
    { 'C',  "Lock", "Unknown event (type 30)",      "units" },
    { 'C',  "Lock", "Unknown event (type 31)",      "units" },
    { 'H',  "Lock", "Adaptive mutex hold",          "nsec",
        "lockstat:::adaptive-release", NULL,
        "lockstat:::adaptive-acquire" },
    { 'H',  "Lock", "Spin lock hold",           "nsec",
        "lockstat:::spin-release", NULL,
        "lockstat:::spin-acquire" },
    { 'H',  "Lock", "R/W writer hold",          "nsec",
        "lockstat:::rw-release", "arg1 == 0",
        "lockstat:::rw-acquire" },
    { 'H',  "Lock", "R/W reader hold",          "nsec",
        "lockstat:::rw-release", "arg1 != 0",
        "lockstat:::rw-acquire" },
    { 'H',  "Lock", "Unknown event (type 36)",      "units" },
    { 'H',  "Lock", "Unknown event (type 37)",      "units" },
    { 'H',  "Lock", "Unknown event (type 38)",      "units" },
    { 'H',  "Lock", "Unknown event (type 39)",      "units" },
    { 'H',  "Lock", "Unknown event (type 40)",      "units" },
    { 'H',  "Lock", "Unknown event (type 41)",      "units" },
    { 'H',  "Lock", "Unknown event (type 42)",      "units" },
    { 'H',  "Lock", "Unknown event (type 43)",      "units" },
    { 'H',  "Lock", "Unknown event (type 44)",      "units" },
    { 'H',  "Lock", "Unknown event (type 45)",      "units" },
    { 'H',  "Lock", "Unknown event (type 46)",      "units" },
    { 'H',  "Lock", "Unknown event (type 47)",      "units" },
    { 'H',  "Lock", "Unknown event (type 48)",      "units" },
    { 'H',  "Lock", "Unknown event (type 49)",      "units" },
    { 'H',  "Lock", "Unknown event (type 50)",      "units" },
    { 'H',  "Lock", "Unknown event (type 51)",      "units" },
    { 'H',  "Lock", "Unknown event (type 52)",      "units" },
    { 'H',  "Lock", "Unknown event (type 53)",      "units" },
    { 'H',  "Lock", "Unknown event (type 54)",      "units" },
    { 'H',  "Lock", "Unknown event (type 55)",      "units" },
    { 'I',  "CPU+PIL", "Profiling interrupt",       "nsec",
        "profile:::profile-97", NULL },
    { 'I',  "Lock", "Unknown event (type 57)",      "units" },
    { 'I',  "Lock", "Unknown event (type 58)",      "units" },
    { 'I',  "Lock", "Unknown event (type 59)",      "units" },
    { 'E',  "Lock", "Recursive lock entry detected",    "(N/A)",
        "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
    { 'E',  "Lock", "Lockstat enter failure",       "(N/A)" },
    { 'E',  "Lock", "Lockstat exit failure",        "nsec"  },
    { 'E',  "Lock", "Lockstat record failure",      "(N/A)" },
};

static void
fail(int do_perror, const char *message, ...)
{
    va_list args;
    int save_errno = errno;

    va_start(args, message);
    (void) fprintf(stderr, "lockstat: ");
    (void) vfprintf(stderr, message, args);
    va_end(args);
    if (do_perror)
        (void) fprintf(stderr, ": %s", strerror(save_errno));
    (void) fprintf(stderr, "\n");
    exit(2);
}

static void
dfail(const char *message, ...)
{
    va_list args;

    va_start(args, message);
    (void) fprintf(stderr, "lockstat: ");
    (void) vfprintf(stderr, message, args);
    va_end(args);
    (void) fprintf(stderr, ": %s\n",
        dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));

    exit(2);
}

static void
show_events(char event_type, char *desc)
{
    int i, first = -1, last;

    for (i = 0; i < LS_MAX_EVENTS; i++) {
        ls_event_info_t *evp = &g_event_info[i];
        if (evp->ev_type != event_type ||
            strncmp(evp->ev_desc, "Unknown event", 13) == 0)
            continue;
        if (first == -1)
            first = i;
        last = i;
    }

    (void) fprintf(stderr,
        "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
        desc, event_type, first, last);

    for (i = first; i <= last; i++)
        (void) fprintf(stderr,
            "%4d = %s\n", i, g_event_info[i].ev_desc);
}

static void
usage(void)
{
    (void) fprintf(stderr,
        "Usage: lockstat [options] command [args]\n"
        "\nEvent selection options:\n\n"
        "  -C              watch contention events [on by default]\n"
        "  -E              watch error events [off by default]\n"
        "  -H              watch hold events [off by default]\n"
        "  -I              watch interrupt events [off by default]\n"
        "  -A              watch all lock events [equivalent to -CH]\n"
        "  -e event_list   only watch the specified events (shown below);\n"
        "                  <event_list> is a comma-separated list of\n"
        "                  events or ranges of events, e.g. 1,4-7,35\n"
        "  -i rate         interrupt rate for -I [default: %d Hz]\n"
        "\nData gathering options:\n\n"
        "  -b              basic statistics (lock, caller, event count)\n"
        "  -t              timing for all events [default]\n"
        "  -h              histograms for event times\n"
        "  -s depth        stack traces <depth> deep\n"
        "  -x opt[=val]    enable or modify DTrace options\n"
        "\nData filtering options:\n\n"
        "  -n nrecords     maximum number of data records [default: %d]\n"
        "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
        "                  symbolic name or hex address; <size> defaults\n"
        "                  to the ELF symbol size if available, 1 if not\n"
        "  -f func[,size]  only watch events generated by <func>\n"
        "  -d duration     only watch events longer than <duration>\n"
        "  -T              trace (rather than sample) events\n"
        "\nData reporting options:\n\n"
        "  -c              coalesce lock data for arrays like pse_mutex[]\n"
        "  -k              coalesce PCs within functions\n"
        "  -g              show total events generated by function\n"
        "  -w              wherever: don't distinguish events by caller\n"
        "  -W              whichever: don't distinguish events by lock\n"
        "  -R              display rates rather than counts\n"
        "  -p              parsable output format (awk(1)-friendly)\n"
        "  -P              sort lock data by (count * avg_time) product\n"
        "  -D n            only display top <n> events of each type\n"
        "  -o filename     send output to <filename>\n",
        DEFAULT_HZ, DEFAULT_NRECS);

    show_events('C', "Contention");
    show_events('H', "Hold-time");
    show_events('I', "Interrupt");
    show_events('E', "Error");
    (void) fprintf(stderr, "\n");

    exit(1);
}

static int
lockcmp(lsrec_t *a, lsrec_t *b)
{
    int i;

    if (a->ls_event < b->ls_event)
        return (-1);
    if (a->ls_event > b->ls_event)
        return (1);

    for (i = g_stkdepth - 1; i >= 0; i--) {
        if (a->ls_stack[i] < b->ls_stack[i])
            return (-1);
        if (a->ls_stack[i] > b->ls_stack[i])
            return (1);
    }

    if (a->ls_caller < b->ls_caller)
        return (-1);
    if (a->ls_caller > b->ls_caller)
        return (1);

    if (a->ls_lock < b->ls_lock)
        return (-1);
    if (a->ls_lock > b->ls_lock)
        return (1);

    return (0);
}

static int
countcmp(lsrec_t *a, lsrec_t *b)
{
    if (a->ls_event < b->ls_event)
        return (-1);
    if (a->ls_event > b->ls_event)
        return (1);

    return (b->ls_count - a->ls_count);
}

static int
timecmp(lsrec_t *a, lsrec_t *b)
{
    if (a->ls_event < b->ls_event)
        return (-1);
    if (a->ls_event > b->ls_event)
        return (1);

    if (a->ls_time < b->ls_time)
        return (1);
    if (a->ls_time > b->ls_time)
        return (-1);

    return (0);
}

static int
lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
{
    if (a->ls_event < b->ls_event)
        return (-1);
    if (a->ls_event > b->ls_event)
        return (1);

    if (a->ls_lock < b->ls_lock)
        return (-1);
    if (a->ls_lock > b->ls_lock)
        return (1);

    return (0);
}

static int
lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
{
    if (a->ls_event < b->ls_event)
        return (-1);
    if (a->ls_event > b->ls_event)
        return (1);

    if (a->ls_lock < b->ls_lock)
        return (-1);
    if (a->ls_lock > b->ls_lock)
        return (1);

    return (b->ls_count - a->ls_count);
}

static int
sitecmp_anylock(lsrec_t *a, lsrec_t *b)
{
    int i;

    if (a->ls_event < b->ls_event)
        return (-1);
    if (a->ls_event > b->ls_event)
        return (1);

    for (i = g_stkdepth - 1; i >= 0; i--) {
        if (a->ls_stack[i] < b->ls_stack[i])
            return (-1);
        if (a->ls_stack[i] > b->ls_stack[i])
            return (1);
    }

    if (a->ls_caller < b->ls_caller)
        return (-1);
    if (a->ls_caller > b->ls_caller)
        return (1);

    return (0);
}

static int
site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
{
    int i;

    if (a->ls_event < b->ls_event)
        return (-1);
    if (a->ls_event > b->ls_event)
        return (1);

    for (i = g_stkdepth - 1; i >= 0; i--) {
        if (a->ls_stack[i] < b->ls_stack[i])
            return (-1);
        if (a->ls_stack[i] > b->ls_stack[i])
            return (1);
    }

    if (a->ls_caller < b->ls_caller)
        return (-1);
    if (a->ls_caller > b->ls_caller)
        return (1);

    return (b->ls_count - a->ls_count);
}

static void
mergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
{
    int m = n / 2;
    int i, j;

    if (m > 1)
        mergesort(cmp, a, b, m);
    if (n - m > 1)
        mergesort(cmp, a + m, b + m, n - m);
    for (i = m; i > 0; i--)
        b[i - 1] = a[i - 1];
    for (j = m - 1; j < n - 1; j++)
        b[n + m - j - 2] = a[j + 1];
    while (i < j)
        *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
    *a = b[i];
}

static void
coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
{
    int i, j;
    lsrec_t *target, *current;

    target = lock[0];

    for (i = 1; i < n; i++) {
        current = lock[i];
        if (cmp(current, target) != 0) {
            target = current;
            continue;
        }
        current->ls_event = LS_MAX_EVENTS;
        target->ls_count += current->ls_count;
        target->ls_refcnt += current->ls_refcnt;
        if (g_recsize < LS_TIME)
            continue;
        target->ls_time += current->ls_time;
        if (g_recsize < LS_HIST)
            continue;
        for (j = 0; j < 64; j++)
            target->ls_hist[j] += current->ls_hist[j];
    }
}

static void
coalesce_symbol(uintptr_t *addrp)
{
    uintptr_t symoff;
    size_t symsize;

    if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
        *addrp -= symoff;
}

static void
predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
{
    char *new;
    int len, newlen;

    if (what == NULL)
        return;

    if (*pred == NULL) {
        *pred = malloc(1);
        *pred[0] = '\0';
    }

    len = strlen(*pred);
    newlen = len + strlen(what) + 32 + strlen("( && )");
    new = malloc(newlen);

    if (*pred[0] != '\0') {
        if (cmp != NULL) {
            (void) sprintf(new, "(%s) && (%s %s 0x%p)",
                *pred, what, cmp, (void *)value);
        } else {
            (void) sprintf(new, "(%s) && (%s)", *pred, what);
        }
    } else {
        if (cmp != NULL) {
            (void) sprintf(new, "%s %s 0x%p",
                what, cmp, (void *)value);
        } else {
            (void) sprintf(new, "%s", what);
        }
    }

    free(*pred);
    *pred = new;
}

static void
predicate_destroy(char **pred)
{
    free(*pred);
    *pred = NULL;
}

static void
filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
{
    char buf[256], *c = buf, *new;
    int len, newlen;

    if (*filt == NULL) {
        *filt = malloc(1);
        *filt[0] = '\0';
    }

    (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
        " || " : "", what, (void *)base, what, (void *)(base + size));

    newlen = (len = strlen(*filt) + 1) + strlen(c);
    new = malloc(newlen);
    bcopy(*filt, new, len);
    (void) strcat(new, c);
    free(*filt);
    *filt = new;
}

static void
filter_destroy(char **filt)
{
    free(*filt);
    *filt = NULL;
}

static void
dprog_add(const char *fmt, ...)
{
    va_list args;
    int size, offs;
    char c;

    va_start(args, fmt);
    size = vsnprintf(&c, 1, fmt, args) + 1;

    if (g_proglen == 0) {
        offs = 0;
    } else {
        offs = g_proglen - 1;
    }

    g_proglen = offs + size;

    if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
        fail(1, "failed to reallocate program text");

    (void) vsnprintf(&g_prog[offs], size, fmt, args);
}

/*
 * This function may read like an open sewer, but keep in mind that programs
 * that generate other programs are rarely pretty.  If one has the unenviable
 * task of maintaining or -- worse -- extending this code, use the -V option
 * to examine the D program as generated by this function.
 */
static void
dprog_addevent(int event)
{
    ls_event_info_t *info = &g_event_info[event];
    char *pred = NULL;
    char stack[20];
    const char *arg0, *caller;
    char *arg1 = "arg1";
    char buf[80];
    hrtime_t dur;
    int depth;

    if (info->ev_name[0] == '\0')
        return;

    if (info->ev_type == 'I') {
        /*
         * For interrupt events, arg0 (normally the lock pointer) is
         * the CPU address plus the current pil, and arg1 (normally
         * the number of nanoseconds) is the number of nanoseconds
         * late -- and it's stored in arg2.
         */
        arg0 = "(uintptr_t)curthread->t_cpu + \n"
            "\t    curthread->t_cpu->cpu_profile_pil";
        caller = "(uintptr_t)arg0";
        arg1 = "arg2";
    } else {
        arg0 = "(uintptr_t)arg0";
        caller = "caller";
    }

    if (g_recsize > LS_HIST) {
        for (depth = 0; g_recsize > LS_STACK(depth); depth++)
            continue;

        if (g_tracing) {
            (void) sprintf(stack, "\tstack(%d);\n", depth);
        } else {
            (void) sprintf(stack, ", stack(%d)", depth);
        }
    } else {
        (void) sprintf(stack, "");
    }

    if (info->ev_acquire != NULL) {
        /*
         * If this is a hold event, we need to generate an additional
         * clause for the acquire; the clause for the release will be
         * generated with the aggregating statement, below.
         */
        dprog_add("%s\n", info->ev_acquire);
        predicate_add(&pred, info->ev_predicate, NULL, 0);
        predicate_add(&pred, g_predicate, NULL, 0);
        if (pred != NULL)
            dprog_add("/%s/\n", pred);

        dprog_add("{\n");
        (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);

        if (info->ev_type == 'H') {
            dprog_add("\t%s = timestamp;\n", buf);
        } else {
            /*
             * If this isn't a hold event, it's the recursive
             * error event.  For this, we simply bump the
             * thread-local, per-lock count.
             */
            dprog_add("\t%s++;\n", buf);
        }

        dprog_add("}\n\n");
        predicate_destroy(&pred);
        pred = NULL;

        if (info->ev_type == 'E') {
            /*
             * If this is the recursive lock error event, we need
             * to generate an additional clause to decrement the
             * thread-local, per-lock count.  This assures that we
             * only execute the aggregating clause if we have
             * recursive entry.
             */
            dprog_add("%s\n", info->ev_name);
            dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
        }

        predicate_add(&pred, buf, NULL, 0);

        if (info->ev_type == 'H') {
            (void) sprintf(buf, "timestamp -\n\t    "
                "self->ev%d[(uintptr_t)arg0]", event);
        }

        arg1 = buf;
    } else {
        predicate_add(&pred, info->ev_predicate, NULL, 0);
        if (info->ev_type != 'I')
            predicate_add(&pred, g_predicate, NULL, 0);
        else
            predicate_add(&pred, g_ipredicate, NULL, 0);
    }

    if ((dur = g_min_duration[event]) != 0)
        predicate_add(&pred, arg1, ">=", dur);

    dprog_add("%s\n", info->ev_name);

    if (pred != NULL)
        dprog_add("/%s/\n", pred);
    predicate_destroy(&pred);

    dprog_add("{\n");

    if (g_tracing) {
        dprog_add("\ttrace(%dULL);\n", event);
        dprog_add("\ttrace(%s);\n", arg0);
        dprog_add("\ttrace(%s);\n", caller);
        dprog_add(stack);
    } else {
        /*
         * The ordering here is important:  when we process the
         * aggregate, we count on the fact that @avg appears before
         * @hist in program order to assure that @avg is assigned the
         * first aggregation variable ID and @hist assigned the
         * second; see the comment in process_aggregate() for details.
         */
        dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
            event, arg0, caller, stack, arg1);

        if (g_recsize >= LS_HIST) {
            dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
                "(%s);\n", event, arg0, caller, stack, arg1);
        }
    }

    if (info->ev_acquire != NULL)
        dprog_add("\tself->ev%d[arg0] = 0;\n", event);

    dprog_add("}\n\n");
}

static void
dprog_compile()
{
    dtrace_prog_t *prog;
    dtrace_proginfo_t info;

    if (g_Vflag) {
        (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
        (void) fputs(g_prog, stderr);
        (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
    }

    if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
        DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
        dfail("failed to compile program");

    if (dtrace_program_exec(g_dtp, prog, &info) == -1)
        dfail("failed to enable probes");

    if (dtrace_go(g_dtp) != 0)
        dfail("couldn't start tracing");
}

static void
status_fire(void)
{}

static void
status_init(void)
{
    dtrace_optval_t val, status, agg;
    struct sigaction act;
    struct itimerspec ts;
    struct sigevent ev;
    timer_t tid;

    if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
        dfail("failed to get 'statusrate'");

    if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
        dfail("failed to get 'statusrate'");

    /*
     * We would want to awaken at a rate that is the GCD of the statusrate
     * and the aggrate -- but that seems a bit absurd.  Instead, we'll
     * simply awaken at a rate that is the more frequent of the two, which
     * assures that we're never later than the interval implied by the
     * more frequent rate.
     */
    val = status < agg ? status : agg;

    (void) sigemptyset(&act.sa_mask);
    act.sa_flags = 0;
    act.sa_handler = status_fire;
    (void) sigaction(SIGUSR1, &act, NULL);

    ev.sigev_notify = SIGEV_SIGNAL;
    ev.sigev_signo = SIGUSR1;

    if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
        dfail("cannot create CLOCK_REALTIME timer");

    ts.it_value.tv_sec = val / NANOSEC;
    ts.it_value.tv_nsec = val % NANOSEC;
    ts.it_interval = ts.it_value;

    if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
        dfail("cannot set time on CLOCK_REALTIME timer");
}

static void
status_check(void)
{
    if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
        dfail("failed to snap aggregate");

    if (dtrace_status(g_dtp) == -1)
        dfail("dtrace_status()");
}

static void
lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
{
    bzero(lsrec, g_recsize);
    lsrec->ls_count = 1;

    if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
        fail(0, "truncated DTrace record");

    if (rec->dtrd_size != sizeof (uint64_t))
        fail(0, "bad event size in first record");

    /* LINTED - alignment */
    lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
    rec++;

    if (rec->dtrd_size != sizeof (uintptr_t))
        fail(0, "bad lock address size in second record");

    /* LINTED - alignment */
    lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
    rec++;

    if (rec->dtrd_size != sizeof (uintptr_t))
        fail(0, "bad caller size in third record");

    /* LINTED - alignment */
    lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
    rec++;

    if (g_recsize > LS_HIST) {
        int frames, i;
        pc_t *stack;

        frames = rec->dtrd_size / sizeof (pc_t);
        /* LINTED - alignment */
        stack = (pc_t *)(data + rec->dtrd_offset);

        for (i = 1; i < frames; i++)
            lsrec->ls_stack[i - 1] = stack[i];
    }
}

/*ARGSUSED*/
static int
count_aggregate(const dtrace_aggdata_t *agg, void *arg)
{
    *((size_t *)arg) += 1;

    return (DTRACE_AGGWALK_NEXT);
}

static int
process_aggregate(const dtrace_aggdata_t *agg, void *arg)
{
    const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
    caddr_t data = agg->dtada_data;
    lsdata_t *lsdata = arg;
    lsrec_t *lsrec = lsdata->lsd_next;
    const dtrace_recdesc_t *rec;
    uint64_t *avg, *quantized;
    int i, j;

    assert(lsdata->lsd_count < g_nrecs);

    /*
     * Aggregation variable IDs are guaranteed to be generated in program
     * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
     * plus one.  As "avg" appears before "hist" in program order, we know
     * that "avg" will be allocated the first aggregation variable ID, and
     * "hist" will be allocated the second aggregation variable ID -- and
     * we therefore use the aggregation variable ID to differentiate the
     * cases.
     */
    if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
        /*
         * If this is the histogram entry.  We'll copy the quantized
         * data into lc_hist, and jump over the rest.
         */
        rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];

        if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
            fail(0, "bad variable ID in aggregation record");

        if (rec->dtrd_size !=
            DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
            fail(0, "bad quantize size in aggregation record");

        /* LINTED - alignment */
        quantized = (uint64_t *)(data + rec->dtrd_offset);

        for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
            i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
            lsrec->ls_hist[j] = quantized[i];

        goto out;
    }

    lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
        aggdesc->dtagd_nrecs - 1, data);

    rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];

    if (rec->dtrd_size != 2 * sizeof (uint64_t))
        fail(0, "bad avg size in aggregation record");

    /* LINTED - alignment */
    avg = (uint64_t *)(data + rec->dtrd_offset);
    lsrec->ls_count = (uint32_t)avg[0];
    lsrec->ls_time = (uintptr_t)avg[1];

    if (g_recsize >= LS_HIST)
        return (DTRACE_AGGWALK_NEXT);

out:
    lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
    lsdata->lsd_count++;

    return (DTRACE_AGGWALK_NEXT);
}

static int
process_trace(const dtrace_probedata_t *pdata, void *arg)
{
    lsdata_t *lsdata = arg;
    lsrec_t *lsrec = lsdata->lsd_next;
    dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
    caddr_t data = pdata->dtpda_data;

    if (lsdata->lsd_count >= g_nrecs)
        return (DTRACE_CONSUME_NEXT);

    lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);

    lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
    lsdata->lsd_count++;

    return (DTRACE_CONSUME_NEXT);
}

static int
process_data(FILE *out, char *data)
{
    lsdata_t lsdata;

    /* LINTED - alignment */
    lsdata.lsd_next = (lsrec_t *)data;
    lsdata.lsd_count = 0;

    if (g_tracing) {
        if (dtrace_consume(g_dtp, out,
            process_trace, NULL, &lsdata) != 0)
            dfail("failed to consume buffer");

        return (lsdata.lsd_count);
    }

    if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
        process_aggregate, &lsdata) != 0)
        dfail("failed to walk aggregate");

    return (lsdata.lsd_count);
}

/*ARGSUSED*/
static int
drophandler(const dtrace_dropdata_t *data, void *arg)
{
    g_dropped++;
    (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
    return (DTRACE_HANDLE_OK);
}

int
main(int argc, char **argv)
{
    char *data_buf;
    lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
    FILE *out = stdout;
    char c;
    pid_t child;
    int status;
    int i, j;
    hrtime_t duration;
    char *addrp, *offp, *sizep, *evp, *lastp, *p;
    uintptr_t addr;
    size_t size, off;
    int events_specified = 0;
    int exec_errno = 0;
    uint32_t event;
    char *filt = NULL, *ifilt = NULL;
    static uint64_t ev_count[LS_MAX_EVENTS + 1];
    static uint64_t ev_time[LS_MAX_EVENTS + 1];
    dtrace_optval_t aggsize;
    char aggstr[10];
    long ncpus;
    int dynvar = 0;
    int err;

    if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
        fail(0, "cannot open dtrace library: %s",
            dtrace_errmsg(NULL, err));
    }

    if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
        dfail("couldn't establish drop handler");

    if (symtab_init() == -1)
        fail(1, "can't load kernel symbols");

    g_nrecs = DEFAULT_NRECS;

    while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
        switch (c) {
        case 'b':
            g_recsize = LS_BASIC;
            break;

        case 't':
            g_recsize = LS_TIME;
            break;

        case 'h':
            g_recsize = LS_HIST;
            break;

        case 's':
            if (!isdigit(optarg[0]))
                usage();
            g_stkdepth = atoi(optarg);
            if (g_stkdepth > LS_MAX_STACK_DEPTH)
                fail(0, "max stack depth is %d",
                    LS_MAX_STACK_DEPTH);
            g_recsize = LS_STACK(g_stkdepth);
            break;

        case 'n':
            if (!isdigit(optarg[0]))
                usage();
            g_nrecs = atoi(optarg);
            break;

        case 'd':
            if (!isdigit(optarg[0]))
                usage();
            duration = atoll(optarg);

            /*
             * XXX -- durations really should be per event
             * since the units are different, but it's hard
             * to express this nicely in the interface.
             * Not clear yet what the cleanest solution is.
             */
            for (i = 0; i < LS_MAX_EVENTS; i++)
                if (g_event_info[i].ev_type != 'E')
                    g_min_duration[i] = duration;

            break;

        case 'i':
            if (!isdigit(optarg[0]))
                usage();
            i = atoi(optarg);
            if (i <= 0)
                usage();
            if (i > MAX_HZ)
                fail(0, "max interrupt rate is %d Hz", MAX_HZ);

            for (j = 0; j < LS_MAX_EVENTS; j++)
                if (strcmp(g_event_info[j].ev_desc,
                    "Profiling interrupt") == 0)
                    break;

            (void) sprintf(g_event_info[j].ev_name,
                "profile:::profile-%d", i);
            break;

        case 'l':
        case 'f':
            addrp = strtok(optarg, ",");
            sizep = strtok(NULL, ",");
            addrp = strtok(optarg, ",+");
            offp = strtok(NULL, ",");

            size = sizep ? strtoul(sizep, NULL, 0) : 1;
            off = offp ? strtoul(offp, NULL, 0) : 0;

            if (addrp[0] == '0') {
                addr = strtoul(addrp, NULL, 16) + off;
            } else {
                addr = sym_to_addr(addrp) + off;
                if (sizep == NULL)
                    size = sym_size(addrp) - off;
                if (addr - off == 0)
                    fail(0, "symbol '%s' not found", addrp);
                if (size == 0)
                    size = 1;
            }


            if (c == 'l') {
                filter_add(&filt, "arg0", addr, size);
            } else {
                filter_add(&filt, "caller", addr, size);
                filter_add(&ifilt, "arg0", addr, size);
            }
            break;

        case 'e':
            evp = strtok_r(optarg, ",", &lastp);
            while (evp) {
                int ev1, ev2;
                char *evp2;

                (void) strtok(evp, "-");
                evp2 = strtok(NULL, "-");
                ev1 = atoi(evp);
                ev2 = evp2 ? atoi(evp2) : ev1;
                if ((uint_t)ev1 >= LS_MAX_EVENTS ||
                    (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
                    fail(0, "-e events out of range");
                for (i = ev1; i <= ev2; i++)
                    g_enabled[i] = 1;
                evp = strtok_r(NULL, ",", &lastp);
            }
            events_specified = 1;
            break;

        case 'c':
            g_cflag = 1;
            break;

        case 'k':
            g_kflag = 1;
            break;

        case 'w':
            g_wflag = 1;
            break;

        case 'W':
            g_Wflag = 1;
            break;

        case 'g':
            g_gflag = 1;
            break;

        case 'C':
        case 'E':
        case 'H':
        case 'I':
            for (i = 0; i < LS_MAX_EVENTS; i++)
                if (g_event_info[i].ev_type == c)
                    g_enabled[i] = 1;
            events_specified = 1;
            break;

        case 'A':
            for (i = 0; i < LS_MAX_EVENTS; i++)
                if (strchr("CH", g_event_info[i].ev_type))
                    g_enabled[i] = 1;
            events_specified = 1;
            break;

        case 'T':
            g_tracing = 1;
            break;

        case 'D':
            if (!isdigit(optarg[0]))
                usage();
            g_topn = atoi(optarg);
            break;

        case 'R':
            g_rates = 1;
            break;

        case 'p':
            g_pflag = 1;
            break;

        case 'P':
            g_Pflag = 1;
            break;

        case 'o':
            if ((out = fopen(optarg, "w")) == NULL)
                fail(1, "error opening file");
            break;

        case 'V':
            g_Vflag = 1;
            break;

        default:
            if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
                usage();
        }
    }

    if (filt != NULL) {
        predicate_add(&g_predicate, filt, NULL, 0);
        filter_destroy(&filt);
    }

    if (ifilt != NULL) {
        predicate_add(&g_ipredicate, ifilt, NULL, 0);
        filter_destroy(&ifilt);
    }

    if (g_recsize == 0) {
        if (g_gflag) {
            g_stkdepth = LS_MAX_STACK_DEPTH;
            g_recsize = LS_STACK(g_stkdepth);
        } else {
            g_recsize = LS_TIME;
        }
    }

    if (g_gflag && g_recsize <= LS_STACK(0))
        fail(0, "'-g' requires at least '-s 1' data gathering");

    /*
     * Make sure the alignment is reasonable
     */
    g_recsize = -(-g_recsize & -sizeof (uint64_t));

    for (i = 0; i < LS_MAX_EVENTS; i++) {
        /*
         * If no events were specified, enable -C.
         */
        if (!events_specified && g_event_info[i].ev_type == 'C')
            g_enabled[i] = 1;
    }

    for (i = 0; i < LS_MAX_EVENTS; i++) {
        if (!g_enabled[i])
            continue;

        if (g_event_info[i].ev_acquire != NULL) {
            /*
             * If we've enabled a hold event, we must explicitly
             * allocate dynamic variable space.
             */
            dynvar = 1;
        }

        dprog_addevent(i);
    }

    /*
     * Make sure there are remaining arguments to specify a child command
     * to execute.
     */
    if (argc <= optind)
        usage();

    if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
        dfail("couldn't determine number of online CPUs");

    /*
     * By default, we set our data buffer size to be the number of records
     * multiplied by the size of the record, doubled to account for some
     * DTrace slop and divided by the number of CPUs.  We silently clamp
     * the aggregation size at both a minimum and a maximum to prevent
     * absurdly low or high values.
     */
    if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
        aggsize = MIN_AGGSIZE;

    if (aggsize > MAX_AGGSIZE)
        aggsize = MAX_AGGSIZE;

    (void) sprintf(aggstr, "%lld", (long long)aggsize);

    if (!g_tracing) {
        if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
            dfail("failed to set 'bufsize'");

        if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
            dfail("failed to set 'aggsize'");

        if (dynvar) {
            /*
             * If we're using dynamic variables, we set our
             * dynamic variable size to be one megabyte per CPU,
             * with a hard-limit of 32 megabytes.  This may still
             * be too small in some cases, but it can be tuned
             * manually via -x if need be.
             */
            (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);

            if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
                dfail("failed to set 'dynvarsize'");
        }
    } else {
        if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
            dfail("failed to set 'bufsize'");
    }

    if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
        dfail("failed to set 'statusrate'");

    optind = 1;
    while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
        switch (c) {
        case 'x':
            if ((p = strchr(optarg, '=')) != NULL)
                *p++ = '\0';

            if (dtrace_setopt(g_dtp, optarg, p) != 0)
                dfail("failed to set -x %s", optarg);
            break;
        }
    }

    argc -= optind;
    argv += optind;

    dprog_compile();
    status_init();

    g_elapsed = -gethrtime();

    /*
     * Spawn the specified command and wait for it to complete.
     */
    child = fork();
    if (child == -1)
        fail(1, "cannot fork");
    if (child == 0) {
        (void) dtrace_close(g_dtp);
        (void) execvp(argv[0], &argv[0]);
        exec_errno = errno;
        exit(127);
    }

    while (waitpid(child, &status, WEXITED) != child)
        status_check();

    g_elapsed += gethrtime();

    if (WIFEXITED(status)) {
        if (WEXITSTATUS(status) != 0) {
            if (exec_errno != 0) {
                errno = exec_errno;
                fail(1, "could not execute %s", argv[0]);
            }
            (void) fprintf(stderr,
                "lockstat: warning: %s exited with code %d\n",
                argv[0], WEXITSTATUS(status));
        }
    } else {
        (void) fprintf(stderr,
            "lockstat: warning: %s died on signal %d\n",
            argv[0], WTERMSIG(status));
    }

    if (dtrace_stop(g_dtp) == -1)
        dfail("failed to stop dtrace");

    /*
     * Before we read out the results, we need to allocate our buffer.
     * If we're tracing, then we'll just use the precalculated size.  If
     * we're not, then we'll take a snapshot of the aggregate, and walk
     * it to count the number of records.
     */
    if (!g_tracing) {
        if (dtrace_aggregate_snap(g_dtp) != 0)
            dfail("failed to snap aggregate");

        g_nrecs = 0;

        if (dtrace_aggregate_walk(g_dtp,
            count_aggregate, &g_nrecs) != 0)
            dfail("failed to walk aggregate");
    }

    if ((data_buf = memalign(sizeof (uint64_t),
        (g_nrecs + 1) * g_recsize)) == NULL)
        fail(1, "Memory allocation failed");

    /*
     * Read out the DTrace data.
     */
    g_nrecs_used = process_data(out, data_buf);

    if (g_nrecs_used > g_nrecs || g_dropped)
        (void) fprintf(stderr, "lockstat: warning: "
            "ran out of data records (use -n for more)\n");

    /* LINTED - alignment */
    for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
        /* LINTED - alignment */
        lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
        ev_count[lsp->ls_event] += lsp->ls_count;
        ev_time[lsp->ls_event] += lsp->ls_time;
    }

    /*
     * If -g was specified, convert stacks into individual records.
     */
    if (g_gflag) {
        lsrec_t *newlsp, *oldlsp;

        newlsp = memalign(sizeof (uint64_t),
            g_nrecs_used * LS_TIME * (g_stkdepth + 1));
        if (newlsp == NULL)
            fail(1, "Cannot allocate space for -g processing");
        lsp = newlsp;
        /* LINTED - alignment */
        for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
            /* LINTED - alignment */
            oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
            int fr;
            int caller_in_stack = 0;

            if (oldlsp->ls_count == 0)
                continue;

            for (fr = 0; fr < g_stkdepth; fr++) {
                if (oldlsp->ls_stack[fr] == 0)
                    break;
                if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
                    caller_in_stack = 1;
                bcopy(oldlsp, lsp, LS_TIME);
                lsp->ls_caller = oldlsp->ls_stack[fr];
                /* LINTED - alignment */
                lsp = (lsrec_t *)((char *)lsp + LS_TIME);
            }
            if (!caller_in_stack) {
                bcopy(oldlsp, lsp, LS_TIME);
                /* LINTED - alignment */
                lsp = (lsrec_t *)((char *)lsp + LS_TIME);
            }
        }
        g_nrecs = g_nrecs_used =
            ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
        g_recsize = LS_TIME;
        g_stkdepth = 0;
        free(data_buf);
        data_buf = (char *)newlsp;
    }

    if ((sort_buf = calloc(2 * (g_nrecs + 1),
        sizeof (void *))) == NULL)
        fail(1, "Sort buffer allocation failed");
    merge_buf = sort_buf + (g_nrecs + 1);

    /*
     * Build the sort buffer, discarding zero-count records along the way.
     */
    /* LINTED - alignment */
    for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
        /* LINTED - alignment */
        lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
        if (lsp->ls_count == 0)
            lsp->ls_event = LS_MAX_EVENTS;
        sort_buf[i] = lsp;
    }

    if (g_nrecs_used == 0)
        exit(0);

    /*
     * Add a sentinel after the last record
     */
    sort_buf[i] = lsp;
    lsp->ls_event = LS_MAX_EVENTS;

    if (g_tracing) {
        report_trace(out, sort_buf);
        return (0);
    }

    /*
     * Application of -g may have resulted in multiple records
     * with the same signature; coalesce them.
     */
    if (g_gflag) {
        mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
        coalesce(lockcmp, sort_buf, g_nrecs_used);
    }

    /*
     * Coalesce locks within the same symbol if -c option specified.
     * Coalesce PCs within the same function if -k option specified.
     */
    if (g_cflag || g_kflag) {
        for (i = 0; i < g_nrecs_used; i++) {
            int fr;
            lsp = sort_buf[i];
            if (g_cflag)
                coalesce_symbol(&lsp->ls_lock);
            if (g_kflag) {
                for (fr = 0; fr < g_stkdepth; fr++)
                    coalesce_symbol(&lsp->ls_stack[fr]);
                coalesce_symbol(&lsp->ls_caller);
            }
        }
        mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
        coalesce(lockcmp, sort_buf, g_nrecs_used);
    }

    /*
     * Coalesce callers if -w option specified
     */
    if (g_wflag) {
        mergesort(lock_and_count_cmp_anywhere,
            sort_buf, merge_buf, g_nrecs_used);
        coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
    }

    /*
     * Coalesce locks if -W option specified
     */
    if (g_Wflag) {
        mergesort(site_and_count_cmp_anylock,
            sort_buf, merge_buf, g_nrecs_used);
        coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
    }

    /*
     * Sort data by contention count (ls_count) or total time (ls_time),
     * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
     */
    if (g_recsize < LS_TIME)
        g_Pflag = 0;

    if (g_Pflag)
        mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
    else
        mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);

    /*
     * Display data by event type
     */
    first = &sort_buf[0];
    while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
        current = first;
        while ((lsp = *current)->ls_event == event)
            current++;
        report_stats(out, first, current - first, ev_count[event],
            ev_time[event]);
        first = current;
    }

    return (0);
}

static char *
format_symbol(char *buf, uintptr_t addr, int show_size)
{
    uintptr_t symoff;
    char *symname;
    size_t symsize;

    symname = addr_to_sym(addr, &symoff, &symsize);

    if (show_size && symoff == 0)
        (void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
    else if (symoff == 0)
        (void) sprintf(buf, "%s", symname);
    else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)  /* CPU+PIL */
        (void) sprintf(buf, "%s+%ld", symname, (long)symoff);
    else if (symoff <= symsize || (symoff < 256 && addr != symoff))
        (void) sprintf(buf, "%s+0x%llx", symname,
            (unsigned long long)symoff);
    else
        (void) sprintf(buf, "0x%llx", (unsigned long long)addr);
    return (buf);
}

static void
report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
    uint64_t total_time)
{
    uint32_t event = sort_buf[0]->ls_event;
    lsrec_t *lsp;
    double ptotal = 0.0;
    double percent;
    int i, j, fr;
    int displayed;
    int first_bin, last_bin, max_bin_count, total_bin_count;
    int rectype;
    char buf[256];
    char lhdr[80], chdr[80];

    rectype = g_recsize;

    if (g_topn == 0) {
        (void) fprintf(out, "%20llu %s\n",
            g_rates == 0 ? total_count :
            ((unsigned long long)total_count * NANOSEC) / g_elapsed,
            g_event_info[event].ev_desc);
        return;
    }

    (void) sprintf(lhdr, "%s%s",
        g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
    (void) sprintf(chdr, "%s%s",
        g_wflag ? "Hottest " : "", "Caller");

    if (!g_pflag)
        (void) fprintf(out,
            "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
            g_event_info[event].ev_desc, (double)total_count,
            (double)g_elapsed / NANOSEC,
            (double)total_count * NANOSEC / g_elapsed);

    if (!g_pflag && rectype < LS_HIST) {
        (void) sprintf(buf, "%s", g_event_info[event].ev_units);
        (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
            g_rates ? "ops/s" : "Count",
            g_gflag ? "genr" : "indv",
            "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
        (void) fprintf(out, "---------------------------------"
            "----------------------------------------------\n");
    }

    displayed = 0;
    for (i = 0; i < nrecs; i++) {
        lsp = sort_buf[i];

        if (displayed++ >= g_topn)
            break;

        if (g_pflag) {
            int j;

            (void) fprintf(out, "%u %u",
                lsp->ls_event, lsp->ls_count);
            (void) fprintf(out, " %s",
                format_symbol(buf, lsp->ls_lock, g_cflag));
            (void) fprintf(out, " %s",
                format_symbol(buf, lsp->ls_caller, 0));
            (void) fprintf(out, " %f",
                (double)lsp->ls_refcnt / lsp->ls_count);
            if (rectype >= LS_TIME)
                (void) fprintf(out, " %llu",
                    (unsigned long long)lsp->ls_time);
            if (rectype >= LS_HIST) {
                for (j = 0; j < 64; j++)
                    (void) fprintf(out, " %u",
                        lsp->ls_hist[j]);
            }
            for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
                if (rectype <= LS_STACK(j) ||
                    lsp->ls_stack[j] == 0)
                    break;
                (void) fprintf(out, " %s",
                    format_symbol(buf, lsp->ls_stack[j], 0));
            }
            (void) fprintf(out, "\n");
            continue;
        }

        if (rectype >= LS_HIST) {
            (void) fprintf(out, "---------------------------------"
                "----------------------------------------------\n");
            (void) sprintf(buf, "%s",
                g_event_info[event].ev_units);
            (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
                g_rates ? "ops/s" : "Count",
                g_gflag ? "genr" : "indv",
                "cuml", "rcnt", buf, lhdr, chdr);
        }

        if (g_Pflag && total_time != 0)
            percent = (lsp->ls_time * 100.00) / total_time;
        else
            percent = (lsp->ls_count * 100.00) / total_count;

        ptotal += percent;

        if (rectype >= LS_TIME)
            (void) sprintf(buf, "%llu",
                (unsigned long long)(lsp->ls_time / lsp->ls_count));
        else
            buf[0] = '\0';

        (void) fprintf(out, "%5llu ",
            g_rates == 0 ? lsp->ls_count :
            ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);

        (void) fprintf(out, "%3.0f%% ", percent);

        if (g_gflag)
            (void) fprintf(out, "---- ");
        else
            (void) fprintf(out, "%3.0f%% ", ptotal);

        (void) fprintf(out, "%4.2f %8s ",
            (double)lsp->ls_refcnt / lsp->ls_count, buf);

        (void) fprintf(out, "%-22s ",
            format_symbol(buf, lsp->ls_lock, g_cflag));

        (void) fprintf(out, "%-24s\n",
            format_symbol(buf, lsp->ls_caller, 0));

        if (rectype < LS_HIST)
            continue;

        (void) fprintf(out, "\n");
        (void) fprintf(out, "%10s %31s %-9s %-24s\n",
            g_event_info[event].ev_units,
            "------ Time Distribution ------",
            g_rates ? "ops/s" : "count",
            rectype > LS_STACK(0) ? "Stack" : "");

        first_bin = 0;
        while (lsp->ls_hist[first_bin] == 0)
            first_bin++;

        last_bin = 63;
        while (lsp->ls_hist[last_bin] == 0)
            last_bin--;

        max_bin_count = 0;
        total_bin_count = 0;
        for (j = first_bin; j <= last_bin; j++) {
            total_bin_count += lsp->ls_hist[j];
            if (lsp->ls_hist[j] > max_bin_count)
                max_bin_count = lsp->ls_hist[j];
        }

        /*
         * If we went a few frames below the caller, ignore them
         */
        for (fr = 3; fr > 0; fr--)
            if (lsp->ls_stack[fr] == lsp->ls_caller)
                break;

        for (j = first_bin; j <= last_bin; j++) {
            uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
            (void) fprintf(out, "%10llu |%s%s %-9u ",
                1ULL << j,
                "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
                "                              " + depth,
                g_rates == 0 ? lsp->ls_hist[j] :
                (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
                g_elapsed));
            if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
                (void) fprintf(out, "\n");
                continue;
            }
            (void) fprintf(out, "%-24s\n",
                format_symbol(buf, lsp->ls_stack[fr], 0));
            fr++;
        }
        while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
            (void) fprintf(out, "%15s %-36s %-24s\n", "", "",
                format_symbol(buf, lsp->ls_stack[fr], 0));
            fr++;
        }
    }

    if (!g_pflag)
        (void) fprintf(out, "---------------------------------"
            "----------------------------------------------\n");

    (void) fflush(out);
}

static void
report_trace(FILE *out, lsrec_t **sort_buf)
{
    lsrec_t *lsp;
    int i, fr;
    int rectype;
    char buf[256], buf2[256];

    rectype = g_recsize;

    if (!g_pflag) {
        (void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
            "Event", "Time", "Owner", "Lock", "Caller");
        (void) fprintf(out, "---------------------------------"
            "----------------------------------------------\n");
    }

    for (i = 0; i < g_nrecs_used; i++) {

        lsp = sort_buf[i];

        if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
            continue;

        (void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
            lsp->ls_event, (unsigned long long)lsp->ls_time,
            (void *)lsp->ls_next,
            format_symbol(buf, lsp->ls_lock, 0),
            format_symbol(buf2, lsp->ls_caller, 0));

        if (rectype <= LS_STACK(0))
            continue;

        /*
         * If we went a few frames below the caller, ignore them
         */
        for (fr = 3; fr > 0; fr--)
            if (lsp->ls_stack[fr] == lsp->ls_caller)
                break;

        while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
            (void) fprintf(out, "%53s  %-24s\n", "",
                format_symbol(buf, lsp->ls_stack[fr], 0));
            fr++;
        }
        (void) fprintf(out, "\n");
    }

    (void) fflush(out);
}