/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <strings.h>
#include <ctype.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <limits.h>
#include <dtrace.h>
#include <sys/lockstat.h>
#include <alloca.h>
#include <signal.h>
#include <assert.h>
typedef struct lsrec {
} lsrec_t;
typedef struct lsdata {
} lsdata_t;
/*
* Definitions for the types of experiments which can be run. They are
* listed in increasing order of memory cost and processing time cost.
* The numerical value of each type is the number of bytes needed per record.
*/
extern int symtab_init(void);
extern char *strtok_r(char *, const char *, char **);
static int g_stkdepth;
static int g_rates = 0;
static int g_pflag = 0;
static int g_Pflag = 0;
static int g_wflag = 0;
static int g_Wflag = 0;
static int g_cflag = 0;
static int g_kflag = 0;
static int g_gflag = 0;
static int g_Vflag = 0;
static int g_tracing = 0;
static int g_nrecs_used;
static char *g_predicate;
static char *g_ipredicate;
static char *g_prog;
static int g_proglen;
static int g_dropped;
typedef struct ls_event_info {
char ev_type;
char *ev_predicate;
char *ev_acquire;
{ 'C', "Lock", "Adaptive mutex spin", "nsec",
"lockstat:::adaptive-spin" },
{ 'C', "Lock", "Adaptive mutex block", "nsec",
"lockstat:::adaptive-block" },
{ 'C', "Lock", "Spin lock spin", "nsec",
"lockstat:::spin-spin" },
{ 'C', "Lock", "Thread lock spin", "nsec",
"lockstat:::thread-spin" },
{ 'C', "Lock", "R/W writer blocked by writer", "nsec",
"lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
{ 'C', "Lock", "R/W writer blocked by readers", "nsec",
"lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
{ 'C', "Lock", "R/W reader blocked by writer", "nsec",
"lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
{ 'C', "Lock", "R/W reader blocked by write wanted", "nsec",
"lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
{ 'C', "Lock", "Unknown event (type 8)", "units" },
{ 'C', "Lock", "Unknown event (type 9)", "units" },
{ 'C', "Lock", "Unknown event (type 10)", "units" },
{ 'C', "Lock", "Unknown event (type 11)", "units" },
{ 'C', "Lock", "Unknown event (type 12)", "units" },
{ 'C', "Lock", "Unknown event (type 13)", "units" },
{ 'C', "Lock", "Unknown event (type 14)", "units" },
{ 'C', "Lock", "Unknown event (type 15)", "units" },
{ 'C', "Lock", "Unknown event (type 16)", "units" },
{ 'C', "Lock", "Unknown event (type 17)", "units" },
{ 'C', "Lock", "Unknown event (type 18)", "units" },
{ 'C', "Lock", "Unknown event (type 19)", "units" },
{ 'C', "Lock", "Unknown event (type 20)", "units" },
{ 'C', "Lock", "Unknown event (type 21)", "units" },
{ 'C', "Lock", "Unknown event (type 22)", "units" },
{ 'C', "Lock", "Unknown event (type 23)", "units" },
{ 'C', "Lock", "Unknown event (type 24)", "units" },
{ 'C', "Lock", "Unknown event (type 25)", "units" },
{ 'C', "Lock", "Unknown event (type 26)", "units" },
{ 'C', "Lock", "Unknown event (type 27)", "units" },
{ 'C', "Lock", "Unknown event (type 28)", "units" },
{ 'C', "Lock", "Unknown event (type 29)", "units" },
{ 'C', "Lock", "Unknown event (type 30)", "units" },
{ 'C', "Lock", "Unknown event (type 31)", "units" },
{ 'H', "Lock", "Adaptive mutex hold", "nsec",
"lockstat:::adaptive-release", NULL,
"lockstat:::adaptive-acquire" },
{ 'H', "Lock", "Spin lock hold", "nsec",
"lockstat:::spin-release", NULL,
"lockstat:::spin-acquire" },
{ 'H', "Lock", "R/W writer hold", "nsec",
"lockstat:::rw-release", "arg1 == 0",
"lockstat:::rw-acquire" },
{ 'H', "Lock", "R/W reader hold", "nsec",
"lockstat:::rw-release", "arg1 != 0",
"lockstat:::rw-acquire" },
{ 'H', "Lock", "Unknown event (type 36)", "units" },
{ 'H', "Lock", "Unknown event (type 37)", "units" },
{ 'H', "Lock", "Unknown event (type 38)", "units" },
{ 'H', "Lock", "Unknown event (type 39)", "units" },
{ 'H', "Lock", "Unknown event (type 40)", "units" },
{ 'H', "Lock", "Unknown event (type 41)", "units" },
{ 'H', "Lock", "Unknown event (type 42)", "units" },
{ 'H', "Lock", "Unknown event (type 43)", "units" },
{ 'H', "Lock", "Unknown event (type 44)", "units" },
{ 'H', "Lock", "Unknown event (type 45)", "units" },
{ 'H', "Lock", "Unknown event (type 46)", "units" },
{ 'H', "Lock", "Unknown event (type 47)", "units" },
{ 'H', "Lock", "Unknown event (type 48)", "units" },
{ 'H', "Lock", "Unknown event (type 49)", "units" },
{ 'H', "Lock", "Unknown event (type 50)", "units" },
{ 'H', "Lock", "Unknown event (type 51)", "units" },
{ 'H', "Lock", "Unknown event (type 52)", "units" },
{ 'H', "Lock", "Unknown event (type 53)", "units" },
{ 'H', "Lock", "Unknown event (type 54)", "units" },
{ 'H', "Lock", "Unknown event (type 55)", "units" },
{ 'I', "CPU+PIL", "Profiling interrupt", "nsec",
"profile:::profile-97", NULL },
{ 'I', "Lock", "Unknown event (type 57)", "units" },
{ 'I', "Lock", "Unknown event (type 58)", "units" },
{ 'I', "Lock", "Unknown event (type 59)", "units" },
{ 'E', "Lock", "Recursive lock entry detected", "(N/A)",
{ 'E', "Lock", "Lockstat enter failure", "(N/A)" },
{ 'E', "Lock", "Lockstat exit failure", "nsec" },
{ 'E', "Lock", "Lockstat record failure", "(N/A)" },
};
static void
{
if (do_perror)
exit(2);
}
static void
{
exit(2);
}
static void
{
for (i = 0; i < LS_MAX_EVENTS; i++) {
continue;
if (first == -1)
first = i;
last = i;
}
"\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
}
static void
usage(void)
{
"Usage: lockstat [options] command [args]\n"
"\nEvent selection options:\n\n"
" -C watch contention events [on by default]\n"
" -E watch error events [off by default]\n"
" -H watch hold events [off by default]\n"
" -I watch interrupt events [off by default]\n"
" -A watch all lock events [equivalent to -CH]\n"
" -e event_list only watch the specified events (shown below);\n"
" <event_list> is a comma-separated list of\n"
" events or ranges of events, e.g. 1,4-7,35\n"
" -i rate interrupt rate for -I [default: %d Hz]\n"
"\nData gathering options:\n\n"
" -b basic statistics (lock, caller, event count)\n"
" -t timing for all events [default]\n"
" -h histograms for event times\n"
" -s depth stack traces <depth> deep\n"
" -x opt[=val] enable or modify DTrace options\n"
"\nData filtering options:\n\n"
" -n nrecords maximum number of data records [default: %d]\n"
" -l lock[,size] only watch <lock>, which can be specified as a\n"
" symbolic name or hex address; <size> defaults\n"
" to the ELF symbol size if available, 1 if not\n"
" -f func[,size] only watch events generated by <func>\n"
" -d duration only watch events longer than <duration>\n"
" -T trace (rather than sample) events\n"
"\nData reporting options:\n\n"
" -c coalesce lock data for arrays like pse_mutex[]\n"
" -k coalesce PCs within functions\n"
" -g show total events generated by function\n"
" -w wherever: don't distinguish events by caller\n"
" -W whichever: don't distinguish events by lock\n"
" -R display rates rather than counts\n"
" -p parsable output format (awk(1)-friendly)\n"
" -P sort lock data by (count * avg_time) product\n"
" -D n only display top <n> events of each type\n"
" -o filename send output to <filename>\n",
exit(1);
}
static int
{
int i;
return (-1);
return (1);
for (i = g_stkdepth - 1; i >= 0; i--) {
return (-1);
return (1);
}
return (-1);
return (1);
return (-1);
return (1);
return (0);
}
static int
{
return (-1);
return (1);
}
static int
{
return (-1);
return (1);
return (1);
return (-1);
return (0);
}
static int
{
return (-1);
return (1);
return (-1);
return (1);
return (0);
}
static int
{
return (-1);
return (1);
return (-1);
return (1);
}
static int
{
int i;
return (-1);
return (1);
for (i = g_stkdepth - 1; i >= 0; i--) {
return (-1);
return (1);
}
return (-1);
return (1);
return (0);
}
static int
{
int i;
return (-1);
return (1);
for (i = g_stkdepth - 1; i >= 0; i--) {
return (-1);
return (1);
}
return (-1);
return (1);
}
static void
{
int m = n / 2;
int i, j;
if (m > 1)
if (n - m > 1)
for (i = m; i > 0; i--)
b[i - 1] = a[i - 1];
for (j = m - 1; j < n - 1; j++)
b[n + m - j - 2] = a[j + 1];
while (i < j)
*a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
*a = b[i];
}
static void
{
int i, j;
for (i = 1; i < n; i++) {
continue;
}
continue;
continue;
for (j = 0; j < 64; j++)
}
}
static void
{
}
static void
{
char *new;
return;
*pred[0] = '\0';
}
if (*pred[0] != '\0') {
} else {
}
} else {
} else {
}
}
}
static void
{
}
static void
{
*filt[0] = '\0';
}
}
static void
{
}
static void
{
char c;
if (g_proglen == 0) {
offs = 0;
} else {
}
}
/*
* This function may read like an open sewer, but keep in mind that programs
* that generate other programs are rarely pretty. If one has the unenviable
* task of maintaining or -- worse -- extending this code, use the -V option
* to examine the D program as generated by this function.
*/
static void
{
int depth;
return;
/*
* For interrupt events, arg0 (normally the lock pointer) is
* the CPU address plus the current pil, and arg1 (normally
* the number of nanoseconds) is the number of nanoseconds
* late -- and it's stored in arg2.
*/
arg0 = "(uintptr_t)curthread->t_cpu + \n"
"\t curthread->t_cpu->cpu_profile_pil";
caller = "(uintptr_t)arg0";
arg1 = "arg2";
} else {
arg0 = "(uintptr_t)arg0";
caller = "caller";
}
continue;
if (g_tracing) {
} else {
}
} else {
}
/*
* If this is a hold event, we need to generate an additional
* clause for the acquire; the clause for the release will be
* generated with the aggregating statement, below.
*/
dprog_add("{\n");
} else {
/*
* If this isn't a hold event, it's the recursive
* error event. For this, we simply bump the
* thread-local, per-lock count.
*/
}
dprog_add("}\n\n");
/*
* If this is the recursive lock error event, we need
* to generate an additional clause to decrement the
* thread-local, per-lock count. This assures that we
* only execute the aggregating clause if we have
* recursive entry.
*/
}
"self->ev%d[(uintptr_t)arg0]", event);
}
} else {
else
}
dprog_add("{\n");
if (g_tracing) {
} else {
/*
* The ordering here is important: when we process the
* aggregate, we count on the fact that @avg appears before
* @hist in program order to assure that @avg is assigned the
* first aggregation variable ID and @hist assigned the
* second; see the comment in process_aggregate() for details.
*/
dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
}
}
dprog_add("}\n\n");
}
static void
{
if (g_Vflag) {
}
dfail("failed to compile program");
dfail("failed to enable probes");
dfail("couldn't start tracing");
}
static void
status_fire(void)
{}
static void
status_init(void)
{
dfail("failed to get 'statusrate'");
dfail("failed to get 'statusrate'");
/*
* We would want to awaken at a rate that is the GCD of the statusrate
* and the aggrate -- but that seems a bit absurd. Instead, we'll
* simply awaken at a rate that is the more frequent of the two, which
* assures that we're never later than the interval implied by the
* more frequent rate.
*/
dfail("cannot create CLOCK_REALTIME timer");
dfail("cannot set time on CLOCK_REALTIME timer");
}
static void
status_check(void)
{
dfail("failed to snap aggregate");
dfail("dtrace_status()");
}
static void
{
fail(0, "truncated DTrace record");
fail(0, "bad event size in first record");
/* LINTED - alignment */
rec++;
fail(0, "bad lock address size in second record");
/* LINTED - alignment */
rec++;
fail(0, "bad caller size in third record");
/* LINTED - alignment */
rec++;
int frames, i;
/* LINTED - alignment */
for (i = 1; i < frames; i++)
}
}
/*ARGSUSED*/
static int
{
return (DTRACE_AGGWALK_NEXT);
}
static int
{
int i, j;
/*
* Aggregation variable IDs are guaranteed to be generated in program
* order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
* plus one. As "avg" appears before "hist" in program order, we know
* that "avg" will be allocated the first aggregation variable ID, and
* "hist" will be allocated the second aggregation variable ID -- and
* we therefore use the aggregation variable ID to differentiate the
* cases.
*/
/*
* If this is the histogram entry. We'll copy the quantized
* data into lc_hist, and jump over the rest.
*/
fail(0, "bad variable ID in aggregation record");
DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
fail(0, "bad quantize size in aggregation record");
/* LINTED - alignment */
for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
goto out;
}
fail(0, "bad avg size in aggregation record");
/* LINTED - alignment */
return (DTRACE_AGGWALK_NEXT);
out:
return (DTRACE_AGGWALK_NEXT);
}
static int
{
return (DTRACE_CONSUME_NEXT);
return (DTRACE_CONSUME_NEXT);
}
static int
{
/* LINTED - alignment */
if (g_tracing) {
dfail("failed to consume buffer");
}
process_aggregate, &lsdata) != 0)
dfail("failed to walk aggregate");
}
/*ARGSUSED*/
static int
{
g_dropped++;
return (DTRACE_HANDLE_OK);
}
int
{
char *data_buf;
char c;
int status;
int i, j;
int events_specified = 0;
int exec_errno = 0;
long ncpus;
int dynvar = 0;
int err;
fail(0, "cannot open dtrace library: %s",
}
dfail("couldn't establish drop handler");
if (symtab_init() == -1)
switch (c) {
case 'b':
break;
case 't':
break;
case 'h':
break;
case 's':
usage();
if (g_stkdepth > LS_MAX_STACK_DEPTH)
fail(0, "max stack depth is %d",
break;
case 'n':
usage();
break;
case 'd':
usage();
/*
* XXX -- durations really should be per event
* since the units are different, but it's hard
* to express this nicely in the interface.
* Not clear yet what the cleanest solution is.
*/
for (i = 0; i < LS_MAX_EVENTS; i++)
g_min_duration[i] = duration;
break;
case 'i':
usage();
if (i <= 0)
usage();
if (i > MAX_HZ)
for (j = 0; j < LS_MAX_EVENTS; j++)
"Profiling interrupt") == 0)
break;
"profile:::profile-%d", i);
break;
case 'l':
case 'f':
if (addrp[0] == '0') {
} else {
if (size == 0)
size = 1;
}
if (c == 'l') {
} else {
}
break;
case 'e':
while (evp) {
char *evp2;
fail(0, "-e events out of range");
g_enabled[i] = 1;
}
events_specified = 1;
break;
case 'c':
g_cflag = 1;
break;
case 'k':
g_kflag = 1;
break;
case 'w':
g_wflag = 1;
break;
case 'W':
g_Wflag = 1;
break;
case 'g':
g_gflag = 1;
break;
case 'C':
case 'E':
case 'H':
case 'I':
for (i = 0; i < LS_MAX_EVENTS; i++)
if (g_event_info[i].ev_type == c)
g_enabled[i] = 1;
events_specified = 1;
break;
case 'A':
for (i = 0; i < LS_MAX_EVENTS; i++)
g_enabled[i] = 1;
events_specified = 1;
break;
case 'T':
g_tracing = 1;
break;
case 'D':
usage();
break;
case 'R':
g_rates = 1;
break;
case 'p':
g_pflag = 1;
break;
case 'P':
g_Pflag = 1;
break;
case 'o':
break;
case 'V':
g_Vflag = 1;
break;
default:
usage();
}
}
}
}
if (g_recsize == 0) {
if (g_gflag) {
} else {
}
}
fail(0, "'-g' requires at least '-s 1' data gathering");
/*
* Make sure the alignment is reasonable
*/
for (i = 0; i < LS_MAX_EVENTS; i++) {
/*
* If no events were specified, enable -C.
*/
g_enabled[i] = 1;
}
for (i = 0; i < LS_MAX_EVENTS; i++) {
if (!g_enabled[i])
continue;
/*
* If we've enabled a hold event, we must explicitly
* allocate dynamic variable space.
*/
dynvar = 1;
}
dprog_addevent(i);
}
/*
* Make sure there are remaining arguments to specify a child command
* to execute.
*/
usage();
dfail("couldn't determine number of online CPUs");
/*
* By default, we set our data buffer size to be the number of records
* multiplied by the size of the record, doubled to account for some
* DTrace slop and divided by the number of CPUs. We silently clamp
* the aggregation size at both a minimum and a maximum to prevent
* absurdly low or high values.
*/
if (aggsize > MAX_AGGSIZE)
if (!g_tracing) {
dfail("failed to set 'bufsize'");
dfail("failed to set 'aggsize'");
if (dynvar) {
/*
* If we're using dynamic variables, we set our
* dynamic variable size to be one megabyte per CPU,
* with a hard-limit of 32 megabytes. This may still
* be too small in some cases, but it can be tuned
* manually via -x if need be.
*/
dfail("failed to set 'dynvarsize'");
}
} else {
dfail("failed to set 'bufsize'");
}
dfail("failed to set 'statusrate'");
optind = 1;
switch (c) {
case 'x':
*p++ = '\0';
break;
}
}
status_init();
/*
* Spawn the specified command and wait for it to complete.
*/
if (child == -1)
if (child == 0) {
(void) dtrace_close(g_dtp);
exec_errno = errno;
exit(127);
}
status_check();
if (WEXITSTATUS(status) != 0) {
if (exec_errno != 0) {
errno = exec_errno;
}
"lockstat: warning: %s exited with code %d\n",
}
} else {
"lockstat: warning: %s died on signal %d\n",
}
dfail("failed to stop dtrace");
/*
* Before we read out the results, we need to allocate our buffer.
* If we're tracing, then we'll just use the precalculated size. If
* we're not, then we'll take a snapshot of the aggregate, and walk
* it to count the number of records.
*/
if (!g_tracing) {
if (dtrace_aggregate_snap(g_dtp) != 0)
dfail("failed to snap aggregate");
g_nrecs = 0;
count_aggregate, &g_nrecs) != 0)
dfail("failed to walk aggregate");
}
/*
* Read out the DTrace data.
*/
"ran out of data records (use -n for more)\n");
/* LINTED - alignment */
/* LINTED - alignment */
}
/*
* If -g was specified, convert stacks into individual records.
*/
if (g_gflag) {
/* LINTED - alignment */
/* LINTED - alignment */
int fr;
int caller_in_stack = 0;
continue;
break;
caller_in_stack = 1;
/* LINTED - alignment */
}
if (!caller_in_stack) {
/* LINTED - alignment */
}
}
g_nrecs = g_nrecs_used =
g_stkdepth = 0;
}
sizeof (void *))) == NULL)
/*
* Build the sort buffer, discarding zero-count records along the way.
*/
/* LINTED - alignment */
/* LINTED - alignment */
}
if (g_nrecs_used == 0)
exit(0);
/*
* Add a sentinel after the last record
*/
if (g_tracing) {
return (0);
}
/*
* Application of -g may have resulted in multiple records
* with the same signature; coalesce them.
*/
if (g_gflag) {
}
/*
* Coalesce locks within the same symbol if -c option specified.
* Coalesce PCs within the same function if -k option specified.
*/
for (i = 0; i < g_nrecs_used; i++) {
int fr;
if (g_cflag)
if (g_kflag) {
}
}
}
/*
* Coalesce callers if -w option specified
*/
if (g_wflag) {
}
/*
* Coalesce locks if -W option specified
*/
if (g_Wflag) {
}
/*
* Sort data by contention count (ls_count) or total time (ls_time),
* depending on g_Pflag. Override g_Pflag if time wasn't measured.
*/
g_Pflag = 0;
if (g_Pflag)
else
/*
* Display data by event type
*/
current++;
}
return (0);
}
static char *
{
char *symname;
else if (symoff == 0)
(unsigned long long)symoff);
else
return (buf);
}
static void
{
double percent;
int i, j, fr;
int displayed;
int rectype;
if (g_topn == 0) {
g_rates == 0 ? total_count :
return;
}
if (!g_pflag)
"----------------------------------------------\n");
}
displayed = 0;
for (i = 0; i < nrecs; i++) {
break;
if (g_pflag) {
int j;
for (j = 0; j < 64; j++)
}
for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
break;
}
continue;
}
"----------------------------------------------\n");
}
if (g_Pflag && total_time != 0)
else
else
buf[0] = '\0';
if (g_gflag)
else
continue;
"------ Time Distribution ------",
first_bin = 0;
first_bin++;
last_bin = 63;
last_bin--;
max_bin_count = 0;
total_bin_count = 0;
}
/*
* If we went a few frames below the caller, ignore them
*/
break;
1ULL << j,
" " + depth,
g_elapsed));
continue;
}
fr++;
}
fr++;
}
}
if (!g_pflag)
"----------------------------------------------\n");
}
static void
{
int i, fr;
int rectype;
if (!g_pflag) {
"Event", "Time", "Owner", "Lock", "Caller");
"----------------------------------------------\n");
}
for (i = 0; i < g_nrecs_used; i++) {
continue;
continue;
/*
* If we went a few frames below the caller, ignore them
*/
break;
fr++;
}
}
}