xdt.c revision 411e7d8f94b93b43156abc905a0ad18cb5f1469c
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Xen event provider for DTrace
*
* NOTE: This provider is PRIVATE. It is intended as a short-term solution and
* may disappear or be re-implemented at anytime.
*
* This provider isn't suitable as a general-purpose solution for a number of
* reasons. First and foremost, we rely on the Xen tracing mechanism and don't
* have any way to gather data other than that collected by the Xen trace
* buffers. Further, it does not fit into the DTrace model (see "Interacting
* with DTrace" below.)
*
*
* Tracing in Xen
* --------------
*
* Xen implements a tracing facility for generating and collecting execution
* event traces from the hypervisor. When tracing is enabled, compiled in
* probes record events in contiguous per-CPU trace buffers.
*
* +---------+
* +------+ | |
* | CPUn |----> | BUFFERn |
* +------+ | |
* +---------+- tbuf.va + (tbuf.size * n)
* : :
* +---------+
* +------+ | |
* | CPU1 |----> | BUFFER1 |
* +------+ | |
* +---------+- tbuf.va + tbuf.size
* +------+ | |
* | CPU0 |----> | BUFFER0 |
* +------+ | |
* +---------+- tbuf.va
*
* Each CPU buffer consists of a metadata header followed by the trace records.
* that point to the next record to be written and the next record to be read
* respectively. The trace record format is as follows:
*
* +--------------------------------------------------------------------------+
* | CPUID(uint_t) | TSC(uint64_t) | EVENTID(uint32_t) | DATA FIELDS |
* +--------------------------------------------------------------------------+
*
* DATA FIELDS:
* +--------------------------------------------------------------------------+
* | D1(uint32_t) | D2(uint32_t) | D3(uint32_t) | D4(uint32_t) | D5(uint32_t) |
* +--------------------------------------------------------------------------+
*
*
* Interacting with DTrace
* -----------------------
*
* Every xdt_poll_nsec nano-seconds we poll the trace buffers for data and feed
* each entry into dtrace_probe() with the corresponding probe ID for the event.
* As a result of this periodic collection implementation probe firings are
* asynchronous. This is the only sensible way to implement this form of
* provider, but because of its asynchronous nature asking things like
* "current CPU" and, more importantly, arbitrary questions about the context
* surrounding the probe firing are not meaningful. So, consumers should not
* attempt to infer anything beyond what is supplied via the probe arguments.
*/
#include <sys/sysmacros.h>
#include <vm/seg_kmem.h>
#include <sys/hypervisor.h>
/*
* in the xVM gate.
*/
/* Macros to extract the domid and cpuid from a HVM trace data field */
#define HVM_DOMID(d) (d >> 16)
#define HVM_VCPUID(d) (d & 0xFFFF)
if (id) \
} \
/* Probe classes */
#define XDT_SCHED 0
#define XDT_MEM 1
#define XDT_HVM 2
#define XDT_NCLASSES 3
/* Probe events */
#define XDT_EVT_INVALID (-(int)1)
#define XDT_SCHED_OFF_CPU 0
#define XDT_SCHED_ON_CPU 1
#define XDT_SCHED_IDLE_OFF_CPU 2
#define XDT_SCHED_IDLE_ON_CPU 3
#define XDT_SCHED_BLOCK 4
#define XDT_SCHED_SLEEP 5
#define XDT_SCHED_WAKE 6
#define XDT_SCHED_YIELD 7
#define XDT_SCHED_SHUTDOWN_POWEROFF 8
#define XDT_SCHED_SHUTDOWN_REBOOT 9
#define XDT_SCHED_SHUTDOWN_SUSPEND 10
#define XDT_SCHED_SHUTDOWN_CRASH 11
#define XDT_MEM_PAGE_GRANT_MAP 12
#define XDT_MEM_PAGE_GRANT_UNMAP 13
#define XDT_MEM_PAGE_GRANT_TRANSFER 14
#define XDT_HVM_VMENTRY 15
#define XDT_HVM_VMEXIT 16
#define XDT_NEVENTS 17
typedef struct {
const char *pr_mod; /* probe module */
const char *pr_name; /* probe name */
int evt_id; /* event id */
} xdt_probe_t;
typedef struct {
typedef struct {
static struct {
/* per-cpu buffers */
/* statistics */
} tbuf;
static char *xdt_stats[] = {
"dropped_recs",
};
/*
* Tunable variables
*
* includes both the name of the module ("xdt") and the name of the variable.
* For example:
* set xdt:xdt_tbuf_pages = 40
*/
/*
* The following may be tuned by adding a line to
* For example:
* xdt_poll_nsec = 200000000;
*/
/*
* Internal variables
*/
static dev_info_t *xdt_devi;
static dtrace_provider_id_t xdt_id;
static kstat_t *xdt_kstats;
static xdt_probe_t xdt_probe[] = {
/* Sched probes */
XDT_SCHED },
/* Memory probes */
/* HVM probes */
{ NULL }
};
extern uint_t xen_get_nphyscpus(void);
static inline uint32_t
{
int i;
for (i = 0; i < XDT_NCLASSES; i++)
return (tot);
}
static void
xdt_init_trace_masks(void)
{
}
static int
{
if (flag != KSTAT_READ)
return (EACCES);
/*
* Assignment order should match that of the names in
* xdt_stats.
*/
return (0);
}
static void
xdt_kstat_init(void)
{
return;
while (nstats > 0) {
knp++;
cp++;
nstats--;
}
}
static int
{
int xerr;
return (xen_xlate_errcode(xerr));
return (0);
}
static int
{
int xerr;
/*
* Ask the HAT to load a throwaway mapping to page zero, then
* overwrite it with the hypervisor mapping. It gets removed
* later via hat_unload().
*/
| PT_FOREIGN | PT_WRITABLE;
if (xerr != 0) {
/* unmap pages loaded so far */
return (xen_xlate_errcode(xerr));
}
mfn++;
}
return (0);
}
static int
xdt_attach_trace_buffers(void)
{
int err;
uint_t i;
/*
* Xen does not support trace buffer re-sizing. If the buffers
* have already been allocated we just use them as is.
*/
return (err);
/* set trace buffer size */
(void) xdt_sysctl_tbuf(&tbuf_op);
/* get trace buffer info */
return (err);
return (ENOBUFS);
}
}
return (err);
}
KM_SLEEP);
KM_SLEEP);
sizeof (struct t_buf));
/* throw away stale trace records */
}
return (0);
}
static void
xdt_detach_trace_buffers(void)
{
}
static inline void
{
int eid;
return;
}
/*
* Sched probes
*/
case TRC_SCHED_SWITCH_INFPREV:
/*
* Info on vCPU being de-scheduled
*
* rec->data[0] = prev domid
* rec->data[1] = time spent on pcpu
*/
break;
case TRC_SCHED_SWITCH_INFNEXT:
/*
* Info on next vCPU to be scheduled
*
* rec->data[0] = next domid
* rec->data[1] = time spent waiting to get on cpu
* rec->data[2] = time slice
*/
break;
case TRC_SCHED_SWITCH:
/*
* vCPU switch
*
* rec->data[0] = prev domid
* rec->data[1] = prev vcpuid
* rec->data[2] = next domid
* rec->data[3] = next vcpuid
*/
/* prev and next info don't match doms being sched'd */
return;
}
break;
case TRC_SCHED_BLOCK:
/*
* vCPU blocked
*
* rec->data[0] = domid
* rec->data[1] = vcpuid
*/
break;
case TRC_SCHED_SLEEP:
/*
* Put vCPU to sleep
*
* rec->data[0] = domid
* rec->data[1] = vcpuid
*/
break;
case TRC_SCHED_WAKE:
/*
* Wake up vCPU
*
* rec->data[0] = domid
* rec->data[1] = vcpuid
*/
break;
case TRC_SCHED_YIELD:
/*
* vCPU yielded
*
* rec->data[0] = domid
* rec->data[1] = vcpuid
*/
break;
case TRC_SCHED_SHUTDOWN:
/*
* Guest shutting down
*
* rec->data[0] = domid
* rec->data[1] = initiating vcpu
* rec->data[2] = shutdown code
*/
case SHUTDOWN_poweroff:
break;
case SHUTDOWN_reboot:
break;
case SHUTDOWN_suspend:
break;
case SHUTDOWN_crash:
break;
default:
return;
}
break;
/*
* Mem probes
*/
case TRC_MEM_PAGE_GRANT_MAP:
/*
* Guest mapped page grant
*
* rec->data[0] = domid
*/
break;
case TRC_MEM_PAGE_GRANT_UNMAP:
/*
* Guest unmapped page grant
*
* rec->data[0] = domid
*/
break;
/*
* Page grant is being transferred
*
* rec->data[0] = target domid
*/
break;
/*
* HVM probes
*/
case TRC_HVM_VMENTRY:
/*
* Return to guest via vmx_launch/vmrun
*
* rec->data[0] = (domid<<16 + vcpuid)
*/
break;
case TRC_HVM_VMEXIT:
/*
* Entry into VMEXIT handler
*
* rec->data[0] = (domid<<16 + vcpuid)
* rec->data[1] = guest rip
* rec->data[2] = cpu vendor specific exit code
*/
break;
case TRC_LOST_RECORDS:
break;
default:
break;
}
}
/*ARGSUSED*/
static void
xdt_tbuf_scan(void *arg)
{
/* scan all cpu buffers for new records */
membar_consumer(); /* read prod /then/ data */
membar_exit(); /* read data /then/ update cons */
}
}
}
static void
xdt_cyclic_enable(void)
{
}
static void
{
return;
p->pr_name, dtrace_mach_aframes(), p);
}
/*ARGSUSED*/
static void
{
int i;
xdt_probe_create(&xdt_probe[i]);
}
} else {
break;
}
return;
xdt_probe_create(&xdt_probe[i]);
}
}
/*ARGSUSED*/
static void
{
xdt_probe_t *p = parg;
}
static void
{
(void) xdt_sysctl_tbuf(&tbuf_op);
}
/*ARGSUSED*/
static void
{
xdt_probe_t *p = parg;
/* set the trace mask for this class */
}
if (xdt_cyclic == CYCLIC_NONE) {
/*
* DTrace doesn't have the notion of failing an enabling. It
* works on the premise that, if you have advertised a probe
* via the pops->dtps_provide() function, you can enable it.
* Failure is not an option. In the case where we can't enable
* Xen tracing the consumer will carry on regardless and
* think all is OK except the probes will never fire.
*/
if (xdt_sysctl_tbuf(&tbuf_op) != 0) {
return;
}
}
}
/*ARGSUSED*/
static void
{
xdt_probe_t *p = parg;
int i, err;
/*
* We could be here in the slight window between the cyclic firing and
* a call to dtrace_probe() occurring. We need to be careful if we tear
* down any shared state.
*/
xdt_probemap[p->evt_id] = 0;
if (xdt_nr_active_probes() == 0) {
cur_trace_mask = 0;
if (xdt_cyclic == CYCLIC_NONE)
return;
/*
* We will try to disable the trace buffers. If we fail for some
* reason we will try again, up to a count of XDT_TBUF_RETRY.
* If we still aren't successful we try to set the trace mask
* to 0 in order to prevent trace records from being written.
*/
i = 0;
do {
} while ((err != 0) && (++i < XDT_TBUF_RETRY));
if (err != 0) {
"Couldn't disable hypervisor tracing.");
} else {
/*
* We don't bother making the hypercall to set
* the trace mask, since it will be reset when
* tracing is re-enabled.
*/
}
/* other probes are enabled, so add the sub-class mask back */
cur_trace_mask |= 0xF000;
}
}
static dtrace_pattr_t xdt_attr = {
};
static dtrace_pops_t xdt_pops = {
xdt_provide, /* dtps_provide() */
NULL, /* dtps_provide_module() */
xdt_enable, /* dtps_enable() */
xdt_disable, /* dtps_disable() */
NULL, /* dtps_suspend() */
NULL, /* dtps_resume() */
NULL, /* dtps_getargdesc() */
NULL, /* dtps_getargval() */
NULL, /* dtps_usermode() */
xdt_destroy /* dtps_destroy() */
};
static int
{
int val;
if (!DOMAIN_IS_INITDOMAIN(xen_info))
return (DDI_FAILURE);
switch (cmd) {
case DDI_ATTACH:
break;
case DDI_RESUME:
/*
* return DDI_FAILURE for now.
*/
return (DDI_FAILURE);
default:
return (DDI_FAILURE);
}
DDI_FAILURE || xdt_attach_trace_buffers() != 0 ||
return (DDI_FAILURE);
}
"xdt_poll_nsec", XDT_POLL_DEFAULT);
sizeof (xdt_schedinfo_t), KM_SLEEP);
return (DDI_SUCCESS);
}
static int
{
switch (cmd) {
case DDI_DETACH:
break;
case DDI_SUSPEND:
/*
* return DDI_FAILURE for now.
*/
return (DDI_FAILURE);
default:
return (DDI_FAILURE);
}
if (dtrace_unregister(xdt_id) != 0)
return (DDI_FAILURE);
if (xdt_cyclic != CYCLIC_NONE)
if (xdt_kstats != NULL)
xdt_devi = (void *)0;
return (DDI_SUCCESS);
}
/*ARGSUSED*/
static int
{
int error;
switch (infocmd) {
case DDI_INFO_DEVT2DEVINFO:
error = DDI_SUCCESS;
break;
case DDI_INFO_DEVT2INSTANCE:
*result = (void *)0;
error = DDI_SUCCESS;
break;
default:
error = DDI_FAILURE;
}
return (error);
}
static struct cb_ops xdt_cb_ops = {
nulldev, /* open(9E) */
nodev, /* close(9E) */
nodev, /* strategy(9E) */
nodev, /* print(9E) */
nodev, /* dump(9E) */
nodev, /* read(9E) */
nodev, /* write(9E) */
nodev, /* ioctl(9E) */
nodev, /* devmap(9E) */
nodev, /* mmap(9E) */
nodev, /* segmap(9E) */
nochpoll, /* chpoll(9E) */
ddi_prop_op, /* prop_op(9E) */
NULL, /* streamtab(9S) */
};
DEVO_REV, /* devo_rev */
0, /* devo_refcnt */
xdt_info, /* getinfo(9E) */
nulldev, /* identify(9E) */
nulldev, /* probe(9E) */
xdt_attach, /* attach(9E) */
xdt_detach, /* detach(9E) */
nulldev, /* devo_reset */
&xdt_cb_ops, /* devo_cb_ops */
NULL, /* devo_bus_ops */
NULL /* power(9E) */
};
"Hypervisor event tracing",
};
static struct modlinkage modlinkage = {
&modldrv,
};
int
_init(void)
{
return (mod_install(&modlinkage));
}
int
_fini(void)
{
return (mod_remove(&modlinkage));
}
int
{
}