/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/modctl.h>
#include <sys/dtrace.h>
#include <sys/kobj.h>
#include <sys/stat.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/conf.h>
#define FBT_PUSHL_EBP 0x55
#define FBT_MOVL_ESP_EBP0_V0 0x8b
#define FBT_MOVL_ESP_EBP1_V0 0xec
#define FBT_MOVL_ESP_EBP0_V1 0x89
#define FBT_MOVL_ESP_EBP1_V1 0xe5
#define FBT_REX_RSP_RBP 0x48
#define FBT_POPL_EBP 0x5d
#define FBT_RET 0xc3
#define FBT_RET_IMM16 0xc2
#define FBT_LEAVE 0xc9
#ifdef __amd64
#define FBT_PATCHVAL 0xcc
#else
#define FBT_PATCHVAL 0xf0
#endif
#define FBT_ENTRY "entry"
#define FBT_RETURN "return"
#define FBT_ADDR2NDX(addr) ((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask)
#define FBT_PROBETAB_SIZE 0x8000 /* 32k entries -- 128K total */
typedef struct fbt_probe {
struct fbt_probe *fbtp_hashnext;
uint8_t *fbtp_patchpoint;
int8_t fbtp_rval;
uint8_t fbtp_patchval;
uint8_t fbtp_savedval;
uintptr_t fbtp_roffset;
dtrace_id_t fbtp_id;
char *fbtp_name;
struct modctl *fbtp_ctl;
int fbtp_loadcnt;
int fbtp_symndx;
int fbtp_primary;
struct fbt_probe *fbtp_next;
} fbt_probe_t;
static dev_info_t *fbt_devi;
static dtrace_provider_id_t fbt_id;
static fbt_probe_t **fbt_probetab;
static int fbt_probetab_size;
static int fbt_probetab_mask;
static int fbt_verbose = 0;
static int
fbt_invop(uintptr_t addr, uintptr_t *stack, uintptr_t rval)
{
uintptr_t stack0, stack1, stack2, stack3, stack4;
fbt_probe_t *fbt = fbt_probetab[FBT_ADDR2NDX(addr)];
for (; fbt != NULL; fbt = fbt->fbtp_hashnext) {
if ((uintptr_t)fbt->fbtp_patchpoint == addr) {
if (fbt->fbtp_roffset == 0) {
int i = 0;
/*
* When accessing the arguments on the stack,
* we must protect against accessing beyond
* the stack. We can safely set NOFAULT here
* -- we know that interrupts are already
* disabled.
*/
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
CPU->cpu_dtrace_caller = stack[i++];
#ifdef __amd64
/*
* On amd64, stack[0] contains the dereferenced
* stack pointer, stack[1] contains savfp,
* stack[2] contains savpc. We want to step
* over these entries.
*/
i += 2;
#endif
stack0 = stack[i++];
stack1 = stack[i++];
stack2 = stack[i++];
stack3 = stack[i++];
stack4 = stack[i++];
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT |
CPU_DTRACE_BADADDR);
dtrace_probe(fbt->fbtp_id, stack0, stack1,
stack2, stack3, stack4);
CPU->cpu_dtrace_caller = NULL;
} else {
#ifdef __amd64
/*
* On amd64, we instrument the ret, not the
* leave. We therefore need to set the caller
* to assure that the top frame of a stack()
* action is correct.
*/
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
CPU->cpu_dtrace_caller = stack[0];
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT |
CPU_DTRACE_BADADDR);
#endif
dtrace_probe(fbt->fbtp_id, fbt->fbtp_roffset,
rval, 0, 0, 0);
CPU->cpu_dtrace_caller = NULL;
}
return (fbt->fbtp_rval);
}
}
return (0);
}
/*ARGSUSED*/
static void
fbt_provide_module(void *arg, struct modctl *ctl)
{
struct module *mp = ctl->mod_mp;
char *str = mp->strings;
int nsyms = mp->nsyms;
Shdr *symhdr = mp->symhdr;
char *modname = ctl->mod_modname;
char *name;
fbt_probe_t *fbt, *retfbt;
size_t symsize;
int i, size;
/*
* Employees of dtrace and their families are ineligible. Void
* where prohibited.
*/
if (strcmp(modname, "dtrace") == 0)
return;
if (ctl->mod_requisites != NULL) {
struct modctl_list *list;
list = (struct modctl_list *)ctl->mod_requisites;
for (; list != NULL; list = list->modl_next) {
if (strcmp(list->modl_modp->mod_modname, "dtrace") == 0)
return;
}
}
/*
* KMDB is ineligible for instrumentation -- it may execute in
* any context, including probe context.
*/
if (strcmp(modname, "kmdbmod") == 0)
return;
if (str == NULL || symhdr == NULL || symhdr->sh_addr == NULL) {
/*
* If this module doesn't (yet) have its string or symbol
* table allocated, clear out.
*/
return;
}
symsize = symhdr->sh_entsize;
if (mp->fbt_nentries) {
/*
* This module has some FBT entries allocated; we're afraid
* to screw with it.
*/
return;
}
for (i = 1; i < nsyms; i++) {
uint8_t *instr, *limit;
Sym *sym = (Sym *)(symhdr->sh_addr + i * symsize);
int j;
if (ELF_ST_TYPE(sym->st_info) != STT_FUNC)
continue;
/*
* Weak symbols are not candidates. This could be made to
* work (where weak functions and their underlying function
* appear as two disjoint probes), but it's not simple.
*/
if (ELF_ST_BIND(sym->st_info) == STB_WEAK)
continue;
name = str + sym->st_name;
if (strstr(name, "dtrace_") == name &&
strstr(name, "dtrace_safe_") != name) {
/*
* Anything beginning with "dtrace_" may be called
* from probe context unless it explitly indicates
* that it won't be called from probe context by
* using the prefix "dtrace_safe_".
*/
continue;
}
if (strstr(name, "kdi_") == name ||
strstr(name, "_kdi_") != NULL) {
/*
* Any function name beginning with "kdi_" or
* containing the string "_kdi_" is a part of the
* kernel debugger interface and may be called in
* arbitrary context -- including probe context.
*/
continue;
}
/*
* Due to 4524008, _init and _fini may have a bloated st_size.
* While this bug was fixed quite some time ago, old drivers
* may be lurking. We need to develop a better solution to
* this problem, such that correct _init and _fini functions
* (the vast majority) may be correctly traced. One solution
* may be to scan through the entire symbol table to see if
* any symbol overlaps with _init. If none does, set a bit in
* the module structure that this module has correct _init and
* _fini sizes. This will cause some pain the first time a
* module is scanned, but at least it would be O(N) instead of
* O(N log N)...
*/
if (strcmp(name, "_init") == 0)
continue;
if (strcmp(name, "_fini") == 0)
continue;
/*
* In order to be eligible, the function must begin with the
* following sequence:
*
* pushl %esp
* movl %esp, %ebp
*
* Note that there are two variants of encodings that generate
* the movl; we must check for both. For 64-bit, we would
* normally insist that a function begin with the following
* sequence:
*
* pushq %rbp
* movq %rsp, %rbp
*
* However, the compiler for 64-bit often splits these two
* instructions -- and the first instruction in the function
* is often not the pushq. As a result, on 64-bit we look
* for any "pushq %rbp" in the function and we instrument
* this with a breakpoint instruction.
*/
instr = (uint8_t *)sym->st_value;
limit = (uint8_t *)(sym->st_value + sym->st_size);
#ifdef __amd64
while (instr < limit) {
if (*instr == FBT_PUSHL_EBP)
break;
if ((size = dtrace_instr_size(instr)) <= 0)
break;
instr += size;
}
if (instr >= limit || *instr != FBT_PUSHL_EBP) {
/*
* We either don't save the frame pointer in this
* function, or we ran into some disassembly
* screw-up. Either way, we bail.
*/
continue;
}
#else
if (instr[0] != FBT_PUSHL_EBP)
continue;
if (!(instr[1] == FBT_MOVL_ESP_EBP0_V0 &&
instr[2] == FBT_MOVL_ESP_EBP1_V0) &&
!(instr[1] == FBT_MOVL_ESP_EBP0_V1 &&
instr[2] == FBT_MOVL_ESP_EBP1_V1))
continue;
#endif
fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP);
fbt->fbtp_name = name;
fbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
name, FBT_ENTRY, 3, fbt);
fbt->fbtp_patchpoint = instr;
fbt->fbtp_ctl = ctl;
fbt->fbtp_loadcnt = ctl->mod_loadcnt;
fbt->fbtp_rval = DTRACE_INVOP_PUSHL_EBP;
fbt->fbtp_savedval = *instr;
fbt->fbtp_patchval = FBT_PATCHVAL;
fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
fbt->fbtp_symndx = i;
fbt_probetab[FBT_ADDR2NDX(instr)] = fbt;
mp->fbt_nentries++;
retfbt = NULL;
again:
if (instr >= limit)
continue;
/*
* If this disassembly fails, then we've likely walked off into
* a jump table or some other unsuitable area. Bail out of the
* disassembly now.
*/
if ((size = dtrace_instr_size(instr)) <= 0)
continue;
#ifdef __amd64
/*
* We only instrument "ret" on amd64 -- we don't yet instrument
* ret imm16, largely because the compiler doesn't seem to
* (yet) emit them in the kernel...
*/
if (*instr != FBT_RET) {
instr += size;
goto again;
}
#else
if (!(size == 1 &&
(*instr == FBT_POPL_EBP || *instr == FBT_LEAVE) &&
(*(instr + 1) == FBT_RET ||
*(instr + 1) == FBT_RET_IMM16))) {
instr += size;
goto again;
}
#endif
/*
* We (desperately) want to avoid erroneously instrumenting a
* jump table, especially given that our markers are pretty
* short: two bytes on x86, and just one byte on amd64. To
* determine if we're looking at a true instruction sequence
* or an inline jump table that happens to contain the same
* byte sequences, we resort to some heuristic sleeze: we
* treat this instruction as being contained within a pointer,
* and see if that pointer points to within the body of the
* function. If it does, we refuse to instrument it.
*/
for (j = 0; j < sizeof (uintptr_t); j++) {
uintptr_t check = (uintptr_t)instr - j;
uint8_t *ptr;
if (check < sym->st_value)
break;
if (check + sizeof (uintptr_t) > (uintptr_t)limit)
continue;
ptr = *(uint8_t **)check;
if (ptr >= (uint8_t *)sym->st_value && ptr < limit) {
instr += size;
goto again;
}
}
/*
* We have a winner!
*/
fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP);
fbt->fbtp_name = name;
if (retfbt == NULL) {
fbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
name, FBT_RETURN, 3, fbt);
} else {
retfbt->fbtp_next = fbt;
fbt->fbtp_id = retfbt->fbtp_id;
}
retfbt = fbt;
fbt->fbtp_patchpoint = instr;
fbt->fbtp_ctl = ctl;
fbt->fbtp_loadcnt = ctl->mod_loadcnt;
#ifndef __amd64
if (*instr == FBT_POPL_EBP) {
fbt->fbtp_rval = DTRACE_INVOP_POPL_EBP;
} else {
ASSERT(*instr == FBT_LEAVE);
fbt->fbtp_rval = DTRACE_INVOP_LEAVE;
}
fbt->fbtp_roffset =
(uintptr_t)(instr - (uint8_t *)sym->st_value) + 1;
#else
ASSERT(*instr == FBT_RET);
fbt->fbtp_rval = DTRACE_INVOP_RET;
fbt->fbtp_roffset =
(uintptr_t)(instr - (uint8_t *)sym->st_value);
#endif
fbt->fbtp_savedval = *instr;
fbt->fbtp_patchval = FBT_PATCHVAL;
fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
fbt->fbtp_symndx = i;
fbt_probetab[FBT_ADDR2NDX(instr)] = fbt;
mp->fbt_nentries++;
instr += size;
goto again;
}
}
/*ARGSUSED*/
static void
fbt_destroy(void *arg, dtrace_id_t id, void *parg)
{
fbt_probe_t *fbt = parg, *next, *hash, *last;
struct modctl *ctl = fbt->fbtp_ctl;
int ndx;
do {
if (ctl != NULL && ctl->mod_loadcnt == fbt->fbtp_loadcnt) {
if ((ctl->mod_loadcnt == fbt->fbtp_loadcnt &&
ctl->mod_loaded)) {
((struct module *)
(ctl->mod_mp))->fbt_nentries--;
}
}
/*
* Now we need to remove this probe from the fbt_probetab.
*/
ndx = FBT_ADDR2NDX(fbt->fbtp_patchpoint);
last = NULL;
hash = fbt_probetab[ndx];
while (hash != fbt) {
ASSERT(hash != NULL);
last = hash;
hash = hash->fbtp_hashnext;
}
if (last != NULL) {
last->fbtp_hashnext = fbt->fbtp_hashnext;
} else {
fbt_probetab[ndx] = fbt->fbtp_hashnext;
}
next = fbt->fbtp_next;
kmem_free(fbt, sizeof (fbt_probe_t));
fbt = next;
} while (fbt != NULL);
}
/*ARGSUSED*/
static int
fbt_enable(void *arg, dtrace_id_t id, void *parg)
{
fbt_probe_t *fbt = parg;
struct modctl *ctl = fbt->fbtp_ctl;
ctl->mod_nenabled++;
if (!ctl->mod_loaded) {
if (fbt_verbose) {
cmn_err(CE_NOTE, "fbt is failing for probe %s "
"(module %s unloaded)",
fbt->fbtp_name, ctl->mod_modname);
}
return (0);
}
/*
* Now check that our modctl has the expected load count. If it
* doesn't, this module must have been unloaded and reloaded -- and
* we're not going to touch it.
*/
if (ctl->mod_loadcnt != fbt->fbtp_loadcnt) {
if (fbt_verbose) {
cmn_err(CE_NOTE, "fbt is failing for probe %s "
"(module %s reloaded)",
fbt->fbtp_name, ctl->mod_modname);
}
return (0);
}
for (; fbt != NULL; fbt = fbt->fbtp_next)
*fbt->fbtp_patchpoint = fbt->fbtp_patchval;
return (0);
}
/*ARGSUSED*/
static void
fbt_disable(void *arg, dtrace_id_t id, void *parg)
{
fbt_probe_t *fbt = parg;
struct modctl *ctl = fbt->fbtp_ctl;
ASSERT(ctl->mod_nenabled > 0);
ctl->mod_nenabled--;
if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt))
return;
for (; fbt != NULL; fbt = fbt->fbtp_next)
*fbt->fbtp_patchpoint = fbt->fbtp_savedval;
}
/*ARGSUSED*/
static void
fbt_suspend(void *arg, dtrace_id_t id, void *parg)
{
fbt_probe_t *fbt = parg;
struct modctl *ctl = fbt->fbtp_ctl;
ASSERT(ctl->mod_nenabled > 0);
if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt))
return;
for (; fbt != NULL; fbt = fbt->fbtp_next)
*fbt->fbtp_patchpoint = fbt->fbtp_savedval;
}
/*ARGSUSED*/
static void
fbt_resume(void *arg, dtrace_id_t id, void *parg)
{
fbt_probe_t *fbt = parg;
struct modctl *ctl = fbt->fbtp_ctl;
ASSERT(ctl->mod_nenabled > 0);
if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt))
return;
for (; fbt != NULL; fbt = fbt->fbtp_next)
*fbt->fbtp_patchpoint = fbt->fbtp_patchval;
}
/*ARGSUSED*/
static void
fbt_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc)
{
fbt_probe_t *fbt = parg;
struct modctl *ctl = fbt->fbtp_ctl;
struct module *mp = ctl->mod_mp;
ctf_file_t *fp = NULL, *pfp;
ctf_funcinfo_t f;
int error;
ctf_id_t argv[32], type;
int argc = sizeof (argv) / sizeof (ctf_id_t);
const char *parent;
if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt))
goto err;
if (fbt->fbtp_roffset != 0 && desc->dtargd_ndx == 0) {
(void) strcpy(desc->dtargd_native, "int");
return;
}
if ((fp = ctf_modopen(mp, &error)) == NULL) {
/*
* We have no CTF information for this module -- and therefore
* no args[] information.
*/
goto err;
}
/*
* If we have a parent container, we must manually import it.
*/
if ((parent = ctf_parent_name(fp)) != NULL) {
struct modctl *mp = &modules;
struct modctl *mod = NULL;
/*
* We must iterate over all modules to find the module that
* is our parent.
*/
do {
if (strcmp(mp->mod_modname, parent) == 0) {
mod = mp;
break;
}
} while ((mp = mp->mod_next) != &modules);
if (mod == NULL)
goto err;
if ((pfp = ctf_modopen(mod->mod_mp, &error)) == NULL) {
goto err;
}
if (ctf_import(fp, pfp) != 0) {
ctf_close(pfp);
goto err;
}
ctf_close(pfp);
}
if (ctf_func_info(fp, fbt->fbtp_symndx, &f) == CTF_ERR)
goto err;
if (fbt->fbtp_roffset != 0) {
if (desc->dtargd_ndx > 1)
goto err;
ASSERT(desc->dtargd_ndx == 1);
type = f.ctc_return;
} else {
if (desc->dtargd_ndx + 1 > f.ctc_argc)
goto err;
if (ctf_func_args(fp, fbt->fbtp_symndx, argc, argv) == CTF_ERR)
goto err;
type = argv[desc->dtargd_ndx];
}
if (ctf_type_name(fp, type, desc->dtargd_native,
DTRACE_ARGTYPELEN) != NULL) {
ctf_close(fp);
return;
}
err:
if (fp != NULL)
ctf_close(fp);
desc->dtargd_ndx = DTRACE_ARGNONE;
}
static dtrace_pattr_t fbt_attr = {
{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
};
static dtrace_pops_t fbt_pops = {
NULL,
fbt_provide_module,
fbt_enable,
fbt_disable,
fbt_suspend,
fbt_resume,
fbt_getargdesc,
NULL,
NULL,
fbt_destroy
};
static void
fbt_cleanup(dev_info_t *devi)
{
dtrace_invop_remove(fbt_invop);
ddi_remove_minor_node(devi, NULL);
kmem_free(fbt_probetab, fbt_probetab_size * sizeof (fbt_probe_t *));
fbt_probetab = NULL;
fbt_probetab_mask = 0;
}
static int
fbt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
{
switch (cmd) {
case DDI_ATTACH:
break;
case DDI_RESUME:
return (DDI_SUCCESS);
default:
return (DDI_FAILURE);
}
if (fbt_probetab_size == 0)
fbt_probetab_size = FBT_PROBETAB_SIZE;
fbt_probetab_mask = fbt_probetab_size - 1;
fbt_probetab =
kmem_zalloc(fbt_probetab_size * sizeof (fbt_probe_t *), KM_SLEEP);
dtrace_invop_add(fbt_invop);
if (ddi_create_minor_node(devi, "fbt", S_IFCHR, 0,
DDI_PSEUDO, NULL) == DDI_FAILURE ||
dtrace_register("fbt", &fbt_attr, DTRACE_PRIV_KERNEL, NULL,
&fbt_pops, NULL, &fbt_id) != 0) {
fbt_cleanup(devi);
return (DDI_FAILURE);
}
ddi_report_dev(devi);
fbt_devi = devi;
return (DDI_SUCCESS);
}
static int
fbt_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
{
switch (cmd) {
case DDI_DETACH:
break;
case DDI_SUSPEND:
return (DDI_SUCCESS);
default:
return (DDI_FAILURE);
}
if (dtrace_unregister(fbt_id) != 0)
return (DDI_FAILURE);
fbt_cleanup(devi);
return (DDI_SUCCESS);
}
/*ARGSUSED*/
static int
fbt_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
int error;
switch (infocmd) {
case DDI_INFO_DEVT2DEVINFO:
*result = (void *)fbt_devi;
error = DDI_SUCCESS;
break;
case DDI_INFO_DEVT2INSTANCE:
*result = (void *)0;
error = DDI_SUCCESS;
break;
default:
error = DDI_FAILURE;
}
return (error);
}
/*ARGSUSED*/
static int
fbt_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
{
return (0);
}
static struct cb_ops fbt_cb_ops = {
fbt_open, /* open */
nodev, /* close */
nulldev, /* strategy */
nulldev, /* print */
nodev, /* dump */
nodev, /* read */
nodev, /* write */
nodev, /* ioctl */
nodev, /* devmap */
nodev, /* mmap */
nodev, /* segmap */
nochpoll, /* poll */
ddi_prop_op, /* cb_prop_op */
0, /* streamtab */
D_NEW | D_MP /* Driver compatibility flag */
};
static struct dev_ops fbt_ops = {
DEVO_REV, /* devo_rev */
0, /* refcnt */
fbt_info, /* get_dev_info */
nulldev, /* identify */
nulldev, /* probe */
fbt_attach, /* attach */
fbt_detach, /* detach */
nodev, /* reset */
&fbt_cb_ops, /* driver operations */
NULL, /* bus operations */
nodev, /* dev power */
ddi_quiesce_not_needed, /* quiesce */
};
/*
* Module linkage information for the kernel.
*/
static struct modldrv modldrv = {
&mod_driverops, /* module type (this is a pseudo driver) */
"Function Boundary Tracing", /* name of module */
&fbt_ops, /* driver ops */
};
static struct modlinkage modlinkage = {
MODREV_1,
(void *)&modldrv,
NULL
};
int
_init(void)
{
return (mod_install(&modlinkage));
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}
int
_fini(void)
{
return (mod_remove(&modlinkage));
}