devfs_subr.c revision 67027fa743be690176c02e0e021c95d48f81ba62
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* miscellaneous routines for the devfs
*/
#include <sys/sysmacros.h>
#include <sys/pathname.h>
#ifdef DEBUG
int devfs_debug = 0x0;
#endif
const char dvnm[] = "devfs";
/*
* The devfs_clean_key is taken during a devfs_clean operation: it is used to
* prevent unnecessary code execution and for detection of potential deadlocks.
*/
/* prototype memory vattrs */
vattr_t dv_vattr_dir = {
VDIR, /* va_type */
DV_DIRMODE_DEFAULT, /* va_mode */
DV_UID_DEFAULT, /* va_uid */
DV_GID_DEFAULT, /* va_gid */
0, /* va_fsid; */
0, /* va_nodeid; */
0, /* va_nlink; */
0, /* va_size; */
0, /* va_atime; */
0, /* va_mtime; */
0, /* va_ctime; */
0, /* va_rdev; */
0, /* va_blksize; */
0, /* va_nblocks; */
0, /* va_seq; */
};
vattr_t dv_vattr_file = {
0, /* va_type */
DV_DEVMODE_DEFAULT, /* va_mode */
DV_UID_DEFAULT, /* va_uid */
DV_GID_DEFAULT, /* va_gid */
0, /* va_fsid; */
0, /* va_nodeid; */
0, /* va_nlink; */
0, /* va_size; */
0, /* va_atime; */
0, /* va_mtime; */
0, /* va_ctime; */
0, /* va_rdev; */
0, /* va_blksize; */
0, /* va_nblocks; */
0, /* va_seq; */
};
vattr_t dv_vattr_priv = {
0, /* va_type */
DV_DEVMODE_PRIV, /* va_mode */
DV_UID_DEFAULT, /* va_uid */
DV_GID_DEFAULT, /* va_gid */
0, /* va_fsid; */
0, /* va_nodeid; */
0, /* va_nlink; */
0, /* va_size; */
0, /* va_atime; */
0, /* va_mtime; */
0, /* va_ctime; */
0, /* va_rdev; */
0, /* va_blksize; */
0, /* va_nblocks; */
0, /* va_seq; */
};
extern dev_info_t *clone_dip;
extern major_t clone_major;
/* dv_node node constructor for kmem cache */
static int
{
return (-1);
}
return (0);
}
/* dv_node node destructor for kmem cache */
static void
{
vn_invalid(vp);
}
/* initialize dv_node node cache */
void
{
}
/* destroy dv_node node cache */
void
{
}
/*
* dv_mkino - Generate a unique inode number for devfs nodes.
*
* Although ino_t is 64 bits, the inode number is truncated to 32 bits for 32
* bit non-LARGEFILE applications. This means that there is a requirement to
* maintain the inode number as a 32 bit value or applications will have
* stat(2) calls fail with EOVERFLOW. We form a 32 bit inode number from the
* dev_t. but if the minor number is larger than L_MAXMIN32 we fold extra minor
*
* To generate inode numbers for directories, we assume that we will never use
* more than half the major space - this allows for ~8190 drivers. We use this
* upper major number space to allocate inode numbers for directories by
* encoding the major and instance into this space.
*
* We also skew the result so that inode 2 is reserved for the root of the file
* system.
*
* As part of the future support for 64-bit dev_t APIs, the upper minor bits
* should be folded into the high inode bits by adding the following code
* after "ino |= 1":
*
* #if (L_BITSMINOR32 != L_BITSMINOR)
* |* fold overflow minor bits into high bits of inode number *|
* ino |= ((ino_t)(minor >> L_BITSMINOR32)) << L_BITSMINOR;
* #endif |* (L_BITSMINOR32 != L_BITSMINOR) *|
*
* This way only applications that use devices that overflow their minor
* space will have an application level impact.
*/
static ino_t
{
static int warn;
/* makedevice32 in high half of major number space */
} else {
/* makedevice32 */
/* make ino for VCHR different than VBLK */
ino <<= 1;
ino |= 1;
}
/*
* diagnose things a little early because adding the skew to a large
* minor number could roll over the major.
*/
warn = 1;
}
return (ino);
}
/*
* Compare two nodes lexographically to balance avl tree
*/
static int
{
int rv;
return (0);
}
/*
* dv_mkroot
*
* Build the first VDIR dv_node.
*/
struct dv_node *
{
dcmn_err3(("dv_mkroot\n"));
dv->dv_namelen = 0;
dv->dv_dflt_mode = 0;
(int (*)(const void *, const void *))dv_compare_nodes,
return (dv);
}
/*
* dv_mkdir
*
* Given an probed or attached nexus node, create a VDIR dv_node.
* No dv_attrvp is created at this point.
*/
struct dv_node *
{
dv->dv_dflt_mode = 0;
(int (*)(const void *, const void *))dv_compare_nodes,
return (dv);
}
/*
* dv_mknod
*
* Given a minor node, create a VCHR or VBLK dv_node.
* No dv_attrvp is created at this point.
*/
static struct dv_node *
struct ddi_minor_data *dmd)
{
/* increment dev_ref with devi_lock held */
/*
* Minors created with ddi_create_priv_minor_node can specify
* a default mode permission other than the devfs default.
*/
dcmn_err5(("%s: dv_mknod default priv mode 0%o\n",
}
return (dv);
}
/*
* dv_destroy
*
* Destroy what we created in dv_mkdir or dv_mknod.
* In the case of a *referenced* directory, do nothing.
*/
void
{
/*
* We may be asked to unlink referenced directories.
* In this case, there is nothing to be done.
* The eventual memory free will be done in
* devfs_inactive.
*/
return;
}
}
}
}
}
/*
* Find and hold dv_node by name
*/
static struct dv_node *
{
if (dv) {
return (dv);
}
return (NULL);
}
/*
* Inserts a new dv_node in a parent directory
*/
void
{
} else {
}
/* enter node in the avl tree */
}
/*
* Unlink a dv_node from a perent directory
*/
void
{
/* verify linkage of arguments */
} else {
}
/* remove from avl tree */
}
/*
* Merge devfs node specific information into an attribute structure.
*
* NOTE: specfs provides ATIME,MTIME,CTIME,SIZE,BLKSIZE,NBLOCKS on leaf node.
*/
void
{
} else {
/* don't trust the shadow file type */
else
}
}
/*
* Get default device permission by consulting rules in
* privilege specification in minor node and /etc/minor_perm.
*
* This function is called from the devname filesystem to get default
* permissions for a device exported to a non-global zone.
*/
void
{
/* If vp isn't a dv_node, return something sensible */
if (no_fs_perm)
*no_fs_perm = 0;
*vap = dv_vattr_file;
return;
}
/*
* For minors not created by ddi_create_priv_minor_node(),
* use devfs defaults.
*/
*vap = dv_vattr_dir;
if (no_fs_perm)
*no_fs_perm = 1;
*vap = dv_vattr_priv;
} else {
/*
* look up perm bits from minor_perm
*/
*vap = dv_vattr_file;
dcmn_err5(("%s: minor perm mode 0%o\n",
dcmn_err5(("%s: priv mode 0%o\n",
}
}
}
/*
* dv_shadow_node
*
* node in the shadow attribute filesystem.
*
* node in the shadow attribute filesystem. These nodes
* are only created to persist non-default attributes.
* Lack of such a node implies the default permissions
* are sufficient.
*
* Managing the attribute file entries is slightly tricky (mostly
* because we can't intercept VN_HOLD and VN_RELE except on the last
* release).
*
* We assert that if the dv_attrvp pointer is non-NULL, it points
* to a singly-held (by us) vnode that represents the shadow entry
* in the underlying filesystem. To avoid store-ordering issues,
* we assert that the pointer can only be tested under the dv_contents
* READERS lock.
*/
void
char *nm, /* name component */
int flags) /* optionally create shadow node */
{
int create_tried;
int error;
dcmn_err3(("dv_shadow_node: name %s attr %p\n",
if ((flags & DV_SHADOW_WRITE_HELD) == 0) {
return;
return;
}
}
} else {
return;
}
create_tried = 0;
/* factor out the snode since we only want the attribute node */
}
} else
/*
* All we want is the permissions (and maybe ACLs and
* extended attributes), and we want to perform lookups
* by name. Drivers occasionally change their minor
* number space. If something changes, there's no
* much we can do about it here.
*/
/* The shadow node checks out. We are done */
if (error == 0) {
/*
* Determine if we have non-trivial ACLs on this node.
* It is not necessary to VOP_RWLOCK since fs_acl_nontrivial
* only does VOP_GETSECATTR.
*/
/*
* If we have synced out the memory attributes, free
* them and switch back to using the persistent store.
*/
}
if ((flags & DV_SHADOW_WRITE_HELD) == 0)
return;
}
/*
* Failed to find attribute in persistent backing store,
* get default permission bits.
*/
/*
* Try to create shadow dir. This is necessary in case
* we need to create a shadow leaf node later, when user
* executes chmod.
*/
case VDIR:
create_tried = 1;
break;
case VCHR:
case VBLK:
/*
* Shadow nodes are only created on demand
*/
if (flags & DV_SHADOW_CREATE) {
create_tried = 1;
}
break;
default:
/*NOTREACHED*/
}
if (create_tried &&
goto lookup;
}
}
/* Store attribute in memory */
}
if ((flags & DV_SHADOW_WRITE_HELD) == 0)
}
/*
* Given a devinfo node, and a name, returns the appropriate
* minor information for that named node, if it exists.
*/
static int
{
struct ddi_minor_data *dmd;
/*
* Skip alias nodes and nodes without a name.
*/
continue;
dcmn_err4(("dv_find_leafnode: (%s,%s)\n",
if (r_mi->ddm_node_priv)
return (0);
}
}
return (ENOENT);
}
/*
* Special handling for clone node:
* Clone minor name is a driver name, the minor number will
* be the major number of the driver. There is no minor
* node under the clone driver, so we'll manufacture the
* dev_t.
*/
static struct dv_node *
{
char *devnm;
struct ddi_minor_data *dmd;
/*
* Make sure drvname is a STREAMS driver. We load the driver,
* but don't attach to any instances. This makes stat(2)
* relatively cheap.
*/
if (major == DDI_MAJOR_T_NONE)
return (NULL);
return (NULL);
return (NULL);
}
return (dvp);
}
/*
* Given the parent directory node, and a name in it, returns the
* named dv_node to the caller (as a vnode).
*
* (We need pnp and rdir for doing shadow lookups; they can be NULL)
*/
int
{
extern int isminiroot; /* see modctl.c */
int circ;
char *mnm;
struct ddi_minor_data *dmd;
return (ESTALE);
}
/*
* Empty name or ., return node itself.
*/
return (0);
}
/*
* .., return the parent directory
*/
return (0);
}
/*
* Fail anything without a valid device name component
*/
return (ENOENT);
}
/*
* So, now we have to deal with the trickier stuff.
*
* (a) search the existing list of dv_nodes on this directory
*/
if (tsd_get(devfs_clean_key)) {
return (EBUSY);
}
}
/*
* Common case - we already have attributes
*/
goto found;
}
/*
* No attribute vp, try and build one.
*
* dv_shadow_node() can briefly drop &dv->dv_contents lock
* if it is unable to upgrade it to a write lock. If the
* current thread has come in through the bottom-up device
* configuration devfs_clean() path, we may deadlock against
* a thread performing top-down device configuration if it
* grabs the contents lock. To avoid this, when we are on the
* devfs_clean() path we attempt to upgrade the dv_contents
* lock before we call dv_shadow_node().
*/
if (tsd_get(devfs_clean_key)) {
return (EBUSY);
}
}
goto found;
}
/*
* (b) Search the child devinfo nodes of our parent directory,
* looking for the named node. If we find it, build a new
* node, then grab the writers lock, search the directory
* if it's still not there, then insert it.
*
* We drop the devfs locks before accessing the device tree.
* Take care to mark the node BUSY so that a forced devfs_clean
* doesn't mark the directory node stale.
*
* Also, check if we are called as part of devfs_clean or
* reset_perm. If so, simply return not found because there
* is nothing to clean.
*/
if (tsd_get(devfs_clean_key)) {
return (ENOENT);
}
/*
* We could be either READ or WRITE locked at
* this point. Upgrade if we are read locked.
*/
/*
* Things may have changed when we dropped
* the contents lock, so start from top again
*/
goto start;
}
was_busy++;
if (mnm)
*mnm = (char)0;
/*
* Configure one nexus child, will call nexus's bus_ops
* If successful, devi is held upon returning.
* Note: devfs lookup should not be configuring grandchildren.
*/
if (mnm)
*mnm = ':';
if (rv != NDI_SUCCESS) {
goto notfound;
}
/*
* If we configured a hidden node, consider it notfound.
*/
if (ndi_dev_is_hidden_node(devi)) {
goto notfound;
}
/*
* Don't make vhci clients visible under phci, unless we
* are in miniroot.
*/
goto notfound;
}
/*
* Invalidate cache to notice newly created minor nodes.
*/
/*
* mkdir for nexus drivers and leaf nodes as well. If we are racing
* and create a duplicate, the duplicate will be destroyed below.
*/
} else {
/*
* Allocate dmd first to avoid KM_SLEEP with active
* ndi_devi_enter.
*/
/*
* For clone minors, load the driver indicated by
* minor name.
*/
} else {
/*
* Find minor node and make a dv_node
*/
if (dmd->ddm_node_priv)
}
}
}
/*
* Release hold from ndi_devi_config_one()
*/
goto notfound;
}
/*
* We have released the dv_contents lock, need to check
* if another thread already created a duplicate node
*/
} else {
/*
* Duplicate found, use the existing node
*/
dv_destroy(dv, 0);
}
goto founddv;
/*NOTREACHED*/
/*
* Fail lookup of device that has now become hidden (typically via
* hot removal of open device).
*/
goto notfound;
}
/*
* Skip non-kernel lookups of internal nodes.
* This use of kcred to distinguish between user and
* internal kernel lookups is unfortunate. The information
* provided by the seg argument to lookupnameat should
* evolve into a lookup flag for filesystems that need
* this distinction.
*/
goto notfound;
}
/*
* If vnode is a device, return special vnode instead
* (though it knows all about -us- via sp->s_realvp,
* sp->s_devvp, and sp->s_dip)
*/
} else
if (was_busy)
return (rv);
}
/*
* The given directory node is out-of-date; that is, it has been
* marked as needing to be rebuilt, possibly because some new devinfo
* node has come into existence, or possibly because this is the first
* time we've been here.
*/
void
{
struct ddi_minor_data *dmd;
char devnm[MAXNAMELEN];
return;
}
/*
* While we know enough to create a directory at DS_INITIALIZED,
* the directory will be empty until DS_ATTACHED. The existence
* of an empty directory dv_node will cause a devi_ref, which
* type operations - making devfs_clean coordination even more
* sensitive and error prone. Given this, the 'continue' below
* is checking for DS_ATTACHED instead of DS_INITIALIZED.
*/
continue;
/* skip hidden nodes */
if (ndi_dev_is_hidden_node(devi))
continue;
char *addr;
/*
* Skip alias nodes, internal nodes, and nodes
* without a name. We allow DDM_DEFAULT nodes
* to appear in readdir.
*/
continue;
else
/* dv_node already exists */
continue;
}
}
/* directory doesn't exist */
}
}
}
/*
* Given a directory node, clean out all the nodes beneath.
*
* VDIR: Reinvoke to clean them, then delete the directory.
* VCHR, VBLK: Just blow them away.
*
* Mark the directories touched as in need of a rebuild, in case
* we fall over part way through. When DV_CLEAN_FORCE is specified,
* we mark referenced empty directories as stale to facilitate DR.
*/
int
{
int busy = 0;
/*
* We should always be holding the tsd_clean_key here: dv_cleandir()
* will be called as a result of a devfs_clean request and the
* tsd_clean_key will be set in either in devfs_clean() itself or in
* devfs_clean_vhci().
*
* Since we are on the devfs_clean path, we return EBUSY if we cannot
* get the contents lock: if we blocked here we might deadlock against
* a thread performing top-down device configuration.
*/
if (!(flags & DV_CLEANDIR_LCK) &&
return (EBUSY);
/*
* If devnm is specified, the non-minor portion of the
* name must match devnm.
*/
if (devnm &&
continue;
/* check type of what we are cleaning */
/* recurse on directories */
goto set_busy;
}
/* A clean directory is an empty directory... */
/*
* ... but an empty directory can still have
* references to it. If we have dv_busy or
* DV_CLEAN_FORCE is *not* specified then a
* referenced directory is considered busy.
*/
goto set_busy;
}
/*
* Mark referenced directory stale so that DR
* will succeed even if a shell has
* VN_HOLD reference to an empty directory).
*/
}
} else {
goto set_busy;
}
}
/* unlink from directory */
/* drop locks */
/* destroy vnode if ref count is zero */
continue;
/*
* If devnm is not NULL we return immediately on busy,
* otherwise we continue destroying unused dv_node's.
*/
if (devnm)
break;
}
/*
* This code may be invoked to inform devfs that a new node has
* been created in the kernel device tree. So we always set
* the DV_BUILD flag to allow the next dv_filldir() to pick
* the new devinfo nodes.
*/
if (!(flags & DV_CLEANDIR_LCK))
}
/*
* Walk through the devfs hierarchy, correcting the permissions of
* devices with default permissions that do not match those specified
* by minor perm. This can only be done for all drivers for now.
*/
static int
{
int retval = 0;
char *nm;
int error = 0;
}
} else {
/*
* Check for permissions from minor_perm
* If there are none, we're done
*/
continue;
/*
* Allow a node's permissions to be altered
* permanently from the defaults by chmod,
* using the shadow node as backing store.
* Otherwise, update node to minor_perm permissions.
*/
/*
* No attribute vp, try to find one.
*/
}
continue;
}
dcmn_err5(("%s: no perm change: "
continue;
}
}
dcmn_err5(("%s: perm %d/%d/0%o -> %d/%d/0%o (%d)\n",
}
if (error != 0) {
}
}
return (retval);
}
int
{
int rval;
return (0);
return (rval);
}
/*
* Clean up dangling devfs shadow nodes for removed
* drivers so that, in the event the driver is re-added
* to the system, newly created nodes won't incorrectly
* pick up these stale shadow node permissions.
*
* This is accomplished by walking down the pathname
* to the directory, starting at the root's attribute
* node, then removing all minors matching the specified
* node name. Care must be taken to remove all entries
* in a directory before the directory itself, so that
* the clean-up associated with rem_drv'ing a nexus driver
* does not inadvertently result in an inconsistent
* filesystem underlying devfs.
*/
static int
{
int error;
int eof;
int ndirents = 64;
char *nm;
uio.uio_loffset = 0;
eof = 0;
error = 0;
break;
continue;
("rem_drv %s/%s lookup (%d)\n",
if (error)
continue;
if (error == 0) {
("rem_drv %s/%s rmdir (%d)\n",
}
} else {
NULL, 0);
("rem_drv %s/%s remove (%d)\n",
}
if (error) {
goto exit;
}
}
}
exit:
return (error);
}
int
{
int error;
int eof;
int ndirents = 64;
char *nm;
return (0);
pn_skipslash(&pn);
while (pn_pathleft(&pn)) {
if (error) {
dcmn_err5(("remdrv_cleanup %s lookup error %d\n",
return (0);
}
pn_skipslash(&pn);
}
uio.uio_loffset = 0;
eof = 0;
error = 0;
break;
continue;
continue;
("rem_drv %s/%s lookup (%d)\n",
if (error)
continue;
if (error == 0) {
("rem_drv %s/%s rmdir (%d)\n",
}
} else {
NULL, 0);
("rem_drv %s/%s remove (%d)\n",
}
if (error)
goto exit;
}
}
exit:
return (0);
}
struct dv_list {
};
void
char *devnm,
void *arg)
{
int len;
dcmn_err3(("dv_walk: ddv = %s, devnm = %s\n",
/*
* If devnm is not NULL and is not the empty string,
* select only dv_nodes with matching non-minor name
*/
continue;
continue;
if (tail)
else
}
while (head) {
}
}