zone.c revision e76e762ef75f893b9c9cd50e3212110e2dce7d6f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Zones
*
* A zone is a named collection of processes, namespace constraints,
* and other system resources which comprise a secure and manageable
* application containment facility.
*
* Zones (represented by the reference counted zone_t) are tracked in
* the kernel in the zonehash. Elsewhere in the kernel, Zone IDs
* (zoneid_t) are used to track zone association. Zone IDs are
* dynamically generated when the zone is created; if a persistent
* identifier is needed (core files, accounting logs, audit trail,
* etc.), the zone name should be used.
*
*
* Global Zone:
*
* The global zone (zoneid 0) is automatically associated with all
* system resources that have not been bound to a user-created zone.
* This means that even systems where zones are not in active use
* have a global zone, and all processes, mounts, etc. are
* associated with that zone. The global zone is generally
* unconstrained in terms of privileges and access, though the usual
* credential and privilege based restrictions apply.
*
*
* Zone States:
*
* The states in which a zone may be in and the transitions are as
* follows:
*
* ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
* initialized zone is added to the list of active zones on the system but
* isn't accessible.
*
* ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
* ready. The zone is made visible after the ZSD constructor callbacks are
* executed. A zone remains in this state until it transitions into
* the ZONE_IS_BOOTING state as a result of a call to zone_boot().
*
* ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
* init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
* state.
*
* ZONE_IS_RUNNING: The zone is open for business: zsched has
* successfully started init. A zone remains in this state until
* zone_shutdown() is called.
*
* ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
* killing all processes running in the zone. The zone remains
* in this state until there are no more user processes running in the zone.
* zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
* Since zone_shutdown() is restartable, it may be called successfully
* multiple times for the same zone_t. Setting of the zone's state to
* ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
* the zone's status without worrying about it being a moving target.
*
* ZONE_IS_EMPTY: zone_shutdown() has been called, and there
* are no more user processes in the zone. The zone remains in this
* state until there are no more kernel threads associated with the
* zone. zone_create(), zone_enter(), and zone_destroy() on this zone will
* fail.
*
* ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
* have exited. zone_shutdown() returns. Henceforth it is not possible to
* join the zone or create kernel threads therein.
*
* ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
* remains in this state until zsched exits. Calls to zone_find_by_*()
* return NULL from now on.
*
* ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no
* processes or threads doing work on behalf of the zone. The zone is
* removed from the list of active zones. zone_destroy() returns, and
* the zone can be recreated.
*
* ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
* callbacks are executed, and all memory associated with the zone is
* freed.
*
* Threads can wait for the zone to enter a requested state by using
* zone_status_wait() or zone_status_timedwait() with the desired
* state passed in as an argument. Zone state transitions are
* uni-directional; it is not possible to move back to an earlier state.
*
*
* Zone-Specific Data:
*
* Subsystems needing to maintain zone-specific data can store that
* data using the ZSD mechanism. This provides a zone-specific data
* store, similar to thread-specific data (see pthread_getspecific(3C)
* to register callbacks to be invoked when a zone is created, shut
* down, or destroyed. This can be used to initialize zone-specific
* data for new zones and to clean up when zones go away.
*
*
* Data Structures:
*
* The per-zone structure (zone_t) is reference counted, and freed
* when all references are released. zone_hold and zone_rele can be
* used to adjust the reference count. In addition, reference counts
* associated with the cred_t structure are tracked separately using
* zone_cred_hold and zone_cred_rele.
*
* Pointers to active zone_t's are stored in two hash tables; one
* for searching by id, the other for searching by name. Lookups
* can be performed on either basis, using zone_find_by_id and
* zone_find_by_name. Both return zone_t pointers with the zone
* held, so zone_rele should be called when the pointer is no longer
* needed. Zones can also be searched by path; zone_find_by_path
* returns the zone with which a path name is associated (global
* zone if the path is not within some other zone's file system
* hierarchy). This currently requires iterating through each zone,
* so it is slower than an id or name search via a hash table.
*
*
* Locking:
*
* zonehash_lock: This is a top-level global lock used to protect the
* zone hash tables and lists. Zones cannot be created or destroyed
* while this lock is held.
* zone_status_lock: This is a global lock protecting zone state.
* Zones cannot change state while this lock is held. It also
* protects the list of kernel threads associated with a zone.
* zone_lock: This is a per-zone lock used to protect several fields of
* this lock means that the zone cannot go away.
* zone_nlwps_lock: This is a per-zone lock used to protect the fields
* related to the zone.max-lwps rctl.
* zone_mem_lock: This is a per-zone lock used to protect the fields
* related to the zone.max-locked-memory and zone.max-swap rctls.
* zsd_key_lock: This is a global lock protecting the key state for ZSD.
* zone_deathrow_lock: This is a global lock protecting the "deathrow"
* list (a list of zones in the ZONE_IS_DEAD state).
*
* Ordering requirements:
* pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
* zone_lock --> zsd_key_lock --> pidlock --> p_lock
*
* When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
* zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
* zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
*
* Blocking memory allocations are permitted while holding any of the
* zone locks.
*
*
* System Call Interface:
*
* The zone subsystem can be managed and queried from user level with
* the following system calls (all subcodes of the primary "zone"
* system call):
* - zone_create: creates a zone with selected attributes (name,
* root path, privileges, resource controls, ZFS datasets)
* - zone_enter: allows the current process to enter a zone
* - zone_getattr: reports attributes of a zone
* - zone_setattr: set attributes of a zone
* - zone_boot: set 'init' running for the zone
* - zone_list: lists all zones active in the system
* - zone_lookup: looks up zone id based on name
* - zone_shutdown: initiates shutdown process (see states above)
* - zone_destroy: completes shutdown process (see states above)
*
*/
#include <sys/priv_impl.h>
#include <sys/pathname.h>
#include <sys/sysevent.h>
#include <sys/systeminfo.h>
#include <sys/cred_impl.h>
#include <sys/contract_impl.h>
#include <sys/pool_pset.h>
#include <sys/sysmacros.h>
#include <sys/ipc_impl.h>
/*
* cv used to signal that all references to the zone have been released. This
* needs to be global since there may be multiple waiters, and the first to
* wake up will free the zone_t, hence we cannot use zone->zone_cv.
*/
static kcondvar_t zone_destroy_cv;
/*
* Lock used to serialize access to zone_cv. This could have been per-zone,
* but then we'd need another lock for zone_destroy_cv, and why bother?
*/
static kmutex_t zone_status_lock;
/*
* ZSD-related global variables.
*/
/*
* The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
*/
static zone_key_t zsd_keyval = 0;
/*
* Global list of registered keys. We use this when a new zone is created.
*/
static list_t zsd_registered_keys;
int zone_hash_size = 256;
static kmutex_t zonehash_lock;
static id_space_t *zoneid_space;
/*
* The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
* kernel proper runs, and which manages all other zones.
*
* Although not declared as static, the variable "zone0" should not be used
* except for by code that needs to reference the global zone early on in boot,
* before it is fully initialized. All other consumers should use
* 'global_zone'.
*/
/*
* List of active zones, protected by zonehash_lock.
*/
static list_t zone_active;
/*
* List of destroyed zones that still have outstanding cred references.
* Used for debugging. Uses a separate lock to avoid lock ordering
* problems in zone_free.
*/
static list_t zone_deathrow;
static kmutex_t zone_deathrow_lock;
/* number of zones is limited by virtual interface limit in IP */
/* Event channel to sent zone state change notifications */
/*
* This table holds the mapping from kernel zone states to
* states visible in the state notification API.
* The idea is that we only expose "obvious" states and
* do not expose states which are just implementation details.
*/
const char *zone_status_table[] = {
ZONE_EVENT_UNINITIALIZED, /* uninitialized */
ZONE_EVENT_READY, /* ready */
ZONE_EVENT_READY, /* booting */
ZONE_EVENT_RUNNING, /* running */
ZONE_EVENT_SHUTTING_DOWN, /* shutting_down */
ZONE_EVENT_SHUTTING_DOWN, /* empty */
ZONE_EVENT_SHUTTING_DOWN, /* down */
ZONE_EVENT_SHUTTING_DOWN, /* dying */
ZONE_EVENT_UNINITIALIZED, /* dead */
};
/*
* This isn't static so lint doesn't complain.
*/
/*
* Synchronization primitives used to synchronize between mounts and zone
*/
static int mounts_in_progress;
static kcondvar_t mount_cv;
static kmutex_t mount_lock;
const char * const zone_default_initname = "/sbin/init";
static char * const zone_prefix = "/zone/";
static int zone_add_datalink(zoneid_t, char *);
static int zone_remove_datalink(zoneid_t, char *);
static int zone_check_datalink(zoneid_t *, char *);
static int zone_list_datalink(zoneid_t, int *, char *);
/*
* Bump this number when you alter the zone syscall interfaces; this is
* because we need to have support for previous API versions in libc
* to support patching; libc calls into the kernel to determine this number.
*
* Version 1 of the API is the version originally shipped with Solaris 10
* Version 2 alters the zone_create system call in order to support more
* arguments by moving the args into a structure; and to do better
* error reporting when zone_create() fails.
* Version 3 alters the zone_create system call in order to support the
* import of ZFS datasets to zones.
* Version 4 alters the zone_create system call in order to support
* Trusted Extensions.
* Version 5 alters the zone_boot system call, and converts its old
* bootargs parameter to be set by the zone_setattr API instead.
* Version 6 adds the flag argument to zone_create.
*/
static const int ZONE_SYSCALL_API_VERSION = 6;
/*
* Certain filesystems (such as NFS and autofs) need to know which zone
* the mount is being placed in. Because of this, we need to be able to
* ensure that a zone isn't in the process of being created such that
* nfs_mount() thinks it is in the global zone, while by the time it
* gets added the list of mounted zones, it ends up on zoneA's mount
* list.
*
* The following functions: block_mounts()/resume_mounts() and
* mount_in_progress()/mount_completed() are used by zones and the VFS
* layer (respectively) to synchronize zone creation and new mounts.
*
* The semantics are like a reader-reader lock such that there may
* either be multiple mounts (or zone creations, if that weren't
* serialized by zonehash_lock) in progress at the same time, but not
* both.
*
* We use cv's so the user can ctrl-C out of the operation if it's
* taking too long.
*
* The semantics are such that there is unfair bias towards the
* "current" operation. This means that zone creations may starve if
* there is a rapid succession of new mounts coming in to the system, or
* there is a remote possibility that zones will be created at such a
* rate that new mounts will not be able to proceed.
*/
/*
* Prevent new mounts from progressing to the point of calling
* VFS_MOUNT(). If there are already mounts in this "region", wait for
* them to complete.
*/
static int
block_mounts(void)
{
int retval = 0;
/*
* Since it may block for a long time, block_mounts() shouldn't be
* called with zonehash_lock held.
*/
while (mounts_in_progress > 0) {
goto signaled;
}
/*
* A negative value of mounts_in_progress indicates that mounts
* have been blocked by (-mounts_in_progress) different callers.
*/
retval = 1;
return (retval);
}
/*
* The VFS layer may progress with new mounts as far as we're concerned.
* Allow them to progress if we were the last obstacle.
*/
static void
resume_mounts(void)
{
if (++mounts_in_progress == 0)
}
/*
* The VFS layer is busy with a mount; zones should wait until all
* mounts are completed to progress.
*/
void
mount_in_progress(void)
{
while (mounts_in_progress < 0)
}
/*
* VFS is done with one mount; wake up any waiting block_mounts()
* callers if this is the last mount.
*/
void
mount_completed(void)
{
if (--mounts_in_progress == 0)
}
/*
* ZSD routines.
*
* Zone Specific Data (ZSD) is modeled after Thread Specific Data as
* defined by the pthread_key_create() and related interfaces.
*
* callbacks to be executed when a zone is created, shutdown, or
* destroyed.
*
* Unlike the thread counterpart, destructor callbacks will be executed
* callbacks, so it is the responsibility of such callbacks to check for
* NULL data values if necessary.
*
* The locking strategy and overall picture is as follows:
*
* When someone calls zone_key_create(), a template ZSD entry is added to the
* global list "zsd_registered_keys", protected by zsd_key_lock. The
* constructor callback is called immediately on all existing zones, and a
* copy of the ZSD entry added to the per-zone zone_zsd list (protected by
* zone_lock). As this operation requires the list of zones, the list of
* registered keys, and the per-zone list of ZSD entries to remain constant
* throughout the entire operation, it must grab zonehash_lock, zone_lock for
* all existing zones, and zsd_key_lock, in that order. Similar locking is
* needed when zone_key_delete() is called. It is thus sufficient to hold
* zsd_key_lock *or* zone_lock to prevent additions to or removals from the
* per-zone zone_zsd list.
*
* Note that this implementation does not make a copy of the ZSD entry if a
* constructor callback is not provided. A zone_getspecific() on such an
* uninitialized ZSD entry will return NULL.
*
* When new zones are created constructor callbacks for all registered ZSD
* entries will be called.
*
* The framework does not provide any locking around zone_getspecific() and
* zone_setspecific() apart from that needed for internal consistency, so
* callers interested in atomic "test-and-set" semantics will need to provide
* their own locking.
*/
void
{
struct zsd_entry *t;
ASSERT(zsd_keyval != 0);
t = kmem_alloc(sizeof (*t), KM_SLEEP);
t->zsd_create = create;
t->zsd_shutdown = shutdown;
t->zsd_destroy = destroy;
}
}
}
/*
* Helper function to find the zsd_entry associated with the key in the
* given list.
*/
static struct zsd_entry *
{
/*
* Move to head of list to keep list in MRU order.
*/
list_remove(l, zsd);
list_insert_head(l, zsd);
}
return (zsd);
}
}
return (NULL);
}
/*
* Function called when a module is being unloaded, or otherwise wishes
* to unregister its ZSD key and callbacks.
*/
int
{
goto notfound;
void *data;
} else {
}
if (zsdp->zsd_shutdown)
if (zsdp->zsd_destroy)
}
}
return (0);
return (-1);
}
/*
* ZSD counterpart of pthread_setspecific().
*/
int
{
struct zsd_entry *t;
if (t != NULL) {
/*
* Replace old value with new
*/
return (0);
}
/*
* If there was no previous value, go through the list of registered
* keys.
*
* We avoid grabbing zsd_key_lock until we are sure we need it; this is
* necessary for shutdown callbacks to be able to execute without fear
* of deadlock.
*/
return (-1);
}
/*
* Add a zsd_entry to this zone, using the template we just retrieved
* to initialize the constructor and destructor(s).
*/
t = kmem_alloc(sizeof (*t), KM_SLEEP);
return (0);
}
/*
* ZSD counterpart of pthread_getspecific().
*/
void *
{
struct zsd_entry *t;
void *data;
return (data);
}
/*
* Function used to initialize a zone's list of ZSD callbacks and data
* when the zone is being created. The callbacks are initialized from
* the template list (zsd_registered_keys), and the constructor
* callback executed (if one exists).
*
* This is called before the zone is made publicly available, hence no
* need to grab zone_lock.
*
* Although we grab and release zsd_key_lock, new entries cannot be
* added to or removed from the zsd_registered_keys list until we
* release zonehash_lock, so there isn't a window for a
* zone_key_create() to come in after we've dropped zsd_key_lock but
* before the zone is added to the zone list, such that the constructor
* callbacks aren't executed for the new zone.
*/
static void
{
struct zsd_entry *t;
t = kmem_alloc(sizeof (*t), KM_SLEEP);
}
}
}
/*
* Helper function to execute shutdown or destructor callbacks.
*/
static void
{
struct zsd_entry *t;
if (ct == ZSD_DESTROY) {
/*
* Make sure destructors are only called once.
*/
return;
}
}
/*
* Both zsd_key_lock and zone_lock need to be held in order to add or
* remove a ZSD key, (either globally as part of
* zone_key_create()/zone_key_delete(), or on a per-zone basis, as is
* possible through zone_setspecific()), so it's sufficient to hold
* zsd_key_lock here.
*
* This is a good thing, since we don't want to recursively try to grab
* zone_lock if a callback attempts to do something like a crfree() or
* zone_rele().
*/
/* Skip if no callbacks registered */
continue;
continue;
/*
* Call the callback with the zone-specific data if we can find
* any, otherwise with NULL.
*/
if (t != NULL) {
if (ct == ZSD_SHUTDOWN) {
} else {
}
} else {
if (ct == ZSD_SHUTDOWN) {
} else {
}
}
}
}
/*
* Called when the zone is going away; free ZSD-related memory, and
* destroy the zone_zsd list.
*/
static void
{
/*
* Free all the zsd_entry's we had on this zone.
*/
kmem_free(t, sizeof (*t));
}
}
/*
* Frees memory associated with the zone dataset list.
*/
static void
{
zone_dataset_t *t, *next;
kmem_free(t, sizeof (*t));
}
}
/*
* zone.cpu-shares resource control support.
*/
/*ARGSUSED*/
static rctl_qty_t
{
return (p->p_zone->zone_shares);
}
/*ARGSUSED*/
static int
{
return (0);
return (0);
}
static rctl_ops_t zone_cpu_shares_ops = {
};
/*
* zone.cpu-cap resource control support.
*/
/*ARGSUSED*/
static rctl_qty_t
{
return (cpucaps_zone_get(p->p_zone));
}
/*ARGSUSED*/
static int
{
return (0);
/*
* set cap to the new value.
*/
}
static rctl_ops_t zone_cpu_cap_ops = {
};
/*ARGSUSED*/
static rctl_qty_t
{
return (nlwps);
}
/*ARGSUSED*/
static int
{
return (0);
return (1);
return (0);
}
/*ARGSUSED*/
static int
{
return (0);
return (0);
}
static rctl_ops_t zone_lwps_ops = {
};
/*ARGSUSED*/
static int
{
rctl_qty_t v;
return (1);
return (0);
}
static rctl_ops_t zone_shmmax_ops = {
};
/*ARGSUSED*/
static int
{
rctl_qty_t v;
return (1);
return (0);
}
static rctl_ops_t zone_shmmni_ops = {
};
/*ARGSUSED*/
static int
{
rctl_qty_t v;
return (1);
return (0);
}
static rctl_ops_t zone_semmni_ops = {
};
/*ARGSUSED*/
static int
{
rctl_qty_t v;
return (1);
return (0);
}
static rctl_ops_t zone_msgmni_ops = {
};
/*ARGSUSED*/
static rctl_qty_t
{
rctl_qty_t q;
q = p->p_zone->zone_locked_mem;
return (q);
}
/*ARGSUSED*/
static int
{
rctl_qty_t q;
zone_t *z;
q = z->zone_locked_mem;
return (1);
return (0);
}
/*ARGSUSED*/
static int
{
return (0);
return (0);
}
static rctl_ops_t zone_locked_mem_ops = {
};
/*ARGSUSED*/
static rctl_qty_t
{
rctl_qty_t q;
mutex_enter(&z->zone_mem_lock);
q = z->zone_max_swap;
mutex_exit(&z->zone_mem_lock);
return (q);
}
/*ARGSUSED*/
static int
{
rctl_qty_t q;
zone_t *z;
q = z->zone_max_swap;
return (1);
return (0);
}
/*ARGSUSED*/
static int
{
return (0);
return (0);
}
static rctl_ops_t zone_max_swap_ops = {
};
/*
* Helper function to brand the zone with a unique ID.
*/
static void
{
}
/*
* Returns a held pointer to the "kcred" for the specified zone.
*/
struct cred *
{
return (NULL);
return (cr);
}
static int
{
if (rw == KSTAT_WRITE)
return (EACCES);
return (0);
}
static int
{
if (rw == KSTAT_WRITE)
return (EACCES);
return (0);
}
static void
{
sizeof (zone_kstat_t) / sizeof (kstat_named_t),
return;
sizeof (zone_kstat_t) / sizeof (kstat_named_t),
return;
}
static void
{
void *data;
}
}
}
/*
* Called very early on in boot to initialize the ZSD list so that
* zone_key_create() can be called before zone_init(). It also initializes
* portions of zone0 which may be used before zone_init() is called. The
* variable "global_zone" will be set when zone0 is fully initialized by
* zone_init().
*/
void
zone_zsd_init(void)
{
zone0.zone_nlwps = 0;
zone0.zone_locked_mem = 0;
zone0.zone_shmmax = 0;
zone0.zone_ncpus = 0;
zone0.zone_ncpus_online = 0;
/*
* The root filesystem is not mounted yet, so zone_rootvp cannot be set
* to anything meaningful. It is assigned to be 'rootdir' in
* vfs_mountroot().
*/
/*
* The global zone has all privileges
*/
/*
* Add p0 to the global zone
*/
}
/*
* Compute a hash value based on the contents of the label and the DOI. The
* hash algorithm is somewhat arbitrary, but is based on the observation that
* humans will likely pick labels that differ by amounts that work out to be
* multiples of the number of hash chains, and thus stirring in some primes
* should help.
*/
static uint_t
{
int i;
/* we depend on alignment of label, but not representation */
i = 1;
/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
up++;
i++;
}
return (hash);
}
/*
* All that mod_hash cares about here is zero (equal) versus non-zero (not
* equal). This may need to be changed if less than / greater than is ever
* needed.
*/
static int
{
}
/*
* Called by main() to initialize the zones framework.
*/
void
zone_init(void)
{
int res;
/*
* Create ID space for zone IDs. ID 0 is reserved for the
* global zone.
*/
/*
* Initialize generic zone resource controls, if any.
*/
/*
* System V IPC resource controls
*/
/*
* Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
* this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
*/
/*
* Initialize the ``global zone''.
*/
set = rctl_set_create();
e.rcep_t = RCENTITY_ZONE;
gp);
/*
* pool_default hasn't been initialized yet, so we let pool_init()
* take care of making sure the global zone is in the default pool.
*/
/*
* Initialize global zone kstats
*/
/*
* Initialize zone label.
* mlp are initialized when tnzonecfg is loaded.
*/
zone_uniqid(&zone0);
/*
* maintain zonehashbylabel only for labeled systems
*/
if (is_system_labeled())
zonecount = 1;
(mod_hash_val_t)&zone0);
(mod_hash_val_t)&zone0);
if (is_system_labeled()) {
(void) mod_hash_insert(zonehashbylabel,
}
/*
* We avoid setting zone_kcred until now, since kcred is initialized
* sometime after zone_zsd_init() and before zone_init().
*/
/*
* The global zone is fully initialized (except for zone_rootvp which
* will be set when the root filesystem is mounted).
*/
global_zone = &zone0;
/*
* Setup an event channel to send zone status change notifications on
*/
if (res)
panic("Sysevent_evc_bind failed during zone setup.\n");
}
static void
{
/*
* Remove any zone caps.
*/
/* remove from deathrow list */
}
if (zone->zone_rootpath)
}
/*
* See block comment at the top of this file for information about zone
* status values.
*/
/*
* Convenience function for setting zone status.
*/
static void
{
zone_status_table[status]) ||
#ifdef DEBUG
(void) printf(
"Failed to allocate and send zone state change event.\n");
#endif
}
}
/*
* Public function to retrieve the zone status. The zone status may
* change after it is retrieved.
*/
{
return (zone->zone_status);
}
static int
{
int err = 0;
goto done; /* EFAULT or ENAMETOOLONG */
done:
return (err);
}
static int
{
struct brand_attr *attrp;
return (EFAULT);
}
return (EINVAL);
/*
* This is the only place where a zone can change it's brand.
* We already need to hold zone_status_lock to check the zone
* status, so we'll just use that lock to serialize zone
* branding requests as well.
*/
/* Re-Branding is not allowed and the zone can't be booted yet */
if ((ZONE_IS_BRANDED(zone)) ||
return (EINVAL);
}
if (is_system_labeled() &&
return (EPERM);
}
return (0);
}
static int
{
char initname[INITNAME_SZ];
int err = 0;
return (err); /* EFAULT or ENAMETOOLONG */
return (0);
}
static int
{
int err = 0;
return (err);
}
static int
{
char sched_class[PC_CLNMSZ];
int err;
return (err); /* EFAULT or ENAMETOOLONG */
return (0);
}
/*
* Block indefinitely waiting for (zone_status >= status)
*/
void
{
}
}
/*
* Private CPR-safe version of zone_status_wait().
*/
static void
{
str);
}
/*
* zone_status_lock is implicitly released by the following.
*/
}
/*
* Block until zone enters requested state or signal is received. Return (0)
* if signaled, non-zero otherwise.
*/
int
{
return (0);
}
}
return (1);
}
/*
* Block until the zone enters the requested state or the timeout expires,
* whichever happens first. Return (-1) if operation timed out, time remaining
* otherwise.
*/
{
}
return (timeleft);
}
/*
* Block until the zone enters the requested state, the current process is
* signaled, or the timeout expires, whichever happens first. Return (-1) if
* operation timed out, 0 if signaled, time remaining otherwise.
*/
{
tim);
if (timeleft <= 0)
break;
}
return (timeleft);
}
/*
* Zones have two reference counts: one for references from credential
* structures (zone_cred_ref), and one (zone_ref) for everything else.
* This is so we can allow a zone to be rebooted while there are still
* outstanding cred references, since certain drivers cache dblks (which
* implicitly results in cached creds). We wait for zone_ref to drop to
* 0 (actually 1), but not zone_cred_ref. The zone structure itself is
* later freed when the zone_cred_ref drops to 0, though nothing other
* than the zone id and privilege set should be accessed once the zone
* is "dead".
*
* A debugging flag, zone_wait_for_cred, can be set to a non-zero value
* to 0. This can be useful to flush out other sources of cached creds
* that may be less innocuous than the driver case.
*/
int zone_wait_for_cred = 0;
static void
{
z->zone_ref++;
}
void
{
mutex_enter(&z->zone_lock);
zone_hold_locked(z);
mutex_exit(&z->zone_lock);
}
/*
* If the non-cred ref count drops to 1 and either the cred ref count
* is 0 or we aren't waiting for cred references, the zone is ready to
* be destroyed.
*/
void
{
mutex_enter(&z->zone_lock);
z->zone_ref--;
if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
/* no more refs, free the structure */
mutex_exit(&z->zone_lock);
zone_free(z);
return;
}
/* signal zone_destroy so the zone can finish halting */
mutex_exit(&z->zone_lock);
if (wakeup) {
/*
* Grabbing zonehash_lock here effectively synchronizes with
* zone_destroy() to avoid missed signals.
*/
}
}
void
zone_cred_hold(zone_t *z)
{
mutex_enter(&z->zone_lock);
z->zone_cred_ref++;
ASSERT(z->zone_cred_ref != 0);
mutex_exit(&z->zone_lock);
}
void
zone_cred_rele(zone_t *z)
{
mutex_enter(&z->zone_lock);
ASSERT(z->zone_cred_ref != 0);
z->zone_cred_ref--;
if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
/* no more refs, free the structure */
mutex_exit(&z->zone_lock);
zone_free(z);
return;
}
/*
* If zone_destroy is waiting for the cred references to drain
* out, and they have, signal it.
*/
zone_status_get(z) >= ZONE_IS_DEAD);
mutex_exit(&z->zone_lock);
if (wakeup) {
/*
* Grabbing zonehash_lock here effectively synchronizes with
* zone_destroy() to avoid missed signals.
*/
}
}
void
zone_task_hold(zone_t *z)
{
mutex_enter(&z->zone_lock);
z->zone_ntasks++;
ASSERT(z->zone_ntasks != 0);
mutex_exit(&z->zone_lock);
}
void
{
return;
}
if (refcnt == 1) {
/*
* See if the zone is shutting down.
*/
goto out;
}
/*
* Make sure the ntasks didn't change since we
* dropped zone_lock.
*/
goto out;
}
/*
* No more user processes in the zone. The zone is empty.
*/
goto out;
}
/*
* zsched has exited; the zone is dead.
*/
out:
}
getzoneid(void)
{
}
/*
* Internal versions of zone_find_by_*(). These don't zone_hold() or
* check the validity of a zone's state.
*/
static zone_t *
{
return (zone);
}
static zone_t *
{
/*
* zonehashbylabel is not maintained for unlabeled systems
*/
if (!is_system_labeled())
return (NULL);
return (zone);
}
static zone_t *
zone_find_all_by_name(char *name)
{
return (zone);
}
/*
* Public interface for looking up a zone by zoneid. Only returns the zone if
* it is fully initialized, and has not yet begun the zone_destroy() sequence.
* Caller must call zone_rele() once it is done with the zone.
*
* The zone may begin the zone_destroy() sequence immediately after this
* function returns, but may be safely used until zone_rele() is called.
*/
zone_t *
{
return (NULL);
}
/*
* For all practical purposes the zone doesn't exist.
*/
return (NULL);
}
return (zone);
}
/*
* Similar to zone_find_by_id, but using zone label as the key.
*/
zone_t *
{
return (NULL);
}
if (status > ZONE_IS_DOWN) {
/*
* For all practical purposes the zone doesn't exist.
*/
return (NULL);
}
return (zone);
}
/*
* Similar to zone_find_by_id, but using zone name as the key.
*/
zone_t *
zone_find_by_name(char *name)
{
return (NULL);
}
/*
* For all practical purposes the zone doesn't exist.
*/
return (NULL);
}
return (zone);
}
/*
* Similar to zone_find_by_id(), using the path as a key. For instance,
* zone "foo".
*
* zone_find_by_path() always returns a non-NULL value, since at the
* very least every path will be contained in the global zone.
*
* As with the other zone_find_by_*() functions, the caller is
* responsible for zone_rele()ing the return value of this function.
*/
zone_t *
zone_find_by_path(const char *path)
{
/*
* Call from rootconf().
*/
return (global_zone);
}
}
/*
* Zone practically doesn't exist.
*/
zret = global_zone;
}
return (zret);
}
/*
* Get the number of cpus visible to this zone. The system-wide global
* 'ncpus' is returned if pools are disabled, the caller is in the
* global zone, or a NULL zone argument is passed in.
*/
int
{
}
/*
* Get the number of online cpus visible to this zone. The system-wide
* global 'ncpus_online' is returned if pools are disabled, the caller
* is in the global zone, or a NULL zone argument is passed in.
*/
int
{
}
/*
* Return the pool to which the zone is currently bound.
*/
pool_t *
{
ASSERT(pool_lock_held());
}
/*
* Set the zone's pool pointer and update the zone's visibility to match
* the resources in the new pool.
*/
void
{
ASSERT(pool_lock_held());
}
/*
* Return the cached value of the id of the processor set to which the
* zone is currently bound. The value will be ZONE_PS_INVAL if the pools
* facility is disabled.
*/
{
return (zone->zone_psetid);
}
/*
* Set the cached value of the id of the processor set to which the zone
* is currently bound. Also update the zone's visibility to match the
* resources in the new processor set.
*/
void
{
return;
/*
* Global zone sees all.
*/
if (zone != global_zone) {
if (newpsetid != ZONE_PS_INVAL)
if (oldpsetid != ZONE_PS_INVAL)
}
/*
* Disabling pools, so we should start using the global values
* for ncpus and ncpus_online.
*/
if (newpsetid == ZONE_PS_INVAL) {
zone->zone_ncpus = 0;
zone->zone_ncpus_online = 0;
}
}
/*
* Walk the list of active zones and issue the provided callback for
* each of them.
*
* Caller must not be holding any locks that may be acquired under
* zonehash_lock. See comment at the beginning of the file for a list of
* common locks and their interactions with zones.
*/
int
{
int ret = 0;
/*
* Skip zones that shouldn't be externally visible.
*/
continue;
/*
* Bail immediately if any callback invocation returns a
* non-zero value.
*/
if (ret != 0)
break;
}
return (ret);
}
static int
{
int trycount;
int error = 0;
char *path;
return (error);
/* prevent infinite loop */
trycount = 10;
for (;;) {
if (--trycount <= 0) {
goto out;
}
/*
* VOP_ACCESS() may cover 'vp' with a new
* filesystem, if 'vp' is an autoFS vnode.
* Get the new 'vp' if so.
*/
(!vn_ismntpt(vp) ||
/* Success! */
break;
}
}
goto out;
}
return (0);
out:
return (error);
}
((c) >= 'a' && (c) <= 'z') || \
((c) >= 'A' && (c) <= 'Z'))
static int
{
int i, err;
return (err); /* EFAULT or ENAMETOOLONG */
}
/* must be less than ZONENAME_MAX */
return (EINVAL);
}
/*
* Name must start with an alphanumeric and must contain only
* alphanumerics, '-', '_' and '.'.
*/
return (EINVAL);
}
kname[i] != '.') {
return (EINVAL);
}
}
return (0);
}
/*
* Similar to thread_create(), but makes sure the thread is in the appropriate
* zone's zsched process (curproc->p_zone->zone_zsched) before returning.
*/
/*ARGSUSED*/
void (*proc)(),
void *arg,
{
kthread_t *t;
/*
* No-one should be trying to create threads if the zone is shutting
* down and there aren't any kernel threads around. See comment
* in zthread_exit().
*/
/*
* Create a thread, but don't let it run until we've finished setting
* things up.
*/
} else {
}
zone->zone_kthreads = t;
t->t_proc_flag |= TP_ZTHREAD;
project_rele(t->t_proj);
/*
* Setup complete, let it run.
*/
thread_lock(t);
t->t_schedflag |= TS_ALLSTART;
setrun_locked(t);
thread_unlock(t);
return (t);
}
/*
* Similar to thread_exit(). Must be called by threads created via
* zthread_exit().
*/
void
zthread_exit(void)
{
/*
* Reparent to p0
*/
t->t_proc_flag &= ~TP_ZTHREAD;
hat_thread_exit(t);
if (t->t_back == t) {
/*
* If the zone is empty, once the thread count
* goes to zero no further kernel threads can be
* created. This is because if the creator is a process
* in the zone, then it must have exited before the zone
* state could be set to ZONE_IS_EMPTY.
* Otherwise, if the creator is a kernel thread in the
* zone, the thread count is non-zero.
*
* This really means that non-zone kernel threads should
* not create zone kernel threads.
*/
/*
* Remove any CPU caps on this zone.
*/
}
} else {
if (zone->zone_kthreads == t)
}
thread_exit();
/* NOTREACHED */
}
static void
{
/* we're going to hold a reference here to the directory */
#ifdef C2_AUDIT
if (audit_active) /* update abs cwd/root path see c2audit.c */
#endif
}
/*
* Convert an rctl value represented by an nvlist_t into an rctl_val_t.
*/
static int
{
const char *name;
return (EINVAL);
/*
* Currently only privileged values are allowed, but
* this may change in the future.
*/
if (ui64 != RCPRIV_PRIVILEGED)
return (EINVAL);
if (ui64 != RCTL_LOCAL_NOACTION &&
ui64 != RCTL_LOCAL_DENY)
return (EINVAL);
action_set = B_TRUE;
} else {
return (EINVAL);
}
}
return (EINVAL);
rv->rcv_action_signal = 0;
rv->rcv_firing_time = 0;
return (0);
}
/*
* Non-global zone version of start_init.
*/
void
zone_start_init(void)
{
/*
* For all purposes (ZONE_ATTR_INITPID and restart_init),
* storing just the pid of init is sufficient.
*/
z->zone_proc_initpid = p->p_pid;
/*
* We maintain zone_boot_err so that we can return the cause of the
* failure back to the caller of the zone_boot syscall.
*/
if (z->zone_boot_err != 0) {
/*
* Make sure we are still in the booting state-- we could have
* raced and already be shutting down, or even further along.
*/
if (zone_status_get(z) == ZONE_IS_BOOTING) {
}
/* It's gone bad, dispose of the process */
mutex_enter(&p->p_lock);
lwp_exit();
}
} else {
if (zone_status_get(z) == ZONE_IS_BOOTING)
/* cause the process to return to userland. */
lwp_rtt();
}
}
struct zsched_arg {
};
/*
* Per-zone "sched" workalike. The similarity to "sched" doesn't have
* anything to do with scheduling, but rather with the fact that
* per-zone kernel threads are parented to zsched, just like regular
* kernel threads are parented to sched (p0).
*
* zsched is also responsible for launching init for the zone.
*/
static void
{
kproject_t *pj;
/*
* We are this zone's "zsched" process. As the zone isn't generally
* visible yet we don't need to grab any locks before initializing its
* zone_proc pointer.
*/
/*
* Disassociate process from its 'parent'; parent ourselves to init
* (pid 1) and change other values as needed.
*/
sess_create();
/* Decrement what newproc() incremented. */
/*
* Our credentials are about to become kcred-like, so we don't care
* about the caller's ruid.
*/
/*
* getting out of global zone, so decrement lwp counts
*/
/*
* Decrement locked memory counts on old zone and project.
*/
/*
* Create and join a new task in project '0' of this zone.
*
* We don't need to call holdlwps() since we know we're the only lwp in
* this process.
*
* task_join() returns with p_lock held.
*/
/*
* add lwp counts to zsched's zone, and increment project's task count
* due to the task created in the above tasksys_settaskid
*/
/*
* The process was created by a process in the global zone, hence the
* credentials are wrong. We might as well have kcred-ish credentials.
*/
/*
* Hold credentials again (for thread)
*/
/*
* p_lwpcnt can't change since this is a kernel process.
*/
/*
* Chroot
*/
/*
* Initialize zone's rctl set.
*/
set = rctl_set_create();
e.rcep_t = RCENTITY_ZONE;
/*
* Apply the rctls passed in to zone_create(). This is basically a list
* assignment: all of the old values are removed and the new ones
* inserted. That is, if an empty list is passed in, all values are
* removed.
*/
char *name;
int error; /* For ASSERT()s */
for (; /* ever */; ) {
break;
}
for (i = 0; i < nelem; i++) {
/*
* rctl_local_insert can fail if the value being
* inserted is a duplicate; this is OK.
*/
}
}
/*
* Tell the world that we're done setting up.
*
* At this point we want to set the zone status to ZONE_IS_READY
* and atomically set the zone's processor set visibility. Once
* we drop pool_lock() this zone will automatically get updated
* to reflect any future changes to the pools configuration.
*/
pool_lock();
if (pool_state == POOL_ENABLED)
pool_unlock();
/*
* Once we see the zone transition to the ZONE_IS_BOOTING state,
* we launch init, and set the state to running.
*/
/*
* Ok, this is a little complicated. We need to grab the
* zone's pool's scheduling class ID; note that by now, we
* are already bound to a pool if we need to be (zoneadmd
* will have done that to us while we're in the READY
* state). *But* the scheduling class for the zone's 'init'
* must be explicitly passed to newproc, which doesn't
* respect pool bindings.
*
* We hold the pool_lock across the call to newproc() to
* close the obvious race: the pool's scheduling class
* could change before we manage to create the LWP with
* classid 'cid'.
*/
pool_lock();
if (zone->zone_defaultcid > 0)
else
if (cid == -1)
cid = defaultcid;
/*
* If this fails, zone_boot will ultimately fail. The
* state of the zone will be set to SHUTTING_DOWN-- userland
* will have to tear down the zone, and fail, or try again.
*/
}
pool_unlock();
}
/*
* Wait for zone_destroy() to be called. This is what we spend
* most of our life doing.
*/
if (ct)
/*
* At this point the process contract should be empty.
* (Though if it isn't, it's not the end of the world.)
*/
/*
* Allow kcred to be freed when all referring processes
* (including this one) go away. We can't just do this in
* zone_free because we need to wait for the zone_cred_ref to
* drop to 0 before calling zone_free, and the existence of
* zone_kcred will prevent that. Thus, we call crfree here to
* balance the crdup in zone_create. The crhold calls earlier
* in zsched will be dropped when the thread and process exit.
*/
exit(CLD_EXITED, 0);
}
/*
* Helper function to determine if there are any submounts of the
* provided path. Used to make sure the zone doesn't "inherit" any
* mounts from before it is created.
*/
static uint_t
zone_mount_count(const char *rootpath)
{
/*
* Holding zonehash_lock prevents race conditions with
* vfs_list_add()/vfs_list_remove() since we serialize with
* zone_find_by_path().
*/
/*
* The rootpath must end with a '/'
*/
/*
* This intentionally does not count the rootpath itself if that
* happens to be a mount point.
*/
do {
rootpathlen) == 0)
count++;
return (count);
}
/*
* Helper function to make sure that a zone created on 'rootpath'
* wouldn't end up containing other zones' rootpaths.
*/
static boolean_t
zone_is_nested(const char *rootpath)
{
if (zone == global_zone)
continue;
return (B_TRUE);
}
return (B_FALSE);
}
static int
{
if (zone_privssz < sizeof (priv_set_t))
return (EFAULT);
}
return (0);
}
/*
* We make creative use of nvlists to pass in rctls from userland. The list is
* a list of the following structures:
*
* (name = rctl_name, value = nvpair_list_array)
*
* Where each element of the nvpair_list_array is of the form:
*
* [(name = "privilege", value = RCPRIV_PRIVILEGED),
* (name = "limit", value = uint64_t),
* (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
*/
static int
{
char *kbuf;
int error;
if (buflen == 0)
return (0);
return (ENOMEM);
goto out;
}
/*
* non-NULL, so we reset it here.
*/
goto out;
}
char *name;
goto out;
}
goto out;
}
for (i = 0; i < nelem; i++) {
goto out;
}
goto out;
}
}
error = 0;
out:
return (error);
}
int
}
}
}
static int
{
/* Get label from user */
return (EFAULT);
return (ENOMEM);
return (0);
}
/*
* Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
*/
static int
{
char *kbuf;
return (0);
return (ENOMEM);
return (EFAULT);
}
for (;;) {
else
break;
}
return (0);
}
/*
* System call to create/initialize a new zone named 'zone_name', rooted
* at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
* and initialized with the zone-wide rctls described in 'rctlbuf', and
* with labeling set by 'match', 'doi', and 'label'.
*
* If extended error is non-null, we may use it to return more detailed
* error information.
*/
static zoneid_t
int flags)
{
struct zsched_arg zarg;
int error;
int error2 = 0;
char *str;
if (secpolicy_zone_config(CRED()) != 0)
/* can't boot zone from within chroot environment */
zone->zone_ncpus = 0;
zone->zone_ncpus_online = 0;
if (flags & ZCF_NET_EXCL) {
}
}
}
}
/* initialize node name to be the same as zone name */
zone->zone_shmmax = 0;
zone->zone_nlwps = 0;
zone->zone_locked_mem = 0;
zone->zone_max_swap = 0;
/*
* Zsched initializes the rctls.
*/
}
}
/*
* Read in the trusted system parameters:
* match flag and sensitivity label.
*/
/* Fail if requested to set doi to anything but system's doi */
}
/* Always apply system's doi to the zone */
if (error != 0) {
}
} else {
/* all zones get an admin_low label if system is not labeled */
}
/*
* Stop all lwps since that's what normally happens as part of fork().
* This needs to happen before we grab any locks to avoid deadlock
* (another lwp in the process could be waiting for the held lock).
*/
if (rctls)
}
if (block_mounts() == 0) {
if (rctls)
}
/*
* Set up credential for kernel access. After this, any errors
* should go through the dance in errout rather than calling
* zone_free directly.
*/
/*
* Make sure zone doesn't already exist.
*
* If the system and zone are labeled,
* make sure no other zone exists that has the same label.
*/
else
goto errout;
}
/*
* Don't allow zone creations which would cause one zone's rootpath to
* be accessible from that of another (non-global) zone.
*/
goto errout;
}
goto errout;
}
goto errout;
}
/*
* Zone is still incomplete, but we need to drop all locks while
* zsched() initializes this zone's kernel process. We
* optimistically add the zone to the hashtable and associated
* lists so a parallel zone_create() doesn't try to create the
* same zone.
*/
zonecount++;
(void) mod_hash_insert(zonehashbyid,
if (insert_label_hash) {
(void) mod_hash_insert(zonehashbylabel,
}
/*
* Insert into active list. At this point there are no 'hold's
* on the zone, but everyone else knows not to use it, so we can
* continue to use it. zsched() will do a zone_hold() if the
* newproc() is successful.
*/
/*
* The process, task, and project rctls are probably wrong;
* we need an interface to get the default values of all rctls,
* and initialize zsched appropriately. I'm not sure that that
* makes much of a difference, though.
*/
/*
* We need to undo all globally visible state.
*/
(void) mod_hash_destroy(zonehashbylabel,
}
(void) mod_hash_destroy(zonehashbyname,
(void) mod_hash_destroy(zonehashbyid,
zonecount--;
goto errout;
}
/*
* Zone creation can't fail from now on.
*/
/*
* Create zone kstats
*/
/*
* Let the other lwps continue.
*/
/*
* Wait for zsched to finish initializing the zone.
*/
/*
* The zone is fully visible, so we can let mounts progress.
*/
if (rctls)
return (zoneid);
/*
* Let the other lwps continue.
*/
if (rctls)
/*
* There is currently one reference to the zone, a cred_ref from
* zone_kcred. To free the zone, we call crfree, which will call
* zone_cred_rele, which will call zone_free.
*/
}
/*
* Cause the zone to boot. This is pretty simple, since we let zoneadmd do
* the heavy lifting. initname is the path to the program to launch
* at the "top" of the zone; if this is NULL, we use the system default,
* which is stored at zone_default_initname.
*/
static int
{
int err;
if (secpolicy_zone_config(CRED()) != 0)
/*
* Look for zone under hash lock to prevent races with calls to
* zone_shutdown, zone_destroy, etc.
*/
}
}
}
/*
* Boot (starting init) might have failed, in which case the zone
* will go to the SHUTTING_DOWN state; an appropriate errno will
* be placed in zone->zone_boot_err, and so we return that.
*/
}
/*
* Kills all user processes in the zone, waiting for them all to exit
* before returning.
*/
static int
{
int waitstatus;
/*
* We need to drop zonehash_lock before killing all
* processes, otherwise we'll deadlock with zone_find_*
* which can be called from the exit path.
*/
ZONE_IS_EMPTY)) == -1) {
}
/*
* return EINTR if we were signaled
*/
if (waitstatus == 0)
return (EINTR);
return (0);
}
/*
* This function implements the policy for zone visibility.
*
* In standard Solaris, a non-global zone can only see itself.
*
* In Trusted Extensions, a labeled zone can lookup any zone whose label
* it dominates. For this test, the label of the global zone is treated as
* admin_high so it is special-cased instead of being checked for dominance.
*
* Returns true if zone attributes are viewable, false otherwise.
*/
static boolean_t
{
return (B_TRUE);
return (B_TRUE);
} else {
return (B_FALSE);
}
} else {
return (B_FALSE);
}
}
/*
* Systemcall to start the zone's halt sequence. By the time this
* function successfully returns, all user processes and kernel threads
* executing in it will have exited, ZSD shutdown callbacks executed,
* and the zone status set to ZONE_IS_DOWN.
*
* It is possible that the call will interrupt itself if the caller is the
* parent of any process running in the zone, and doesn't have SIGCHLD blocked.
*/
static int
{
int error;
if (secpolicy_zone_config(CRED()) != 0)
/*
* Block mounts so that VFS_MOUNT() can get an accurate view of
* the zone's status with regards to ZONE_IS_SHUTTING down.
*
* e.g. NFS can fail the mount if it determines that the zone
* has already begun the shutdown sequence.
*/
if (block_mounts() == 0)
/*
* Look for zone under hash lock to prevent races with other
* calls to zone_shutdown and zone_destroy.
*/
}
/*
* Fail if the zone isn't fully initialized yet.
*/
if (status < ZONE_IS_READY) {
}
/*
* If conditions required for zone_shutdown() to return have been met,
* return success.
*/
if (status >= ZONE_IS_DOWN) {
return (0);
}
/*
* If zone_shutdown() hasn't been called before, go through the motions.
* If it has, there's nothing to do but wait for the kernel threads to
* drain.
*/
if (status < ZONE_IS_EMPTY) {
/*
* There's still stuff running.
*/
}
if (ntasks == 1) {
/*
* The only way to create another task is through
* zone_enter(), which will block until we drop
* zonehash_lock. The zone is empty.
*/
/*
* Skip ahead to ZONE_IS_DOWN
*/
} else {
}
}
}
}
/*
* After the zone status goes to ZONE_IS_DOWN this zone will no
* longer be notified of changes to the pools configuration, so
* in order to not end up with a stale pool pointer, we point
* ourselves at the default pool and remove all resource
* visibility. This is especially important as the zone_t may
* languish on the deathrow for a very long time waiting for
* cred's to drain out.
*
* This rebinding of the zone can happen multiple times
* (presumably due to interrupted or parallel systemcalls)
* without any adverse effects.
*/
if (pool_lock_intr() != 0) {
}
if (pool_state == POOL_ENABLED) {
/*
* The zone no longer needs to be able to see any cpus.
*/
}
pool_unlock();
/*
* ZSD shutdown callbacks can be executed multiple times, hence
* it is safe to not be holding any locks across this call.
*/
/*
* Wait for kernel threads to drain.
*/
}
/*
* Zone can be become down/destroyable even if the above wait
* returns EINTR, so any code added here may never execute.
* (i.e. don't add code here)
*/
return (0);
}
/*
* Systemcall entry point to finalize the zone halt process. The caller
* must have already successfully called zone_shutdown().
*
* Upon successful completion, the zone will have been fully destroyed:
* zsched will have exited, destructor callbacks executed, and the zone
* removed from the list of active zones.
*/
static int
{
if (secpolicy_zone_config(CRED()) != 0)
/*
* Look for zone under hash lock to prevent races with other
* calls to zone_destroy.
*/
}
}
if (status < ZONE_IS_DOWN) {
} else if (status == ZONE_IS_DOWN) {
}
/*
* wait for zsched to exit
*/
for (; /* ever */; ) {
/*
* The zone has gone away. Necessary conditions
* are met, so we return success.
*/
return (0);
}
if (unref) {
/*
* There is only one reference to the zone -- that
* added when the zone was added to the hashtables --
* and things will remain this way until we drop
* zonehash_lock... we can go ahead and cleanup the
* zone.
*/
break;
}
/* Signaled */
}
}
/*
* Remove CPU cap for this zone now since we're not going to
* fail below this point.
*/
/* Get rid of the zone's kstats */
/* Say goodbye to brand framework. */
/*
* It is now safe to let the zone be recreated; remove it from the
* lists. The memory will not be freed until the last cred
* reference goes away.
*/
zonecount--;
/* remove from active list and hash tables */
(void) mod_hash_destroy(zonehashbyname,
(void) mod_hash_destroy(zonehashbyid,
(void) mod_hash_destroy(zonehashbylabel,
/*
* Release the root vnode; we're not using it anymore. Nor should any
* other thread that might access it exist.
*/
}
/* add to deathrow list */
/*
* Drop last reference (which was added by zsched()), this will
* free the zone unless there are outstanding cred references.
*/
return (0);
}
/*
* Systemcall entry point for zone_getattr(2).
*/
static ssize_t
{
char *zonepath;
char *outstr;
}
if (zone_status < ZONE_IS_READY) {
}
/*
* If not in the global zone, don't show information about other zones,
* unless the system is labeled and the local zone's label dominates
* the other zone.
*/
if (!zone_list_access(zone)) {
}
switch (attr) {
case ZONE_ATTR_ROOT:
if (global) {
/*
* Copy the path to trim the trailing "/" (except for
* the global zone).
*/
if (zone != global_zone)
else
} else {
if (inzone || !is_system_labeled()) {
/*
* Caller is not in the global zone.
* if the query is on the current zone
* or the system is not labeled,
* just return faked-up path for current zone.
*/
zonepath = "/";
size = 2;
} else {
/*
* Return related path for current zone.
*/
}
}
}
break;
case ZONE_ATTR_NAME:
}
break;
case ZONE_ATTR_STATUS:
/*
* Since we're not holding zonehash_lock, the zone status
* may be anything; leave it up to userland to sort it out.
*/
size = sizeof (zone_status);
break;
case ZONE_ATTR_FLAGS:
break;
case ZONE_ATTR_PRIVSET:
size = sizeof (priv_set_t);
break;
case ZONE_ATTR_UNIQID:
break;
case ZONE_ATTR_POOLID:
{
if (pool_lock_intr() != 0) {
break;
}
pool_unlock();
}
break;
case ZONE_ATTR_SLBL:
bufsize) != 0)
break;
case ZONE_ATTR_INITPID:
if (initpid == -1) {
break;
}
break;
case ZONE_ATTR_BRAND:
}
break;
case ZONE_ATTR_INITNAME:
NULL);
}
break;
case ZONE_ATTR_BOOTARGS:
outstr = "";
else
}
break;
case ZONE_ATTR_PHYS_MCAP:
break;
case ZONE_ATTR_SCHED_CLASS:
outstr = "";
else
}
break;
default:
} else {
}
}
if (error)
}
/*
* Systemcall entry point for zone_setattr(2).
*/
/*ARGSUSED*/
static int
{
int err;
if (secpolicy_zone_config(CRED()) != 0)
/*
* Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
* global zone.
*/
}
}
/*
* At present most attributes can only be set on non-running,
* non-global zones.
*/
goto done;
switch (attr) {
case ZONE_ATTR_INITNAME:
break;
case ZONE_ATTR_BOOTARGS:
break;
case ZONE_ATTR_BRAND:
break;
case ZONE_ATTR_PHYS_MCAP:
break;
case ZONE_ATTR_SCHED_CLASS:
break;
default:
else
}
done:
}
/*
* Return zero if the process has at least one vnode mapped in to its
* address space which shouldn't be allowed to change zones.
*
* Also return zero if the process has any shared mappings which reserve
* swap. This is because the counting for zone.max-swap does not allow swap
* revervation to be shared between zones. zone swap reservation is counted
* on zone->zone_max_swap.
*/
static int
as_can_change_zones(void)
{
int allow = 1;
/*
* Cannot enter zone with shared anon memory which
* reserves swap. See comment above.
*/
allow = 0;
break;
}
/*
* if we can't get a backing vnode for this segment then skip
* it.
*/
continue;
allow = 0;
break;
}
}
return (allow);
}
/*
* Count swap reserved by curproc's address space
*/
static size_t
as_swresv(void)
{
return (swap);
}
/*
* Systemcall entry point for zone_enter().
*
* The current process is injected into said zone. In the process
* zone-wide rctls, and pool association to match those of the zone.
*
* The first zone_enter() called while the zone is in the ZONE_IS_READY
* state will transition it to ZONE_IS_RUNNING. Processes may only
* enter a zone that is "ready" or "running".
*/
static int
{
contract_t *ct;
int err = 0;
kthread_id_t t;
if (secpolicy_zone_config(CRED()) != 0)
/*
* Stop all lwps so we don't need to hold a lock to look at
* curproc->p_zone. This needs to happen before we grab any
* locks to avoid deadlock (another lwp in the process could
* be waiting for the held lock).
*/
/*
* Make sure we're not changing zones with files open or mapped in
* to our address space which shouldn't be changing zones.
*/
if (!files_can_change_zones()) {
goto out;
}
if (!as_can_change_zones()) {
goto out;
}
goto out;
}
goto out;
}
/*
* To prevent processes in a zone from holding contracts on
* extrazonal resources, and to avoid process contract
* memberships which span zones, contract holders and processes
* which aren't the sole members of their encapsulating process
* contracts are not allowed to zone_enter.
*/
goto out;
}
/*
* Moreover, we don't allow processes whose encapsulating
* process contracts have inherited extrazonal contracts.
* While it would be easier to eliminate all process contracts
* with inherited contracts, we need to be able to give a
* restarted init (or other zone-penetrating process) its
* predecessor's contracts.
*/
if (ctp->conp_ninherited != 0) {
goto out;
}
}
}
/*
* Can't join
*/
goto out;
}
/*
* Make sure new priv set is within the permitted set for caller
*/
goto out;
}
/*
* We want to momentarily drop zonehash_lock while we optimistically
* bind curproc to the pool it should be running in. This is safe
* since the zone can't disappear (we have a hold on it).
*/
/*
* Grab pool_lock to keep the pools configuration from changing
* and to stop ourselves from getting rebound to another pool
* until we join the zone.
*/
if (pool_lock_intr() != 0) {
goto out;
}
/*
* Bind ourselves to the pool currently associated with the zone.
*/
POOL_BIND_ALL)) != 0) {
pool_unlock();
goto out;
}
/*
* Grab cpu_lock now; we'll need it later when we call
* task_join().
*/
/*
* Make sure the zone hasn't moved on since we dropped zonehash_lock.
*/
/*
* Can't join anymore.
*/
if (pool_state == POOL_ENABLED &&
pool_unlock();
goto out;
}
/*
* a_lock must be held while transfering locked memory and swap
* reservation from the global zone to the non global zone because
* asynchronous faults on the processes' address space can lock
* memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
* segments respectively.
*/
/* verify that we do not exceed and task or lwp limits */
/* add new lwps to zone and zone's proj0 */
/* add 1 task to zone's proj0 */
/* remove lwps from proc's old zone and old project */
/*
* Joining the zone cannot fail from now on.
*
* This means that a lot of the following code can be commonized and
* shared with zsched().
*/
/*
* Reset the encapsulating process contract's zone.
*/
/*
* Create a new task and associate the process with the project keyed
* by (projid,zoneid).
*
* We might as well be in project 0; the global zone's projid doesn't
* make much sense in a zone anyhow.
*
* This also increments zone_ntasks, and returns with p_lock held.
*/
/*
* call RCTLOP_SET functions on this proc
*/
e.rcep_t = RCENTITY_ZONE;
/*
* We don't need to hold any of zsched's locks here; not only do we know
* the process and zone aren't going away, we know its session isn't
* changing either.
*
* By joining zsched's session here, we mimic the behavior in the
* global zone of init's sid being the pid of sched. We extend this
* to all zlogin-like zone_enter()'ing processes as well.
*/
/*
* If any threads are scheduled to be placed on zone wait queue they
* should abandon the idea since the wait queue is changing.
* We need to be holding pidlock & p_lock to do this.
*/
do {
thread_lock(t);
/*
* Kick this thread so that he doesn't sit
* on a wrong wait queue.
*/
if (ISWAITING(t))
setrun_locked(t);
if (t->t_schedflag & TS_ANYWAITQ)
t->t_schedflag &= ~ TS_ANYWAITQ;
thread_unlock(t);
}
/*
* If there is a default scheduling class for the zone and it is not
* the class we are currently in, change all of the threads in the
* process to the new class. We need to be holding pidlock & p_lock
* when we call parmsset so this is a good place to do it.
*/
if (zone->zone_defaultcid > 0 &&
pcparms.pc_clparms[0] = 0;
/*
* If setting the class fails, we still want to enter the zone.
*/
do {
}
}
/*
* We're firmly in the zone; let pools progress.
*/
pool_unlock();
/*
* We don't need to retain a hold on the zone since we already
* incremented zone_ntasks, so the zone isn't going anywhere.
*/
/*
* Chroot
*/
/*
* Change process credentials
*/
/*
* Restrict all process privilege sets to zone limit
*/
/*
* Adjust upcount to reflect zone entry.
*/
/*
* Set up core file path and content.
*/
out:
/*
* Let the other lwps continue.
*/
}
/*
* Systemcall entry point for zone_list(2).
*
* Processes running in a (non-global) zone only see themselves.
* On labeled systems, they see all zones whose label they dominate.
*/
static int
{
int error;
if (myzone != global_zone) {
if (!is_system_labeled()) {
/* just return current zone */
} else {
/* return all zones that are dominated */
domi_nzones = 0;
if (real_nzones > 0) {
continue;
continue;
/*
* Note that a label always dominates
* itself, so myzone is always included
* in the list.
*/
if (bldominates(mybslab,
zoneids[domi_nzones++] =
}
}
}
}
} else {
domi_nzones = 0;
if (real_nzones > 0) {
KM_SLEEP);
}
}
/*
* If user has allocated space for fewer entries than we found, then
* return only up to his limit. Either way, tell him exactly how many
* we found.
*/
if (domi_nzones < user_nzones)
error = 0;
user_nzones * sizeof (zoneid_t)) != 0)
}
if (real_nzones > 0)
if (error != 0)
else
return (0);
}
/*
* Systemcall entry point for zone_lookup(2).
*
* Non-global zones are only able to see themselves and (on labeled systems)
* the zones they dominate.
*/
static zoneid_t
zone_lookup(const char *zone_name)
{
char *kname;
int err;
/* return caller's zone id */
return (getzoneid());
}
}
/*
* In a non-global zone, can only lookup global and own name.
* In Trusted Extensions zone label dominance rules apply.
*/
!zone_list_access(zone)) {
} else {
return (zoneid);
}
}
static int
zone_version(int *version_arg)
{
int version = ZONE_SYSCALL_API_VERSION;
return (0);
}
/* ARGSUSED */
long
{
switch (cmd) {
case ZONE_CREATE:
if (get_udatamodel() == DATAMODEL_NATIVE) {
}
} else {
#ifdef _SYSCALL32_IMPL
}
zs.zone_privs =
(const struct priv_set *)
(unsigned long)zs32.zone_privs;
(int *)(unsigned long)zs32.extended_error;
#else
panic("get_udatamodel() returned bogus result\n");
#endif
}
case ZONE_BOOT:
case ZONE_DESTROY:
case ZONE_GETATTR:
case ZONE_SETATTR:
case ZONE_ENTER:
case ZONE_LIST:
case ZONE_SHUTDOWN:
case ZONE_LOOKUP:
return (zone_lookup((const char *)arg1));
case ZONE_VERSION:
return (zone_version((int *)arg1));
case ZONE_ADD_DATALINK:
(char *)arg2));
case ZONE_DEL_DATALINK:
(char *)arg2));
case ZONE_CHECK_DATALINK:
case ZONE_LIST_DATALINK:
default:
}
}
struct zarg {
};
static int
{
char *buf;
int error;
return (error);
}
static void
{
}
static void
{
char *zone_name;
int error;
int retry;
/*
* zoneadmd may be down, but at least we can empty out the zone.
* We can ignore the return value of zone_empty() since we're called
* from a kernel thread and know we won't be delivered any signals.
*/
(void) zone_empty(zone);
/*
* Since we're not holding a reference to the zone, any number of
* things can go wrong, including the zone disappearing before we get a
* chance to talk to zoneadmd.
*/
goto next;
}
break;
}
switch (error) {
case EINTR:
/* FALLTHROUGH */
case EAGAIN: /* process may be forking */
/*
* Back off for a bit
*/
break;
case EBADF:
/*
* zoneadmd may be dead, but it may come back to
* life later.
*/
break;
}
break;
default:
"zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
error);
goto out;
}
next:
/*
* If this isn't the same zone_t that we originally had in mind,
* then this is the same as if two kadmin requests come in at
* the same time: the first one wins. This means we lose, so we
* bail.
*/
/*
* Problem is solved.
*/
break;
}
/*
* zoneid recycled
*/
break;
}
/*
* We could zone_status_timedwait(), but there doesn't seem to
* be much point in doing that (plus, it would mean that
* zone_free() isn't called until this thread exits).
*/
}
out:
}
thread_exit();
}
/*
* Entry point for uadmin() to tell the zone to go away or reboot. Analog to
* kadmin(). The caller is a process in the zone.
*
* In order to shutdown the zone, we will hand off control to zoneadmd
* (running in the global zone) via a door. We do a half-hearted job at
* killing all processes in the zone, create a kernel thread to contact
* zoneadmd, and make note of the "uniqid" of the zone. The uniqid is
* a form of generation number used to let zoneadmd (as well as
* zone_destroy()) know exactly which zone they're re talking about.
*/
int
{
switch (cmd) {
case A_SHUTDOWN:
switch (fcn) {
case AD_HALT:
case AD_POWEROFF:
break;
case AD_BOOT:
break;
case AD_IBOOT:
case AD_SBOOT:
case AD_SIBOOT:
case AD_NOSYNC:
return (ENOTSUP);
default:
return (EINVAL);
}
break;
case A_REBOOT:
break;
case A_FTRACE:
case A_REMOUNT:
case A_FREEZE:
case A_DUMP:
return (ENOTSUP);
default:
return (EINVAL);
}
return (EPERM);
/*
* zone_status can't be ZONE_IS_EMPTY or higher since curproc
* is in the zone.
*/
/*
* This zone is already on its way down.
*/
return (0);
}
/*
* Prevent future zone_enter()s
*/
/*
* Kill everyone now and call zoneadmd later.
* zone_ki_call_zoneadmd() will do a more thorough job of this
* later.
*/
/*
* Now, create the thread to contact zoneadmd and do the rest of the
* work. This thread can't be created in our zone otherwise
* zone_destroy() would deadlock.
*/
/* mdep was already copied in for us by uadmin */
exit(CLD_EXITED, 0);
return (EINVAL);
}
/*
* Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
* status to ZONE_IS_SHUTTING_DOWN.
*/
void
zone_shutdown_global(void)
{
}
/*
* Returns true if the named dataset is visible in the current zone.
* The 'write' parameter is set to 1 if the dataset is also writable.
*/
int
{
if (dataset[0] == '\0')
return (0);
/*
* Walk the list once, looking for datasets which match exactly, or
* specify a dataset underneath an exported dataset. If found, return
* true and note that it is writable.
*/
if (write)
*write = 1;
return (1);
}
}
/*
* Walk the list a second time, searching for datasets which are parents
* of exported datasets. These should be visible, but read-only.
*
* a trailing slash.
*/
len--; /* Ignore trailing slash */
if (write)
*write = 0;
return (1);
}
}
return (0);
}
/*
* zone_find_by_any_path() -
*
* kernel-private routine similar to zone_find_by_path(), but which
* effectively compares against zone paths rather than zonerootpath
* (i.e., the last component of zonerootpaths, which should be "root/",
* are not compared.) This is done in order to accurately identify all
* paths, whether zone-visible or not, including those which are parallel
* to /root/, such as /dev/, /home/, etc...
*
* If the specified path does not fall under any zone path then global
* zone is returned.
*
* The treat_abs parameter indicates whether the path should be treated as
* an absolute path although it does not begin with "/". (This supports
*
* The caller is responsible for zone_rele of the returned zone.
*/
zone_t *
{
int path_offset = 0;
return (global_zone);
}
if (*path != '/') {
path_offset = 1;
}
char *c;
char *rootpath_start;
continue;
/* scan backwards to find start of last component */
do {
c--;
} while (*c != '/');
break;
}
zone = global_zone;
return (zone);
}
/* List of data link names which are accessible from the zone */
struct dlnamelist {
struct dlnamelist *dlnl_next;
};
/*
* Check whether the datalink name (dlname) itself is present.
* Return true if found.
*/
static boolean_t
{
struct dlnamelist *dlnl;
break;
}
}
return (found);
}
/*
* Add an data link name for the zone. Does not check for duplicates.
*/
static int
{
struct dlnamelist *dlnl;
int err;
}
}
/*
* Verify that the datalink name isn't already used by a different
* zone while allowing duplicate entries for the same zone (e.g. due
* to both using IPv4 and IPv6 on an interface)
*/
continue;
}
}
return (0);
}
static int
{
int err;
}
}
/* Look for match */
LIFNAMSIZ) == 0)
goto found;
}
return (0);
}
/*
* Using the zoneidp as ALL_ZONES, we can lookup which zone is using datalink
* name (dlname); otherwise we just check if the specified zoneidp has access
* to the datalink name.
*/
static int
{
char *dln;
int err = 0;
}
}
/*
* Check whether datalink name is already used.
*/
continue;
if (allzones)
sizeof (*zoneidp));
}
}
/* datalink name is not found in any active zone. */
}
/*
* Get the names of the datalinks assigned to a zone.
* Here *nump is the number of datalinks, and the assumption
* is that the caller will gurantee that the the supplied buffer is
* big enough to hold at least #*nump datalink names, that is,
* LIFNAMSIZ X *nump
* On return, *nump will be the "new" number of datalinks, if it
* ever changed.
*/
static int
{
struct dlnamelist *dlnl;
char *ptr;
}
num = 0;
/*
* If the list changed and the new number is bigger
* than what the caller supplied, just count, don't
* do copyout
*/
continue;
}
}
/* Increased or decreased, caller should be notified. */
}
}
return (0);
}
/*
* Public interface for looking up a zone by zoneid. It's a customized version
* for netstack_zone_create(), it:
* 1. Doesn't acquire the zonehash_lock, since it is called from
* zone_key_create() or zone_zsd_configure(), lock already held.
* 2. Doesn't check the status of the zone.
* 3. It will be called even before zone_init is called, in that case the
* address of zone0 is returned directly, and netstack_zone_create()
* will only assign a value to zone0.zone_netstack, won't break anything.
*/
zone_t *
{
if (zonehashbyid == NULL)
return (&zone0);
else
return (zone_find_all_by_id(zoneid));
}