dsl_dir.c revision 03bad06fbb261fd4a7151a70dfeff2f5041cce1f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 Martin Matuska. All rights reserved.
* Copyright (c) 2014 Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/dsl_deleg.h>
#include <sys/dmu_impl.h>
#include <sys/metaslab.h>
#include <sys/zfeature.h>
#include <sys/zfs_znode.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
/*
* Filesystem and Snapshot Limits
* ------------------------------
*
* that can be created at a given level in the tree or below. A typical
* use-case is with a delegated dataset where the administrator wants to ensure
* that a user within the zone is not creating too many additional filesystems
* or snapshots, even though they're not exceeding their space quota.
*
* The filesystem and snapshot counts are stored as extensible properties. This
* capability is controlled by a feature flag and must be enabled to be used.
* Once enabled, the feature is not active until the first limit is set. At
* will validate and update the counts.
*
* Because the count properties will not exist before the feature is active,
* the counts are updated when a limit is first set on an uninitialized
* dsl_dir node in the tree (The filesystem/snapshot count on a node includes
* all of the nested filesystems/snapshots. Thus, a new leaf node has a
* filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
* snapshot count properties on a node indicate uninitialized counts on that
* node.) When first setting a limit on an uninitialized node, the code starts
* at the filesystem with the new limit and descends into all sub-filesystems
* to add the count properties.
*
* In practice this is lightweight since a limit is typically set when the
* filesystem is created and thus has no children. Once valid, changing the
* limit value won't require a re-traversal since the counts are already valid.
* When recursively fixing the counts, if a node with a limit is encountered
* during the descent, the counts are known to be valid and there is no need to
* descend into that filesystem's children. The counts on filesystems above the
* one with the new limit will still be uninitialized, unless a limit is
* eventually set on one of those filesystems. The counts are always recursively
* updated when a limit is set on a dataset, unless there is already a limit.
* When a new limit value is set on a filesystem with an existing limit, it is
* possible for the new limit to be less than the current count at that level
* since a user who can change the limit is also allowed to exceed the limit.
*
* Once the feature is active, then whenever a filesystem or snapshot is
* created, the code recurses up the tree, validating the new count against the
* limit at each initialized level. In practice, most levels will not have a
* limit set. If there is a limit at any initialized level up the tree, the
* check must pass or the creation will fail. Likewise, when a filesystem or
* snapshot is destroyed, the counts are recursively adjusted all the way up
* the initizized nodes in the tree. Renaming a filesystem into different point
* in the tree will first validate, then update the counts on each branch up to
* the common ancestor. A receive will also validate the counts and then update
* them.
*
* An exception to the above behavior is that the limit is not enforced if the
* user has permission to modify the limit. This is primarily so that
* recursive snapshots in the global zone always work. We want to prevent a
* denial-of-service in which a lower level delegated dataset could max out its
* limit and thus block recursive snapshots from being taken in the global zone.
* Because of this, it is possible for the snapshot count to be over the limit
* and snapshots taken in the global zone could cause a lower level dataset to
* hit or exceed its limit. The administrator taking the global zone recursive
* snapshot should be aware of this side-effect and behave accordingly.
* For consistency, the filesystem limit is also not enforced if the user can
* modify the limit.
*
* The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
* and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
* dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
* dsl_dir_init_fs_ss_count().
*
* There is a special case when we receive a filesystem that already exists. In
* this case a temporary clone name of %X is created (see dmu_recv_begin). We
* never update the filesystem counts for temporary clones.
*
* Likewise, we do not update the snapshot counts for temporary snapshots,
* such as those created by zfs diff.
*/
static void
dsl_dir_evict(void *dbu)
{
int t;
for (t = 0; t < TXG_SIZE; t++) {
}
}
int
{
int err;
if (err != 0)
return (err);
#ifdef ZFS_DEBUG
{
}
#endif
if (err != 0)
goto errout;
if (tail) {
#ifdef ZFS_DEBUG
#endif
} else {
}
if (err != 0)
goto errout;
} else {
}
if (dsl_dir_is_clone(dd)) {
/*
* We can't open the origin dataset, because
* that would require opening this dsl_dir.
* Just look at its phys directly instead.
*/
&origin_bonus);
if (err != 0)
goto errout;
dd->dd_origin_txg =
}
} else {
}
}
/*
* The dsl_dir_t has both open-to-close and instantiate-to-evict
* holds on the spa. We need the open-to-close holds because
* otherwise the spa_refcnt wouldn't change when we open a
* dir which the spa also has open, so we could incorrectly
* the instantiate-to-evict hold because the dsl_dir_t has a
* pointer to the dd_pool, which has a pointer to the spa_t.
*/
return (0);
return (err);
}
void
{
}
/*
* Remove a reference to the given dsl dir that is being asynchronously
* released. Async releases occur from a taskq performing eviction of
* dsl datasets and dirs. This process is identical to a normal release
* with the exception of using the async API for releasing the reference on
* the spa.
*/
void
{
}
/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
void
{
} else {
buf[0] = '\0';
}
/*
* recursive mutex so that we can use
* dprintf_dd() with dd_lock held
*/
} else {
}
}
/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
int
{
int result = 0;
/* parent's name + 1 for the "/" */
}
/* see dsl_dir_name */
} else {
}
return (result);
}
static int
{
char *p;
/* This would be a good place to reserve some namespace... */
if (p && (p[1] == '/' || p[1] == '@')) {
/* two separators in a row */
}
/*
* if the first thing is an @ or /, it had better be an
* @ and it had better not have any more ats or slashes,
* and it had better have something after the @.
*/
if (p != NULL &&
return (SET_ERROR(ENAMETOOLONG));
p = NULL;
} else if (p[0] == '/') {
if (p - path >= MAXNAMELEN)
return (SET_ERROR(ENAMETOOLONG));
p++;
} else if (p[0] == '@') {
/*
* if the next separator is an @, there better not be
* any more slashes.
*/
if (p - path >= MAXNAMELEN)
return (SET_ERROR(ENAMETOOLONG));
} else {
panic("invalid p=%p", (void *)p);
}
*nextp = p;
return (0);
}
/*
* Return the dsl_dir_t, and possibly the last component which couldn't
* be found in *tail. The name must be in the specified dsl_pool_t. This
* thread must hold the dp_config_rwlock for the pool. Returns NULL if the
* path is bogus, or if tail==NULL and we couldn't parse the whole name.
* (*tail)[0] == '@' means that the last component is a snapshot.
*/
int
{
char buf[MAXNAMELEN];
int err;
if (err != 0)
return (err);
/* Make sure the name is in the specified pool. */
if (err != 0) {
return (err);
}
if (err != 0)
break;
if (next[0] == '@')
break;
dprintf("looking up %s in obj%lld\n",
if (err != 0) {
err = 0;
break;
}
if (err != 0)
break;
}
if (err != 0) {
return (err);
}
/*
* It's an error if there's more than one component left, or
* tailp==NULL and there's any component left.
*/
/* bad path name */
}
return (err);
}
/*
* If the counts are already initialized for this filesystem and its
* descendants then do nothing, otherwise initialize the counts.
*
* The counts on this filesystem, and those below, may be uninitialized due to
* either the use of a pre-existing pool which did not support the
* filesystem/snapshot limit feature, or one in which the feature had not yet
* been enabled.
*
* Recursively descend the filesystem tree and update the filesystem/snapshot
* counts on each filesystem below, then update the cumulative count on the
* current filesystem. If the filesystem already has a count set on it,
* then we know that its counts, and the counts on the filesystems below it,
* are already correct, so we don't have to update this filesystem.
*/
static void
{
/*
* If the filesystem count has already been initialized then we
* don't need to recurse down any further.
*/
return;
/* Iterate my child dirs */
&chld_dd));
/*
* Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
* temporary datasets.
*/
continue;
}
my_fs_cnt++; /* count this child */
}
/* Count my snapshots (we counted children's snapshots above) */
zap_cursor_advance(zc)) {
/* Don't count temporary snapshots */
my_ss_cnt++;
}
/* we're in a sync task, update counts */
}
static int
{
int error;
if (error != 0)
return (error);
}
DD_FIELD_FILESYSTEM_COUNT) == 0) {
}
return (0);
}
static void
{
/*
* Since the feature was not active and we're now setting a
* limit, increment the feature-active counter so that the
* feature becomes active for the first time.
*
* We are already in a sync task so we can update the MOS.
*/
}
/*
* Since we are now setting a non-UINT64_MAX limit on the filesystem,
* we need to ensure the counts are correct. Descend down the tree from
* this point and update all of the counts to be accurate.
*/
}
/*
* Make sure the feature is enabled and activate it if necessary.
* Since we're setting a limit, ensure the on-disk counts are valid.
* This is only called by the ioctl path when setting a limit value.
*
* We do not need to validate the new limit, since users who can change the
* limit are also allowed to exceed the limit.
*/
int
dsl_dir_activate_fs_ss_limit(const char *ddname)
{
int error;
dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
error = 0;
return (error);
}
/*
* Used to determine if the filesystem_limit or snapshot_limit should be
* enforced. We allow the limit to be exceeded if the user has permission to
* write the property value. We pass in the creds that we got in the open
* context since we will always be the GZ root in syncing context. We also have
* to handle the case where we are allowed to change the limit on the current
* dataset, but there may be another limit in the tree above.
*
* We can never modify these two properties within a non-global zone. In
* addition, the other checks are modeled on zfs_secpolicy_write_perms. We
* can't use that function since we are already holding the dp_config_rwlock.
* In addition, we already have the dd and dealing with snapshots is simplified
* in this code.
*/
typedef enum {
static enforce_res_t
{
#ifdef _KERNEL
return (ENFORCE_ALWAYS);
if (secpolicy_zfs(cr) == 0)
return (ENFORCE_NEVER);
#endif
return (ENFORCE_ALWAYS);
return (ENFORCE_ALWAYS);
/* Only root can access zoned fs's from the GZ */
} else {
}
return (enforce);
}
/*
* Check if adding additional child filesystem(s) would exceed any filesystem
* limits or adding additional snapshot(s) would exceed any snapshot limits.
* The prop argument indicates which limit to check.
*
* Note that all filesystem limits up to the root (or the highest
* initialized) filesystem or the given ancestor must be satisfied.
*/
int
{
char *count_prop;
int err = 0;
/*
* If we're allowed to change the limit, don't enforce the limit
* e.g. this can happen if a snapshot is taken by an administrative
* user in the global zone (i.e. a recursive snapshot by root).
* However, we must handle the case of delegated permissions where we
* are allowed to change the limit on the current dataset, but there
* is another limit in the tree above.
*/
if (enforce == ENFORCE_NEVER)
return (0);
/*
* e.g. if renaming a dataset with no snapshots, count adjustment
* is 0.
*/
if (delta == 0)
return (0);
if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
/*
* We don't enforce the limit for temporary snapshots. This is
* indicated by a NULL cred_t argument.
*/
return (0);
} else {
}
/*
* If an ancestor has been provided, stop checking the limit once we
* hit that dir. We need this during rename so that we don't overcount
* the check once we recurse up to the common ancestor.
*/
return (0);
/*
* If we hit an uninitialized node while recursing up the tree, we can
* stop since we know there is no limit here (or above). The counts are
* not valid on this node and we know we won't touch this node's counts.
*/
return (0);
B_FALSE);
if (err != 0)
return (err);
/* Is there a limit which we've hit? */
return (err);
}
/*
* Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
* parents. When a new filesystem/snapshot is created, increment the count on
* all parents, and when a filesystem/snapshot is destroyed, decrement the
* count.
*/
void
{
int err;
/*
* When we receive an incremental stream into a filesystem that already
* exists, a temporary clone is created. We don't count this temporary
* clone, whose name begins with a '%'. We also ignore hidden ($FREE,
* $MOS & $ORIGIN) objsets.
*/
return;
/*
* e.g. if renaming a dataset with no snapshots, count adjustment is 0
*/
if (delta == 0)
return;
/*
* If we hit an uninitialized node while recursing up the tree, we can
* stop since we know the counts are not valid on this node and we
* know we shouldn't touch this node's counts. An uninitialized count
* on the node indicates that either the feature has not yet been
* activated or there are no limits on this part of the tree.
*/
return;
/* Use a signed verify to make sure we're not neg. */
tx));
/* Roll up this additional count into our ancestors */
}
{
if (pds) {
} else {
/* it's the root dir */
}
if (pds) {
/* update the filesystem counts */
}
return (ddobj);
}
{
}
void
{
}
if (dsl_dir_is_zapified(dd)) {
}
}
}
if (dsl_dir_is_clone(dd)) {
char buf[MAXNAMELEN];
}
}
void
{
/* up the hold count until we can be written out */
}
}
static int64_t
{
return (new_accounted - old_accounted);
}
void
{
/* release the hold from dsl_dir_dirty */
}
static uint64_t
{
int i;
for (i = 0; i < TXG_SIZE; i++) {
}
return (space);
}
/*
* How much space would dd have available if ancestor had delta applied
* to it? If ondiskonly is set, we're only interested in what's
* on-disk, not estimated pending changes.
*/
{
/*
* If there are no restrictions otherwise, assume we have
* unlimited space available.
*/
quota = UINT64_MAX;
}
if (!ondiskonly)
}
/*
* We have some space reserved, in addition to what our
* parent gave us.
*/
}
if (parentspace != UINT64_MAX)
parentspace -= delta;
}
/* over quota */
myspace = 0;
} else {
/*
* the lesser of the space provided by our parent and
* the space left in our quota
*/
}
return (myspace);
}
struct tempreserve {
};
static int
{
struct tempreserve *tr;
int i;
/*
* Check against the dsl_dir's quota. We don't add in the delta
* when checking for over-quota because they get one free hit.
*/
for (i = 0; i < TXG_SIZE; i++)
/*
* On the first iteration, fetch the dataset's used-on-disk and
* refreservation values. Also, if checkrefquota is set, test if
* allocating this space would exceed the dataset's refquota.
*/
int error;
if (error) {
return (error);
}
}
/*
* If this transaction will result in a net free of space,
* we want to let it through.
*/
quota = UINT64_MAX;
else
/*
* Adjust the quota against the actual pool size at the root
* minus any outstanding deferred frees.
* To ensure that it's possible to remove files from a full
* pool without inducing transient overcommits, we throttle
* netfree transactions against a quota that is slightly larger,
* but still within the pool's allocation slop. In cases where
* we're very close to full, this will allow a steady trickle of
* removes to get through.
*/
}
}
/*
* If they are requesting more space, and our current estimate
* is over quota, they get to try again unless the actual
* on-disk is over quota and there are no pending changes (which
* may free up space for us).
*/
"quota=%lluK tr=%lluK err=%d\n",
}
/* We need to up our estimated delta before dropping dd_lock */
/* see if it's OK with our parent */
} else {
return (0);
}
}
/*
* Reserve space in this dsl_dir, to be used in this tx's txg.
* After the space has been dirtied (and dsl_dir_willuse_space()
* has been called), the reservation should be canceled, using
* dsl_dir_tempreserve_clear().
*/
int
{
int err;
if (asize == 0) {
*tr_cookiep = NULL;
return (0);
}
if (err == 0) {
struct tempreserve *tr;
} else {
/*
* If arc_memory_throttle() detected that pageout
* is running and we are low on memory, we delay new
* non-pageout transactions to give pageout an
* advantage.
*
* It is unfortunate to be delaying while the caller's
* locks are held.
*/
}
}
if (err == 0) {
}
if (err != 0)
else
*tr_cookiep = tr_list;
return (err);
}
/*
* Clear a temporary reservation that we previously made with
* dsl_dir_tempreserve_space().
*/
void
{
struct tempreserve *tr;
return;
} else {
}
}
}
/*
* This should be called from open context when we think we're going to write
* or free space, for example when dirtying data. Be conservative; it's okay
* to write less space or free more, but we don't want to write more or free
* less than the amount specified.
*/
void
{
if (space > 0)
/* Make sure that we clean up dd_space_to* */
/* XXX this is potentially expensive and unnecessary... */
}
void
{
/*
* dsl_dataset_set_refreservation_sync_impl() calls this with
* dd_lock held, so that it can atomically update
* ds->ds_reserved and the dsl_dir accounting, so that
* dsl_dataset_check_quota() can see dataset and dir accounting
* consistently.
*/
if (needlock)
ASSERT(compressed >= 0 ||
ASSERT(uncompressed >= 0 ||
#ifdef DEBUG
dd_used_t t;
uint64_t u = 0;
for (t = 0; t < DD_USED_NUM; t++)
#endif
}
if (needlock)
}
}
void
{
if (delta == 0 ||
return;
}
typedef struct dsl_dir_set_qr_arg {
const char *ddsqra_name;
static int
{
int error;
if (error != 0)
return (error);
if (error != 0) {
return (error);
}
if (newval == 0) {
return (0);
}
/*
* If we are doing the preliminary check in open context, and
* there are pending changes, then don't fail it, since the
* pending changes could under-estimate the amount of space to be
* freed up.
*/
}
return (error);
}
static void
{
} else {
}
}
int
{
}
int
{
int error;
if (error != 0)
return (error);
/*
* If we are doing the preliminary check in open context, the
* space estimates may be inaccurate.
*/
if (!dmu_tx_is_syncing(tx)) {
return (0);
}
if (error != 0) {
return (error);
}
} else {
}
}
return (error);
}
void
{
/* Roll up this additional usage into our ancestors */
}
}
static void
{
} else {
(longlong_t)newval);
}
}
int
{
}
static dsl_dir_t *
{
return (dd);
}
}
return (NULL);
}
/*
* If delta is applied to dd, how much of that delta would be applied to
* ancestor? Syncing context only.
*/
static int64_t
{
return (delta);
}
typedef struct dsl_dir_rename_arg {
const char *ddra_oldname;
const char *ddra_newname;
/* ARGSUSED */
static int
{
char namebuf[MAXNAMELEN];
return (SET_ERROR(ENAMETOOLONG));
return (0);
}
static int
{
const char *mynewname;
int error;
/* target dir should exist */
if (error != 0)
return (error);
/* new parent should exist */
if (error != 0) {
return (error);
}
/* can't rename to different pool */
}
/* new name should not already exist */
}
/* if the name length is growing, validate child name lengths */
if (delta > 0) {
if (error != 0) {
return (error);
}
}
if (dmu_tx_is_syncing(tx)) {
/*
* Although this is the check function and we don't
* normally make on-disk changes in check functions,
* we need to do that here.
*
* Ensure this portion of the tree's counts have been
* initialized in case the new parent has limits set.
*/
}
}
/* is there enough space? */
if (dsl_dir_is_zapified(dd)) {
int err;
&fs_cnt);
return (err);
}
/*
* have to add 1 for the filesystem itself that we're
* moving
*/
fs_cnt++;
&ss_cnt);
return (err);
}
}
/* no rename into our descendant */
}
if (error != 0) {
return (error);
}
}
return (0);
}
static void
{
const char *mynewname;
int error;
&mynewname));
/* Log this before we change the name. */
/*
* We already made sure the dd counts were initialized in the
* check function.
*/
&fs_cnt));
/* add 1 for the filesystem itself that we're moving */
fs_cnt++;
&ss_cnt));
}
-unused_rsrv, 0, 0, tx);
unused_rsrv, 0, 0, tx);
}
}
/* remove from old parent zapobj */
/* add to new parent zapobj */
}
int
{
return (dsl_sync_task(oldname,
3, ZFS_SPACE_CHECK_RESERVED));
}
int
{
int err;
if (err != 0)
return (err);
if (err != 0)
return (err);
return (0);
}
{
timestruc_t t;
t = dd->dd_snap_cmtime;
return (t);
}
void
{
timestruc_t t;
gethrestime(&t);
dd->dd_snap_cmtime = t;
}
void
{
}
{
}