avs/nsctl/nskernd.c

	nskernd.c revision 570de38f63910201fdd77246630b7aa8f9dc5661
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/types.h>
#include <sys/resource.h>
#include <sys/priocntl.h>
#include <sys/rtpriocntl.h>
#include <sys/tspriocntl.h>
#include <sys/wait.h>
#include <sys/stat.h>

#include <strings.h>
#include <thread.h>
#include <stdlib.h>
#include <signal.h>
#include <errno.h>
#include <stdio.h>
#include <fcntl.h>
#include <locale.h>
#include <unistd.h>
#include <syslog.h>

#include <sys/nsctl/cfg.h>
#include <sys/nsctl/nsctl.h>
#include <sys/nsctl/nsc_ioctl.h>
#include <sys/nskernd.h>
#include <nsctl.h>

#include <sys/mkdev.h>
#include <sys/nsctl/sv_efi.h>

static const char *rdev = "/dev/nsctl";

/*
 * Define a minimal user stack size in bytes over and above the
 * libthread THR_STACK_MIN minimum value.
 *
 * This stack size needs to be sufficient to run _newlwp() and then
 * ioctl() down into the kernel.
 */
#define NSK_STACK_SIZE  512

/*
 * LWP scheduling control switches.
 *
 * allow_pri    - set to non-zero to enable priocntl() manipulations of
 *      created LWPs.
 * allow_rt - set to non-zero to use the RT rather than the TS
 *      scheduling class when manipulating the schduling
 *      parameters for an LWP.  Only used if allow_pri is
 *      non-zero.
 */
static int allow_pri = 1;
static int allow_rt = 0;    /* disallow - bad interactions with timeout() */

static int nsctl_fd = -1;
static int sigterm;

static int nthreads;        /* number of threads in the kernel */
static int exiting;     /* shutdown in progress flag */
static mutex_t thr_mutex = DEFAULTMUTEX;
static mutex_t cfg_mutex = DEFAULTMUTEX;

static int cl_nodeid = -1;

static int display_msg = 0;
static int delay_time = 30;

static void
usage(void)
{
    (void) fprintf(stderr, gettext("usage: nskernd\n"));
    exit(255);
}


static void
sighand(int sig)
{
    if (sig == SIGTERM) {
        sigterm++;
    }
}


/*
 * Returns: 1 - can enter kernel; 0 - shutdown in progress, do not enter kernel
 */
int
nthread_inc(void)
{
    (void) mutex_lock(&thr_mutex);
    if (exiting) {
        /* cannot enter kernel as nskernd is being shutdown - exit */
        (void) mutex_unlock(&thr_mutex);
        return (0);
    }
    nthreads++;
    (void) mutex_unlock(&thr_mutex);
    return (1);
}


void
nthread_dec(void)
{
    (void) mutex_lock(&thr_mutex);
    nthreads--;
    (void) mutex_unlock(&thr_mutex);
}


/*
 * returns: 1 - can shutdown; 0 - unable to shutdown
 */
int
canshutdown(void)
{
    int rc = 1;
    time_t  start_delay;

    (void) mutex_lock(&thr_mutex);
    if (nthreads > 0) {
        if (display_msg) {
            (void) fprintf(stderr,
                gettext("nskernd: unable to shutdown: "
                "%d kernel threads in use\n"), nthreads);
        }
        start_delay = time(0);
        while (nthreads > 0 && (time(0) - start_delay) < delay_time) {
            (void) mutex_unlock(&thr_mutex);
            (void) sleep(1);
            (void) mutex_lock(&thr_mutex);
            (void) fprintf(stderr,
                gettext("nskernd:   delay shutdown: "
                "%d kernel threads in use\n"), nthreads);
        }
        if (nthreads > 0) {
            rc = 0;
        } else {
            exiting = 1;
        }
    } else {
        /* flag shutdown in progress */
        exiting = 1;
    }
    (void) mutex_unlock(&thr_mutex);

    return (rc);
}


/*
 * returns: 1 - shutdown successful; 0 - unable to shutdown
 */
int
shutdown(void)
{
    struct nskernd data;
    int rc;

    if (nsctl_fd < 0)
        return (1);

    bzero(&data, sizeof (data));
    data.command = NSKERND_STOP;

    if (!canshutdown()) {
        return (0);
    }

    rc = ioctl(nsctl_fd, NSCIOC_NSKERND, &data);
    if (rc < 0) {
        if (errno != EINTR || !sigterm) {
            (void) fprintf(stderr,
                gettext("nskernd: NSKERND_STOP failed\n"));
        }
    }

    return (1);
}


/*
 * First function run by a NSKERND_NEWLWP thread.
 *
 * Determines if it needs to change the scheduling priority of the LWP,
 * and then calls back into the kernel.
 */
static void *
_newlwp(void *arg)
{
    struct nskernd nsk;
    pcparms_t pcparms;
    pcinfo_t pcinfo;

    /* copy arguments onto stack and free heap memory */
    bcopy(arg, &nsk, sizeof (nsk));
    free(arg);

    if (nsk.data2 && allow_pri) {
        /* increase the scheduling priority of this LWP */

        bzero(&pcinfo, sizeof (pcinfo));
        (void) strcpy(pcinfo.pc_clname, allow_rt ? "RT" : "TS");

        if (priocntl(0, 0, PC_GETCID, (char *)&pcinfo) < 0) {
            (void) fprintf(stderr,
                gettext(
                "nskernd: priocntl(PC_GETCID) failed: %s\n"),
                strerror(errno));
            goto pri_done;
        }

        bzero(&pcparms, sizeof (pcparms));
        pcparms.pc_cid = pcinfo.pc_cid;

        if (allow_rt) {
            ((rtparms_t *)pcparms.pc_clparms)->rt_pri =
                (pri_t)0; /* minimum RT priority */
            ((rtparms_t *)pcparms.pc_clparms)->rt_tqsecs =
                (uint_t)RT_TQDEF;
            ((rtparms_t *)pcparms.pc_clparms)->rt_tqnsecs =
                RT_TQDEF;
        } else {
            ((tsparms_t *)pcparms.pc_clparms)->ts_uprilim =
                ((tsinfo_t *)&pcinfo.pc_clinfo)->ts_maxupri;
            ((tsparms_t *)pcparms.pc_clparms)->ts_upri =
                ((tsinfo_t *)&pcinfo.pc_clinfo)->ts_maxupri;
        }

        if (priocntl(P_LWPID, P_MYID,
            PC_SETPARMS, (char *)&pcparms) < 0) {
            (void) fprintf(stderr,
                gettext(
                "nskernd: priocntl(PC_SETPARMS) failed: %s\n"),
                strerror(errno));
        }
    }

pri_done:
    if (nthread_inc()) {
        (void) ioctl(nsctl_fd, NSCIOC_NSKERND, &nsk);
        nthread_dec();
    }
    return (NULL);
}


/*
 * Start a new thread bound to an LWP.
 *
 * This is the user level side of nsc_create_process().
 */
static void
newlwp(struct nskernd *req)
{
    struct nskernd *nskp;
    thread_t tid;
    int rc;

    nskp = malloc(sizeof (*nskp));
    if (!nskp) {
#ifdef DEBUG
        (void) fprintf(stderr, gettext("nskernd: malloc(%d) failed\n"),
            sizeof (*nskp));
#endif
        req->data1 = (uint64_t)ENOMEM;
        return;
    }

    /* copy args for child */
    bcopy(req, nskp, sizeof (*nskp));

    rc = thr_create(NULL, (THR_MIN_STACK + NSK_STACK_SIZE),
        _newlwp, nskp, THR_BOUND|THR_DETACHED, &tid);

    if (rc != 0) {
        /* thr_create failed */
#ifdef DEBUG
        (void) fprintf(stderr,
            gettext("nskernd: thr_create failed: %s\n"),
            strerror(errno));
#endif
        req->data1 = (uint64_t)errno;
        free(nskp);
    } else {
        /* success - _newlwp() will free nskp */
        req->data1 = (uint64_t)0;
    }
}

static int
log_iibmp_err(char *set, int flags)
{
    CFGFILE *cfg;
    char key[CFG_MAX_KEY];
    char buf[CFG_MAX_BUF];
    char newflags[CFG_MAX_BUF];
    char outbuf[CFG_MAX_BUF];
    char *mst, *shd, *bmp, *mode, *ovr, *cnode, *opt, *grp;
    int setno, found = 0;
    int setlen;
    int rc = 0;
    pid_t pid = -1;

    if (set && *set) {
        setlen = strlen(set);
    } else {
        return (EINVAL);
    }

    (void) mutex_lock(&cfg_mutex);
    cfg = cfg_open("");
    if (!cfg) {
        (void) mutex_unlock(&cfg_mutex);
        return (ENXIO);
    }

    if (!cfg_lock(cfg, CFG_WRLOCK)) {

        (void) mutex_unlock(&cfg_mutex);
        cfg_close(cfg);

        pid = fork();

        if (pid == -1) {
            (void) fprintf(stderr, gettext(
                "nskernd: Error forking\n"));
            return (errno);
        } else if (pid > 0) {
            (void) fprintf(stdout, gettext(
                "nskernd: Attempting deferred bitmap error\n"));
            return (0);
        }

        (void) mutex_lock(&cfg_mutex);
        cfg = cfg_open("");
        if (!cfg) {
            (void) mutex_unlock(&cfg_mutex);
            (void) fprintf(stderr, gettext(
                "nskernd: Failed cfg_open, deferred bitmap\n"));
            return (ENXIO);
        }

        /* Sooner or later, this lock will be free */
        while (!cfg_lock(cfg, CFG_WRLOCK))
            (void) sleep(2);
    }

    /* find the proper set number */
    for (setno = 1; !found; setno++) {
        (void) snprintf(key, CFG_MAX_KEY, "ii.set%d", setno);
        if (cfg_get_cstring(cfg, key, buf, CFG_MAX_BUF) < 0) {
            break;
        }

        mst = strtok(buf, " ");
        shd = strtok(NULL, " ");
        if (strncmp(shd, set, setlen) == 0) {
            found = 1;

            bmp = strtok(NULL, " ");
            mode = strtok(NULL, " ");
            ovr = strtok(NULL, " ");
            cnode = strtok(NULL, " ");
            opt = strtok(NULL, " ");
            grp = strtok(NULL, " ");
            break;
        }
    }

    if (found) {
        /* were there flags in the options field already? */
        (void) snprintf(newflags, CFG_MAX_BUF, "%s=0x%x",
            NSKERN_II_BMP_OPTION, flags);
        if (opt && strcmp(opt, "-") != 0) {
            bzero(newflags, CFG_MAX_BUF);
            opt = strtok(opt, ";");
            while (opt) {
                if (strncmp(opt, NSKERN_II_BMP_OPTION,
                    strlen(NSKERN_II_BMP_OPTION)) != 0) {
                    (void) strcat(newflags, ";");
                    (void) strcat(newflags, opt);
                }
            }
        }
        (void) snprintf(key, CFG_MAX_KEY, "ii.set%d", setno);
        (void) snprintf(outbuf, CFG_MAX_BUF, "%s %s %s %s %s %s %s %s",
            mst, shd, bmp, mode, ovr, cnode, newflags, grp);
        if (cfg_put_cstring(cfg, key, outbuf, CFG_MAX_BUF) < 0) {
            (void) printf("Failed to put [%s]\n", outbuf);
            rc = ENXIO;
        } else {
            (void) cfg_commit(cfg);
            rc = 0;
        }
    } else {
        (void) fprintf(stderr, gettext(
            "nskernd: Failed deferred bitmap [%s]\n"), set);
        rc = EINVAL;
    }
    cfg_unlock(cfg);
    cfg_close(cfg);
    (void) mutex_unlock(&cfg_mutex);

    /*
     * if we are the fork'ed client, just exit, if parent just return
     */
    if (pid == 0) {
        exit(rc);
        /*NOTREACHED*/
    } else {
        return (rc);
    }
}

/*
 * First function run by a NSKERND_LOCK thread.
 *
 * Opens dscfg and locks it,
 * and then calls back into the kernel.
 *
 * Incoming:
 *  data1 is the kernel address of the sync structure.
 *  data2 is read(0)/write(1) lock mode.
 *
 * Returns:
 *  data1 as incoming.
 *  data2 errno.
 */
static void *
_dolock(void *arg)
{
    struct nskernd nsk;
    CFGFILE *cfg;
    int locked;
    int mode;
    int rc = 0;

    /* copy arguments onto stack and free heap memory */
    bcopy(arg, &nsk, sizeof (nsk));
    free(arg);

    (void) mutex_lock(&cfg_mutex);
    cfg = cfg_open("");
    if (cfg == NULL) {
#ifdef DEBUG
        (void) fprintf(stderr,
            gettext("nskernd: cfg_open failed: %s\n"),
            strerror(errno));
#endif
        rc = ENXIO;
    }

    if (nsk.data2 == 0) {
        mode = CFG_RDLOCK;
    } else {
        mode = CFG_WRLOCK;
    }

    locked = 0;
    if (rc == 0) {
        if (cfg_lock(cfg, mode)) {
            locked = 1;
        } else {
#ifdef DEBUG
            (void) fprintf(stderr,
                gettext("nskernd: cfg_lock failed: %s\n"),
                strerror(errno));
#endif
            rc = EINVAL;
        }
    }

    /* return to kernel */

    nsk.data2 = (uint64_t)rc;
    if (nthread_inc()) {
        (void) ioctl(nsctl_fd, NSCIOC_NSKERND, &nsk);
        nthread_dec();
    }

    /* cleanup */

    if (locked) {
        cfg_unlock(cfg);
        locked = 0;
    }

    if (cfg != NULL) {
        cfg_close(cfg);
        cfg = NULL;
    }
    (void) mutex_unlock(&cfg_mutex);

    return (NULL);
}


/*
 * Inter-node lock thread.
 *
 * This is the user level side of nsc_rmlock().
 */
static void
dolock(struct nskernd *req)
{
    struct nskernd *nskp;
    thread_t tid;
    int rc;

    /* create a new thread to do the lock and return to kernel */

    nskp = malloc(sizeof (*nskp));
    if (!nskp) {
#ifdef DEBUG
        (void) fprintf(stderr,
            gettext("nskernd:dolock: malloc(%d) failed\n"),
            sizeof (*nskp));
#endif
        req->data1 = (uint64_t)ENOMEM;
        return;
    }

    /* copy args for child */
    bcopy(req, nskp, sizeof (*nskp));

    rc = thr_create(NULL, (THR_MIN_STACK + NSK_STACK_SIZE),
        _dolock, nskp, THR_BOUND|THR_DETACHED, &tid);

    if (rc != 0) {
        /* thr_create failed */
#ifdef DEBUG
        (void) fprintf(stderr,
            gettext("nskernd: thr_create failed: %s\n"),
            strerror(errno));
#endif
        req->data1 = (uint64_t)errno;
        free(nskp);
    } else {
        /* success - _dolock() will free nskp */
        req->data1 = (uint64_t)0;
    }
}


/*
 * Convenience code for engineering test of multi-terabyte volumes.
 *
 * zvol (part of zfs) does not support DKIOCPARTITION but does use EFI
 * labels.  This code allocates a simple efi label structure and ioctls
 * to extract the size of a zvol.  It only handles the minimal EFI ioctl
 * implementation in zvol.
 */

static void
zvol_bsize(char *path, uint64_t *size, const int pnum)
{
    struct stat64 stb1, stb2;
    struct dk_minfo dkm;
    int fd = -1;
    int rc;

    if (cl_nodeid || pnum != 0)
        return;

    if ((fd = open(path, O_RDONLY)) < 0) {
        return;
    }

    if (stat64("/devices/pseudo/zfs@0:zfs", &stb1) != 0 ||
        fstat64(fd, &stb2) != 0 ||
        !S_ISCHR(stb1.st_mode) ||
        !S_ISCHR(stb2.st_mode) ||
        major(stb1.st_rdev) != major(stb2.st_rdev)) {
        (void) close(fd);
        return;
    }

    rc = ioctl(fd, DKIOCGMEDIAINFO, (void *)&dkm);
    if (rc >= 0) {
        *size = LE_64(dkm.dki_capacity) *
            (dkm.dki_lbsize) / 512;
    }

    (void) close(fd);
}

/* ARGSUSED */
static void
get_bsize(uint64_t raw_fd, uint64_t *size, int *partitionp, char *path)
{
    struct nscioc_bsize bsize;
#ifdef DKIOCPARTITION
    struct partition64 p64;
#endif
    struct dk_cinfo dki_info;
    struct vtoc vtoc;
    int fd;

    *partitionp = -1;
    *size = (uint64_t)0;

    dki_info.dki_partition = (ushort_t)-1;
    bsize.dki_info = (uint64_t)(unsigned long)&dki_info;
    bsize.vtoc = (uint64_t)(unsigned long)&vtoc;
    bsize.raw_fd = raw_fd;
    bsize.efi = 0;

    fd = open(rdev, O_RDONLY);
    if (fd < 0)
        return;

    if (ioctl(fd, NSCIOC_BSIZE, &bsize) < 0) {
        if (dki_info.dki_partition != (ushort_t)-1) {
            /* assume part# is ok and just the size failed */
            *partitionp = (int)dki_info.dki_partition;

#ifdef DKIOCPARTITION
            /* see if this is an EFI label */
            bzero(&p64, sizeof (p64));
            p64.p_partno = (uint_t)*partitionp;
            if ((ioctl(fd, DKIOCPARTITION, &p64)) > 0) {
                *size = (uint64_t)p64.p_size;
            } else {
                bsize.p64 = (uint64_t)(unsigned long)&p64;
                bsize.efi = 1;

                if (ioctl(fd, NSCIOC_BSIZE, &bsize) < 0) {
                    /* see if this is a zvol */
                    zvol_bsize(path, size, *partitionp);
                } else {
                    *size = (uint64_t)p64.p_size;
                }
            }
#endif  /* DKIOCPARTITION */
        }

        (void) close(fd);
        return;
    }

    (void) close(fd);

    *partitionp = (int)dki_info.dki_partition;

    if (vtoc.v_sanity != VTOC_SANE)
        return;

    if (vtoc.v_version != V_VERSION && vtoc.v_version != 0)
        return;

    if (dki_info.dki_partition > V_NUMPAR)
        return;

    *size = (uint64_t)vtoc.v_part[(int)dki_info.dki_partition].p_size;
}


static int
iscluster(void)
{
    /*
     * Find out if we are running in a cluster
     */
    cl_nodeid = cfg_iscluster();
    if (cl_nodeid > 0) {
        return (TRUE);
    } else if (cl_nodeid == 0) {
        return (FALSE);
    }

    (void) fprintf(stderr, "%s\n",
        gettext("nskernd: unable to ascertain environment"));
    exit(1);
    /* NOTREACHED */
}

/*
 * Runtime Solaris release checking - build release == runtime release
 * is always considered success, so only keep entries in the map for
 * the special cases.
 */
static nsc_release_t nskernd_rel_map[] = {
/*  { "5.10", "5.10" },         */
    { "5.11", "5.10" },
    { NULL, NULL }
};


#ifdef lint
#define main    nskernd_main
#endif
/* ARGSUSED1 */
int
main(int argc, char *argv[])
{
    const char *dir = "/";
    struct nskernd data;
    struct rlimit rl;
    int i, run, rc;
    int partition;
    char *reqd;
    int syncpipe[2];
    int startup;

    (void) setlocale(LC_ALL, "");
    (void) textdomain("nskernd");

    rc = nsc_check_release(BUILD_REV_STR, nskernd_rel_map, &reqd);
    if (rc < 0) {
        (void) fprintf(stderr,
            gettext("nskernd: unable to determine the current "
            "Solaris release: %s\n"), strerror(errno));
        exit(1);
    } else if (rc == FALSE) {
        (void) fprintf(stderr,
            gettext("nskernd: incorrect Solaris release "
            "(requires %s)\n"), reqd);
        exit(1);
    }

    rc = 0;

    if (argc != 1)
        usage();

    /*
     * Usage: <progname> [-g] [-d <seconds to delay>]
     */
    while ((i = getopt(argc, argv, "gd:")) != EOF) {
        switch (i) {
            case 'g':
                display_msg = 1;
                break;
            case 'd':
                delay_time = atoi(optarg);
                if (delay_time <= 0) {
                    delay_time = 30;
                }
                break;
            default:
                syslog(LOG_ERR,
                "Usage: nskernd [-g] [-d <seconds to delay>]");
                exit(1);
                break;
        }
    }

    if (chroot(dir) < 0) {
        (void) fprintf(stderr, gettext("nskernd: chroot failed: %s\n"),
            strerror(errno));
        exit(1);
    }

    if (chdir(dir) < 0) {
        (void) fprintf(stderr, gettext("nskernd: chdir failed: %s\n"),
            strerror(errno));
        exit(1);
    }

    /*
     * Determine if we are in a Sun Cluster or not, before fork'ing
     */
    (void) iscluster();

    /*
     * create a pipe to synchronise the parent with the
     * child just before it enters its service loop.
     */
    if (pipe(syncpipe) < 0) {
        (void) fprintf(stderr,
            gettext("nskernd: cannot create pipe: %s\n"),
            strerror(errno));
        exit(1);
    }
    /*
     * Fork off a child that becomes the daemon.
     */

    if ((rc = fork()) > 0) {
        char c;
        int n;
        (void) close(syncpipe[1]);
        /*
         * wait for the close of the pipe.
         * If we get a char back, indicates good
         * status from child, so exit 0.
         * If we get a zero length read, then the
         * child has failed, so we do too.
         */
        n = read(syncpipe[0], &c, 1);
        exit((n <= 0) ? 1 : 0);
    } else if (rc < 0) {
        (void) fprintf(stderr, gettext("nskernd: cannot fork: %s\n"),
            strerror(errno));
        exit(1);
    }

    /*
     * In child - become daemon.
     */

    /* use closefrom(3C) from PSARC/2000/193 when possible */
    for (i = 0; i < syncpipe[1]; i++) {
        (void) close(i);
    }
    closefrom(syncpipe[1] + 1);

    (void) open("/dev/console", O_WRONLY|O_APPEND);
    (void) dup(0);
    (void) dup(0);
    (void) close(0);

    (void) setpgrp();

    /*
     * Ignore all signals apart from SIGTERM.
     */

    for (i = 1; i < _sys_nsig; i++)
        (void) sigset(i, SIG_IGN);

    (void) sigset(SIGTERM, sighand);

    /*
     * Increase the number of fd's that can be open.
     */

    rl.rlim_cur = RLIM_INFINITY;
    rl.rlim_max = RLIM_INFINITY;
    if (setrlimit(RLIMIT_NOFILE, &rl) < 0) {
        (void) fprintf(stderr,
            gettext("nskernd: could not increase RLIMIT_NOFILE: %s\n"),
            strerror(errno));
        (void) fprintf(stderr,
            gettext("nskernd: the maximum number of nsctl open "
            "devices may be reduced\n"));
    }

    /*
     * Open /dev/nsctl and startup.
     */

    nsctl_fd = open(rdev, O_RDONLY);
    if (nsctl_fd < 0) {
        (void) fprintf(stderr, gettext("nskernd: unable to open %s\n"),
            rdev);
        exit(1);
    }

    bzero(&data, sizeof (data));

    data.command = NSKERND_START;
    data.data1 = (uint64_t)cl_nodeid;
    run = 1;

    startup = 1;
    while (run) {
        rc = ioctl(nsctl_fd, NSCIOC_NSKERND, &data);
        if (rc < 0) {
            /* try and do kernel cleanup and exit */
            if (shutdown()) {
                run = 0;
            } else {
                sigterm = 0;
            }

            (void) fprintf(stderr,
                gettext("nskernd: NSCIOC_NSKERND failed: %s\n"),
                strerror(errno));
            continue;
        } else if (sigterm) {
            /* SIGTERM received - terminate */
            if (data.command != NSKERND_START &&
                (data.command != NSKERND_STOP ||
                data.data1 != (uint64_t)1)) {
                /* need to do kernel cleanup */
                if (shutdown()) {
                    run = 0;
                } else {
                    sigterm = 0;
                    data.command = NSKERND_START;
                    data.data1 = (uint64_t)cl_nodeid;
                }
            } else {
                /* just quit */
                if (canshutdown()) {
                    run = 0;
                } else {
                    /* cannot shutdown - threads active */
                    sigterm = 0;
                    data.command = NSKERND_START;
                    data.data1 = (uint64_t)cl_nodeid;
                }
            }
            continue;
        }
        if (startup) {
            char c = 0;
            (void) write(syncpipe[1], &c, 1);
            (void) close(syncpipe[1]);
            startup = 0;
        }
        switch (data.command) {
        case NSKERND_START: /* (re)start completion */
            if (rc == 1) {
                (void) fprintf(stderr,
                    gettext("nskernd: already started\n"));
                run = 0;
            } else if (rc == 2) {
                (void) fprintf(stderr,
                    gettext("nskernd: stopped by kernel\n"));
                run = 0;
            }
            data.command = NSKERND_WAIT;
            break;

        case NSKERND_STOP:  /* kernel telling daemon to stop */
            if (data.data1 != (uint64_t)1) {
                (void) shutdown();
                run = 0;
            }
            break;

        case NSKERND_BSIZE:
            /*
             * kernel requesting partsize
             * data1 - size return
             * data2 - raw_fd (entry)
             *   - partition number (return)
             */
            partition = -1;
            get_bsize(data.data2, &data.data1,
                &partition, data.char1);
            data.data2 = (uint64_t)partition;
            data.command = NSKERND_WAIT;
            break;

        case NSKERND_NEWLWP:    /* kernel requesting a new LWP */
            newlwp(&data);
            data.command = NSKERND_WAIT;
            break;

        case NSKERND_LOCK:      /* kernel requesting lock */
            dolock(&data);
            data.command = NSKERND_WAIT;
            break;

        case NSKERND_WAIT:  /* kernel retrying wait */
            /*
             * the kernel thread can be woken by the dr config
             * utilities (ie cfgadm) therefore we just reissue
             * the wait.
             */
            break;

        case NSKERND_IIBITMAP:
            rc = log_iibmp_err(data.char1, (int)data.data1);
            data.data1 = (uint64_t)rc;
            data.command = NSKERND_WAIT;
            break;

        default:
            (void) fprintf(stderr,
                gettext("nskernd: unknown command %d"),
                data.command);
            data.command = NSKERND_WAIT;
            break;
        }
    }

    (void) close(nsctl_fd);

    return (rc);
}