common/syscall/lgrpsys.c

	lgrpsys.c revision dc32d872cbeb56532bcea030255db9cd79bac7da
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2015 Joyent, Inc.
 */

/*
 * lgroup system calls
 */

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/sunddi.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/cpupart.h>
#include <sys/lgrp.h>
#include <sys/lgrp_user.h>
#include <sys/promif.h>     /* for prom_printf() */
#include <sys/sysmacros.h>
#include <sys/policy.h>

#include <vm/as.h>


/* definitions for mi_validity */
#define VALID_ADDR  1
#define VALID_REQ   2

/*
 * run through the given number of addresses and requests and return the
 * corresponding memory information for each address
 */
static int
meminfo(int addr_count, struct meminfo *mip)
{
    size_t      in_size, out_size, req_size, val_size;
    struct as   *as;
    struct hat  *hat;
    int     i, j, out_idx, info_count;
    lgrp_t      *lgrp;
    pfn_t       pfn;
    ssize_t     pgsz;
    int     *req_array, *val_array;
    uint64_t    *in_array, *out_array;
    uint64_t    addr, paddr;
    uintptr_t   vaddr;
    int     ret = 0;
    struct meminfo minfo;
#if defined(_SYSCALL32_IMPL)
    struct meminfo32 minfo32;
#endif

    /*
     * Make sure that there is at least one address to translate and
     * limit how many virtual addresses the kernel can do per call
     */
    if (addr_count < 1)
        return (set_errno(EINVAL));
    else if (addr_count > MAX_MEMINFO_CNT)
        addr_count = MAX_MEMINFO_CNT;

    if (get_udatamodel() == DATAMODEL_NATIVE) {
        if (copyin(mip, &minfo, sizeof (struct meminfo)))
            return (set_errno(EFAULT));
    }
#if defined(_SYSCALL32_IMPL)
    else {
        bzero(&minfo, sizeof (minfo));
        if (copyin(mip, &minfo32, sizeof (struct meminfo32)))
            return (set_errno(EFAULT));
        minfo.mi_inaddr = (const uint64_t *)(uintptr_t)
            minfo32.mi_inaddr;
        minfo.mi_info_req = (const uint_t *)(uintptr_t)
            minfo32.mi_info_req;
        minfo.mi_info_count = minfo32.mi_info_count;
        minfo.mi_outdata = (uint64_t *)(uintptr_t)
            minfo32.mi_outdata;
        minfo.mi_validity = (uint_t *)(uintptr_t)
            minfo32.mi_validity;
    }
#endif
    /*
     * all the input parameters have been copied in:-
     * addr_count - number of input addresses
     * minfo.mi_inaddr - array of input addresses
     * minfo.mi_info_req - array of types of information requested
     * minfo.mi_info_count - no. of pieces of info requested for each addr
     * minfo.mi_outdata - array into which the results are placed
     * minfo.mi_validity -  array containing bitwise result codes; 0th bit
     *          evaluates validity of corresponding input
     *          address, 1st bit validity of response to first
     *          member of info_req, etc.
     */

    /* make sure mi_info_count is within limit */
    info_count = minfo.mi_info_count;
    if (info_count < 1 || info_count > MAX_MEMINFO_REQ)
        return (set_errno(EINVAL));

    /*
     * allocate buffer in_array for the input addresses and copy them in
     */
    in_size = sizeof (uint64_t) * addr_count;
    in_array = kmem_alloc(in_size, KM_SLEEP);
    if (copyin(minfo.mi_inaddr, in_array, in_size)) {
        kmem_free(in_array, in_size);
        return (set_errno(EFAULT));
    }

    /*
     * allocate buffer req_array for the input info_reqs and copy them in
     */
    req_size = sizeof (uint_t) * info_count;
    req_array = kmem_alloc(req_size, KM_SLEEP);
    if (copyin(minfo.mi_info_req, req_array, req_size)) {
        kmem_free(req_array, req_size);
        kmem_free(in_array, in_size);
        return (set_errno(EFAULT));
    }

    /*
     * Validate privs for each req.
     */
    for (i = 0; i < info_count; i++) {
        switch (req_array[i] & MEMINFO_MASK) {
        case MEMINFO_VLGRP:
        case MEMINFO_VPAGESIZE:
            break;
        default:
            if (secpolicy_meminfo(CRED()) != 0) {
                kmem_free(req_array, req_size);
                kmem_free(in_array, in_size);
                return (set_errno(EPERM));
            }
            break;
        }
    }

    /*
     * allocate buffer out_array which holds the results and will have
     * to be copied out later
     */
    out_size = sizeof (uint64_t) * addr_count * info_count;
    out_array = kmem_alloc(out_size, KM_SLEEP);

    /*
     * allocate buffer val_array which holds the validity bits and will
     * have to be copied out later
     */
    val_size = sizeof (uint_t) * addr_count;
    val_array = kmem_alloc(val_size, KM_SLEEP);

    if ((req_array[0] & MEMINFO_MASK) == MEMINFO_PLGRP) {
        /* find the corresponding lgroup for each physical address */
        for (i = 0; i < addr_count; i++) {
            paddr = in_array[i];
            pfn = btop(paddr);
            lgrp = lgrp_pfn_to_lgrp(pfn);
            if (lgrp) {
                out_array[i] = lgrp->lgrp_id;
                val_array[i] = VALID_ADDR | VALID_REQ;
            } else {
                out_array[i] = NULL;
                val_array[i] = 0;
            }
        }
    } else {
        /* get the corresponding memory info for each virtual address */
        as = curproc->p_as;

        AS_LOCK_ENTER(as, RW_READER);
        hat = as->a_hat;
        for (i = out_idx = 0; i < addr_count; i++, out_idx +=
            info_count) {
            addr = in_array[i];
            vaddr = (uintptr_t)(addr & ~PAGEOFFSET);
            if (!as_segat(as, (caddr_t)vaddr)) {
                val_array[i] = 0;
                continue;
            }
            val_array[i] = VALID_ADDR;
            pfn = hat_getpfnum(hat, (caddr_t)vaddr);
            if (pfn != PFN_INVALID) {
                paddr = (uint64_t)((pfn << PAGESHIFT) |
                    (addr & PAGEOFFSET));
                for (j = 0; j < info_count; j++) {
                    switch (req_array[j] & MEMINFO_MASK) {
                    case MEMINFO_VPHYSICAL:
                        /*
                         * return the physical address
                         * corresponding to the input
                         * virtual address
                         */
                        out_array[out_idx + j] = paddr;
                        val_array[i] |= VALID_REQ << j;
                        break;
                    case MEMINFO_VLGRP:
                        /*
                         * return the lgroup of physical
                         * page corresponding to the
                         * input virtual address
                         */
                        lgrp = lgrp_pfn_to_lgrp(pfn);
                        if (lgrp) {
                            out_array[out_idx + j] =
                                lgrp->lgrp_id;
                            val_array[i] |=
                                VALID_REQ << j;
                        }
                        break;
                    case MEMINFO_VPAGESIZE:
                        /*
                         * return the size of physical
                         * page corresponding to the
                         * input virtual address
                         */
                        pgsz = hat_getpagesize(hat,
                            (caddr_t)vaddr);
                        if (pgsz != -1) {
                            out_array[out_idx + j] =
                                pgsz;
                            val_array[i] |=
                                VALID_REQ << j;
                        }
                        break;
                    case MEMINFO_VREPLCNT:
                        /*
                         * for future use:-
                         * return the no. replicated
                         * physical pages corresponding
                         * to the input virtual address,
                         * so it is always 0 at the
                         * moment
                         */
                        out_array[out_idx + j] = 0;
                        val_array[i] |= VALID_REQ << j;
                        break;
                    case MEMINFO_VREPL:
                        /*
                         * for future use:-
                         * return the nth physical
                         * replica of the specified
                         * virtual address
                         */
                        break;
                    case MEMINFO_VREPL_LGRP:
                        /*
                         * for future use:-
                         * return the lgroup of nth
                         * physical replica of the
                         * specified virtual address
                         */
                        break;
                    case MEMINFO_PLGRP:
                        /*
                         * this is for physical address
                         * only, shouldn't mix with
                         * virtual address
                         */
                        break;
                    default:
                        break;
                    }
                }
            }
        }
        AS_LOCK_EXIT(as);
    }

    /* copy out the results and validity bits and free the buffers */
    if ((copyout(out_array, minfo.mi_outdata, out_size) != 0) ||
        (copyout(val_array, minfo.mi_validity, val_size) != 0))
        ret = set_errno(EFAULT);

    kmem_free(in_array, in_size);
    kmem_free(out_array, out_size);
    kmem_free(req_array, req_size);
    kmem_free(val_array, val_size);

    return (ret);
}


/*
 * Initialize lgroup affinities for thread
 */
void
lgrp_affinity_init(lgrp_affinity_t **bufaddr)
{
    if (bufaddr)
        *bufaddr = NULL;
}


/*
 * Free lgroup affinities for thread and set to NULL
 * just in case thread gets recycled
 */
void
lgrp_affinity_free(lgrp_affinity_t **bufaddr)
{
    if (bufaddr && *bufaddr) {
        kmem_free(*bufaddr, nlgrpsmax * sizeof (lgrp_affinity_t));
        *bufaddr = NULL;
    }
}


#define P_ANY   -2  /* cookie specifying any ID */


/*
 * Find LWP with given ID in specified process and get its affinity for
 * specified lgroup
 */
lgrp_affinity_t
lgrp_affinity_get_thread(proc_t *p, id_t lwpid, lgrp_id_t lgrp)
{
    lgrp_affinity_t aff;
    int     found;
    kthread_t   *t;

    ASSERT(MUTEX_HELD(&p->p_lock));

    aff = LGRP_AFF_NONE;
    found = 0;
    t = p->p_tlist;
    /*
     * The process may be executing in proc_exit() and its p->p_list may be
     * already NULL.
     */
    if (t == NULL)
        return (set_errno(ESRCH));

    do {
        if (t->t_tid == lwpid || lwpid == P_ANY) {
            thread_lock(t);
            /*
             * Check to see whether caller has permission to set
             * affinity for LWP
             */
            if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
                thread_unlock(t);
                return (set_errno(EPERM));
            }

            if (t->t_lgrp_affinity)
                aff = t->t_lgrp_affinity[lgrp];
            thread_unlock(t);
            found = 1;
            break;
        }
    } while ((t = t->t_forw) != p->p_tlist);
    if (!found)
        aff = set_errno(ESRCH);

    return (aff);
}


/*
 * Get lgroup affinity for given LWP
 */
lgrp_affinity_t
lgrp_affinity_get(lgrp_affinity_args_t *ap)
{
    lgrp_affinity_t     aff;
    lgrp_affinity_args_t    args;
    id_t            id;
    idtype_t        idtype;
    lgrp_id_t       lgrp;
    proc_t          *p;
    kthread_t       *t;

    /*
     * Copyin arguments
     */
    if (copyin(ap, &args, sizeof (lgrp_affinity_args_t)) != 0)
        return (set_errno(EFAULT));

    id = args.id;
    idtype = args.idtype;
    lgrp = args.lgrp;

    /*
     * Check for invalid lgroup
     */
    if (lgrp < 0 || lgrp == LGRP_NONE)
        return (set_errno(EINVAL));

    /*
     * Check for existing lgroup
     */
    if (lgrp > lgrp_alloc_max)
        return (set_errno(ESRCH));

    /*
     * Get lgroup affinity for given LWP or process
     */
    switch (idtype) {

    case P_LWPID:
        /*
         * LWP in current process
         */
        p = curproc;
        mutex_enter(&p->p_lock);
        if (id != P_MYID)   /* different thread */
            aff = lgrp_affinity_get_thread(p, id, lgrp);
        else {          /* current thread */
            aff = LGRP_AFF_NONE;
            t = curthread;
            thread_lock(t);
            if (t->t_lgrp_affinity)
                aff = t->t_lgrp_affinity[lgrp];
            thread_unlock(t);
        }
        mutex_exit(&p->p_lock);
        break;

    case P_PID:
        /*
         * Process
         */
        mutex_enter(&pidlock);

        if (id == P_MYID)
            p = curproc;
        else {
            p = prfind(id);
            if (p == NULL) {
                mutex_exit(&pidlock);
                return (set_errno(ESRCH));
            }
        }

        mutex_enter(&p->p_lock);
        aff = lgrp_affinity_get_thread(p, P_ANY, lgrp);
        mutex_exit(&p->p_lock);

        mutex_exit(&pidlock);
        break;

    default:
        aff = set_errno(EINVAL);
        break;
    }

    return (aff);
}


/*
 * Find lgroup for which this thread has most affinity in specified partition
 * starting from home lgroup unless specified starting lgroup is preferred
 */
lpl_t *
lgrp_affinity_best(kthread_t *t, struct cpupart *cpupart, lgrp_id_t start,
    boolean_t prefer_start)
{
    lgrp_affinity_t *affs;
    lgrp_affinity_t best_aff;
    lpl_t       *best_lpl;
    lgrp_id_t   finish;
    lgrp_id_t   home;
    lgrp_id_t   lgrpid;
    lpl_t       *lpl;

    ASSERT(t != NULL);
    ASSERT((MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0) ||
        (MUTEX_HELD(&ttoproc(t)->p_lock) && THREAD_LOCK_HELD(t)));
    ASSERT(cpupart != NULL);

    if (t->t_lgrp_affinity == NULL)
        return (NULL);

    affs = t->t_lgrp_affinity;

    /*
     * Thread bound to CPU
     */
    if (t->t_bind_cpu != PBIND_NONE) {
        cpu_t   *cp;

        /*
         * Find which lpl has most affinity among leaf lpl directly
         * containing CPU and its ancestor lpls
         */
        cp = cpu[t->t_bind_cpu];

        best_lpl = lpl = cp->cpu_lpl;
        best_aff = affs[best_lpl->lpl_lgrpid];
        while (lpl->lpl_parent != NULL) {
            lpl = lpl->lpl_parent;
            lgrpid = lpl->lpl_lgrpid;
            if (affs[lgrpid] > best_aff) {
                best_lpl = lpl;
                best_aff = affs[lgrpid];
            }
        }
        return (best_lpl);
    }

    /*
     * Start searching from home lgroup unless given starting lgroup is
     * preferred or home lgroup isn't in given pset.  Use root lgroup as
     * starting point if both home and starting lgroups aren't in given
     * pset.
     */
    ASSERT(start >= 0 && start <= lgrp_alloc_max);
    home = t->t_lpl->lpl_lgrpid;
    if (!prefer_start && LGRP_CPUS_IN_PART(home, cpupart))
        lgrpid = home;
    else if (start != LGRP_NONE && LGRP_CPUS_IN_PART(start, cpupart))
        lgrpid = start;
    else
        lgrpid = LGRP_ROOTID;

    best_lpl = &cpupart->cp_lgrploads[lgrpid];
    best_aff = affs[lgrpid];
    finish = lgrpid;
    do {
        /*
         * Skip any lgroups that don't have CPU resources
         * in this processor set.
         */
        if (!LGRP_CPUS_IN_PART(lgrpid, cpupart)) {
            if (++lgrpid > lgrp_alloc_max)
                lgrpid = 0; /* wrap the search */
            continue;
        }

        /*
         * Find lgroup with most affinity
         */
        lpl = &cpupart->cp_lgrploads[lgrpid];
        if (affs[lgrpid] > best_aff) {
            best_aff = affs[lgrpid];
            best_lpl = lpl;
        }

        if (++lgrpid > lgrp_alloc_max)
            lgrpid = 0; /* wrap the search */

    } while (lgrpid != finish);

    /*
     * No lgroup (in this pset) with any affinity
     */
    if (best_aff == LGRP_AFF_NONE)
        return (NULL);

    lgrpid = best_lpl->lpl_lgrpid;
    ASSERT(LGRP_CPUS_IN_PART(lgrpid, cpupart) && best_lpl->lpl_ncpu > 0);

    return (best_lpl);
}


/*
 * Set thread's affinity for given lgroup
 */
int
lgrp_affinity_set_thread(kthread_t *t, lgrp_id_t lgrp, lgrp_affinity_t aff,
    lgrp_affinity_t **aff_buf)
{
    lgrp_affinity_t *affs;
    lgrp_id_t   best;
    lpl_t       *best_lpl;
    lgrp_id_t   home;
    int     retval;

    ASSERT(t != NULL);
    ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));

    retval = 0;

    thread_lock(t);

    /*
     * Check to see whether caller has permission to set affinity for
     * thread
     */
    if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
        thread_unlock(t);
        return (set_errno(EPERM));
    }

    if (t->t_lgrp_affinity == NULL) {
        if (aff == LGRP_AFF_NONE) {
            thread_unlock(t);
            return (0);
        }
        ASSERT(aff_buf != NULL && *aff_buf != NULL);
        t->t_lgrp_affinity = *aff_buf;
        *aff_buf = NULL;
    }

    affs = t->t_lgrp_affinity;
    affs[lgrp] = aff;

    /*
     * Find lgroup for which thread has most affinity,
     * starting with lgroup for which affinity being set
     */
    best_lpl = lgrp_affinity_best(t, t->t_cpupart, lgrp, B_TRUE);

    /*
     * Rehome if found lgroup with more affinity than home or lgroup for
     * which affinity is being set has same affinity as home
     */
    home = t->t_lpl->lpl_lgrpid;
    if (best_lpl != NULL && best_lpl != t->t_lpl) {
        best = best_lpl->lpl_lgrpid;
        if (affs[best] > affs[home] || (affs[best] == affs[home] &&
            best == lgrp))
            lgrp_move_thread(t, best_lpl, 1);
    }

    thread_unlock(t);

    return (retval);
}


/*
 * Set process' affinity for specified lgroup
 */
int
lgrp_affinity_set_proc(proc_t *p, lgrp_id_t lgrp, lgrp_affinity_t aff,
    lgrp_affinity_t **aff_buf_array)
{
    lgrp_affinity_t *buf;
    int     err = 0;
    int     i;
    int     retval;
    kthread_t   *t;

    ASSERT(MUTEX_HELD(&pidlock) && MUTEX_HELD(&p->p_lock));
    ASSERT(aff_buf_array != NULL);

    i = 0;
    t = p->p_tlist;
    if (t != NULL) {
        do {
            /*
             * Set lgroup affinity for thread
             */
            buf = aff_buf_array[i];
            retval = lgrp_affinity_set_thread(t, lgrp, aff, &buf);

            if (err == 0 && retval != 0)
                err = retval;

            /*
             * Advance pointer to next buffer
             */
            if (buf == NULL) {
                ASSERT(i < p->p_lwpcnt);
                aff_buf_array[i] = NULL;
                i++;
            }

        } while ((t = t->t_forw) != p->p_tlist);
    }
    return (err);
}


/*
 * Set LWP's or process' affinity for specified lgroup
 *
 * When setting affinities, pidlock, process p_lock, and thread_lock()
 * need to be held in that order to protect target thread's pset, process,
 * process contents, and thread contents.  thread_lock() does splhigh(),
 * so it ends up having similiar effect as kpreempt_disable(), so it will
 * protect calls to lgrp_move_thread() and lgrp_choose() from pset changes.
 */
int
lgrp_affinity_set(lgrp_affinity_args_t *ap)
{
    lgrp_affinity_t     aff;
    lgrp_affinity_t     *aff_buf;
    lgrp_affinity_args_t    args;
    id_t            id;
    idtype_t        idtype;
    lgrp_id_t       lgrp;
    int         nthreads;
    proc_t          *p;
    int         retval;

    /*
     * Copyin arguments
     */
    if (copyin(ap, &args, sizeof (lgrp_affinity_args_t)) != 0)
        return (set_errno(EFAULT));

    idtype = args.idtype;
    id = args.id;
    lgrp = args.lgrp;
    aff = args.aff;

    /*
     * Check for invalid lgroup
     */
    if (lgrp < 0 || lgrp == LGRP_NONE)
        return (set_errno(EINVAL));

    /*
     * Check for existing lgroup
     */
    if (lgrp > lgrp_alloc_max)
        return (set_errno(ESRCH));

    /*
     * Check for legal affinity
     */
    if (aff != LGRP_AFF_NONE && aff != LGRP_AFF_WEAK &&
        aff != LGRP_AFF_STRONG)
        return (set_errno(EINVAL));

    /*
     * Must be process or LWP ID
     */
    if (idtype != P_LWPID && idtype != P_PID)
        return (set_errno(EINVAL));

    /*
     * Set given LWP's or process' affinity for specified lgroup
     */
    switch (idtype) {

    case P_LWPID:
        /*
         * Allocate memory for thread's lgroup affinities
         * ahead of time w/o holding locks
         */
        aff_buf = kmem_zalloc(nlgrpsmax * sizeof (lgrp_affinity_t),
            KM_SLEEP);

        p = curproc;

        /*
         * Set affinity for thread
         */
        mutex_enter(&p->p_lock);
        if (id == P_MYID) {     /* current thread */
            retval = lgrp_affinity_set_thread(curthread, lgrp, aff,
                &aff_buf);
        } else if (p->p_tlist == NULL) {
            retval = set_errno(ESRCH);
        } else {            /* other thread */
            int     found = 0;
            kthread_t   *t;

            t = p->p_tlist;
            do {
                if (t->t_tid == id) {
                    retval = lgrp_affinity_set_thread(t,
                        lgrp, aff, &aff_buf);
                    found = 1;
                    break;
                }
            } while ((t = t->t_forw) != p->p_tlist);
            if (!found)
                retval = set_errno(ESRCH);
        }
        mutex_exit(&p->p_lock);

        /*
         * Free memory for lgroup affinities,
         * since thread didn't need it
         */
        if (aff_buf)
            kmem_free(aff_buf,
                nlgrpsmax * sizeof (lgrp_affinity_t));

        break;

    case P_PID:

        do {
            lgrp_affinity_t **aff_buf_array;
            int     i;
            size_t      size;

            /*
             * Get process
             */
            mutex_enter(&pidlock);

            if (id == P_MYID)
                p = curproc;
            else
                p = prfind(id);

            if (p == NULL) {
                mutex_exit(&pidlock);
                return (set_errno(ESRCH));
            }

            /*
             * Get number of threads in process
             *
             * NOTE: Only care about user processes,
             *   so p_lwpcnt should be number of threads.
             */
            mutex_enter(&p->p_lock);
            nthreads = p->p_lwpcnt;
            mutex_exit(&p->p_lock);

            mutex_exit(&pidlock);

            if (nthreads < 1)
                return (set_errno(ESRCH));

            /*
             * Preallocate memory for lgroup affinities for
             * each thread in process now to avoid holding
             * any locks.  Allocate an array to hold a buffer
             * for each thread.
             */
            aff_buf_array = kmem_zalloc(nthreads *
                sizeof (lgrp_affinity_t *), KM_SLEEP);

            size = nlgrpsmax * sizeof (lgrp_affinity_t);
            for (i = 0; i < nthreads; i++)
                aff_buf_array[i] = kmem_zalloc(size, KM_SLEEP);

            mutex_enter(&pidlock);

            /*
             * Get process again since dropped locks to allocate
             * memory (except current process)
             */
            if (id != P_MYID)
                p = prfind(id);

            /*
             * Process went away after we dropped locks and before
             * reacquiring them, so drop locks, free memory, and
             * return.
             */
            if (p == NULL) {
                mutex_exit(&pidlock);
                for (i = 0; i < nthreads; i++)
                    kmem_free(aff_buf_array[i], size);
                kmem_free(aff_buf_array,
                    nthreads * sizeof (lgrp_affinity_t *));
                return (set_errno(ESRCH));
            }

            mutex_enter(&p->p_lock);

            /*
             * See whether number of threads is same
             * If not, drop locks, free memory, and try again
             */
            if (nthreads != p->p_lwpcnt) {
                mutex_exit(&p->p_lock);
                mutex_exit(&pidlock);
                for (i = 0; i < nthreads; i++)
                    kmem_free(aff_buf_array[i], size);
                kmem_free(aff_buf_array,
                    nthreads * sizeof (lgrp_affinity_t *));
                continue;
            }

            /*
             * Set lgroup affinity for threads in process
             */
            retval = lgrp_affinity_set_proc(p, lgrp, aff,
                aff_buf_array);

            mutex_exit(&p->p_lock);
            mutex_exit(&pidlock);

            /*
             * Free any leftover memory, since some threads may
             * have already allocated memory and set lgroup
             * affinities before
             */
            for (i = 0; i < nthreads; i++)
                if (aff_buf_array[i] != NULL)
                    kmem_free(aff_buf_array[i], size);
            kmem_free(aff_buf_array,
                nthreads * sizeof (lgrp_affinity_t *));

            break;

        } while (nthreads != p->p_lwpcnt);

        break;

    default:
        retval = set_errno(EINVAL);
        break;
    }

    return (retval);
}


/*
 * Return the latest generation number for the lgroup hierarchy
 * with the given view
 */
lgrp_gen_t
lgrp_generation(lgrp_view_t view)
{
    cpupart_t   *cpupart;
    uint_t      gen;

    kpreempt_disable();

    /*
     * Determine generation number for given view
     */
    if (view == LGRP_VIEW_OS)
        /*
         * Return generation number of lgroup hierarchy for OS view
         */
        gen = lgrp_gen;
    else {
        /*
         * For caller's view, use generation numbers for lgroup
         * hierarchy and caller's pset
         * NOTE: Caller needs to check for change in pset ID
         */
        cpupart = curthread->t_cpupart;
        ASSERT(cpupart);
        gen = lgrp_gen + cpupart->cp_gen;
    }

    kpreempt_enable();

    return (gen);
}


lgrp_id_t
lgrp_home_thread(kthread_t *t)
{
    lgrp_id_t   home;

    ASSERT(t != NULL);
    ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));

    thread_lock(t);

    /*
     * Check to see whether caller has permission to set affinity for
     * thread
     */
    if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
        thread_unlock(t);
        return (set_errno(EPERM));
    }

    home = lgrp_home_id(t);

    thread_unlock(t);
    return (home);
}


/*
 * Get home lgroup of given process or thread
 */
lgrp_id_t
lgrp_home_get(idtype_t idtype, id_t id)
{
    proc_t      *p;
    lgrp_id_t   retval;
    kthread_t   *t;

    /*
     * Get home lgroup of given LWP or process
     */
    switch (idtype) {

    case P_LWPID:
        p = curproc;

        /*
         * Set affinity for thread
         */
        mutex_enter(&p->p_lock);
        if (id == P_MYID) {     /* current thread */
            retval = lgrp_home_thread(curthread);
        } else if (p->p_tlist == NULL) {
            retval = set_errno(ESRCH);
        } else {            /* other thread */
            int found = 0;

            t = p->p_tlist;
            do {
                if (t->t_tid == id) {
                    retval = lgrp_home_thread(t);
                    found = 1;
                    break;
                }
            } while ((t = t->t_forw) != p->p_tlist);
            if (!found)
                retval = set_errno(ESRCH);
        }
        mutex_exit(&p->p_lock);
        break;

    case P_PID:
        /*
         * Get process
         */
        mutex_enter(&pidlock);

        if (id == P_MYID)
            p = curproc;
        else
            p = prfind(id);

        if (p == NULL) {
            mutex_exit(&pidlock);
            return (set_errno(ESRCH));
        }

        mutex_enter(&p->p_lock);
        t = p->p_tlist;
        if (t == NULL)
            retval = set_errno(ESRCH);
        else
            retval = lgrp_home_thread(t);
        mutex_exit(&p->p_lock);

        mutex_exit(&pidlock);

        break;

    default:
        retval = set_errno(EINVAL);
        break;
    }

    return (retval);
}


/*
 * Return latency between "from" and "to" lgroups
 *
 * This latency number can only be used for relative comparison
 * between lgroups on the running system, cannot be used across platforms,
 * and may not reflect the actual latency.  It is platform and implementation
 * specific, so platform gets to decide its value.  It would be nice if the
 * number was at least proportional to make comparisons more meaningful though.
 */
int
lgrp_latency(lgrp_id_t from, lgrp_id_t to)
{
    lgrp_t      *from_lgrp;
    int     i;
    int     latency;
    int     latency_max;
    lgrp_t      *to_lgrp;

    ASSERT(MUTEX_HELD(&cpu_lock));

    if (from < 0 || to < 0)
        return (set_errno(EINVAL));

    if (from > lgrp_alloc_max || to > lgrp_alloc_max)
        return (set_errno(ESRCH));

    from_lgrp = lgrp_table[from];
    to_lgrp = lgrp_table[to];

    if (!LGRP_EXISTS(from_lgrp) || !LGRP_EXISTS(to_lgrp)) {
        return (set_errno(ESRCH));
    }

    /*
     * Get latency for same lgroup
     */
    if (from == to) {
        latency = from_lgrp->lgrp_latency;
        return (latency);
    }

    /*
     * Get latency between leaf lgroups
     */
    if (from_lgrp->lgrp_childcnt == 0 && to_lgrp->lgrp_childcnt == 0)
        return (lgrp_plat_latency(from_lgrp->lgrp_plathand,
            to_lgrp->lgrp_plathand));

    /*
     * Determine max latency between resources in two lgroups
     */
    latency_max = 0;
    for (i = 0; i <= lgrp_alloc_max; i++) {
        lgrp_t  *from_rsrc;
        int j;
        lgrp_t  *to_rsrc;

        from_rsrc = lgrp_table[i];
        if (!LGRP_EXISTS(from_rsrc) ||
            !klgrpset_ismember(from_lgrp->lgrp_set[LGRP_RSRC_CPU], i))
            continue;

        for (j = 0; j <= lgrp_alloc_max; j++) {
            to_rsrc = lgrp_table[j];
            if (!LGRP_EXISTS(to_rsrc) ||
                klgrpset_ismember(to_lgrp->lgrp_set[LGRP_RSRC_MEM],
                j) == 0)
                continue;
            latency = lgrp_plat_latency(from_rsrc->lgrp_plathand,
                to_rsrc->lgrp_plathand);
            if (latency > latency_max)
                latency_max = latency;
        }
    }
    return (latency_max);
}


/*
 * Return lgroup interface version number
 * 0 - none
 * 1 - original
 * 2 - lgrp_latency_cookie() and lgrp_resources() added
 */
int
lgrp_version(int version)
{
    /*
     * Return LGRP_VER_NONE when requested version isn't supported
     */
    if (version < LGRP_VER_NONE || version > LGRP_VER_CURRENT)
        return (LGRP_VER_NONE);

    /*
     * Return current version when LGRP_VER_NONE passed in
     */
    if (version == LGRP_VER_NONE)
        return (LGRP_VER_CURRENT);

    /*
     * Otherwise, return supported version.
     */
    return (version);
}


/*
 * Snapshot of lgroup hieararchy
 *
 * One snapshot is kept and is based on the kernel's native data model, so
 * a 32-bit snapshot is kept for the 32-bit kernel and a 64-bit one for the
 * 64-bit kernel.  If a 32-bit user wants a snapshot from the 64-bit kernel,
 * the kernel generates a 32-bit snapshot from the data in its 64-bit snapshot.
 *
 * The format is defined by lgroup snapshot header and the layout of
 * the snapshot in memory is as follows:
 * 1) lgroup snapshot header
 *    - specifies format of snapshot
 *    - defined by lgrp_snapshot_header_t
 * 2) lgroup info array
 *    - contains information about each lgroup
 *    - one element for each lgroup
 *    - each element is defined by lgrp_info_t
 * 3) lgroup CPU ID array
 *    - contains list (array) of CPU IDs for each lgroup
 *    - lgrp_info_t points into array and specifies how many CPUs belong to
 *      given lgroup
 * 4) lgroup parents array
 *    - contains lgroup bitmask of parents for each lgroup
 *    - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
 * 5) lgroup children array
 *    - contains lgroup bitmask of children for each lgroup
 *    - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
 * 6) lgroup resources array
 *    - contains lgroup bitmask of resources for each lgroup
 *    - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
 * 7) lgroup latency table
 *    - contains latency from each lgroup to each of other lgroups
 *
 * NOTE:  Must use nlgrpsmax for per lgroup data structures because lgroups
 *    may be sparsely allocated.
 */
lgrp_snapshot_header_t  *lgrp_snap = NULL;  /* lgroup snapshot */
static kmutex_t     lgrp_snap_lock;     /* snapshot lock */


/*
 * Take a snapshot of lgroup hierarchy and return size of buffer
 * needed to hold snapshot
 */
static int
lgrp_snapshot(void)
{
    size_t      bitmask_size;
    size_t      bitmasks_size;
    size_t      bufsize;
    int     cpu_index;
    size_t      cpuids_size;
    int     i;
    int     j;
    size_t      info_size;
    size_t      lats_size;
    ulong_t     *lgrp_children;
    processorid_t   *lgrp_cpuids;
    lgrp_info_t *lgrp_info;
    int     **lgrp_lats;
    ulong_t     *lgrp_parents;
    ulong_t     *lgrp_rsets;
    ulong_t     *lgrpset;
    int     snap_ncpus;
    int     snap_nlgrps;
    int     snap_nlgrpsmax;
    size_t      snap_hdr_size;
#ifdef  _SYSCALL32_IMPL
    model_t     model = DATAMODEL_NATIVE;

    /*
     * Have up-to-date snapshot, so check to see whether caller is 32-bit
     * program and need to return size of 32-bit snapshot now.
     */
    model = get_udatamodel();
    if (model == DATAMODEL_ILP32 && lgrp_snap &&
        lgrp_snap->ss_gen == lgrp_gen) {

        snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;

        /*
         * Calculate size of buffer needed for 32-bit snapshot,
         * rounding up size of each object to allow for alignment
         * of next object in buffer.
         */
        snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
            sizeof (caddr32_t));
        info_size =
            P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
            sizeof (processorid_t));
        cpuids_size =
            P2ROUNDUP(lgrp_snap->ss_ncpus * sizeof (processorid_t),
            sizeof (ulong_t));

        /*
         * lgroup bitmasks needed for parents, children, and resources
         * for each lgroup and pset lgroup set
         */
        bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
        bitmasks_size = (((2 + LGRP_RSRC_COUNT) *
            snap_nlgrpsmax) + 1) * bitmask_size;

        /*
         * Size of latency table and buffer
         */
        lats_size = snap_nlgrpsmax * sizeof (caddr32_t) +
            snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int);

        bufsize = snap_hdr_size + info_size + cpuids_size +
            bitmasks_size + lats_size;
        return (bufsize);
    }
#endif  /* _SYSCALL32_IMPL */

    /*
     * Check whether snapshot is up-to-date
     * Free it and take another one if not
     */
    if (lgrp_snap) {
        if (lgrp_snap->ss_gen == lgrp_gen)
            return (lgrp_snap->ss_size);

        kmem_free(lgrp_snap, lgrp_snap->ss_size);
        lgrp_snap = NULL;
    }

    /*
     * Allocate memory for snapshot
     * w/o holding cpu_lock while waiting for memory
     */
    while (lgrp_snap == NULL) {
        int old_generation;

        /*
         * Take snapshot of lgroup generation number
         * and configuration size dependent information
         * NOTE: Only count number of online CPUs,
         * since only online CPUs appear in lgroups.
         */
        mutex_enter(&cpu_lock);
        old_generation = lgrp_gen;
        snap_ncpus = ncpus_online;
        snap_nlgrps = nlgrps;
        snap_nlgrpsmax = nlgrpsmax;
        mutex_exit(&cpu_lock);

        /*
         * Calculate size of buffer needed for snapshot,
         * rounding up size of each object to allow for alignment
         * of next object in buffer.
         */
        snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header_t),
            sizeof (void *));
        info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info_t),
            sizeof (processorid_t));
        cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t),
            sizeof (ulong_t));
        /*
         * lgroup bitmasks needed for pset lgroup set and  parents,
         * children, and resource sets for each lgroup
         */
        bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
        bitmasks_size = (((2 + LGRP_RSRC_COUNT) *
            snap_nlgrpsmax) + 1) * bitmask_size;

        /*
         * Size of latency table and buffer
         */
        lats_size = snap_nlgrpsmax * sizeof (int *) +
            snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int);

        bufsize = snap_hdr_size + info_size + cpuids_size +
            bitmasks_size + lats_size;

        /*
         * Allocate memory for buffer
         */
        lgrp_snap = kmem_zalloc(bufsize, KM_NOSLEEP);
        if (lgrp_snap == NULL)
            return (set_errno(ENOMEM));

        /*
         * Check whether generation number has changed
         */
        mutex_enter(&cpu_lock);
        if (lgrp_gen == old_generation)
            break;      /* hasn't change, so done. */

        /*
         * Generation number changed, so free memory and try again.
         */
        mutex_exit(&cpu_lock);
        kmem_free(lgrp_snap, bufsize);
        lgrp_snap = NULL;
    }

    /*
     * Fill in lgroup snapshot header
     * (including pointers to tables of lgroup info, CPU IDs, and parents
     * and children)
     */
    lgrp_snap->ss_version = LGRP_VER_CURRENT;

    /*
     * XXX For now, liblgrp only needs to know whether the hierarchy
     * XXX only has one level or not
     */
    if (snap_nlgrps == 1)
        lgrp_snap->ss_levels = 1;
    else
        lgrp_snap->ss_levels = 2;

    lgrp_snap->ss_root = LGRP_ROOTID;

    lgrp_snap->ss_nlgrps = lgrp_snap->ss_nlgrps_os = snap_nlgrps;
    lgrp_snap->ss_nlgrps_max = snap_nlgrpsmax;
    lgrp_snap->ss_ncpus = snap_ncpus;
    lgrp_snap->ss_gen = lgrp_gen;
    lgrp_snap->ss_view = LGRP_VIEW_OS;
    lgrp_snap->ss_pset = 0;     /* NOTE: caller should set if needed */
    lgrp_snap->ss_size = bufsize;
    lgrp_snap->ss_magic = (uintptr_t)lgrp_snap;

    lgrp_snap->ss_info = lgrp_info =
        (lgrp_info_t *)((uintptr_t)lgrp_snap + snap_hdr_size);

    lgrp_snap->ss_cpuids = lgrp_cpuids =
        (processorid_t *)((uintptr_t)lgrp_info + info_size);

    lgrp_snap->ss_lgrpset = lgrpset =
        (ulong_t *)((uintptr_t)lgrp_cpuids + cpuids_size);

    lgrp_snap->ss_parents = lgrp_parents =
        (ulong_t *)((uintptr_t)lgrpset + bitmask_size);

    lgrp_snap->ss_children = lgrp_children =
        (ulong_t *)((uintptr_t)lgrp_parents + (snap_nlgrpsmax *
        bitmask_size));

    lgrp_snap->ss_rsets = lgrp_rsets =
        (ulong_t *)((uintptr_t)lgrp_children + (snap_nlgrpsmax *
        bitmask_size));

    lgrp_snap->ss_latencies = lgrp_lats =
        (int **)((uintptr_t)lgrp_rsets + (LGRP_RSRC_COUNT *
        snap_nlgrpsmax * bitmask_size));

    /*
     * Fill in lgroup information
     */
    cpu_index = 0;
    for (i = 0; i < snap_nlgrpsmax; i++) {
        struct cpu  *cp;
        int     cpu_count;
        struct cpu  *head;
        int     k;
        lgrp_t      *lgrp;

        lgrp = lgrp_table[i];
        if (!LGRP_EXISTS(lgrp)) {
            bzero(&lgrp_info[i], sizeof (lgrp_info[i]));
            lgrp_info[i].info_lgrpid = LGRP_NONE;
            continue;
        }

        lgrp_info[i].info_lgrpid = i;
        lgrp_info[i].info_latency = lgrp->lgrp_latency;

        /*
         * Fill in parents, children, and lgroup resources
         */
        lgrp_info[i].info_parents =
            (ulong_t *)((uintptr_t)lgrp_parents + (i * bitmask_size));

        if (lgrp->lgrp_parent)
            BT_SET(lgrp_info[i].info_parents,
                lgrp->lgrp_parent->lgrp_id);

        lgrp_info[i].info_children =
            (ulong_t *)((uintptr_t)lgrp_children + (i * bitmask_size));

        for (j = 0; j < snap_nlgrpsmax; j++)
            if (klgrpset_ismember(lgrp->lgrp_children, j))
                BT_SET(lgrp_info[i].info_children, j);

        lgrp_info[i].info_rset =
            (ulong_t *)((uintptr_t)lgrp_rsets +
            (i * LGRP_RSRC_COUNT * bitmask_size));

        for (j = 0; j < LGRP_RSRC_COUNT; j++) {
            ulong_t *rset;

            rset = (ulong_t *)((uintptr_t)lgrp_info[i].info_rset +
                (j * bitmask_size));
            for (k = 0; k < snap_nlgrpsmax; k++)
                if (klgrpset_ismember(lgrp->lgrp_set[j], k))
                    BT_SET(rset, k);
        }

        /*
         * Fill in CPU IDs
         */
        cpu_count = 0;
        lgrp_info[i].info_cpuids = NULL;
        cp = head = lgrp->lgrp_cpu;
        if (head != NULL) {
            lgrp_info[i].info_cpuids = &lgrp_cpuids[cpu_index];
            do {
                lgrp_cpuids[cpu_index] = cp->cpu_id;
                cpu_index++;
                cpu_count++;
                cp = cp->cpu_next_lgrp;
            } while (cp != head);
        }
        ASSERT(cpu_count == lgrp->lgrp_cpucnt);
        lgrp_info[i].info_ncpus = cpu_count;

        /*
         * Fill in memory sizes for lgroups that directly contain
         * memory
         */
        if (klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], i)) {
            lgrp_info[i].info_mem_free =
                lgrp_mem_size(i, LGRP_MEM_SIZE_FREE);
            lgrp_info[i].info_mem_install =
                lgrp_mem_size(i, LGRP_MEM_SIZE_INSTALL);
        }

        /*
         * Fill in latency table and buffer
         */
        lgrp_lats[i] = (int *)((uintptr_t)lgrp_lats + snap_nlgrpsmax *
            sizeof (int *) + i * snap_nlgrpsmax * sizeof (int));
        for (j = 0; j < snap_nlgrpsmax; j++) {
            lgrp_t  *to;

            to = lgrp_table[j];
            if (!LGRP_EXISTS(to))
                continue;
            lgrp_lats[i][j] = lgrp_latency(lgrp->lgrp_id,
                to->lgrp_id);
        }
    }
    ASSERT(cpu_index == snap_ncpus);


    mutex_exit(&cpu_lock);

#ifdef  _SYSCALL32_IMPL
    /*
     * Check to see whether caller is 32-bit program and need to return
     * size of 32-bit snapshot now that snapshot has been taken/updated.
     * May not have been able to do this earlier if snapshot was out of
     * date or didn't exist yet.
     */
    if (model == DATAMODEL_ILP32) {

        snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;

        /*
         * Calculate size of buffer needed for 32-bit snapshot,
         * rounding up size of each object to allow for alignment
         * of next object in buffer.
         */
        snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
            sizeof (caddr32_t));
        info_size =
            P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
            sizeof (processorid_t));
        cpuids_size =
            P2ROUNDUP(lgrp_snap->ss_ncpus * sizeof (processorid_t),
            sizeof (ulong_t));

        bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
        bitmasks_size = (((2 + LGRP_RSRC_COUNT) * snap_nlgrpsmax) +
            1) * bitmask_size;


        /*
         * Size of latency table and buffer
         */
        lats_size = (snap_nlgrpsmax * sizeof (caddr32_t)) +
            (snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int));

        bufsize = snap_hdr_size + info_size + cpuids_size +
            bitmasks_size + lats_size;
        return (bufsize);
    }
#endif  /* _SYSCALL32_IMPL */

    return (lgrp_snap->ss_size);
}


/*
 * Copy snapshot into given user buffer, fix up any pointers in buffer to point
 * into user instead of kernel address space, and return size of buffer
 * needed to hold snapshot
 */
static int
lgrp_snapshot_copy(char *buf, size_t bufsize)
{
    size_t          bitmask_size;
    int         cpu_index;
    size_t          cpuids_size;
    int         i;
    size_t          info_size;
    lgrp_info_t     *lgrp_info;
    int         retval;
    size_t          snap_hdr_size;
    int         snap_ncpus;
    int         snap_nlgrpsmax;
    lgrp_snapshot_header_t  *user_snap;
    lgrp_info_t     *user_info;
    lgrp_info_t     *user_info_buffer;
    processorid_t       *user_cpuids;
    ulong_t         *user_lgrpset;
    ulong_t         *user_parents;
    ulong_t         *user_children;
    int         **user_lats;
    int         **user_lats_buffer;
    ulong_t         *user_rsets;

    if (lgrp_snap == NULL)
        return (0);

    if (buf == NULL || bufsize <= 0)
        return (lgrp_snap->ss_size);

    /*
     * User needs to try getting size of buffer again
     * because given buffer size is too small.
     * The lgroup hierarchy may have changed after they asked for the size
     * but before the snapshot was taken.
     */
    if (bufsize < lgrp_snap->ss_size)
        return (set_errno(EAGAIN));

    snap_ncpus = lgrp_snap->ss_ncpus;
    snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;

    /*
     * Fill in lgrpset now because caller may have change psets
     */
    kpreempt_disable();
    for (i = 0; i < snap_nlgrpsmax; i++) {
        if (klgrpset_ismember(curthread->t_cpupart->cp_lgrpset,
            i)) {
            BT_SET(lgrp_snap->ss_lgrpset, i);
        }
    }
    kpreempt_enable();

    /*
     * Copy lgroup snapshot (snapshot header, lgroup info, and CPU IDs)
     * into user buffer all at once
     */
    if (copyout(lgrp_snap, buf, lgrp_snap->ss_size) != 0)
        return (set_errno(EFAULT));

    /*
     * Round up sizes of lgroup snapshot header and info for alignment
     */
    snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header_t),
        sizeof (void *));
    info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info_t),
        sizeof (processorid_t));
    cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t),
        sizeof (ulong_t));

    bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);

    /*
     * Calculate pointers into user buffer for lgroup snapshot header,
     * info, and CPU IDs
     */
    user_snap = (lgrp_snapshot_header_t *)buf;
    user_info = (lgrp_info_t *)((uintptr_t)user_snap + snap_hdr_size);
    user_cpuids = (processorid_t *)((uintptr_t)user_info + info_size);
    user_lgrpset = (ulong_t *)((uintptr_t)user_cpuids + cpuids_size);
    user_parents = (ulong_t *)((uintptr_t)user_lgrpset + bitmask_size);
    user_children = (ulong_t *)((uintptr_t)user_parents +
        (snap_nlgrpsmax * bitmask_size));
    user_rsets = (ulong_t *)((uintptr_t)user_children +
        (snap_nlgrpsmax * bitmask_size));
    user_lats = (int **)((uintptr_t)user_rsets +
        (LGRP_RSRC_COUNT * snap_nlgrpsmax * bitmask_size));

    /*
     * Copyout magic number (ie. pointer to beginning of buffer)
     */
    if (copyout(&buf, &user_snap->ss_magic, sizeof (buf)) != 0)
        return (set_errno(EFAULT));

    /*
     * Fix up pointers in user buffer to point into user buffer
     * not kernel snapshot
     */
    if (copyout(&user_info, &user_snap->ss_info, sizeof (user_info)) != 0)
        return (set_errno(EFAULT));

    if (copyout(&user_cpuids, &user_snap->ss_cpuids,
        sizeof (user_cpuids)) != 0)
        return (set_errno(EFAULT));

    if (copyout(&user_lgrpset, &user_snap->ss_lgrpset,
        sizeof (user_lgrpset)) != 0)
        return (set_errno(EFAULT));

    if (copyout(&user_parents, &user_snap->ss_parents,
        sizeof (user_parents)) != 0)
        return (set_errno(EFAULT));

    if (copyout(&user_children, &user_snap->ss_children,
        sizeof (user_children)) != 0)
        return (set_errno(EFAULT));

    if (copyout(&user_rsets, &user_snap->ss_rsets,
        sizeof (user_rsets)) != 0)
        return (set_errno(EFAULT));

    if (copyout(&user_lats, &user_snap->ss_latencies,
        sizeof (user_lats)) != 0)
        return (set_errno(EFAULT));

    /*
     * Make copies of lgroup info and latency table, fix up pointers,
     * and then copy them into user buffer
     */
    user_info_buffer = kmem_zalloc(info_size, KM_NOSLEEP);
    if (user_info_buffer == NULL)
        return (set_errno(ENOMEM));

    user_lats_buffer = kmem_zalloc(snap_nlgrpsmax * sizeof (int *),
        KM_NOSLEEP);
    if (user_lats_buffer == NULL) {
        kmem_free(user_info_buffer, info_size);
        return (set_errno(ENOMEM));
    }

    lgrp_info = (lgrp_info_t *)((uintptr_t)lgrp_snap + snap_hdr_size);
    bcopy(lgrp_info, user_info_buffer, info_size);

    cpu_index = 0;
    for (i = 0; i < snap_nlgrpsmax; i++) {
        ulong_t *snap_rset;

        /*
         * Skip non-existent lgroups
         */
        if (user_info_buffer[i].info_lgrpid == LGRP_NONE)
            continue;

        /*
         * Update free memory size since it changes frequently
         * Only do so for lgroups directly containing memory
         *
         * NOTE: This must be done before changing the pointers to
         *   point into user space since we need to dereference
         *   lgroup resource set
         */
        snap_rset = &lgrp_info[i].info_rset[LGRP_RSRC_MEM *
            BT_BITOUL(snap_nlgrpsmax)];
        if (BT_TEST(snap_rset, i))
            user_info_buffer[i].info_mem_free =
                lgrp_mem_size(i, LGRP_MEM_SIZE_FREE);

        /*
         * Fix up pointers to parents, children, resources, and
         * latencies
         */
        user_info_buffer[i].info_parents =
            (ulong_t *)((uintptr_t)user_parents + (i * bitmask_size));
        user_info_buffer[i].info_children =
            (ulong_t *)((uintptr_t)user_children + (i * bitmask_size));
        user_info_buffer[i].info_rset =
            (ulong_t *)((uintptr_t)user_rsets +
            (i * LGRP_RSRC_COUNT * bitmask_size));
        user_lats_buffer[i] = (int *)((uintptr_t)user_lats +
            (snap_nlgrpsmax * sizeof (int *)) + (i * snap_nlgrpsmax *
            sizeof (int)));

        /*
         * Fix up pointer to CPU IDs
         */
        if (user_info_buffer[i].info_ncpus == 0) {
            user_info_buffer[i].info_cpuids = NULL;
            continue;
        }
        user_info_buffer[i].info_cpuids = &user_cpuids[cpu_index];
        cpu_index += user_info_buffer[i].info_ncpus;
    }
    ASSERT(cpu_index == snap_ncpus);

    /*
     * Copy lgroup info and latency table with pointers fixed up to point
     * into user buffer out to user buffer now
     */
    retval = lgrp_snap->ss_size;
    if (copyout(user_info_buffer, user_info, info_size) != 0)
        retval = set_errno(EFAULT);
    kmem_free(user_info_buffer, info_size);

    if (copyout(user_lats_buffer, user_lats, snap_nlgrpsmax *
        sizeof (int *)) != 0)
        retval = set_errno(EFAULT);
    kmem_free(user_lats_buffer, snap_nlgrpsmax * sizeof (int *));

    return (retval);
}


#ifdef  _SYSCALL32_IMPL
/*
 * Make 32-bit copy of snapshot, fix up any pointers in buffer to point
 * into user instead of kernel address space, copy 32-bit snapshot into
 * given user buffer, and return size of buffer needed to hold snapshot
 */
static int
lgrp_snapshot_copy32(caddr32_t buf, size32_t bufsize)
{
    size32_t            bitmask_size;
    size32_t            bitmasks_size;
    size32_t            children_size;
    int             cpu_index;
    size32_t            cpuids_size;
    int             i;
    int             j;
    size32_t            info_size;
    size32_t            lats_size;
    lgrp_info_t         *lgrp_info;
    lgrp_snapshot_header32_t    *lgrp_snap32;
    lgrp_info32_t           *lgrp_info32;
    processorid_t           *lgrp_cpuids32;
    caddr32_t           *lgrp_lats32;
    int             **lgrp_lats32_kernel;
    uint_t              *lgrp_set32;
    uint_t              *lgrp_parents32;
    uint_t              *lgrp_children32;
    uint_t              *lgrp_rsets32;
    size32_t            parents_size;
    size32_t            rsets_size;
    size32_t            set_size;
    size32_t            snap_hdr_size;
    int             snap_ncpus;
    int             snap_nlgrpsmax;
    size32_t            snap_size;

    if (lgrp_snap == NULL)
        return (0);

    snap_ncpus = lgrp_snap->ss_ncpus;
    snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;

    /*
     * Calculate size of buffer needed for 32-bit snapshot,
     * rounding up size of each object to allow for alignment
     * of next object in buffer.
     */
    snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
        sizeof (caddr32_t));
    info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
        sizeof (processorid_t));
    cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t),
        sizeof (ulong_t));

    bitmask_size = BT_SIZEOFMAP32(snap_nlgrpsmax);

    set_size = bitmask_size;
    parents_size = snap_nlgrpsmax * bitmask_size;
    children_size = snap_nlgrpsmax * bitmask_size;
    rsets_size = P2ROUNDUP(LGRP_RSRC_COUNT * snap_nlgrpsmax *
        (int)bitmask_size, sizeof (caddr32_t));

    bitmasks_size = set_size + parents_size + children_size + rsets_size;

    /*
     * Size of latency table and buffer
     */
    lats_size = (snap_nlgrpsmax * sizeof (caddr32_t)) +
        (snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int));

    snap_size = snap_hdr_size + info_size + cpuids_size + bitmasks_size +
        lats_size;

    if (buf == NULL || bufsize <= 0) {
        return (snap_size);
    }

    /*
     * User needs to try getting size of buffer again
     * because given buffer size is too small.
     * The lgroup hierarchy may have changed after they asked for the size
     * but before the snapshot was taken.
     */
    if (bufsize < snap_size)
        return (set_errno(EAGAIN));

    /*
     * Make 32-bit copy of snapshot, fix up pointers to point into user
     * buffer not kernel, and then copy whole thing into user buffer
     */
    lgrp_snap32 = kmem_zalloc(snap_size, KM_NOSLEEP);
    if (lgrp_snap32 == NULL)
        return (set_errno(ENOMEM));

    /*
     * Calculate pointers into 32-bit copy of snapshot
     * for lgroup info, CPU IDs, pset lgroup bitmask, parents, children,
     * resources, and latency table and buffer
     */
    lgrp_info32 = (lgrp_info32_t *)((uintptr_t)lgrp_snap32 +
        snap_hdr_size);
    lgrp_cpuids32 = (processorid_t *)((uintptr_t)lgrp_info32 + info_size);
    lgrp_set32 = (uint_t *)((uintptr_t)lgrp_cpuids32 + cpuids_size);
    lgrp_parents32 = (uint_t *)((uintptr_t)lgrp_set32 + set_size);
    lgrp_children32 = (uint_t *)((uintptr_t)lgrp_parents32 + parents_size);
    lgrp_rsets32 = (uint_t *)((uintptr_t)lgrp_children32 + children_size);
    lgrp_lats32 = (caddr32_t *)((uintptr_t)lgrp_rsets32 + rsets_size);

    /*
     * Make temporary lgroup latency table of pointers for kernel to use
     * to fill in rows of table with latencies from each lgroup
     */
    lgrp_lats32_kernel =  kmem_zalloc(snap_nlgrpsmax * sizeof (int *),
        KM_NOSLEEP);
    if (lgrp_lats32_kernel == NULL) {
        kmem_free(lgrp_snap32, snap_size);
        return (set_errno(ENOMEM));
    }

    /*
     * Fill in 32-bit lgroup snapshot header
     * (with pointers into user's buffer for lgroup info, CPU IDs,
     * bit masks, and latencies)
     */
    lgrp_snap32->ss_version = lgrp_snap->ss_version;
    lgrp_snap32->ss_levels = lgrp_snap->ss_levels;
    lgrp_snap32->ss_nlgrps = lgrp_snap32->ss_nlgrps_os =
        lgrp_snap->ss_nlgrps;
    lgrp_snap32->ss_nlgrps_max = snap_nlgrpsmax;
    lgrp_snap32->ss_root = lgrp_snap->ss_root;
    lgrp_snap32->ss_ncpus = lgrp_snap->ss_ncpus;
    lgrp_snap32->ss_gen = lgrp_snap->ss_gen;
    lgrp_snap32->ss_view = LGRP_VIEW_OS;
    lgrp_snap32->ss_size = snap_size;
    lgrp_snap32->ss_magic = buf;
    lgrp_snap32->ss_info = buf + snap_hdr_size;
    lgrp_snap32->ss_cpuids = lgrp_snap32->ss_info + info_size;
    lgrp_snap32->ss_lgrpset = lgrp_snap32->ss_cpuids + cpuids_size;
    lgrp_snap32->ss_parents = lgrp_snap32->ss_lgrpset + bitmask_size;
    lgrp_snap32->ss_children = lgrp_snap32->ss_parents +
        (snap_nlgrpsmax * bitmask_size);
    lgrp_snap32->ss_rsets = lgrp_snap32->ss_children +
        (snap_nlgrpsmax * bitmask_size);
    lgrp_snap32->ss_latencies = lgrp_snap32->ss_rsets +
        (LGRP_RSRC_COUNT * snap_nlgrpsmax * bitmask_size);

    /*
     * Fill in lgrpset now because caller may have change psets
     */
    kpreempt_disable();
    for (i = 0; i < snap_nlgrpsmax; i++) {
        if (klgrpset_ismember(curthread->t_cpupart->cp_lgrpset,
            i)) {
            BT_SET32(lgrp_set32, i);
        }
    }
    kpreempt_enable();

    /*
     * Fill in 32-bit copy of lgroup info and fix up pointers
     * to point into user's buffer instead of kernel's
     */
    cpu_index = 0;
    lgrp_info = lgrp_snap->ss_info;
    for (i = 0; i < snap_nlgrpsmax; i++) {
        uint_t  *children;
        uint_t  *lgrp_rset;
        uint_t  *parents;
        ulong_t *snap_rset;

        /*
         * Skip non-existent lgroups
         */
        if (lgrp_info[i].info_lgrpid == LGRP_NONE) {
            bzero(&lgrp_info32[i], sizeof (lgrp_info32[i]));
            lgrp_info32[i].info_lgrpid = LGRP_NONE;
            continue;
        }

        /*
         * Fill in parents, children, lgroup resource set, and
         * latencies from snapshot
         */
        parents = (uint_t *)((uintptr_t)lgrp_parents32 +
            i * bitmask_size);
        children = (uint_t *)((uintptr_t)lgrp_children32 +
            i * bitmask_size);
        snap_rset = (ulong_t *)((uintptr_t)lgrp_snap->ss_rsets +
            (i * LGRP_RSRC_COUNT * BT_SIZEOFMAP(snap_nlgrpsmax)));
        lgrp_rset = (uint_t *)((uintptr_t)lgrp_rsets32 +
            (i * LGRP_RSRC_COUNT * bitmask_size));
        lgrp_lats32_kernel[i] = (int *)((uintptr_t)lgrp_lats32 +
            snap_nlgrpsmax * sizeof (caddr32_t) + i * snap_nlgrpsmax *
            sizeof (int));
        for (j = 0; j < snap_nlgrpsmax; j++) {
            int k;
            uint_t  *rset;

            if (BT_TEST(&lgrp_snap->ss_parents[i], j))
                BT_SET32(parents, j);

            if (BT_TEST(&lgrp_snap->ss_children[i], j))
                BT_SET32(children, j);

            for (k = 0; k < LGRP_RSRC_COUNT; k++) {
                rset = (uint_t *)((uintptr_t)lgrp_rset +
                    k * bitmask_size);
                if (BT_TEST(&snap_rset[k], j))
                    BT_SET32(rset, j);
            }

            lgrp_lats32_kernel[i][j] =
                lgrp_snap->ss_latencies[i][j];
        }

        /*
         * Fix up pointer to latency buffer
         */
        lgrp_lats32[i] = lgrp_snap32->ss_latencies +
            snap_nlgrpsmax * sizeof (caddr32_t) + i * snap_nlgrpsmax *
            sizeof (int);

        /*
         * Fix up pointers for parents, children, and resources
         */
        lgrp_info32[i].info_parents = lgrp_snap32->ss_parents +
            (i * bitmask_size);
        lgrp_info32[i].info_children = lgrp_snap32->ss_children +
            (i * bitmask_size);
        lgrp_info32[i].info_rset = lgrp_snap32->ss_rsets +
            (i * LGRP_RSRC_COUNT * bitmask_size);

        /*
         * Fill in memory and CPU info
         * Only fill in memory for lgroups directly containing memory
         */
        snap_rset = &lgrp_info[i].info_rset[LGRP_RSRC_MEM *
            BT_BITOUL(snap_nlgrpsmax)];
        if (BT_TEST(snap_rset, i)) {
            lgrp_info32[i].info_mem_free = lgrp_mem_size(i,
                LGRP_MEM_SIZE_FREE);
            lgrp_info32[i].info_mem_install =
                lgrp_info[i].info_mem_install;
        }

        lgrp_info32[i].info_ncpus = lgrp_info[i].info_ncpus;

        lgrp_info32[i].info_lgrpid = lgrp_info[i].info_lgrpid;
        lgrp_info32[i].info_latency = lgrp_info[i].info_latency;

        if (lgrp_info32[i].info_ncpus == 0) {
            lgrp_info32[i].info_cpuids = 0;
            continue;
        }

        /*
         * Fix up pointer for CPU IDs
         */
        lgrp_info32[i].info_cpuids = lgrp_snap32->ss_cpuids +
            (cpu_index * sizeof (processorid_t));
        cpu_index += lgrp_info32[i].info_ncpus;
    }
    ASSERT(cpu_index == snap_ncpus);

    /*
     * Copy lgroup CPU IDs into 32-bit snapshot
     * before copying it out into user's buffer
     */
    bcopy(lgrp_snap->ss_cpuids, lgrp_cpuids32, cpuids_size);

    /*
     * Copy 32-bit lgroup snapshot into user's buffer all at once
     */
    if (copyout(lgrp_snap32, (void *)(uintptr_t)buf, snap_size) != 0) {
        kmem_free(lgrp_snap32, snap_size);
        kmem_free(lgrp_lats32_kernel, snap_nlgrpsmax * sizeof (int *));
        return (set_errno(EFAULT));
    }

    kmem_free(lgrp_snap32, snap_size);
    kmem_free(lgrp_lats32_kernel, snap_nlgrpsmax * sizeof (int *));

    return (snap_size);
}
#endif  /* _SYSCALL32_IMPL */


int
lgrpsys(int subcode, long ia, void *ap)
{
    size_t  bufsize;
    int latency;

    switch (subcode) {

    case LGRP_SYS_AFFINITY_GET:
        return (lgrp_affinity_get((lgrp_affinity_args_t *)ap));

    case LGRP_SYS_AFFINITY_SET:
        return (lgrp_affinity_set((lgrp_affinity_args_t *)ap));

    case LGRP_SYS_GENERATION:
        return (lgrp_generation(ia));

    case LGRP_SYS_HOME:
        return (lgrp_home_get((idtype_t)ia, (id_t)(uintptr_t)ap));

    case LGRP_SYS_LATENCY:
        mutex_enter(&cpu_lock);
        latency = lgrp_latency(ia, (lgrp_id_t)(uintptr_t)ap);
        mutex_exit(&cpu_lock);
        return (latency);

    case LGRP_SYS_MEMINFO:
        return (meminfo(ia, (struct meminfo *)ap));

    case LGRP_SYS_VERSION:
        return (lgrp_version(ia));

    case LGRP_SYS_SNAPSHOT:
        mutex_enter(&lgrp_snap_lock);
        bufsize = lgrp_snapshot();
        if (ap && ia > 0) {
            if (get_udatamodel() == DATAMODEL_NATIVE)
                bufsize = lgrp_snapshot_copy(ap, ia);
#ifdef  _SYSCALL32_IMPL
            else
                bufsize = lgrp_snapshot_copy32(
                    (caddr32_t)(uintptr_t)ap, ia);
#endif  /* _SYSCALL32_IMPL */
        }
        mutex_exit(&lgrp_snap_lock);
        return (bufsize);

    default:
        break;

    }

    return (set_errno(EINVAL));
}