common/os/timers.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Copyright (c) 1982, 1986 Regents of the University of California.
 * All rights reserved.  The Berkeley software License Agreement
 * specifies the terms and conditions for redistribution.
 */

#include <sys/param.h>
#include <sys/user.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/cmn_err.h>
#include <sys/cpuvar.h>
#include <sys/timer.h>
#include <sys/debug.h>
#include <sys/sysmacros.h>
#include <sys/cyclic.h>

static void realitexpire(void *);
static void realprofexpire(void *);
static void timeval_advance(struct timeval *, struct timeval *);

kmutex_t tod_lock;  /* protects time-of-day stuff */

/*
 * Constant to define the minimum interval value of the ITIMER_REALPROF timer.
 * Value is in microseconds; defaults to 500 usecs.  Setting this value
 * significantly lower may allow for denial-of-service attacks.
 */
int itimer_realprof_minimum = 500;

/*
 * macro to compare a timeval to a timestruc
 */

#define TVTSCMP(tvp, tsp, cmp) \
    /* CSTYLED */ \
    ((tvp)->tv_sec cmp (tsp)->tv_sec || \
    ((tvp)->tv_sec == (tsp)->tv_sec && \
    /* CSTYLED */ \
    (tvp)->tv_usec * 1000 cmp (tsp)->tv_nsec))

/*
 * Time of day and interval timer support.
 *
 * These routines provide the kernel entry points to get and set
 * the time-of-day and per-process interval timers.  Subroutines
 * here provide support for adding and subtracting timeval structures
 * and decrementing interval timers, optionally reloading the interval
 * timers when they expire.
 */

/*
 * SunOS function to generate monotonically increasing time values.
 */
void
uniqtime(struct timeval *tv)
{
    static struct timeval last;
    static int last_timechanged;
    timestruc_t ts;
    time_t sec;
    int usec, nsec;

    /*
     * protect modification of last
     */
    mutex_enter(&tod_lock);
    gethrestime(&ts);

    /*
     * Fast algorithm to convert nsec to usec -- see hrt2ts()
     * in common/os/timers.c for a full description.
     */
    nsec = ts.tv_nsec;
    usec = nsec + (nsec >> 2);
    usec = nsec + (usec >> 1);
    usec = nsec + (usec >> 2);
    usec = nsec + (usec >> 4);
    usec = nsec - (usec >> 3);
    usec = nsec + (usec >> 2);
    usec = nsec + (usec >> 3);
    usec = nsec + (usec >> 4);
    usec = nsec + (usec >> 1);
    usec = nsec + (usec >> 6);
    usec = usec >> 10;
    sec = ts.tv_sec;

    /*
     * If the system hres time has been changed since the last time
     * we are called. then all bets are off; just update our
     * local copy of timechanged and accept the reported time as is.
     */
    if (last_timechanged != timechanged) {
        last_timechanged = timechanged;
    }
    /*
     * Try to keep timestamps unique, but don't be obsessive about
     * it in the face of large differences.
     */
    else if ((sec <= last.tv_sec) &&    /* same or lower seconds, and */
        ((sec != last.tv_sec) ||        /* either different second or */
        (usec <= last.tv_usec)) &&      /* lower microsecond, and */
        ((last.tv_sec - sec) <= 5)) {   /* not way back in time */
        sec = last.tv_sec;
        usec = last.tv_usec + 1;
        if (usec >= MICROSEC) {
            usec -= MICROSEC;
            sec++;
        }
    }
    last.tv_sec = sec;
    last.tv_usec = usec;
    mutex_exit(&tod_lock);

    tv->tv_sec = sec;
    tv->tv_usec = usec;
}

/*
 * Timestamps are exported from the kernel in several places.
 * Such timestamps are commonly used for either uniqueness or for
 * sequencing - truncation to 32-bits is fine for uniqueness,
 * but sequencing is going to take more work as we get closer to 2038!
 */
void
uniqtime32(struct timeval32 *tv32p)
{
    struct timeval tv;

    uniqtime(&tv);
    TIMEVAL_TO_TIMEVAL32(tv32p, &tv);
}

int
gettimeofday(struct timeval *tp)
{
    struct timeval atv;

    if (tp) {
        uniqtime(&atv);
        if (get_udatamodel() == DATAMODEL_NATIVE) {
            if (copyout(&atv, tp, sizeof (atv)))
                return (set_errno(EFAULT));
        } else {
            struct timeval32 tv32;

            if (TIMEVAL_OVERFLOW(&atv))
                return (set_errno(EOVERFLOW));
            TIMEVAL_TO_TIMEVAL32(&tv32, &atv);

            if (copyout(&tv32, tp, sizeof (tv32)))
                return (set_errno(EFAULT));
        }
    }
    return (0);
}

int
getitimer(uint_t which, struct itimerval *itv)
{
    int error;

    if (get_udatamodel() == DATAMODEL_NATIVE)
        error = xgetitimer(which, itv, 0);
    else {
        struct itimerval kitv;

        if ((error = xgetitimer(which, &kitv, 1)) == 0) {
            if (ITIMERVAL_OVERFLOW(&kitv)) {
                error = EOVERFLOW;
            } else {
                struct itimerval32 itv32;

                ITIMERVAL_TO_ITIMERVAL32(&itv32, &kitv);
                if (copyout(&itv32, itv, sizeof (itv32)) != 0)
                    error = EFAULT;
            }
        }
    }

    return (error ? (set_errno(error)) : 0);
}

int
xgetitimer(uint_t which, struct itimerval *itv, int iskaddr)
{
    struct proc *p = curproc;
    struct timeval now;
    struct itimerval aitv;
    hrtime_t ts, first, interval, remain;

    mutex_enter(&p->p_lock);

    switch (which) {
    case ITIMER_VIRTUAL:
    case ITIMER_PROF:
        aitv = ttolwp(curthread)->lwp_timer[which];
        break;

    case ITIMER_REAL:
        uniqtime(&now);
        aitv = p->p_realitimer;

        if (timerisset(&aitv.it_value)) {
            /*CSTYLED*/
            if (timercmp(&aitv.it_value, &now, <)) {
                timerclear(&aitv.it_value);
            } else {
                timevalsub(&aitv.it_value, &now);
            }
        }
        break;

    case ITIMER_REALPROF:
        if (curproc->p_rprof_cyclic == CYCLIC_NONE) {
            bzero(&aitv, sizeof (aitv));
            break;
        }

        aitv = curproc->p_rprof_timer;

        first = tv2hrt(&aitv.it_value);
        interval = tv2hrt(&aitv.it_interval);

        if ((ts = gethrtime()) < first) {
            /*
             * We haven't gone off for the first time; the time
             * remaining is simply the first time we will go
             * off minus the current time.
             */
            remain = first - ts;
        } else {
            if (interval == 0) {
                /*
                 * This was set as a one-shot, and we've
                 * already gone off; there is no time
                 * remaining.
                 */
                remain = 0;
            } else {
                /*
                 * We have a non-zero interval; we need to
                 * determine how far we are into the current
                 * interval, and subtract that from the
                 * interval to determine the time remaining.
                 */
                remain = interval - ((ts - first) % interval);
            }
        }

        hrt2tv(remain, &aitv.it_value);
        break;

    default:
        mutex_exit(&p->p_lock);
        return (EINVAL);
    }

    mutex_exit(&p->p_lock);

    if (iskaddr) {
        bcopy(&aitv, itv, sizeof (*itv));
    } else {
        ASSERT(get_udatamodel() == DATAMODEL_NATIVE);
        if (copyout(&aitv, itv, sizeof (*itv)))
            return (EFAULT);
    }

    return (0);
}


int
setitimer(uint_t which, struct itimerval *itv, struct itimerval *oitv)
{
    int error;

    if (oitv != NULL)
        if ((error = getitimer(which, oitv)) != 0)
            return (error);

    if (itv == NULL)
        return (0);

    if (get_udatamodel() == DATAMODEL_NATIVE)
        error = xsetitimer(which, itv, 0);
    else {
        struct itimerval32 itv32;
        struct itimerval kitv;

        if (copyin(itv, &itv32, sizeof (itv32)))
            error = EFAULT;
        ITIMERVAL32_TO_ITIMERVAL(&kitv, &itv32);
        error = xsetitimer(which, &kitv, 1);
    }

    return (error ? (set_errno(error)) : 0);
}

int
xsetitimer(uint_t which, struct itimerval *itv, int iskaddr)
{
    struct itimerval aitv;
    struct timeval now;
    struct proc *p = curproc;
    kthread_t *t;
    timeout_id_t tmp_id;
    cyc_handler_t hdlr;
    cyc_time_t when;
    cyclic_id_t cyclic;
    hrtime_t ts;
    int min;

    if (itv == NULL)
        return (0);

    if (iskaddr) {
        bcopy(itv, &aitv, sizeof (aitv));
    } else {
        ASSERT(get_udatamodel() == DATAMODEL_NATIVE);
        if (copyin(itv, &aitv, sizeof (aitv)))
            return (EFAULT);
    }

    if (which == ITIMER_REALPROF) {
        min = MAX((int)(cyclic_getres() / (NANOSEC / MICROSEC)),
            itimer_realprof_minimum);
    } else {
        min = usec_per_tick;
    }

    if (itimerfix(&aitv.it_value, min) ||
        (itimerfix(&aitv.it_interval, min) && timerisset(&aitv.it_value)))
        return (EINVAL);

    mutex_enter(&p->p_lock);
    switch (which) {
    case ITIMER_REAL:
        /*
         * The SITBUSY flag prevents conflicts with multiple
         * threads attempting to perform setitimer(ITIMER_REAL)
         * at the same time, even when we drop p->p_lock below.
         * Any blocked thread returns successfully because the
         * effect is the same as if it got here first, finished,
         * and the other thread then came through and destroyed
         * what it did.  We are just protecting the system from
         * malfunctioning due to the race condition.
         */
        if (p->p_flag & SITBUSY) {
            mutex_exit(&p->p_lock);
            return (0);
        }
        p->p_flag |= SITBUSY;
        while ((tmp_id = p->p_itimerid) != 0) {
            /*
             * Avoid deadlock in callout_delete (called from
             * untimeout) which may go to sleep (while holding
             * p_lock). Drop p_lock and re-acquire it after
             * untimeout returns. Need to clear p_itimerid
             * while holding p_lock.
             */
            p->p_itimerid = 0;
            mutex_exit(&p->p_lock);
            (void) untimeout(tmp_id);
            mutex_enter(&p->p_lock);
        }
        if (timerisset(&aitv.it_value)) {
            uniqtime(&now);
            timevaladd(&aitv.it_value, &now);
            p->p_itimerid = realtime_timeout(realitexpire,
                p, hzto(&aitv.it_value));
        }
        p->p_realitimer = aitv;
        p->p_flag &= ~SITBUSY;
        break;

    case ITIMER_REALPROF:
        cyclic = p->p_rprof_cyclic;
        p->p_rprof_cyclic = CYCLIC_NONE;

        mutex_exit(&p->p_lock);

        /*
         * We're now going to acquire cpu_lock, remove the old cyclic
         * if necessary, and add our new cyclic.
         */
        mutex_enter(&cpu_lock);

        if (cyclic != CYCLIC_NONE)
            cyclic_remove(cyclic);

        if (!timerisset(&aitv.it_value)) {
            /*
             * If we were passed a value of 0, we're done.
             */
            mutex_exit(&cpu_lock);
            return (0);
        }

        hdlr.cyh_func = realprofexpire;
        hdlr.cyh_arg = p;
        hdlr.cyh_level = CY_LOW_LEVEL;

        when.cyt_when = (ts = gethrtime() + tv2hrt(&aitv.it_value));
        when.cyt_interval = tv2hrt(&aitv.it_interval);

        if (when.cyt_interval == 0) {
            /*
             * Using the same logic as for CLOCK_HIGHRES timers, we
             * set the interval to be INT64_MAX - when.cyt_when to
             * effect a one-shot; see the comment in clock_highres.c
             * for more details on why this works.
             */
            when.cyt_interval = INT64_MAX - when.cyt_when;
        }

        cyclic = cyclic_add(&hdlr, &when);

        mutex_exit(&cpu_lock);

        /*
         * We have now successfully added the cyclic.  Reacquire
         * p_lock, and see if anyone has snuck in.
         */
        mutex_enter(&p->p_lock);

        if (p->p_rprof_cyclic != CYCLIC_NONE) {
            /*
             * We're racing with another thread establishing an
             * ITIMER_REALPROF interval timer.  We'll let the other
             * thread win (this is a race at the application level,
             * so letting the other thread win is acceptable).
             */
            mutex_exit(&p->p_lock);
            mutex_enter(&cpu_lock);
            cyclic_remove(cyclic);
            mutex_exit(&cpu_lock);

            return (0);
        }

        /*
         * Success.  Set our tracking variables in the proc structure,
         * cancel any outstanding ITIMER_PROF, and allocate the
         * per-thread SIGPROF buffers, if possible.
         */
        hrt2tv(ts, &aitv.it_value);
        p->p_rprof_timer = aitv;
        p->p_rprof_cyclic = cyclic;

        t = p->p_tlist;
        do {
            struct itimerval *itvp;

            itvp = &ttolwp(t)->lwp_timer[ITIMER_PROF];
            timerclear(&itvp->it_interval);
            timerclear(&itvp->it_value);

            if (t->t_rprof != NULL)
                continue;

            t->t_rprof =
                kmem_zalloc(sizeof (struct rprof), KM_NOSLEEP);
            aston(t);
        } while ((t = t->t_forw) != p->p_tlist);

        break;

    case ITIMER_VIRTUAL:
        ttolwp(curthread)->lwp_timer[ITIMER_VIRTUAL] = aitv;
        break;

    case ITIMER_PROF:
        if (p->p_rprof_cyclic != CYCLIC_NONE) {
            /*
             * Silently ignore ITIMER_PROF if ITIMER_REALPROF
             * is in effect.
             */
            break;
        }

        ttolwp(curthread)->lwp_timer[ITIMER_PROF] = aitv;
        break;

    default:
        mutex_exit(&p->p_lock);
        return (EINVAL);
    }
    mutex_exit(&p->p_lock);
    return (0);
}

/*
 * Delete the ITIMER_REALPROF interval timer.
 * Called only from exec_args() when exec occurs.
 * The other ITIMER_* interval timers are specified
 * to be inherited across exec(), so leave them alone.
 */
void
delete_itimer_realprof(void)
{
    kthread_t *t = curthread;
    struct proc *p = ttoproc(t);
    klwp_t *lwp = ttolwp(t);
    cyclic_id_t cyclic;

    mutex_enter(&p->p_lock);

    /* we are performing execve(); assert we are single-threaded */
    ASSERT(t == p->p_tlist && t == t->t_forw);

    if ((cyclic = p->p_rprof_cyclic) == CYCLIC_NONE) {
        mutex_exit(&p->p_lock);
    } else {
        p->p_rprof_cyclic = CYCLIC_NONE;
        /*
         * Delete any current instance of SIGPROF.
         */
        if (lwp->lwp_cursig == SIGPROF) {
            lwp->lwp_cursig = 0;
            lwp->lwp_extsig = 0;
            if (lwp->lwp_curinfo) {
                siginfofree(lwp->lwp_curinfo);
                lwp->lwp_curinfo = NULL;
            }
        }
        /*
         * Delete any pending instances of SIGPROF.
         */
        sigdelset(&p->p_sig, SIGPROF);
        sigdelset(&p->p_extsig, SIGPROF);
        sigdelq(p, NULL, SIGPROF);
        sigdelset(&t->t_sig, SIGPROF);
        sigdelset(&t->t_extsig, SIGPROF);
        sigdelq(p, t, SIGPROF);

        mutex_exit(&p->p_lock);

        /*
         * Remove the ITIMER_REALPROF cyclic.
         */
        mutex_enter(&cpu_lock);
        cyclic_remove(cyclic);
        mutex_exit(&cpu_lock);
    }
}

/*
 * Real interval timer expired:
 * send process whose timer expired an alarm signal.
 * If time is not set up to reload, then just return.
 * Else compute next time timer should go off which is > current time.
 * This is where delay in processing this timeout causes multiple
 * SIGALRM calls to be compressed into one.
 */
static void
realitexpire(void *arg)
{
    struct proc *p = arg;
    struct timeval *valp = &p->p_realitimer.it_value;
    struct timeval *intervalp = &p->p_realitimer.it_interval;
#if !defined(_LP64)
    clock_t ticks;
#endif

    mutex_enter(&p->p_lock);
#if !defined(_LP64)
    if ((ticks = hzto(valp)) > 1) {
        /*
         * If we are executing before we were meant to, it must be
         * because of an overflow in a prior hzto() calculation.
         * In this case, we want to go to sleep for the recalculated
         * number of ticks. For the special meaning of the value "1"
         * see comment in timespectohz().
         */
        p->p_itimerid = realtime_timeout(realitexpire, p, ticks);
        mutex_exit(&p->p_lock);
        return;
    }
#endif
    sigtoproc(p, NULL, SIGALRM);
    if (!timerisset(intervalp)) {
        timerclear(valp);
        p->p_itimerid = 0;
    } else {
        /* advance timer value past current time */
        timeval_advance(valp, intervalp);
        p->p_itimerid = realtime_timeout(realitexpire, p, hzto(valp));
    }
    mutex_exit(&p->p_lock);
}

/*
 * Real time profiling interval timer expired:
 * Increment microstate counters for each lwp in the process
 * and ensure that running lwps are kicked into the kernel.
 * If time is not set up to reload, then just return.
 * Else compute next time timer should go off which is > current time,
 * as above.
 */
static void
realprofexpire(void *arg)
{
    struct proc *p = arg;
    kthread_t *t;

    mutex_enter(&p->p_lock);
    if (p->p_rprof_cyclic == CYCLIC_NONE ||
        (t = p->p_tlist) == NULL) {
        mutex_exit(&p->p_lock);
        return;
    }
    do {
        int mstate;

        /*
         * Attempt to allocate the SIGPROF buffer, but don't sleep.
         */
        if (t->t_rprof == NULL)
            t->t_rprof = kmem_zalloc(sizeof (struct rprof),
                KM_NOSLEEP);
        if (t->t_rprof == NULL)
            continue;

        thread_lock(t);
        switch (t->t_state) {
        case TS_SLEEP:
            /*
             * Don't touch the lwp is it is swapped out.
             */
            if (!(t->t_schedflag & TS_LOAD)) {
                mstate = LMS_SLEEP;
                break;
            }
            switch (mstate = ttolwp(t)->lwp_mstate.ms_prev) {
            case LMS_TFAULT:
            case LMS_DFAULT:
            case LMS_KFAULT:
            case LMS_USER_LOCK:
                break;
            default:
                mstate = LMS_SLEEP;
                break;
            }
            break;
        case TS_RUN:
        case TS_WAIT:
            mstate = LMS_WAIT_CPU;
            break;
        case TS_ONPROC:
            switch (mstate = t->t_mstate) {
            case LMS_USER:
            case LMS_SYSTEM:
            case LMS_TRAP:
                break;
            default:
                mstate = LMS_SYSTEM;
                break;
            }
            break;
        default:
            mstate = t->t_mstate;
            break;
        }
        t->t_rprof->rp_anystate = 1;
        t->t_rprof->rp_state[mstate]++;
        aston(t);
        /*
         * force the thread into the kernel
         * if it is not already there.
         */
        if (t->t_state == TS_ONPROC && t->t_cpu != CPU)
            poke_cpu(t->t_cpu->cpu_id);
        thread_unlock(t);
    } while ((t = t->t_forw) != p->p_tlist);

    mutex_exit(&p->p_lock);
}

/*
 * Advances timer value past the current time of day.  See the detailed
 * comment for this logic in realitsexpire(), above.
 */
static void
timeval_advance(struct timeval *valp, struct timeval *intervalp)
{
    int cnt2nth;
    struct timeval interval2nth;

    for (;;) {
        interval2nth = *intervalp;
        for (cnt2nth = 0; ; cnt2nth++) {
            timevaladd(valp, &interval2nth);
            /*CSTYLED*/
            if (TVTSCMP(valp, &hrestime, >))
                break;
            timevaladd(&interval2nth, &interval2nth);
        }
        if (cnt2nth == 0)
            break;
        timevalsub(valp, &interval2nth);
    }
}

/*
 * Check that a proposed value to load into the .it_value or .it_interval
 * part of an interval timer is acceptable, and set it to at least a
 * specified minimal value.
 */
int
itimerfix(struct timeval *tv, int minimum)
{
    if (tv->tv_sec < 0 || tv->tv_sec > 100000000 ||
        tv->tv_usec < 0 || tv->tv_usec >= MICROSEC)
        return (EINVAL);
    if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < minimum)
        tv->tv_usec = minimum;
    return (0);
}

/*
 * Same as itimerfix, except a) it takes a timespec instead of a timeval and
 * b) it doesn't truncate based on timeout granularity; consumers of this
 * interface (e.g. timer_settime()) depend on the passed timespec not being
 * modified implicitly.
 */
int
itimerspecfix(timespec_t *tv)
{
    if (tv->tv_sec < 0 || tv->tv_nsec < 0 || tv->tv_nsec >= NANOSEC)
        return (EINVAL);
    return (0);
}

/*
 * Decrement an interval timer by a specified number
 * of microseconds, which must be less than a second,
 * i.e. < 1000000.  If the timer expires, then reload
 * it.  In this case, carry over (usec - old value) to
 * reducint the value reloaded into the timer so that
 * the timer does not drift.  This routine assumes
 * that it is called in a context where the timers
 * on which it is operating cannot change in value.
 */
int
itimerdecr(struct itimerval *itp, int usec)
{
    if (itp->it_value.tv_usec < usec) {
        if (itp->it_value.tv_sec == 0) {
            /* expired, and already in next interval */
            usec -= itp->it_value.tv_usec;
            goto expire;
        }
        itp->it_value.tv_usec += MICROSEC;
        itp->it_value.tv_sec--;
    }
    itp->it_value.tv_usec -= usec;
    usec = 0;
    if (timerisset(&itp->it_value))
        return (1);
    /* expired, exactly at end of interval */
expire:
    if (timerisset(&itp->it_interval)) {
        itp->it_value = itp->it_interval;
        itp->it_value.tv_usec -= usec;
        if (itp->it_value.tv_usec < 0) {
            itp->it_value.tv_usec += MICROSEC;
            itp->it_value.tv_sec--;
        }
    } else
        itp->it_value.tv_usec = 0;      /* sec is already 0 */
    return (0);
}

/*
 * Add and subtract routines for timevals.
 * N.B.: subtract routine doesn't deal with
 * results which are before the beginning,
 * it just gets very confused in this case.
 * Caveat emptor.
 */
void
timevaladd(struct timeval *t1, struct timeval *t2)
{
    t1->tv_sec += t2->tv_sec;
    t1->tv_usec += t2->tv_usec;
    timevalfix(t1);
}

void
timevalsub(struct timeval *t1, struct timeval *t2)
{
    t1->tv_sec -= t2->tv_sec;
    t1->tv_usec -= t2->tv_usec;
    timevalfix(t1);
}

void
timevalfix(struct timeval *t1)
{
    if (t1->tv_usec < 0) {
        t1->tv_sec--;
        t1->tv_usec += MICROSEC;
    }
    if (t1->tv_usec >= MICROSEC) {
        t1->tv_sec++;
        t1->tv_usec -= MICROSEC;
    }
}

/*
 * Same as the routines above. These routines take a timespec instead
 * of a timeval.
 */
void
timespecadd(timespec_t *t1, timespec_t *t2)
{
    t1->tv_sec += t2->tv_sec;
    t1->tv_nsec += t2->tv_nsec;
    timespecfix(t1);
}

void
timespecsub(timespec_t *t1, timespec_t *t2)
{
    t1->tv_sec -= t2->tv_sec;
    t1->tv_nsec -= t2->tv_nsec;
    timespecfix(t1);
}

void
timespecfix(timespec_t *t1)
{
    if (t1->tv_nsec < 0) {
        t1->tv_sec--;
        t1->tv_nsec += NANOSEC;
    } else {
        if (t1->tv_nsec >= NANOSEC) {
            t1->tv_sec++;
            t1->tv_nsec -= NANOSEC;
        }
    }
}

/*
 * Compute number of hz until specified time.
 * Used to compute third argument to timeout() from an absolute time.
 */
clock_t
hzto(struct timeval *tv)
{
    timespec_t ts, now;

    ts.tv_sec = tv->tv_sec;
    ts.tv_nsec = tv->tv_usec * 1000;
    gethrestime_lasttick(&now);

    return (timespectohz(&ts, now));
}

/*
 * Compute number of hz until specified time for a given timespec value.
 * Used to compute third argument to timeout() from an absolute time.
 */
clock_t
timespectohz(timespec_t *tv, timespec_t now)
{
    clock_t ticks;
    time_t  sec;
    int nsec;

    /*
     * Compute number of ticks we will see between now and
     * the target time; returns "1" if the destination time
     * is before the next tick, so we always get some delay,
     * and returns LONG_MAX ticks if we would overflow.
     */
    sec = tv->tv_sec - now.tv_sec;
    nsec = tv->tv_nsec - now.tv_nsec + nsec_per_tick - 1;

    if (nsec < 0) {
        sec--;
        nsec += NANOSEC;
    } else if (nsec >= NANOSEC) {
        sec++;
        nsec -= NANOSEC;
    }

    ticks = NSEC_TO_TICK(nsec);

    /*
     * Compute ticks, accounting for negative and overflow as above.
     * Overflow protection kicks in at about 70 weeks for hz=50
     * and at about 35 weeks for hz=100. (Rather longer for the 64-bit
     * kernel :-)
     */
    if (sec < 0 || (sec == 0 && ticks < 1))
        ticks = 1;          /* protect vs nonpositive */
    else if (sec > (LONG_MAX - ticks) / hz)
        ticks = LONG_MAX;       /* protect vs overflow */
    else
        ticks += sec * hz;      /* common case */

    return (ticks);
}

/*
 * Compute number of hz with the timespec tv specified.
 * The return type must be 64 bit integer.
 */
int64_t
timespectohz64(timespec_t *tv)
{
    int64_t ticks;
    int64_t sec;
    int64_t nsec;

    sec = tv->tv_sec;
    nsec = tv->tv_nsec + nsec_per_tick - 1;

    if (nsec < 0) {
        sec--;
        nsec += NANOSEC;
    } else if (nsec >= NANOSEC) {
        sec++;
        nsec -= NANOSEC;
    }

    ticks = NSEC_TO_TICK(nsec);

    /*
     * Compute ticks, accounting for negative and overflow as above.
     * Overflow protection kicks in at about 70 weeks for hz=50
     * and at about 35 weeks for hz=100. (Rather longer for the 64-bit
     * kernel
     */
    if (sec < 0 || (sec == 0 && ticks < 1))
        ticks = 1;          /* protect vs nonpositive */
    else if (sec > (((~0ULL) >> 1) - ticks) / hz)
        ticks = (~0ULL) >> 1;       /* protect vs overflow */
    else
        ticks += sec * hz;      /* common case */

    return (ticks);
}

/*
 * hrt2ts(): convert from hrtime_t to timestruc_t.
 *
 * All this routine really does is:
 *
 *  tsp->sec  = hrt / NANOSEC;
 *  tsp->nsec = hrt % NANOSEC;
 *
 * The black magic below avoids doing a 64-bit by 32-bit integer divide,
 * which is quite expensive.  There's actually much more going on here than
 * it might first appear -- don't try this at home.
 *
 * For the adventuresome, here's an explanation of how it works.
 *
 * Multiplication by a fixed constant is easy -- you just do the appropriate
 * shifts and adds.  For example, to multiply by 10, we observe that
 *
 *  x * 10  = x * (8 + 2)
 *      = (x * 8) + (x * 2)
 *      = (x << 3) + (x << 1).
 *
 * In general, you can read the algorithm right off the bits: the number 10
 * is 1010 in binary; bits 1 and 3 are ones, so x * 10 = (x << 1) + (x << 3).
 *
 * Sometimes you can do better.  For example, 15 is 1111 binary, so the normal
 * shift/add computation is x * 15 = (x << 0) + (x << 1) + (x << 2) + (x << 3).
 * But, it's cheaper if you capitalize on the fact that you have a run of ones:
 * 1111 = 10000 - 1, hence x * 15 = (x << 4) - (x << 0).  [You would never
 * actually perform the operation << 0, since it's a no-op; I'm just writing
 * it that way for clarity.]
 *
 * The other way you can win is if you get lucky with the prime factorization
 * of your constant.  The number 1,000,000,000, which we have to multiply
 * by below, is a good example.  One billion is 111011100110101100101000000000
 * in binary.  If you apply the bit-grouping trick, it doesn't buy you very
 * much, because it's only a win for groups of three or more equal bits:
 *
 * 111011100110101100101000000000 = 1000000000000000000000000000000
 *                -  000100011001010011011000000000
 *
 * Thus, instead of the 13 shift/add pairs (26 operations) implied by the LHS,
 * we have reduced this to 10 shift/add pairs (20 operations) on the RHS.
 * This is better, but not great.
 *
 * However, we can factor 1,000,000,000 = 2^9 * 5^9 = 2^9 * 125 * 125 * 125,
 * and multiply by each factor.  Multiplication by 125 is particularly easy,
 * since 128 is nearby: x * 125 = (x << 7) - x - x - x, which is just four
 * operations.  So, to multiply by 1,000,000,000, we perform three multipli-
 * cations by 125, then << 9, a total of only 3 * 4 + 1 = 13 operations.
 * This is the algorithm we actually use in both hrt2ts() and ts2hrt().
 *
 * Division is harder; there is no equivalent of the simple shift-add algorithm
 * we used for multiplication.  However, we can convert the division problem
 * into a multiplication problem by pre-computing the binary representation
 * of the reciprocal of the divisor.  For the case of interest, we have
 *
 *  1 / 1,000,000,000 = 1.0001001011100000101111101000001B-30,
 *
 * to 32 bits of precision.  (The notation B-30 means "* 2^-30", just like
 * E-18 means "* 10^-18".)
 *
 * So, to compute x / 1,000,000,000, we just multiply x by the 32-bit
 * integer 10001001011100000101111101000001, then normalize (shift) the
 * result.  This constant has several large bits runs, so the multiply
 * is relatively cheap:
 *
 *  10001001011100000101111101000001 = 10001001100000000110000001000001
 *                   - 00000000000100000000000100000000
 *
 * Again, you can just read the algorithm right off the bits:
 *
 *          sec = hrt;
 *          sec += (hrt << 6);
 *          sec -= (hrt << 8);
 *          sec += (hrt << 13);
 *          sec += (hrt << 14);
 *          sec -= (hrt << 20);
 *          sec += (hrt << 23);
 *          sec += (hrt << 24);
 *          sec += (hrt << 27);
 *          sec += (hrt << 31);
 *          sec >>= (32 + 30);
 *
 * Voila!  The only problem is, since hrt is 64 bits, we need to use 96-bit
 * arithmetic to perform this calculation.  That's a waste, because ultimately
 * we only need the highest 32 bits of the result.
 *
 * The first thing we do is to realize that we don't need to use all of hrt
 * in the calculation.  The lowest 30 bits can contribute at most 1 to the
 * quotient (2^30 / 1,000,000,000 = 1.07...), so we'll deal with them later.
 * The highest 2 bits have to be zero, or hrt won't fit in a timestruc_t.
 * Thus, the only bits of hrt that matter for division are bits 30..61.
 * These 32 bits are just the lower-order word of (hrt >> 30).  This brings
 * us down from 96-bit math to 64-bit math, and our algorithm becomes:
 *
 *          tmp = (uint32_t) (hrt >> 30);
 *          sec = tmp;
 *          sec += (tmp << 6);
 *          sec -= (tmp << 8);
 *          sec += (tmp << 13);
 *          sec += (tmp << 14);
 *          sec -= (tmp << 20);
 *          sec += (tmp << 23);
 *          sec += (tmp << 24);
 *          sec += (tmp << 27);
 *          sec += (tmp << 31);
 *          sec >>= 32;
 *
 * Next, we're going to reduce this 64-bit computation to a 32-bit
 * computation.  We begin by rewriting the above algorithm to use relative
 * shifts instead of absolute shifts.  That is, instead of computing
 * tmp << 6, tmp << 8, tmp << 13, etc, we'll just shift incrementally:
 * tmp <<= 6, tmp <<= 2 (== 8 - 6), tmp <<= 5 (== 13 - 8), etc:
 *
 *          tmp = (uint32_t) (hrt >> 30);
 *          sec = tmp;
 *          tmp <<= 6; sec += tmp;
 *          tmp <<= 2; sec -= tmp;
 *          tmp <<= 5; sec += tmp;
 *          tmp <<= 1; sec += tmp;
 *          tmp <<= 6; sec -= tmp;
 *          tmp <<= 3; sec += tmp;
 *          tmp <<= 1; sec += tmp;
 *          tmp <<= 3; sec += tmp;
 *          tmp <<= 4; sec += tmp;
 *          sec >>= 32;
 *
 * Now for the final step.  Instead of throwing away the low 32 bits at
 * the end, we can throw them away as we go, only keeping the high 32 bits
 * of the product at each step.  So, for example, where we now have
 *
 *          tmp <<= 6; sec = sec + tmp;
 * we will instead have
 *          tmp <<= 6; sec = (sec + tmp) >> 6;
 * which is equivalent to
 *          sec = (sec >> 6) + tmp;
 *
 * The final shift ("sec >>= 32") goes away.
 *
 * All we're really doing here is long multiplication, just like we learned in
 * grade school, except that at each step, we only look at the leftmost 32
 * columns.  The cumulative error is, at most, the sum of all the bits we
 * throw away, which is 2^-32 + 2^-31 + ... + 2^-2 + 2^-1 == 1 - 2^-32.
 * Thus, the final result ("sec") is correct to +/- 1.
 *
 * It turns out to be important to keep "sec" positive at each step, because
 * we don't want to have to explicitly extend the sign bit.  Therefore,
 * starting with the last line of code above, each line that would have read
 * "sec = (sec >> n) - tmp" must be changed to "sec = tmp - (sec >> n)", and
 * the operators (+ or -) in all previous lines must be toggled accordingly.
 * Thus, we end up with:
 *
 *          tmp = (uint32_t) (hrt >> 30);
 *          sec = tmp + (sec >> 6);
 *          sec = tmp - (tmp >> 2);
 *          sec = tmp - (sec >> 5);
 *          sec = tmp + (sec >> 1);
 *          sec = tmp - (sec >> 6);
 *          sec = tmp - (sec >> 3);
 *          sec = tmp + (sec >> 1);
 *          sec = tmp + (sec >> 3);
 *          sec = tmp + (sec >> 4);
 *
 * This yields a value for sec that is accurate to +1/-1, so we have two
 * cases to deal with.  The mysterious-looking "+ 7" in the code below biases
 * the rounding toward zero, so that sec is always less than or equal to
 * the correct value.  With this modified code, sec is accurate to +0/-2, with
 * the -2 case being very rare in practice.  With this change, we only have to
 * deal with one case (sec too small) in the cleanup code.
 *
 * The other modification we make is to delete the second line above
 * ("sec = tmp + (sec >> 6);"), since it only has an effect when bit 31 is
 * set, and the cleanup code can handle that rare case.  This reduces the
 * *guaranteed* accuracy of sec to +0/-3, but speeds up the common cases.
 *
 * Finally, we compute nsec = hrt - (sec * 1,000,000,000).  nsec will always
 * be positive (since sec is never too large), and will at most be equal to
 * the error in sec (times 1,000,000,000) plus the low-order 30 bits of hrt.
 * Thus, nsec < 3 * 1,000,000,000 + 2^30, which is less than 2^32, so we can
 * safely assume that nsec fits in 32 bits.  Consequently, when we compute
 * sec * 1,000,000,000, we only need the low 32 bits, so we can just do 32-bit
 * arithmetic and let the high-order bits fall off the end.
 *
 * Since nsec < 3 * 1,000,000,000 + 2^30 == 4,073,741,824, the cleanup loop:
 *
 *          while (nsec >= NANOSEC) {
 *              nsec -= NANOSEC;
 *              sec++;
 *          }
 *
 * is guaranteed to complete in at most 4 iterations.  In practice, the loop
 * completes in 0 or 1 iteration over 95% of the time.
 *
 * On an SS2, this implementation of hrt2ts() takes 1.7 usec, versus about
 * 35 usec for software division -- about 20 times faster.
 */
void
hrt2ts(hrtime_t hrt, timestruc_t *tsp)
{
    uint32_t sec, nsec, tmp;

    tmp = (uint32_t)(hrt >> 30);
    sec = tmp - (tmp >> 2);
    sec = tmp - (sec >> 5);
    sec = tmp + (sec >> 1);
    sec = tmp - (sec >> 6) + 7;
    sec = tmp - (sec >> 3);
    sec = tmp + (sec >> 1);
    sec = tmp + (sec >> 3);
    sec = tmp + (sec >> 4);
    tmp = (sec << 7) - sec - sec - sec;
    tmp = (tmp << 7) - tmp - tmp - tmp;
    tmp = (tmp << 7) - tmp - tmp - tmp;
    nsec = (uint32_t)hrt - (tmp << 9);
    while (nsec >= NANOSEC) {
        nsec -= NANOSEC;
        sec++;
    }
    tsp->tv_sec = (time_t)sec;
    tsp->tv_nsec = nsec;
}

/*
 * Convert from timestruc_t to hrtime_t.
 *
 * The code below is equivalent to:
 *
 *  hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec;
 *
 * but requires no integer multiply.
 */
hrtime_t
ts2hrt(const timestruc_t *tsp)
{
    hrtime_t hrt;

    hrt = tsp->tv_sec;
    hrt = (hrt << 7) - hrt - hrt - hrt;
    hrt = (hrt << 7) - hrt - hrt - hrt;
    hrt = (hrt << 7) - hrt - hrt - hrt;
    hrt = (hrt << 9) + tsp->tv_nsec;
    return (hrt);
}

/*
 * For the various 32-bit "compatibility" paths in the system.
 */
void
hrt2ts32(hrtime_t hrt, timestruc32_t *ts32p)
{
    timestruc_t ts;

    hrt2ts(hrt, &ts);
    TIMESPEC_TO_TIMESPEC32(ts32p, &ts);
}

/*
 * If this ever becomes performance critical (ha!), we can borrow the
 * code from ts2hrt(), above, to multiply tv_sec by 1,000,000 and the
 * straightforward (x << 10) - (x << 5) + (x << 3) to multiply tv_usec by
 * 1,000.  For now, we'll opt for readability (besides, the compiler does
 * a passable job of optimizing constant multiplication into shifts and adds).
 */
hrtime_t
tv2hrt(struct timeval *tvp)
{
    return ((hrtime_t)tvp->tv_sec * NANOSEC +
        (hrtime_t)tvp->tv_usec * (NANOSEC / MICROSEC));
}

void
hrt2tv(hrtime_t hrt, struct timeval *tvp)
{
    uint32_t sec, nsec, tmp;
    uint32_t q, r, t;

    tmp = (uint32_t)(hrt >> 30);
    sec = tmp - (tmp >> 2);
    sec = tmp - (sec >> 5);
    sec = tmp + (sec >> 1);
    sec = tmp - (sec >> 6) + 7;
    sec = tmp - (sec >> 3);
    sec = tmp + (sec >> 1);
    sec = tmp + (sec >> 3);
    sec = tmp + (sec >> 4);
    tmp = (sec << 7) - sec - sec - sec;
    tmp = (tmp << 7) - tmp - tmp - tmp;
    tmp = (tmp << 7) - tmp - tmp - tmp;
    nsec = (uint32_t)hrt - (tmp << 9);
    while (nsec >= NANOSEC) {
        nsec -= NANOSEC;
        sec++;
    }
    tvp->tv_sec = (time_t)sec;
/*
 * this routine is very similar to hr2ts, but requires microseconds
 * instead of nanoseconds, so an interger divide by 1000 routine
 * completes the conversion
 */
    t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12);
    q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14);
    q = q >> 9;
    r = nsec - q*1000;
    tvp->tv_usec = q + ((r + 24) >> 10);

}

int
nanosleep(timespec_t *rqtp, timespec_t *rmtp)
{
    timespec_t rqtime;
    timespec_t rmtime;
    timespec_t now;
    int timecheck;
    int ret = 1;
    model_t datamodel = get_udatamodel();

    timecheck = timechanged;
    gethrestime(&now);

    if (datamodel == DATAMODEL_NATIVE) {
        if (copyin(rqtp, &rqtime, sizeof (rqtime)))
            return (set_errno(EFAULT));
    } else {
        timespec32_t rqtime32;

        if (copyin(rqtp, &rqtime32, sizeof (rqtime32)))
            return (set_errno(EFAULT));
        TIMESPEC32_TO_TIMESPEC(&rqtime, &rqtime32);
    }

    if (rqtime.tv_sec < 0 || rqtime.tv_nsec < 0 ||
        rqtime.tv_nsec >= NANOSEC)
        return (set_errno(EINVAL));

    if (timerspecisset(&rqtime)) {
        timespecadd(&rqtime, &now);
        mutex_enter(&curthread->t_delay_lock);
        while ((ret = cv_waituntil_sig(&curthread->t_delay_cv,
            &curthread->t_delay_lock, &rqtime, timecheck)) > 0)
            continue;
        mutex_exit(&curthread->t_delay_lock);
    }

    if (rmtp) {
        /*
         * If cv_waituntil_sig() returned due to a signal, and
         * there is time remaining, then set the time remaining.
         * Else set time remaining to zero
         */
        rmtime.tv_sec = rmtime.tv_nsec = 0;
        if (ret == 0) {
            timespec_t delta = rqtime;

            gethrestime(&now);
            timespecsub(&delta, &now);
            if (delta.tv_sec > 0 || (delta.tv_sec == 0 &&
                delta.tv_nsec > 0))
                rmtime = delta;
        }

        if (datamodel == DATAMODEL_NATIVE) {
            if (copyout(&rmtime, rmtp, sizeof (rmtime)))
                return (set_errno(EFAULT));
        } else {
            timespec32_t rmtime32;

            TIMESPEC_TO_TIMESPEC32(&rmtime32, &rmtime);
            if (copyout(&rmtime32, rmtp, sizeof (rmtime32)))
                return (set_errno(EFAULT));
        }
    }

    if (ret == 0)
        return (set_errno(EINTR));
    return (0);
}

/*
 * Routines to convert standard UNIX time (seconds since Jan 1, 1970)
 * into year/month/day/hour/minute/second format, and back again.
 * Note: these routines require tod_lock held to protect cached state.
 */
static int days_thru_month[64] = {
    0, 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366, 0, 0,
    0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365, 0, 0,
    0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365, 0, 0,
    0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365, 0, 0,
};

todinfo_t saved_tod;
int saved_utc = -60;

todinfo_t
utc_to_tod(time_t utc)
{
    long dse, day, month, year;
    todinfo_t tod;

    ASSERT(MUTEX_HELD(&tod_lock));

    /*
     * Note that tod_set_prev() assumes utc will be set to zero in
     * the case of it being negative.  Consequently, any change made
     * to this behavior would have to be reflected in that function
     * as well.
     */
    if (utc < 0)            /* should never happen */
        utc = 0;

    saved_tod.tod_sec += utc - saved_utc;
    saved_utc = utc;
    if (saved_tod.tod_sec >= 0 && saved_tod.tod_sec < 60)
        return (saved_tod); /* only the seconds changed */

    dse = utc / 86400;      /* days since epoch */

    tod.tod_sec = utc % 60;
    tod.tod_min = (utc % 3600) / 60;
    tod.tod_hour = (utc % 86400) / 3600;
    tod.tod_dow = (dse + 4) % 7 + 1;    /* epoch was a Thursday */

    year = dse / 365 + 72;  /* first guess -- always a bit too large */
    do {
        year--;
        day = dse - 365 * (year - 70) - ((year - 69) >> 2);
    } while (day < 0);

    month = ((year & 3) << 4) + 1;
    while (day >= days_thru_month[month + 1])
        month++;

    tod.tod_day = day - days_thru_month[month] + 1;
    tod.tod_month = month & 15;
    tod.tod_year = year;

    saved_tod = tod;
    return (tod);
}

time_t
tod_to_utc(todinfo_t tod)
{
    time_t utc;
    int year = tod.tod_year;
    int month = tod.tod_month + ((year & 3) << 4);
#ifdef DEBUG
    /* only warn once, not each time called */
    static int year_warn = 1;
    static int month_warn = 1;
    static int day_warn = 1;
    static int hour_warn = 1;
    static int min_warn = 1;
    static int sec_warn = 1;
    int days_diff = days_thru_month[month + 1] - days_thru_month[month];
#endif

    ASSERT(MUTEX_HELD(&tod_lock));

#ifdef DEBUG
    if (year_warn && (year < 70 || year > 8029)) {
        cmn_err(CE_WARN,
            "The hardware real-time clock appears to have the "
            "wrong years value %d -- time needs to be reset\n",
            year);
        year_warn = 0;
    }

    if (month_warn && (tod.tod_month < 1 || tod.tod_month > 12)) {
        cmn_err(CE_WARN,
            "The hardware real-time clock appears to have the "
            "wrong months value %d -- time needs to be reset\n",
            tod.tod_month);
        month_warn = 0;
    }

    if (day_warn && (tod.tod_day < 1 || tod.tod_day > days_diff)) {
        cmn_err(CE_WARN,
            "The hardware real-time clock appears to have the "
            "wrong days value %d -- time needs to be reset\n",
            tod.tod_day);
        day_warn = 0;
    }

    if (hour_warn && (tod.tod_hour < 0 || tod.tod_hour > 23)) {
        cmn_err(CE_WARN,
            "The hardware real-time clock appears to have the "
            "wrong hours value %d -- time needs to be reset\n",
            tod.tod_hour);
        hour_warn = 0;
    }

    if (min_warn && (tod.tod_min < 0 || tod.tod_min > 59)) {
        cmn_err(CE_WARN,
            "The hardware real-time clock appears to have the "
            "wrong minutes value %d -- time needs to be reset\n",
            tod.tod_min);
        min_warn = 0;
    }

    if (sec_warn && (tod.tod_sec < 0 || tod.tod_sec > 59)) {
        cmn_err(CE_WARN,
            "The hardware real-time clock appears to have the "
            "wrong seconds value %d -- time needs to be reset\n",
            tod.tod_sec);
        sec_warn = 0;
    }
#endif

    utc = (year - 70);      /* next 3 lines: utc = 365y + y/4 */
    utc += (utc << 3) + (utc << 6);
    utc += (utc << 2) + ((year - 69) >> 2);
    utc += days_thru_month[month] + tod.tod_day - 1;
    utc = (utc << 3) + (utc << 4) + tod.tod_hour;   /* 24 * day + hour */
    utc = (utc << 6) - (utc << 2) + tod.tod_min;    /* 60 * hour + min */
    utc = (utc << 6) - (utc << 2) + tod.tod_sec;    /* 60 * min + sec */

    return (utc);
}