sun4/sys/clock.h

	clock.h revision b0fc0e77220f1fa4c933fd58a4e1dedcd650b0f1
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#ifndef _SYS_CLOCK_H
#define _SYS_CLOCK_H

#pragma ident   "%Z%%M% %I% %E% SMI"

#ifdef  __cplusplus
extern "C" {
#endif

#include <sys/spl.h>
#include <sys/time.h>
#include <sys/machclock.h>

#ifndef _ASM

#ifdef  _KERNEL

extern void setcpudelay(void);

extern uint_t   nsec_scale;
extern uint_t   nsec_shift;
extern uint_t   nsec_per_sys_tick;
extern uint64_t sys_tick_freq;

extern int  traptrace_use_stick;
extern uint64_t system_clock_freq;
extern uint_t   sys_clock_mhz;

extern void mon_clock_init(void);
extern void mon_clock_start(void);
extern void mon_clock_stop(void);
extern void mon_clock_share(void);
extern void mon_clock_unshare(void);

extern hrtime_t hrtime_base;
extern void hres_tick(void);
extern void clkstart(void);
extern void cbe_level14();
extern hrtime_t tick2ns(hrtime_t, uint_t);

typedef struct {
    uint64_t cbe_level1_inum;
    uint64_t cbe_level10_inum;
} cbe_data_t;

#endif  /* _KERNEL */

#endif  /* _ASM */


#define CBE_LOW_PIL 1
#define CBE_LOCK_PIL    LOCK_LEVEL
#define CBE_HIGH_PIL    14

#define ADJ_SHIFT   4   /* used in get_hrestime and _level10 */

/*
 * Locking strategy for high-resolution timing services
 *
 * We generally construct timestamps from two or more components:
 * a hardware time source and one or more software time sources.
 * These components cannot all be loaded simultaneously, so we need
 * some sort of locking strategy to generate consistent timestamps.
 *
 * To minimize lock contention and cache thrashing we employ the
 * weakest possible synchronization model: writers (rare) serialize
 * on an acquisition-counting mutex, described below; readers (common)
 * execute in parallel with no synchronization at all -- they don't
 * exclude other readers, and they don't even exclude writers.  Instead,
 * readers just examine the writer lock's value before and after loading
 * all the components of a timestamp to detect writer intervention.
 * In the rare case when a writer does intervene, the reader will
 * detect it, discard the timestamp and try again.
 *
 * The writer lock, hres_lock, is a 32-bit integer consisting of an
 * 8-bit lock and a 24-bit acquisition count.  To acquire the lock we
 * set the lock field with ldstub, which sets the low-order 8 bits to
 * 0xff; to clear the lock, we increment it, which simultaneously clears
 * the lock field (0xff --> 0x00) and increments the acquisition count
 * (due to carry into bit 8).  Thus each acquisition transforms hres_lock
 * from N:0 to N:ff, and each release transforms N:ff into (N+1):0.
 *
 * Readers can detect writer intervention by loading hres_lock before
 * and after loading the time components they need; if either lock value
 * contains 0xff in the low-order bits (lock held), or if the lock values
 * are not equal (lock was acquired and released), a writer intervened
 * and the reader must try again.  If the lock values are equal and the
 * low-order 8 bits are clear, the timestamp must be valid.  We can check
 * both of these conditions with a single compare instruction by checking
 * whether old_hres_lock & ~1 == new_hres_lock, as illustrated by the
 * following table of all possible lock states:
 *
 *  initial & ~1    final       result of compare
 *  ------------    -----       -----------------
 *  now:00      now:00      valid
 *  now:00      now:ff      invalid
 *  now:00      later:00    invalid
 *  now:00      later:ff    invalid
 *  now:fe      now:ff      invalid
 *  now:fe      later:00    invalid
 *  now:fe      later:ff    invalid
 *
 * Implementation considerations:
 *
 * (1) Load buffering.
 *
 * On a CPU that does load buffering we must ensure that the load of
 * hres_lock completes before the load of any timestamp components.
 * This is essential *even on a CPU that does in-order loads* because
 * accessing the hardware time source may not involve a memory reference
 * (e.g. rd %tick).  A convenient way to address this is to clear the
 * lower bit (andn with 1) of the old lock value right away, since this
 * generates a dependency on the load of hres_lock.  We have to do this
 * anyway to perform the lock comparison described above.
 *
 * (2) Out-of-order loads.
 *
 * On a CPU that does out-of-order loads we must ensure that the loads
 * of all timestamp components have completed before we load the final
 * value of hres_lock.  This can be done either by generating load
 * dependencies on the timestamp components or by membar #LoadLoad.
 *
 * (3) Interaction with the high level cyclic handler, hres_tick().
 *
 * One unusual property of hres_lock is that it's acquired in a high
 * level cyclic handler, hres_tick().  Thus, hres_lock must be acquired at
 * CBE_HIGH_PIL or higher to prevent single-CPU deadlock.
 *
 * (4) Cross-calls.
 *
 * If a cross-call happens while one CPU has hres_lock and another is
 * trying to acquire it in the clock interrupt path, the system will
 * deadlock: the first CPU will never release hres_lock since it's
 * waiting to be released from the cross-call, and the cross-call can't
 * complete because the second CPU is spinning on hres_lock with traps
 * disabled.  Thus cross-calls must be blocked while holding hres_lock.
 *
 * Together, (3) and (4) imply that hres_lock should only be acquired
 * at PIL >= max(XCALL_PIL, CBE_HIGH_PIL), or while traps are disabled.
 */
#define HRES_LOCK_OFFSET 3

#define CLOCK_LOCK(oldsplp) \
    lock_set_spl((lock_t *)&hres_lock + HRES_LOCK_OFFSET, \
        ipltospl(CBE_HIGH_PIL), oldsplp)

#define CLOCK_UNLOCK(spl)   \
    membar_ldst_stst(); \
    hres_lock++;        \
    splx(spl);      \
    LOCKSTAT_RECORD0(LS_CLOCK_UNLOCK_RELEASE,   \
        (lock_t *)&hres_lock + HRES_LOCK_OFFSET);

/*
 * NATIVE_TIME_TO_NSEC_SCALE is called with NSEC_SHIFT to convert hi-res
 * timestamps into nanoseconds. On systems that have a %stick register,
 * hi-res timestamps are in %stick units. On systems that do not have a
 * %stick register, hi-res timestamps are in %tick units.
 *
 * NATIVE_TIME_TO_NSEC_SCALE is called with TICK_NSEC_SHIFT to convert from
 * %tick units to nanoseconds on all implementations whether %stick is
 * available or not.
 */

/*
 * At least 62.5 MHz CPU %tick frequency
 */

#define TICK_NSEC_SHIFT 4

/*
 * Convert hi-res native time (V9's %tick in our case) into nanoseconds.
 *
 * The challenge is to multiply a %tick value by (NANOSEC / sys_tick_freq)
 * without using floating point and without overflowing 64-bit integers.
 * We assume that all sun4u systems will have a 16 nsec or better clock
 * (i.e. faster than 62.5 MHz), which means that (ticks << 4) has units
 * greater than one nanosecond, so converting from (ticks << 4) to nsec
 * requires multiplication by a rational number, R, between 0 and 1.
 * To avoid floating-point we precompute (R * 2^32) during boot and
 * stash this away in nsec_scale.  Thus we can compute (tick * R) as
 * (tick * nsec_scale) >> 32, which is accurate to about 1 part per billion.
 *
 * To avoid 64-bit overflow when multiplying (tick << 4) by nsec_scale,
 * we split (tick << 4) into its high and low 32-bit pieces, H and L,
 * multiply each piece separately, and add up the relevant bits of the
 * partial products.  Putting it all together we have:
 *
 * nsec = (tick << 4) * R
 *  = ((tick << 4) * nsec_scale) >> 32
 *  = ((H << 32) + L) * nsec_scale) >> 32
 *  = (H * nsec_scale) + ((L * nsec_scale) >> 32)
 *
 * The last line is the computation we actually perform: it requires no
 * floating point and all intermediate results fit in 64-bit registers.
 *
 * Note that we require that tick is less than (1 << (64 - NSEC_SHIFT));
 * greater values will result in overflow and misbehavior (not that this
 * is a serious problem; (1 << (64 - NSEC_SHIFT)) nanoseconds is over
 * thirty-six years).  Nonetheless, clients may wish to be aware of this
 * limitation; NATIVE_TIME_MAX() returns this maximum native time.
 *
 * We provide two versions of this macro: a "full-service" version that
 * just converts ticks to nanoseconds and a higher-performance version that
 * expects the scaling factor nsec_scale as its second argument (so that
 * callers can distance the load of nsec_scale from its use).  Note that
 * we take a fast path if we determine the ticks to be less than 32 bits
 * (as it often is for the delta between %tick values for successive
 * firings of the hres_tick() cyclic).
 *
 * Note that in the 32-bit path we don't even bother clearing NPT.
 * We get away with this by making hardclk.c ensure than nsec_scale
 * is even, so we can take advantage of the associativity of modular
 * arithmetic: multiplying %tick by any even number, say 2*n, is
 * equivalent to multiplying %tick by 2, then by n.  Multiplication
 * by 2 is equivalent to shifting left by one, which clears NPT.
 *
 * Finally, note that the macros use the labels "6:" and "7:"; these
 * labels must not be used across an invocation of either macro.
 */
#define NATIVE_TIME_TO_NSEC_SCALE(out, scr1, scr2, shift)       \
    srlx    out, 32, scr2;      /* check high 32 bits */    \
/* CSTYLED */                               \
    brz,a,pt scr2, 6f;      /* if clear, 32-bit fast path */\
    mulx    out, scr1, out;     /* delay: 32-bit fast path */   \
    sllx    out, shift, out;    /* clear NPT and pre-scale */   \
    srlx    out, 32, scr2;      /* scr2 = hi32(tick<<4) = H */  \
    mulx    scr2, scr1, scr2;   /* scr2 = (H*F) */      \
    srl out, 0, out;        /* out = lo32(tick<<4) = L */   \
    mulx    out, scr1, scr1;    /* scr1 = (L*F) */      \
    srlx    scr1, 32, scr1;     /* scr1 = (L*F) >> 32 */    \
    ba  7f;         /* branch over 32-bit path */   \
    add scr1, scr2, out;    /* out = (H*F) + ((L*F) >> 32) */\
6:                                  \
    srlx    out, 32 - shift, out;                   \
7:

#define NATIVE_TIME_TO_NSEC(out, scr1, scr2)                \
    sethi   %hi(nsec_scale), scr1;  /* load scaling factor */   \
    ld  [scr1 + %lo(nsec_scale)], scr1;             \
    NATIVE_TIME_TO_NSEC_SCALE(out, scr1, scr2, NSEC_SHIFT);

#define NATIVE_TIME_MAX(out)                        \
    mov -1, out;                        \
    srlx    out, NSEC_SHIFT, out


/*
 * The following macros are only for use in the cpu module.
 */
#if defined(CPU_MODULE)

/*
 * NSEC_SHIFT and VTRACE_SHIFT constants are defined in
 * <sys/machclock.h> file.
 */


/*
 * NOTE: the macros below assume that the various time-related variables
 * (hrestime, hrestime_adj, hres_last_tick, timedelta, nsec_scale, etc)
 * are all stored together on a 64-byte boundary.  The primary motivation
 * is cache performance, but we also take advantage of a convenient side
 * effect: these variables all have the same high 22 address bits, so only
 * one sethi is needed to access them all.
 */

/*
 * GET_HRESTIME() returns the value of hrestime, hrestime_adj and the
 * number of nanoseconds since the last clock tick ('nslt').  It also
 * sets 'nano' to the value NANOSEC (one billion).
 *
 * This macro assumes that all registers are globals or outs so they can
 * safely contain 64-bit data, and that it's safe to use the label "5:".
 * Further, this macro calls the NATIVE_TIME_TO_NSEC_SCALE which in turn
 * uses the labels "6:" and "7:"; labels "5:", "6:" and "7:" must not
 * be used across invocations of this macro.
 */
#define GET_HRESTIME(hrestsec, hrestnsec, adj, nslt, nano, scr, hrlock, \
    gnt1, gnt2) \
5:  sethi   %hi(hres_lock), scr;                    \
    lduw    [scr + %lo(hres_lock)], hrlock; /* load clock lock */   \
    lduw    [scr + %lo(nsec_scale)], nano;  /* tick-to-ns factor */ \
    andn    hrlock, 1, hrlock;      /* see comments above! */   \
    ldx [scr + %lo(hres_last_tick)], nslt;          \
    ldn [scr + %lo(hrestime)], hrestsec; /* load hrestime.sec */\
    add scr, %lo(hrestime), hrestnsec;              \
    ldn [hrestnsec + CLONGSIZE], hrestnsec;         \
    GET_NATIVE_TIME(adj, gnt1, gnt2);   /* get current %tick */ \
    subcc   adj, nslt, nslt; /* nslt = ticks since last clockint */ \
    movneg  %xcc, %g0, nslt; /* ignore neg delta from tick skew */  \
    ldx [scr + %lo(hrestime_adj)], adj; /* load hrestime_adj */ \
    /* membar #LoadLoad; (see comment (2) above) */         \
    lduw    [scr + %lo(hres_lock)], scr; /* load clock lock */  \
    NATIVE_TIME_TO_NSEC_SCALE(nslt, nano, gnt1, NSEC_SHIFT);    \
    sethi   %hi(NANOSEC), nano;                 \
    xor hrlock, scr, scr;                   \
/* CSTYLED */                               \
    brnz,pn scr, 5b;                        \
    or  nano, %lo(NANOSEC), nano;

/*
 * Similar to above, but returns current gethrtime() value in 'base'.
 */
#define GET_HRTIME(base, now, nslt, scale, scr, hrlock, gnt1, gnt2) \
5:  sethi   %hi(hres_lock), scr;                    \
    lduw    [scr + %lo(hres_lock)], hrlock; /* load clock lock */   \
    lduw    [scr + %lo(nsec_scale)], scale; /* tick-to-ns factor */ \
    andn    hrlock, 1, hrlock;      /* see comments above! */   \
    ldx [scr + %lo(hres_last_tick)], nslt;          \
    ldx [scr + %lo(hrtime_base)], base; /* load hrtime_base */  \
    GET_NATIVE_TIME(now, gnt1, gnt2);   /* get current %tick */ \
    subcc   now, nslt, nslt; /* nslt = ticks since last clockint */ \
    movneg  %xcc, %g0, nslt; /* ignore neg delta from tick skew */  \
    /* membar #LoadLoad; (see comment (2) above) */         \
    ld  [scr + %lo(hres_lock)], scr; /* load clock lock */  \
    NATIVE_TIME_TO_NSEC_SCALE(nslt, scale, gnt1, NSEC_SHIFT);   \
    xor hrlock, scr, scr;                   \
/* CSTYLED */                               \
    brnz,pn scr, 5b;                        \
    add base, nslt, base;

/*
 * Maximum-performance timestamp for kernel tracing.  We don't bother
 * clearing NPT because vtrace expresses everything in 32-bit deltas,
 * so only the low-order 32 bits matter.  We do shift down a few bits,
 * however, so that the trace framework doesn't emit a ridiculous number
 * of 32_bit_elapsed_time records (trace points are more expensive when
 * the time since the last trace point doesn't fit in a 16-bit delta).
 * We currently shift by 4 (divide by 16) on the grounds that (1) there's
 * no point making the timing finer-grained than the trace point latency,
 * which exceeds 16 cycles; and (2) the cost and probe effect of many
 * 32-bit time records far exceeds the cost of the 'srlx' instruction.
 */
#define GET_VTRACE_TIME(out, scr1, scr2)                \
    GET_NATIVE_TIME(out, scr1, scr2);   /* get current %tick */ \
    srlx    out, VTRACE_SHIFT, out;

/*
 * Full 64-bit version for those truly rare occasions when you need it.
 * Currently this is only needed to generate the TR_START_TIME record.
 */
#define GET_VTRACE_TIME_64(out, scr1, scr2)             \
    GET_NATIVE_TIME(out, scr1, scr2);   /* get current %tick */ \
    add out, out, out;                      \
    srlx    out, VTRACE_SHIFT + 1, out;

/*
 * Return the rate at which the vtrace clock runs.
 */
#define GET_VTRACE_FREQUENCY(out, scr1, scr2)               \
    sethi   %hi(sys_tick_freq), out;                \
    ldx [out + %lo(sys_tick_freq)], out;            \
    srlx    out, VTRACE_SHIFT, out;

#endif /* CPU_MODULE */

#ifdef  __cplusplus
}
#endif

#endif  /* !_SYS_CLOCK_H */