common/io/bufmod.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "%Z%%M% %I% %E% SMI"

/*
 * STREAMS Buffering module
 *
 * This streams module collects incoming messages from modules below
 * it on the stream and buffers them up into a smaller number of
 * aggregated messages.  Its main purpose is to reduce overhead by
 * cutting down on the number of read (or getmsg) calls its client
 * user process makes.
 *  - only M_DATA is buffered.
 *  - multithreading assumes configured as D_MTQPAIR
 *  - packets are lost only if flag SB_NO_HEADER is clear and buffer
 *    allocation fails.
 *  - in order message transmission. This is enforced for messages other
 *    than high priority messages.
 *  - zero length messages on the read side are not passed up the
 *    stream but used internally for synchronization.
 * FLAGS:
 * - SB_NO_PROTO_CVT - no conversion of M_PROTO messages to M_DATA.
 *   (conversion is the default for backwards compatibility
 *    hence the negative logic).
 * - SB_NO_HEADER - no headers in buffered data.
 *   (adding headers is the default for backwards compatibility
 *    hence the negative logic).
 * - SB_DEFER_CHUNK - provides improved response time in question-answer
 *   applications. Buffering is not enabled until the second message
 *   is received on the read side within the sb_ticks interval.
 *   This option will often be used in combination with flag SB_SEND_ON_WRITE.
 * - SB_SEND_ON_WRITE - a write message results in any pending buffered read
 *   data being immediately sent upstream.
 * - SB_NO_DROPS - bufmod behaves transparently in flow control and propagates
 *   the blocked flow condition downstream. If this flag is clear (default)
 *   messages will be dropped if the upstream flow is blocked.
 */


#include    <sys/types.h>
#include    <sys/errno.h>
#include    <sys/debug.h>
#include    <sys/stropts.h>
#include    <sys/time.h>
#include    <sys/stream.h>
#include    <sys/conf.h>
#include    <sys/ddi.h>
#include    <sys/sunddi.h>
#include    <sys/kmem.h>
#include    <sys/strsun.h>
#include    <sys/bufmod.h>
#include    <sys/modctl.h>
#include    <sys/isa_defs.h>

/*
 * Per-Stream state information.
 *
 * If sb_ticks is negative, we don't deliver chunks until they're
 * full.  If it's zero, we deliver every packet as it arrives.  (In
 * this case we force sb_chunk to zero, to make the implementation
 * easier.)  Otherwise, sb_ticks gives the number of ticks in a
 * buffering interval. The interval begins when the a read side data
 * message is received and a timeout is not active. If sb_snap is
 * zero, no truncation of the msg is done.
 */
struct sb {
    queue_t *sb_rq;     /* our rq */
    mblk_t  *sb_mp;     /* partial chunk */
    mblk_t  *sb_head;   /* pre-allocated space for the next header */
    mblk_t  *sb_tail;   /* first mblk of last message appended */
    uint_t  sb_mlen;    /* sb_mp length */
    uint_t  sb_mcount;  /* input msg count in sb_mp */
    uint_t  sb_chunk;   /* max chunk size */
    clock_t sb_ticks;   /* timeout interval */
    timeout_id_t sb_timeoutid; /* qtimeout() id */
    uint_t  sb_drops;   /* cumulative # discarded msgs */
    uint_t  sb_snap;    /* snapshot length */
    uint_t  sb_flags;   /* flags field */
    uint_t  sb_state;   /* state variable */
};

/*
 * Function prototypes.
 */
static  int sbopen(queue_t *, dev_t *, int, int, cred_t *);
static  int sbclose(queue_t *, int, cred_t *);
static  void    sbwput(queue_t *, mblk_t *);
static  void    sbrput(queue_t *, mblk_t *);
static  void    sbrsrv(queue_t *);
static  void    sbioctl(queue_t *, mblk_t *);
static  void    sbaddmsg(queue_t *, mblk_t *);
static  void    sbtick(void *);
static  void    sbclosechunk(struct sb *);
static  void    sbsendit(queue_t *, mblk_t *);

static struct module_info   sb_minfo = {
    21,     /* mi_idnum */
    "bufmod",   /* mi_idname */
    0,      /* mi_minpsz */
    INFPSZ,     /* mi_maxpsz */
    1,      /* mi_hiwat */
    0       /* mi_lowat */
};

static struct qinit sb_rinit = {
    (int (*)())sbrput,  /* qi_putp */
    (int (*)())sbrsrv,  /* qi_srvp */
    sbopen,         /* qi_qopen */
    sbclose,        /* qi_qclose */
    NULL,           /* qi_qadmin */
    &sb_minfo,      /* qi_minfo */
    NULL            /* qi_mstat */
};

static struct qinit sb_winit = {
    (int (*)())sbwput,  /* qi_putp */
    NULL,           /* qi_srvp */
    NULL,           /* qi_qopen */
    NULL,           /* qi_qclose */
    NULL,           /* qi_qadmin */
    &sb_minfo,      /* qi_minfo */
    NULL            /* qi_mstat */
};

static struct streamtab sb_info = {
    &sb_rinit,  /* st_rdinit */
    &sb_winit,  /* st_wrinit */
    NULL,       /* st_muxrinit */
    NULL        /* st_muxwinit */
};


/*
 * This is the loadable module wrapper.
 */

static struct fmodsw fsw = {
    "bufmod",
    &sb_info,
    D_MTQPAIR | D_MP
};

/*
 * Module linkage information for the kernel.
 */

static struct modlstrmod modlstrmod = {
    &mod_strmodops, "streams buffer mod", &fsw
};

static struct modlinkage modlinkage = {
    MODREV_1, &modlstrmod, NULL
};


int
_init(void)
{
    return (mod_install(&modlinkage));
}

int
_fini(void)
{
    return (mod_remove(&modlinkage));
}

int
_info(struct modinfo *modinfop)
{
    return (mod_info(&modlinkage, modinfop));
}


/* ARGSUSED */
static int
sbopen(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
{
    struct sb   *sbp;
    ASSERT(rq);

    if (sflag != MODOPEN)
        return (EINVAL);

    if (rq->q_ptr)
        return (0);

    /*
     * Allocate and initialize per-Stream structure.
     */
    sbp = kmem_alloc(sizeof (struct sb), KM_SLEEP);
    sbp->sb_rq = rq;
    sbp->sb_ticks = -1;
    sbp->sb_chunk = SB_DFLT_CHUNK;
    sbp->sb_tail = sbp->sb_mp = sbp->sb_head = NULL;
    sbp->sb_mlen = 0;
    sbp->sb_mcount = 0;
    sbp->sb_timeoutid = 0;
    sbp->sb_drops = 0;
    sbp->sb_snap = 0;
    sbp->sb_flags = 0;
    sbp->sb_state = 0;

    rq->q_ptr = WR(rq)->q_ptr = sbp;

    qprocson(rq);


    return (0);
}

/* ARGSUSED1 */
static int
sbclose(queue_t *rq, int flag, cred_t *credp)
{
    struct  sb  *sbp = (struct sb *)rq->q_ptr;

    ASSERT(sbp);

    qprocsoff(rq);
    /*
     * Cancel an outstanding timeout
     */
    if (sbp->sb_timeoutid != 0) {
        (void) quntimeout(rq, sbp->sb_timeoutid);
        sbp->sb_timeoutid = 0;
    }
    /*
     * Free the current chunk.
     */
    if (sbp->sb_mp) {
        freemsg(sbp->sb_mp);
        sbp->sb_tail = sbp->sb_mp = sbp->sb_head = NULL;
        sbp->sb_mlen = 0;
    }

    /*
     * Free the per-Stream structure.
     */
    kmem_free((caddr_t)sbp, sizeof (struct sb));
    rq->q_ptr = WR(rq)->q_ptr = NULL;

    return (0);
}

/*
 * the correction factor is introduced to compensate for
 * whatever assumptions the modules below have made about
 * how much traffic is flowing through the stream and the fact
 * that bufmod may be snipping messages with the sb_snap length.
 */
#define SNIT_HIWAT(msgsize, fudge)  ((4 * msgsize * fudge) + 512)
#define SNIT_LOWAT(msgsize, fudge)  ((2 * msgsize * fudge) + 256)


static void
sbioc(queue_t *wq, mblk_t *mp)
{
    struct iocblk *iocp;
    struct sb *sbp = (struct sb *)wq->q_ptr;
    clock_t ticks;
    mblk_t  *mop;

    iocp = (struct iocblk *)mp->b_rptr;

    switch (iocp->ioc_cmd) {
    case SBIOCGCHUNK:
    case SBIOCGSNAP:
    case SBIOCGFLAGS:
    case SBIOCGTIME:
        miocack(wq, mp, 0, 0);
        return;

    case SBIOCSTIME:
#ifdef _SYSCALL32_IMPL
        if ((iocp->ioc_flag & IOC_MODELS) != IOC_NATIVE) {
            struct timeval32 *t32;

            t32 = (struct timeval32 *)mp->b_cont->b_rptr;
            if (t32->tv_sec < 0 || t32->tv_usec < 0) {
                miocnak(wq, mp, 0, EINVAL);
                break;
            }
            ticks = TIMEVAL_TO_TICK(t32);
        } else
#endif /* _SYSCALL32_IMPL */
        {
            struct timeval *tb;

            tb = (struct timeval *)mp->b_cont->b_rptr;

            if (tb->tv_sec < 0 || tb->tv_usec < 0) {
                miocnak(wq, mp, 0, EINVAL);
                break;
            }
            ticks = TIMEVAL_TO_TICK(tb);
        }
        sbp->sb_ticks = ticks;
        if (ticks == 0)
            sbp->sb_chunk = 0;
        miocack(wq, mp, 0, 0);
        sbclosechunk(sbp);
        return;

    case SBIOCSCHUNK:
        /*
         * set up hi/lo water marks on stream head read queue.
         * unlikely to run out of resources. Fix at later date.
         */
        if ((mop = allocb(sizeof (struct stroptions),
            BPRI_MED)) != NULL) {
            struct stroptions *sop;
            uint_t chunk;

            chunk = *(uint_t *)mp->b_cont->b_rptr;
            mop->b_datap->db_type = M_SETOPTS;
            mop->b_wptr += sizeof (struct stroptions);
            sop = (struct stroptions *)mop->b_rptr;
            sop->so_flags = SO_HIWAT | SO_LOWAT;
            sop->so_hiwat = SNIT_HIWAT(chunk, 1);
            sop->so_lowat = SNIT_LOWAT(chunk, 1);
            qreply(wq, mop);
        }

        sbp->sb_chunk = *(uint_t *)mp->b_cont->b_rptr;
        miocack(wq, mp, 0, 0);
        sbclosechunk(sbp);
        return;

    case SBIOCSFLAGS:
        sbp->sb_flags = *(uint_t *)mp->b_cont->b_rptr;
        miocack(wq, mp, 0, 0);
        return;

    case SBIOCSSNAP:
        /*
         * if chunking dont worry about effects of
         * snipping of message size on head flow control
         * since it has a relatively small bearing on the
         * data rate onto the streamn head.
         */
        if (!sbp->sb_chunk) {
            /*
             * set up hi/lo water marks on stream head read queue.
             * unlikely to run out of resources. Fix at later date.
             */
            if ((mop = allocb(sizeof (struct stroptions),
                BPRI_MED)) != NULL) {
                struct stroptions *sop;
                uint_t snap;
                int fudge;

                snap = *(uint_t *)mp->b_cont->b_rptr;
                mop->b_datap->db_type = M_SETOPTS;
                mop->b_wptr += sizeof (struct stroptions);
                sop = (struct stroptions *)mop->b_rptr;
                sop->so_flags = SO_HIWAT | SO_LOWAT;
                fudge = snap <= 100 ?   4 :
                    snap <= 400 ?   2 :
                    1;
                sop->so_hiwat = SNIT_HIWAT(snap, fudge);
                sop->so_lowat = SNIT_LOWAT(snap, fudge);
                qreply(wq, mop);
            }
        }

        sbp->sb_snap = *(uint_t *)mp->b_cont->b_rptr;
        miocack(wq, mp, 0, 0);
        return;

    default:
        ASSERT(0);
        return;
    }
}

/*
 * Write-side put procedure.  Its main task is to detect ioctls
 * for manipulating the buffering state and hand them to sbioctl.
 * Other message types are passed on through.
 */
static void
sbwput(queue_t *wq, mblk_t *mp)
{
    struct  sb  *sbp = (struct sb *)wq->q_ptr;
    struct copyresp *resp;

    if (sbp->sb_flags & SB_SEND_ON_WRITE)
        sbclosechunk(sbp);
    switch (mp->b_datap->db_type) {
    case M_IOCTL:
        sbioctl(wq, mp);
        break;

    case M_IOCDATA:
        resp = (struct copyresp *)mp->b_rptr;
        if (resp->cp_rval) {
            /*
             * Just free message on failure.
             */
            freemsg(mp);
            break;
        }

        switch (resp->cp_cmd) {
        case SBIOCSTIME:
        case SBIOCSCHUNK:
        case SBIOCSFLAGS:
        case SBIOCSSNAP:
        case SBIOCGTIME:
        case SBIOCGCHUNK:
        case SBIOCGSNAP:
        case SBIOCGFLAGS:
            sbioc(wq, mp);
            break;

        default:
            putnext(wq, mp);
            break;
        }
        break;

    default:
        putnext(wq, mp);
        break;
    }
}

/*
 * Read-side put procedure.  It's responsible for buffering up incoming
 * messages and grouping them into aggregates according to the current
 * buffering parameters.
 */
static void
sbrput(queue_t *rq, mblk_t *mp)
{
    struct  sb  *sbp = (struct sb *)rq->q_ptr;

    ASSERT(sbp);

    switch (mp->b_datap->db_type) {
    case M_PROTO:
        if (sbp->sb_flags & SB_NO_PROTO_CVT) {
            sbclosechunk(sbp);
            sbsendit(rq, mp);
            break;
        } else {
            /*
             * Convert M_PROTO to M_DATA.
             */
            mp->b_datap->db_type = M_DATA;
        }
        /* FALLTHRU */

    case M_DATA:
        if ((sbp->sb_flags & SB_DEFER_CHUNK) &&
            !(sbp->sb_state & SB_FRCVD)) {
            sbclosechunk(sbp);
            sbsendit(rq, mp);
            sbp->sb_state |= SB_FRCVD;
        } else
            sbaddmsg(rq, mp);

        if ((sbp->sb_ticks > 0) && !(sbp->sb_timeoutid))
            sbp->sb_timeoutid = qtimeout(sbp->sb_rq, sbtick,
                sbp, sbp->sb_ticks);

        break;

    case M_FLUSH:
        if (*mp->b_rptr & FLUSHR) {
            /*
             * Reset timeout, flush the chunk currently in
             * progress, and start a new chunk.
             */
            if (sbp->sb_timeoutid) {
                (void) quntimeout(sbp->sb_rq,
                    sbp->sb_timeoutid);
                sbp->sb_timeoutid = 0;
            }
            if (sbp->sb_mp) {
                freemsg(sbp->sb_mp);
                sbp->sb_tail = sbp->sb_mp = sbp->sb_head = NULL;
                sbp->sb_mlen = 0;
                sbp->sb_mcount = 0;
            }
            flushq(rq, FLUSHALL);
        }
        putnext(rq, mp);
        break;

    case M_CTL:
        /*
         * Zero-length M_CTL means our timeout() popped.
         */
        if (MBLKL(mp) == 0) {
            freemsg(mp);
            sbclosechunk(sbp);
        } else {
            sbclosechunk(sbp);
            sbsendit(rq, mp);
        }
        break;

    default:
        if (mp->b_datap->db_type <= QPCTL) {
            sbclosechunk(sbp);
            sbsendit(rq, mp);
        } else {
            /* Note: out of band */
            putnext(rq, mp);
        }
        break;
    }
}

/*
 *  read service procedure.
 */
/* ARGSUSED */
static void
sbrsrv(queue_t *rq)
{
    mblk_t  *mp;

    /*
     * High priority messages shouldn't get here but if
     * one does, jam it through to avoid infinite loop.
     */
    while ((mp = getq(rq)) != NULL) {
        if (!canputnext(rq) && (mp->b_datap->db_type <= QPCTL)) {
            /* should only get here if SB_NO_SROPS */
            (void) putbq(rq, mp);
            return;
        }
        putnext(rq, mp);
    }
}

/*
 * Handle write-side M_IOCTL messages.
 */
static void
sbioctl(queue_t *wq, mblk_t *mp)
{
    struct  sb  *sbp = (struct sb *)wq->q_ptr;
    struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
    struct  timeval *t;
    clock_t ticks;
    mblk_t  *mop;
    int transparent = iocp->ioc_count;
    mblk_t  *datamp;
    int error;

    switch (iocp->ioc_cmd) {
    case SBIOCSTIME:
        if (iocp->ioc_count == TRANSPARENT) {
#ifdef _SYSCALL32_IMPL
            if ((iocp->ioc_flag & IOC_MODELS) != IOC_NATIVE) {
                mcopyin(mp, NULL, sizeof (struct timeval32),
                    NULL);
            } else
#endif /* _SYSCALL32_IMPL */
            {
                mcopyin(mp, NULL, sizeof (*t), NULL);
            }
            qreply(wq, mp);
        } else {
            /*
             * Verify argument length.
             */
#ifdef _SYSCALL32_IMPL
            if ((iocp->ioc_flag & IOC_MODELS) != IOC_NATIVE) {
                struct timeval32 *t32;

                error = miocpullup(mp,
                    sizeof (struct timeval32));
                if (error != 0) {
                    miocnak(wq, mp, 0, error);
                    break;
                }
                t32 = (struct timeval32 *)mp->b_cont->b_rptr;
                if (t32->tv_sec < 0 || t32->tv_usec < 0) {
                    miocnak(wq, mp, 0, EINVAL);
                    break;
                }
                ticks = TIMEVAL_TO_TICK(t32);
            } else
#endif /* _SYSCALL32_IMPL */
            {
                error = miocpullup(mp, sizeof (struct timeval));
                if (error != 0) {
                    miocnak(wq, mp, 0, error);
                    break;
                }

                t = (struct timeval *)mp->b_cont->b_rptr;
                if (t->tv_sec < 0 || t->tv_usec < 0) {
                    miocnak(wq, mp, 0, EINVAL);
                    break;
                }
                ticks = TIMEVAL_TO_TICK(t);
            }
            sbp->sb_ticks = ticks;
            if (ticks == 0)
                sbp->sb_chunk = 0;
            miocack(wq, mp, 0, 0);
            sbclosechunk(sbp);
        }
        break;

    case SBIOCGTIME: {
        struct timeval *t;

        /*
         * Verify argument length.
         */
        if (transparent != TRANSPARENT) {
#ifdef _SYSCALL32_IMPL
            if ((iocp->ioc_flag & IOC_MODELS) != IOC_NATIVE) {
                error = miocpullup(mp,
                    sizeof (struct timeval32));
                if (error != 0) {
                    miocnak(wq, mp, 0, error);
                    break;
                }
            } else
#endif /* _SYSCALL32_IMPL */
            error = miocpullup(mp, sizeof (struct timeval));
            if (error != 0) {
                miocnak(wq, mp, 0, error);
                break;
            }
        }

        /*
         * If infinite timeout, return range error
         * for the ioctl.
         */
        if (sbp->sb_ticks < 0) {
            miocnak(wq, mp, 0, ERANGE);
            break;
        }

#ifdef _SYSCALL32_IMPL
        if ((iocp->ioc_flag & IOC_MODELS) != IOC_NATIVE) {
            struct timeval32 *t32;

            if (transparent == TRANSPARENT) {
                datamp = allocb(sizeof (*t32), BPRI_MED);
                if (datamp == NULL) {
                    miocnak(wq, mp, 0, EAGAIN);
                    break;
                }
                mcopyout(mp, NULL, sizeof (*t32), NULL, datamp);
            }

            t32 = (struct timeval32 *)mp->b_cont->b_rptr;
            TICK_TO_TIMEVAL32(sbp->sb_ticks, t32);

            if (transparent == TRANSPARENT)
                qreply(wq, mp);
            else
                miocack(wq, mp, sizeof (*t32), 0);
        } else
#endif /* _SYSCALL32_IMPL */
        {
            if (transparent == TRANSPARENT) {
                datamp = allocb(sizeof (*t), BPRI_MED);
                if (datamp == NULL) {
                    miocnak(wq, mp, 0, EAGAIN);
                    break;
                }
                mcopyout(mp, NULL, sizeof (*t), NULL, datamp);
            }

            t = (struct timeval *)mp->b_cont->b_rptr;
            TICK_TO_TIMEVAL(sbp->sb_ticks, t);

            if (transparent == TRANSPARENT)
                qreply(wq, mp);
            else
                miocack(wq, mp, sizeof (*t), 0);
        }
        break;
    }

    case SBIOCCTIME:
        sbp->sb_ticks = -1;
        miocack(wq, mp, 0, 0);
        break;

    case SBIOCSCHUNK:
        if (iocp->ioc_count == TRANSPARENT) {
            mcopyin(mp, NULL, sizeof (uint_t), NULL);
            qreply(wq, mp);
        } else {
            /*
             * Verify argument length.
             */
            error = miocpullup(mp, sizeof (uint_t));
            if (error != 0) {
                miocnak(wq, mp, 0, error);
                break;
            }

            /*
             * set up hi/lo water marks on stream head read queue.
             * unlikely to run out of resources. Fix at later date.
             */
            if ((mop = allocb(sizeof (struct stroptions),
                BPRI_MED)) != NULL) {
                struct stroptions *sop;
                uint_t chunk;

                chunk = *(uint_t *)mp->b_cont->b_rptr;
                mop->b_datap->db_type = M_SETOPTS;
                mop->b_wptr += sizeof (struct stroptions);
                sop = (struct stroptions *)mop->b_rptr;
                sop->so_flags = SO_HIWAT | SO_LOWAT;
                sop->so_hiwat = SNIT_HIWAT(chunk, 1);
                sop->so_lowat = SNIT_LOWAT(chunk, 1);
                qreply(wq, mop);
            }

            sbp->sb_chunk = *(uint_t *)mp->b_cont->b_rptr;
            miocack(wq, mp, 0, 0);
            sbclosechunk(sbp);
        }
        break;

    case SBIOCGCHUNK:
        /*
         * Verify argument length.
         */
        if (transparent != TRANSPARENT) {
            error = miocpullup(mp, sizeof (uint_t));
            if (error != 0) {
                miocnak(wq, mp, 0, error);
                break;
            }
        }

        if (transparent == TRANSPARENT) {
            datamp = allocb(sizeof (uint_t), BPRI_MED);
            if (datamp == NULL) {
                miocnak(wq, mp, 0, EAGAIN);
                break;
            }
            mcopyout(mp, NULL, sizeof (uint_t), NULL, datamp);
        }

        *(uint_t *)mp->b_cont->b_rptr = sbp->sb_chunk;

        if (transparent == TRANSPARENT)
            qreply(wq, mp);
        else
            miocack(wq, mp, sizeof (uint_t), 0);
        break;

    case SBIOCSSNAP:
        if (iocp->ioc_count == TRANSPARENT) {
            mcopyin(mp, NULL, sizeof (uint_t), NULL);
            qreply(wq, mp);
        } else {
            /*
             * Verify argument length.
             */
            error = miocpullup(mp, sizeof (uint_t));
            if (error != 0) {
                miocnak(wq, mp, 0, error);
                break;
            }

            /*
             * if chunking dont worry about effects of
             * snipping of message size on head flow control
             * since it has a relatively small bearing on the
             * data rate onto the streamn head.
             */
            if (!sbp->sb_chunk) {
                /*
                 * set up hi/lo water marks on stream
                 * head read queue.  unlikely to run out
                 * of resources. Fix at later date.
                 */
                if ((mop = allocb(sizeof (struct stroptions),
                    BPRI_MED)) != NULL) {
                    struct stroptions *sop;
                    uint_t snap;
                    int fudge;

                    snap = *(uint_t *)mp->b_cont->b_rptr;
                    mop->b_datap->db_type = M_SETOPTS;
                    mop->b_wptr += sizeof (*sop);
                    sop = (struct stroptions *)mop->b_rptr;
                    sop->so_flags = SO_HIWAT | SO_LOWAT;
                    fudge = (snap <= 100) ? 4 :
                        (snap <= 400) ? 2 : 1;
                    sop->so_hiwat = SNIT_HIWAT(snap, fudge);
                    sop->so_lowat = SNIT_LOWAT(snap, fudge);
                    qreply(wq, mop);
                }
            }

            sbp->sb_snap = *(uint_t *)mp->b_cont->b_rptr;

            miocack(wq, mp, 0, 0);
        }
        break;

    case SBIOCGSNAP:
        /*
         * Verify argument length
         */
        if (transparent != TRANSPARENT) {
            error = miocpullup(mp, sizeof (uint_t));
            if (error != 0) {
                miocnak(wq, mp, 0, error);
                break;
            }
        }

        if (transparent == TRANSPARENT) {
            datamp = allocb(sizeof (uint_t), BPRI_MED);
            if (datamp == NULL) {
                miocnak(wq, mp, 0, EAGAIN);
                break;
            }
            mcopyout(mp, NULL, sizeof (uint_t), NULL, datamp);
        }

        *(uint_t *)mp->b_cont->b_rptr = sbp->sb_snap;

        if (transparent == TRANSPARENT)
            qreply(wq, mp);
        else
            miocack(wq, mp, sizeof (uint_t), 0);
        break;

    case SBIOCSFLAGS:
        /*
         * set the flags.
         */
        if (iocp->ioc_count == TRANSPARENT) {
            mcopyin(mp, NULL, sizeof (uint_t), NULL);
            qreply(wq, mp);
        } else {
            error = miocpullup(mp, sizeof (uint_t));
            if (error != 0) {
                miocnak(wq, mp, 0, error);
                break;
            }
            sbp->sb_flags = *(uint_t *)mp->b_cont->b_rptr;
            miocack(wq, mp, 0, 0);
        }
        break;

    case SBIOCGFLAGS:
        /*
         * Verify argument length
         */
        if (transparent != TRANSPARENT) {
            error = miocpullup(mp, sizeof (uint_t));
            if (error != 0) {
                miocnak(wq, mp, 0, error);
                break;
            }
        }

        if (transparent == TRANSPARENT) {
            datamp = allocb(sizeof (uint_t), BPRI_MED);
            if (datamp == NULL) {
                miocnak(wq, mp, 0, EAGAIN);
                break;
            }
            mcopyout(mp, NULL, sizeof (uint_t), NULL, datamp);
        }

        *(uint_t *)mp->b_cont->b_rptr = sbp->sb_flags;

        if (transparent == TRANSPARENT)
            qreply(wq, mp);
        else
            miocack(wq, mp, sizeof (uint_t), 0);
        break;


    default:
        putnext(wq, mp);
        break;
    }
}

/*
 * Given a length l, calculate the amount of extra storage
 * required to round it up to the next multiple of the alignment a.
 */
#define RoundUpAmt(l, a)    ((l) % (a) ? (a) - ((l) % (a)) : 0)
/*
 * Calculate additional amount of space required for alignment.
 */
#define Align(l)        RoundUpAmt(l, sizeof (ulong_t))
/*
 * Smallest possible message size when headers are enabled.
 * This is used to calculate whether a chunk is nearly full.
 */
#define SMALLEST_MESSAGE    sizeof (struct sb_hdr) + _POINTER_ALIGNMENT

/*
 * Process a read-side M_DATA message.
 *
 * If the currently accumulating chunk doesn't have enough room
 * for the message, close off the chunk, pass it upward, and start
 * a new one.  Then add the message to the current chunk, taking
 * account of the possibility that the message's size exceeds the
 * chunk size.
 *
 * If headers are enabled add an sb_hdr header and trailing alignment padding.
 *
 * To optimise performance the total number of msgbs should be kept
 * to a minimum. This is achieved by using any remaining space in message N
 * for both its own padding as well as the header of message N+1 if possible.
 * If there's insufficient space we allocate one message to hold this 'wrapper'.
 * (there's likely to be space beyond message N, since allocb would have
 * rounded up the required size to one of the dblk_sizes).
 *
 */
static void
sbaddmsg(queue_t *rq, mblk_t *mp)
{
    struct sb   *sbp;
    struct timeval  t;
    struct sb_hdr   hp;
    mblk_t *wrapper;    /* padding for msg N, header for msg N+1 */
    mblk_t *last;       /* last mblk of current message */
    size_t wrapperlen;  /* length of header + padding */
    size_t origlen;     /* data length before truncation */
    size_t pad;     /* bytes required to align header */

    sbp = (struct sb *)rq->q_ptr;

    origlen = msgdsize(mp);

    /*
     * Truncate the message.
     */
    if ((sbp->sb_snap > 0) && (origlen > sbp->sb_snap) &&
            (adjmsg(mp, -(origlen - sbp->sb_snap)) == 1))
        hp.sbh_totlen = hp.sbh_msglen = sbp->sb_snap;
    else
        hp.sbh_totlen = hp.sbh_msglen = origlen;

    if (sbp->sb_flags & SB_NO_HEADER) {

        /*
         * Would the inclusion of this message overflow the current
         * chunk? If so close the chunk off and start a new one.
         */
        if ((hp.sbh_totlen + sbp->sb_mlen) > sbp->sb_chunk)
            sbclosechunk(sbp);
        /*
         * First message too big for chunk - just send it up.
         * This will always be true when we're not chunking.
         */
        if (hp.sbh_totlen > sbp->sb_chunk) {
            sbsendit(rq, mp);
            return;
        }

        /*
         * We now know that the msg will fit in the chunk.
         * Link it onto the end of the chunk.
         * Since linkb() walks the entire chain, we keep a pointer to
         * the first mblk of the last msgb added and call linkb on that
         * that last message, rather than performing the
         * O(n) linkb() operation on the whole chain.
         * sb_head isn't needed in this SB_NO_HEADER mode.
         */
        if (sbp->sb_mp)
            linkb(sbp->sb_tail, mp);
        else
            sbp->sb_mp = mp;

        sbp->sb_tail = mp;
        sbp->sb_mlen += hp.sbh_totlen;
        sbp->sb_mcount++;
    } else {
        /* Timestamp must be done immediately */
        uniqtime(&t);
        TIMEVAL_TO_TIMEVAL32(&hp.sbh_timestamp, &t);

        pad = Align(hp.sbh_totlen);
        hp.sbh_totlen += sizeof (hp);
        hp.sbh_totlen += pad;

        /*
         * Would the inclusion of this message overflow the current
         * chunk? If so close the chunk off and start a new one.
         */
        if ((hp.sbh_totlen + sbp->sb_mlen) > sbp->sb_chunk)
                sbclosechunk(sbp);

        if (sbp->sb_head == NULL) {
            /* Allocate leading header of new chunk */
            sbp->sb_head = allocb(sizeof (hp), BPRI_MED);
            if (sbp->sb_head == NULL) {
                /*
                 * Memory allocation failure.
                 * This will need to be revisited
                 * since using certain flag combinations
                 * can result in messages being dropped
                 * silently.
                 */
                freemsg(mp);
                sbp->sb_drops++;
                return;
            }
            sbp->sb_mp = sbp->sb_head;
        }

        /*
         * Copy header into message
         */
        hp.sbh_drops = sbp->sb_drops;
        hp.sbh_origlen = origlen;
        (void) memcpy(sbp->sb_head->b_wptr, (char *)&hp, sizeof (hp));
        sbp->sb_head->b_wptr += sizeof (hp);

        ASSERT(sbp->sb_head->b_wptr <= sbp->sb_head->b_datap->db_lim);

        /*
         * Join message to the chunk
         */
        linkb(sbp->sb_head, mp);

        sbp->sb_mcount++;
        sbp->sb_mlen += hp.sbh_totlen;

        /*
         * If the first message alone is too big for the chunk close
         * the chunk now.
         * If the next message would immediately cause the chunk to
         * overflow we may as well close the chunk now. The next
         * message is certain to be at least SMALLEST_MESSAGE size.
         */
        if (hp.sbh_totlen + SMALLEST_MESSAGE > sbp->sb_chunk) {
            sbclosechunk(sbp);
            return;
        }

        /*
         * Find space for the wrapper. The wrapper consists of:
         *
         * 1) Padding for this message (this is to ensure each header
         * begins on an 8 byte boundary in the userland buffer).
         *
         * 2) Space for the next message's header, in case the next
         * next message will fit in this chunk.
         *
         * It may be possible to append the wrapper to the last mblk
         * of the message, but only if we 'own' the data. If the dblk
         * has been shared through dupmsg() we mustn't alter it.
         */

        wrapperlen = (sizeof (hp) + pad);

        /* Is there space for the wrapper beyond the message's data ? */
        for (last = mp; last->b_cont; last = last->b_cont)
            ;

        if ((wrapperlen <= MBLKTAIL(last)) &&
            (last->b_datap->db_ref == 1)) {
            if (pad > 0) {
                /*
                 * Pad with zeroes to the next pointer boundary
                 * (we don't want to disclose kernel data to
                 * users), then advance wptr.
                 */
                (void) memset(last->b_wptr, 0, pad);
                last->b_wptr += pad;
            }
            /* Remember where to write the header information */
            sbp->sb_head = last;
        } else {
            /* Have to allocate additional space for the wrapper */
            wrapper = allocb(wrapperlen, BPRI_MED);
            if (wrapper == NULL) {
                sbclosechunk(sbp);
                return;
            }
            if (pad > 0) {
                /*
                 * Pad with zeroes (we don't want to disclose
                 * kernel data to users).
                 */
                (void) memset(wrapper->b_wptr, 0, pad);
                wrapper->b_wptr += pad;
            }
            /* Link the wrapper msg onto the end of the chunk */
            linkb(mp, wrapper);
            /* Remember to write the next header in this wrapper */
            sbp->sb_head = wrapper;
        }
    }
}

/*
 * Called from timeout().
 * Signal a timeout by passing a zero-length M_CTL msg in the read-side
 * to synchronize with any active module threads (open, close, wput, rput).
 */
static void
sbtick(void *arg)
{
    struct sb *sbp = arg;
    queue_t *rq;

    ASSERT(sbp);

    rq = sbp->sb_rq;
    sbp->sb_timeoutid = 0;      /* timeout has fired */

    if (putctl(rq, M_CTL) == 0) /* failure */
        sbp->sb_timeoutid = qtimeout(rq, sbtick, sbp, sbp->sb_ticks);
}

/*
 * Close off the currently accumulating chunk and pass
 * it upward.  Takes care of resetting timers as well.
 *
 * This routine is called both directly and as a result
 * of the chunk timeout expiring.
 */
static void
sbclosechunk(struct sb *sbp)
{
    mblk_t  *mp;
    queue_t *rq;

    ASSERT(sbp);

    if (sbp->sb_timeoutid) {
        (void) quntimeout(sbp->sb_rq, sbp->sb_timeoutid);
        sbp->sb_timeoutid = 0;
    }

    mp = sbp->sb_mp;
    rq = sbp->sb_rq;

    /*
     * If there's currently a chunk in progress, close it off
     * and try to send it up.
     */
    if (mp) {
        sbsendit(rq, mp);
    }

    /*
     * Clear old chunk.  Ready for new msgs.
     */
    sbp->sb_tail = sbp->sb_mp = sbp->sb_head = NULL;
    sbp->sb_mlen = 0;
    sbp->sb_mcount = 0;
    if (sbp->sb_flags & SB_DEFER_CHUNK)
        sbp->sb_state &= ~SB_FRCVD;

}

static void
sbsendit(queue_t *rq, mblk_t *mp)
{
    struct  sb  *sbp = (struct sb *)rq->q_ptr;

    if (!canputnext(rq)) {
        if (sbp->sb_flags & SB_NO_DROPS)
            (void) putq(rq, mp);
        else {
            freemsg(mp);
            sbp->sb_drops += sbp->sb_mcount;
        }
        return;
    }
    /*
     * If there are messages on the q already, keep
     * queueing them since they need to be processed in order.
     */
    if (qsize(rq) > 0) {
        /* should only get here if SB_NO_DROPS */
        (void) putq(rq, mp);
    }
    else
        putnext(rq, mp);
}