common/rpc/rpcmod.c

	rpcmod.c revision 07e751310052c13869cb1cb2013faf5da656f9f8
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
/* Copyright (c) 1990 Mentat Inc. */

/*  Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/*    All Rights Reserved   */

#pragma ident   "%Z%%M% %I% %E% SMI"

/*
 * Kernel RPC filtering module
 */

#include <sys/param.h>
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
#include <sys/tihdr.h>
#include <sys/timod.h>
#include <sys/tiuser.h>
#include <sys/debug.h>
#include <sys/signal.h>
#include <sys/pcb.h>
#include <sys/user.h>
#include <sys/errno.h>
#include <sys/cred.h>
#include <sys/policy.h>
#include <sys/inline.h>
#include <sys/cmn_err.h>
#include <sys/kmem.h>
#include <sys/file.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/t_lock.h>
#include <sys/ddi.h>
#include <sys/vtrace.h>
#include <sys/callb.h>

#include <sys/strlog.h>
#include <rpc/rpc_com.h>
#include <inet/common.h>
#include <rpc/types.h>
#include <sys/time.h>
#include <rpc/xdr.h>
#include <rpc/auth.h>
#include <rpc/clnt.h>
#include <rpc/rpc_msg.h>
#include <rpc/clnt.h>
#include <rpc/svc.h>
#include <rpc/rpcsys.h>
#include <rpc/rpc_rdma.h>

/*
 * This is the loadable module wrapper.
 */
#include <sys/conf.h>
#include <sys/modctl.h>
#include <sys/syscall.h>

extern struct streamtab rpcinfo;

static struct fmodsw fsw = {
    "rpcmod",
    &rpcinfo,
    D_NEW|D_MP,
};

/*
 * Module linkage information for the kernel.
 */

static struct modlstrmod modlstrmod = {
    &mod_strmodops, "rpc interface str mod", &fsw
};

/*
 * For the RPC system call.
 */
static struct sysent rpcsysent = {
    2,
    SE_32RVAL1 | SE_ARGC | SE_NOUNLOAD,
    rpcsys
};

static struct modlsys modlsys = {
    &mod_syscallops,
    "RPC syscall",
    &rpcsysent
};

#ifdef _SYSCALL32_IMPL
static struct modlsys modlsys32 = {
    &mod_syscallops32,
    "32-bit RPC syscall",
    &rpcsysent
};
#endif /* _SYSCALL32_IMPL */

static struct modlinkage modlinkage = {
    MODREV_1,
    {
        &modlsys,
#ifdef _SYSCALL32_IMPL
        &modlsys32,
#endif
        &modlstrmod,
        NULL
    }
};

int
_init(void)
{
    int error = 0;
    callb_id_t cid;
    int status;

    svc_init();
    clnt_init();
    cid = callb_add(connmgr_cpr_reset, 0, CB_CL_CPR_RPC, "rpc");

    if (error = mod_install(&modlinkage)) {
        /*
         * Could not install module, cleanup previous
         * initialization work.
         */
        clnt_fini();
        if (cid != NULL)
            (void) callb_delete(cid);

        return (error);
    }

    /*
     * Load up the RDMA plugins and initialize the stats. Even if the
     * plugins loadup fails, but rpcmod was successfully installed the
     * counters still get initialized.
     */
    rw_init(&rdma_lock, NULL, RW_DEFAULT, NULL);
    mutex_init(&rdma_modload_lock, NULL, MUTEX_DEFAULT, NULL);
    mt_kstat_init();

    /*
     * Get our identification into ldi.  This is used for loading
     * other modules, e.g. rpcib.
     */
    status = ldi_ident_from_mod(&modlinkage, &rpcmod_li);
    if (status != 0) {
        cmn_err(CE_WARN, "ldi_ident_from_mod fails with %d", status);
        rpcmod_li = NULL;
    }

    return (error);
}

/*
 * The unload entry point fails, because we advertise entry points into
 * rpcmod from the rest of kRPC: rpcmod_release().
 */
int
_fini(void)
{
    return (EBUSY);
}

int
_info(struct modinfo *modinfop)
{
    return (mod_info(&modlinkage, modinfop));
}

extern int nulldev();

#define RPCMOD_ID   2049

int rmm_open(), rmm_close();

/*
 * To save instructions, since STREAMS ignores the return value
 * from these functions, they are defined as void here. Kind of icky, but...
 */
void rmm_rput(queue_t *, mblk_t *);
void rmm_wput(queue_t *, mblk_t *);
void rmm_rsrv(queue_t *);
void rmm_wsrv(queue_t *);

int rpcmodopen(), rpcmodclose();
void rpcmodrput(), rpcmodwput();
void rpcmodrsrv(), rpcmodwsrv();

static  void    rpcmodwput_other(queue_t *, mblk_t *);
static  int mir_close(queue_t *q);
static  int mir_open(queue_t *q, dev_t *devp, int flag, int sflag,
            cred_t *credp);
static  void    mir_rput(queue_t *q, mblk_t *mp);
static  void    mir_rsrv(queue_t *q);
static  void    mir_wput(queue_t *q, mblk_t *mp);
static  void    mir_wsrv(queue_t *q);

static struct module_info rpcmod_info =
    {RPCMOD_ID, "rpcmod", 0, INFPSZ, 256*1024, 1024};

/*
 * Read side has no service procedure.
 */
static struct qinit rpcmodrinit = {
    (int (*)())rmm_rput,
    (int (*)())rmm_rsrv,
    rmm_open,
    rmm_close,
    nulldev,
    &rpcmod_info,
    NULL
};

/*
 * The write put procedure is simply putnext to conserve stack space.
 * The write service procedure is not used to queue data, but instead to
 * synchronize with flow control.
 */
static struct qinit rpcmodwinit = {
    (int (*)())rmm_wput,
    (int (*)())rmm_wsrv,
    rmm_open,
    rmm_close,
    nulldev,
    &rpcmod_info,
    NULL
};
struct streamtab rpcinfo = { &rpcmodrinit, &rpcmodwinit, NULL, NULL };

struct xprt_style_ops {
    int (*xo_open)();
    int (*xo_close)();
    void (*xo_wput)();
    void (*xo_wsrv)();
    void (*xo_rput)();
    void (*xo_rsrv)();
};

static struct xprt_style_ops xprt_clts_ops = {
    rpcmodopen,
    rpcmodclose,
    rpcmodwput,
    rpcmodwsrv,
    rpcmodrput,
    NULL
};

static struct xprt_style_ops xprt_cots_ops = {
    mir_open,
    mir_close,
    mir_wput,
    mir_wsrv,
    mir_rput,
    mir_rsrv
};

/*
 * Per rpcmod "slot" data structure. q->q_ptr points to one of these.
 */
struct rpcm {
    void        *rm_krpc_cell;  /* Reserved for use by KRPC */
    struct      xprt_style_ops  *rm_ops;
    int     rm_type;    /* Client or server side stream */
#define RM_CLOSING  0x1     /* somebody is trying to close slot */
    uint_t      rm_state;   /* state of the slot. see above */
    uint_t      rm_ref;     /* cnt of external references to slot */
    kmutex_t    rm_lock;    /* mutex protecting above fields */
    kcondvar_t  rm_cwait;   /* condition for closing */
    zoneid_t    rm_zoneid;  /* zone which pushed rpcmod */
};

struct temp_slot {
    void *cell;
    struct xprt_style_ops *ops;
    int type;
    mblk_t *info_ack;
    kmutex_t lock;
    kcondvar_t wait;
};

typedef struct mir_s {
    void    *mir_krpc_cell; /* Reserved for KRPC use. This field */
                    /* must be first in the structure. */
    struct xprt_style_ops   *rm_ops;
    int mir_type;       /* Client or server side stream */

    mblk_t  *mir_head_mp;       /* RPC msg in progress */
        /*
         * mir_head_mp points the first mblk being collected in
         * the current RPC message.  Record headers are removed
         * before data is linked into mir_head_mp.
         */
    mblk_t  *mir_tail_mp;       /* Last mblk in mir_head_mp */
        /*
         * mir_tail_mp points to the last mblk in the message
         * chain starting at mir_head_mp.  It is only valid
         * if mir_head_mp is non-NULL and is used to add new
         * data blocks to the end of chain quickly.
         */

    int32_t mir_frag_len;       /* Bytes seen in the current frag */
        /*
         * mir_frag_len starts at -4 for beginning of each fragment.
         * When this length is negative, it indicates the number of
         * bytes that rpcmod needs to complete the record marker
         * header.  When it is positive or zero, it holds the number
         * of bytes that have arrived for the current fragment and
         * are held in mir_header_mp.
         */

    int32_t mir_frag_header;
        /*
         * Fragment header as collected for the current fragment.
         * It holds the last-fragment indicator and the number
         * of bytes in the fragment.
         */

    unsigned int
        mir_ordrel_pending : 1, /* Sent T_ORDREL_REQ */
        mir_hold_inbound : 1,   /* Hold inbound messages on server */
                    /* side until outbound flow control */
                    /* is relieved. */
        mir_closing : 1,    /* The stream is being closed */
        mir_inrservice : 1, /* data queued or rd srv proc running */
        mir_inwservice : 1, /* data queued or wr srv proc running */
        mir_inwflushdata : 1,   /* flush M_DATAs when srv runs */
        /*
         * On client streams, mir_clntreq is 0 or 1; it is set
         * to 1 whenever a new request is sent out (mir_wput)
         * and cleared when the timer fires (mir_timer).  If
         * the timer fires with this value equal to 0, then the
         * stream is considered idle and KRPC is notified.
         */
        mir_clntreq : 1,
        /*
         * On server streams, stop accepting messages
         */
        mir_svc_no_more_msgs : 1,
        mir_listen_stream : 1,  /* listen end point */
        mir_unused : 1, /* no longer used */
        mir_timer_call : 1,
        mir_junk_fill_thru_bit_31 : 21;

    int mir_setup_complete; /* server has initialized everything */
    timeout_id_t mir_timer_id;  /* Timer for idle checks */
    clock_t mir_idle_timeout;   /* Allowed idle time before shutdown */
        /*
         * This value is copied from clnt_idle_timeout or
         * svc_idle_timeout during the appropriate ioctl.
         * Kept in milliseconds
         */
    clock_t mir_use_timestamp;  /* updated on client with each use */
        /*
         * This value is set to lbolt
         * every time a client stream sends or receives data.
         * Even if the timer message arrives, we don't shutdown
         * client unless:
         *    lbolt >= MSEC_TO_TICK(mir_idle_timeout)+mir_use_timestamp.
         * This value is kept in HZ.
         */

    uint_t  *mir_max_msg_sizep; /* Reference to sanity check size */
        /*
         * This pointer is set to &clnt_max_msg_size or
         * &svc_max_msg_size during the appropriate ioctl.
         */
    zoneid_t mir_zoneid;    /* zone which pushed rpcmod */
    /* Server-side fields. */
    int mir_ref_cnt;        /* Reference count: server side only */
                    /* counts the number of references */
                    /* that a kernel RPC server thread */
                    /* (see svc_run()) has on this rpcmod */
                    /* slot. Effectively, it is the */
                    /* number * of unprocessed messages */
                    /* that have been passed up to the */
                    /* KRPC layer */

    mblk_t  *mir_svc_pend_mp;   /* Pending T_ORDREL_IND or */
                    /* T_DISCON_IND */

    /*
     * these fields are for both client and server, but for debugging,
     * it is easier to have these last in the structure.
     */
    kmutex_t    mir_mutex;  /* Mutex and condvar for close */
    kcondvar_t  mir_condvar;    /* synchronization. */
    kcondvar_t  mir_timer_cv;   /* Timer routine sync. */
} mir_t;

void tmp_rput(queue_t *q, mblk_t *mp);

struct xprt_style_ops tmpops = {
    NULL,
    NULL,
    putnext,
    NULL,
    tmp_rput,
    NULL
};

void
tmp_rput(queue_t *q, mblk_t *mp)
{
    struct temp_slot *t = (struct temp_slot *)(q->q_ptr);
    struct T_info_ack *pptr;

    switch (mp->b_datap->db_type) {
    case M_PCPROTO:
        pptr = (struct T_info_ack *)mp->b_rptr;
        switch (pptr->PRIM_type) {
        case T_INFO_ACK:
            mutex_enter(&t->lock);
            t->info_ack = mp;
            cv_signal(&t->wait);
            mutex_exit(&t->lock);
            return;
        default:
            break;
        }
    default:
        break;
    }

    /*
     * Not an info-ack, so free it. This is ok because we should
     * not be receiving data until the open finishes: rpcmod
     * is pushed well before the end-point is bound to an address.
     */
    freemsg(mp);
}

int
rmm_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *crp)
{
    mblk_t *bp;
    struct temp_slot ts, *t;
    struct T_info_ack *pptr;
    int error = 0;

    ASSERT(q != NULL);
    /*
     * Check for re-opens.
     */
    if (q->q_ptr) {
        TRACE_1(TR_FAC_KRPC, TR_RPCMODOPEN_END,
            "rpcmodopen_end:(%s)", "q->qptr");
        return (0);
    }

    t = &ts;
    bzero(t, sizeof (*t));
    q->q_ptr = (void *)t;
    WR(q)->q_ptr = (void *)t;

    /*
     * Allocate the required messages upfront.
     */
    if ((bp = allocb(sizeof (struct T_info_req) +
        sizeof (struct T_info_ack), BPRI_LO)) == (mblk_t *)NULL) {
        return (ENOBUFS);
    }

    mutex_init(&t->lock, NULL, MUTEX_DEFAULT, NULL);
    cv_init(&t->wait, NULL, CV_DEFAULT, NULL);

    t->ops = &tmpops;

    qprocson(q);
    bp->b_datap->db_type = M_PCPROTO;
    *(int32_t *)bp->b_wptr = (int32_t)T_INFO_REQ;
    bp->b_wptr += sizeof (struct T_info_req);
    putnext(WR(q), bp);

    mutex_enter(&t->lock);
    while (t->info_ack == NULL) {
        if (cv_wait_sig(&t->wait, &t->lock) == 0) {
            error = EINTR;
            break;
        }
    }
    mutex_exit(&t->lock);

    if (error)
        goto out;

    pptr = (struct T_info_ack *)t->info_ack->b_rptr;

    if (pptr->SERV_type == T_CLTS) {
        if ((error = rpcmodopen(q, devp, flag, sflag, crp)) == 0)
            ((struct rpcm *)q->q_ptr)->rm_ops = &xprt_clts_ops;
    } else {
        if ((error = mir_open(q, devp, flag, sflag, crp)) == 0)
            ((mir_t *)q->q_ptr)->rm_ops = &xprt_cots_ops;
    }

out:
    if (error)
        qprocsoff(q);

    freemsg(t->info_ack);
    mutex_destroy(&t->lock);
    cv_destroy(&t->wait);

    return (error);
}

void
rmm_rput(queue_t *q, mblk_t  *mp)
{
    (*((struct temp_slot *)q->q_ptr)->ops->xo_rput)(q, mp);
}

void
rmm_rsrv(queue_t *q)
{
    (*((struct temp_slot *)q->q_ptr)->ops->xo_rsrv)(q);
}

void
rmm_wput(queue_t *q, mblk_t *mp)
{
    (*((struct temp_slot *)q->q_ptr)->ops->xo_wput)(q, mp);
}

void
rmm_wsrv(queue_t *q)
{
    (*((struct temp_slot *)q->q_ptr)->ops->xo_wsrv)(q);
}

int
rmm_close(queue_t *q, int flag, cred_t *crp)
{
    return ((*((struct temp_slot *)q->q_ptr)->ops->xo_close)(q, flag, crp));
}

/*
 * rpcmodopen - open routine gets called when the module gets pushed
 *      onto the stream.
 */
/*ARGSUSED*/
int
rpcmodopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *crp)
{
    struct rpcm *rmp;

    extern void (*rpc_rele)(queue_t *, mblk_t *);
    static void rpcmod_release(queue_t *, mblk_t *);

    TRACE_0(TR_FAC_KRPC, TR_RPCMODOPEN_START, "rpcmodopen_start:");

    /*
     * Initialize entry points to release a rpcmod slot (and an input
     * message if supplied) and to send an output message to the module
     * below rpcmod.
     */
    if (rpc_rele == NULL)
        rpc_rele = rpcmod_release;

    /*
     * Only sufficiently privileged users can use this module, and it
     * is assumed that they will use this module properly, and NOT send
     * bulk data from downstream.
     */
    if (secpolicy_rpcmod_open(crp) != 0)
        return (EPERM);

    /*
     * Allocate slot data structure.
     */
    rmp = kmem_zalloc(sizeof (*rmp), KM_SLEEP);

    mutex_init(&rmp->rm_lock, NULL, MUTEX_DEFAULT, NULL);
    cv_init(&rmp->rm_cwait, NULL, CV_DEFAULT, NULL);
    rmp->rm_zoneid = rpc_zoneid();
    /*
     * slot type will be set by kRPC client and server ioctl's
     */
    rmp->rm_type = 0;

    q->q_ptr = (void *)rmp;
    WR(q)->q_ptr = (void *)rmp;

    TRACE_1(TR_FAC_KRPC, TR_RPCMODOPEN_END, "rpcmodopen_end:(%s)", "end");
    return (0);
}

/*
 * rpcmodclose - This routine gets called when the module gets popped
 * off of the stream.
 */
/*ARGSUSED*/
int
rpcmodclose(queue_t *q, int flag, cred_t *crp)
{
    struct rpcm *rmp;

    ASSERT(q != NULL);
    rmp = (struct rpcm *)q->q_ptr;

    /*
     * Mark our state as closing.
     */
    mutex_enter(&rmp->rm_lock);
    rmp->rm_state |= RM_CLOSING;

    /*
     * Check and see if there are any messages on the queue.  If so, send
     * the messages, regardless whether the downstream module is ready to
     * accept data.
     */
    if (rmp->rm_type == RPC_SERVER) {
        flushq(q, FLUSHDATA);

        qenable(WR(q));

        if (rmp->rm_ref) {
            mutex_exit(&rmp->rm_lock);
            /*
             * call into SVC to clean the queue
             */
            svc_queueclean(q);
            mutex_enter(&rmp->rm_lock);

            /*
             * Block while there are kRPC threads with a reference
             * to this message.
             */
            while (rmp->rm_ref)
                cv_wait(&rmp->rm_cwait, &rmp->rm_lock);
        }

        mutex_exit(&rmp->rm_lock);

        /*
         * It is now safe to remove this queue from the stream. No kRPC
         * threads have a reference to the stream, and none ever will,
         * because RM_CLOSING is set.
         */
        qprocsoff(q);

        /* Notify kRPC that this stream is going away. */
        svc_queueclose(q);
    } else {
        mutex_exit(&rmp->rm_lock);
        qprocsoff(q);
    }

    q->q_ptr = NULL;
    WR(q)->q_ptr = NULL;
    mutex_destroy(&rmp->rm_lock);
    cv_destroy(&rmp->rm_cwait);
    kmem_free(rmp, sizeof (*rmp));
    return (0);
}

#ifdef  DEBUG
int rpcmod_send_msg_up = 0;
int rpcmod_send_uderr = 0;
int rpcmod_send_dup = 0;
int rpcmod_send_dup_cnt = 0;
#endif

/*
 * rpcmodrput - Module read put procedure.  This is called from
 *      the module, driver, or stream head downstream.
 */
void
rpcmodrput(queue_t *q, mblk_t *mp)
{
    struct rpcm *rmp;
    union T_primitives *pptr;
    int hdrsz;

    TRACE_0(TR_FAC_KRPC, TR_RPCMODRPUT_START, "rpcmodrput_start:");

    ASSERT(q != NULL);
    rmp = (struct rpcm *)q->q_ptr;

    if (rmp->rm_type == 0) {
        freemsg(mp);
        return;
    }

#ifdef DEBUG
    if (rpcmod_send_msg_up > 0) {
        mblk_t *nmp = copymsg(mp);
        if (nmp) {
            putnext(q, nmp);
            rpcmod_send_msg_up--;
        }
    }
    if ((rpcmod_send_uderr > 0) && mp->b_datap->db_type == M_PROTO) {
        mblk_t *nmp;
        struct T_unitdata_ind *data;
        struct T_uderror_ind *ud;
        int d;
        data = (struct T_unitdata_ind *)mp->b_rptr;
        if (data->PRIM_type == T_UNITDATA_IND) {
            d = sizeof (*ud) - sizeof (*data);
            nmp = allocb(mp->b_wptr - mp->b_rptr + d, BPRI_HI);
            if (nmp) {
                ud = (struct T_uderror_ind *)nmp->b_rptr;
                ud->PRIM_type = T_UDERROR_IND;
                ud->DEST_length = data->SRC_length;
                ud->DEST_offset = data->SRC_offset + d;
                ud->OPT_length = data->OPT_length;
                ud->OPT_offset = data->OPT_offset + d;
                ud->ERROR_type = ENETDOWN;
                if (data->SRC_length) {
                    bcopy(mp->b_rptr +
                        data->SRC_offset,
                        nmp->b_rptr +
                        ud->DEST_offset,
                        data->SRC_length);
                }
                if (data->OPT_length) {
                    bcopy(mp->b_rptr +
                        data->OPT_offset,
                        nmp->b_rptr +
                        ud->OPT_offset,
                        data->OPT_length);
                }
                nmp->b_wptr += d;
                nmp->b_wptr += (mp->b_wptr - mp->b_rptr);
                nmp->b_datap->db_type = M_PROTO;
                putnext(q, nmp);
                rpcmod_send_uderr--;
            }
        }
    }
#endif
    switch (mp->b_datap->db_type) {
    default:
        putnext(q, mp);
        break;

    case M_PROTO:
    case M_PCPROTO:
        ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (int32_t));
        pptr = (union T_primitives *)mp->b_rptr;

        /*
         * Forward this message to krpc if it is data.
         */
        if (pptr->type == T_UNITDATA_IND) {
            mblk_t *nmp;

        /*
         * Check if the module is being popped.
         */
            mutex_enter(&rmp->rm_lock);
            if (rmp->rm_state & RM_CLOSING) {
                mutex_exit(&rmp->rm_lock);
                putnext(q, mp);
                break;
            }

            switch (rmp->rm_type) {
            case RPC_CLIENT:
                mutex_exit(&rmp->rm_lock);
                hdrsz = mp->b_wptr - mp->b_rptr;

                /*
                 * Make sure the header is sane.
                 */
                if (hdrsz < TUNITDATAINDSZ ||
                    hdrsz < (pptr->unitdata_ind.OPT_length +
                    pptr->unitdata_ind.OPT_offset) ||
                    hdrsz < (pptr->unitdata_ind.SRC_length +
                    pptr->unitdata_ind.SRC_offset)) {
                    freemsg(mp);
                    return;
                }

                /*
                 * Call clnt_clts_dispatch_notify, so that it
                 * can pass the message to the proper caller.
                 * Don't discard the header just yet since the
                 * client may need the sender's address.
                 */
                clnt_clts_dispatch_notify(mp, hdrsz,
                    rmp->rm_zoneid);
                return;
            case RPC_SERVER:
                /*
                 * rm_krpc_cell is exclusively used by the kRPC
                 * CLTS server
                 */
                if (rmp->rm_krpc_cell) {
#ifdef DEBUG
                    /*
                     * Test duplicate request cache and
                     * rm_ref count handling by sending a
                     * duplicate every so often, if
                     * desired.
                     */
                    if (rpcmod_send_dup &&
                        rpcmod_send_dup_cnt++ %
                        rpcmod_send_dup)
                        nmp = copymsg(mp);
                    else
                        nmp = NULL;
#endif
                    /*
                     * Raise the reference count on this
                     * module to prevent it from being
                     * popped before krpc generates the
                     * reply.
                     */
                    rmp->rm_ref++;
                    mutex_exit(&rmp->rm_lock);

                    /*
                     * Submit the message to krpc.
                     */
                    svc_queuereq(q, mp);
#ifdef DEBUG
                    /*
                     * Send duplicate if we created one.
                     */
                    if (nmp) {
                        mutex_enter(&rmp->rm_lock);
                        rmp->rm_ref++;
                        mutex_exit(&rmp->rm_lock);
                        svc_queuereq(q, nmp);
                    }
#endif
                } else {
                    mutex_exit(&rmp->rm_lock);
                    freemsg(mp);
                }
                return;
            default:
                mutex_exit(&rmp->rm_lock);
                freemsg(mp);
                return;
            } /* end switch(rmp->rm_type) */
        } else if (pptr->type == T_UDERROR_IND) {
            mutex_enter(&rmp->rm_lock);
            hdrsz = mp->b_wptr - mp->b_rptr;

            /*
             * Make sure the header is sane
             */
            if (hdrsz < TUDERRORINDSZ ||
                hdrsz < (pptr->uderror_ind.OPT_length +
                pptr->uderror_ind.OPT_offset) ||
                hdrsz < (pptr->uderror_ind.DEST_length +
                pptr->uderror_ind.DEST_offset)) {
                mutex_exit(&rmp->rm_lock);
                freemsg(mp);
                return;
            }

            /*
             * In the case where a unit data error has been
             * received, all we need to do is clear the message from
             * the queue.
             */
            mutex_exit(&rmp->rm_lock);
            freemsg(mp);
            RPCLOG(32, "rpcmodrput: unitdata error received at "
                "%ld\n", gethrestime_sec());
            return;
        } /* end else if (pptr->type == T_UDERROR_IND) */

        putnext(q, mp);
        break;
    } /* end switch (mp->b_datap->db_type) */

    TRACE_0(TR_FAC_KRPC, TR_RPCMODRPUT_END,
        "rpcmodrput_end:");
    /*
     * Return codes are not looked at by the STREAMS framework.
     */
}

/*
 * write put procedure
 */
void
rpcmodwput(queue_t *q, mblk_t *mp)
{
    struct rpcm *rmp;

    ASSERT(q != NULL);

    switch (mp->b_datap->db_type) {
        case M_PROTO:
        case M_PCPROTO:
            break;
        default:
            rpcmodwput_other(q, mp);
            return;
    }

    /*
     * Check to see if we can send the message downstream.
     */
    if (canputnext(q)) {
        putnext(q, mp);
        return;
    }

    rmp = (struct rpcm *)q->q_ptr;
    ASSERT(rmp != NULL);

    /*
     * The first canputnext failed.  Try again except this time with the
     * lock held, so that we can check the state of the stream to see if
     * it is closing.  If either of these conditions evaluate to true
     * then send the meesage.
     */
    mutex_enter(&rmp->rm_lock);
    if (canputnext(q) || (rmp->rm_state & RM_CLOSING)) {
        mutex_exit(&rmp->rm_lock);
        putnext(q, mp);
    } else {
        /*
         * canputnext failed again and the stream is not closing.
         * Place the message on the queue and let the service
         * procedure handle the message.
         */
        mutex_exit(&rmp->rm_lock);
        (void) putq(q, mp);
    }
}

static void
rpcmodwput_other(queue_t *q, mblk_t *mp)
{
    struct rpcm *rmp;
    struct iocblk   *iocp;

    rmp = (struct rpcm *)q->q_ptr;
    ASSERT(rmp != NULL);

    switch (mp->b_datap->db_type) {
        case M_IOCTL:
            iocp = (struct iocblk *)mp->b_rptr;
            ASSERT(iocp != NULL);
            switch (iocp->ioc_cmd) {
                case RPC_CLIENT:
                case RPC_SERVER:
                    mutex_enter(&rmp->rm_lock);
                    rmp->rm_type = iocp->ioc_cmd;
                    mutex_exit(&rmp->rm_lock);
                    mp->b_datap->db_type = M_IOCACK;
                    qreply(q, mp);
                    return;
                default:
                /*
                 * pass the ioctl downstream and hope someone
                 * down there knows how to handle it.
                 */
                    putnext(q, mp);
                    return;
            }
        default:
            break;
    }
    /*
     * This is something we definitely do not know how to handle, just
     * pass the message downstream
     */
    putnext(q, mp);
}

/*
 * Module write service procedure. This is called by downstream modules
 * for back enabling during flow control.
 */
void
rpcmodwsrv(queue_t *q)
{
    struct rpcm *rmp;
    mblk_t      *mp = NULL;

    rmp = (struct rpcm *)q->q_ptr;
    ASSERT(rmp != NULL);

    /*
     * Get messages that may be queued and send them down stream
     */
    while ((mp = getq(q)) != NULL) {
        /*
         * Optimize the service procedure for the server-side, by
         * avoiding a call to canputnext().
         */
        if (rmp->rm_type == RPC_SERVER || canputnext(q)) {
            putnext(q, mp);
            continue;
        }
        (void) putbq(q, mp);
        return;
    }
}

static void
rpcmod_release(queue_t *q, mblk_t *bp)
{
    struct rpcm *rmp;

    /*
     * For now, just free the message.
     */
    if (bp)
        freemsg(bp);
    rmp = (struct rpcm *)q->q_ptr;

    mutex_enter(&rmp->rm_lock);
    rmp->rm_ref--;

    if (rmp->rm_ref == 0 && (rmp->rm_state & RM_CLOSING)) {
        cv_broadcast(&rmp->rm_cwait);
    }

    mutex_exit(&rmp->rm_lock);
}

/*
 * This part of rpcmod is pushed on a connection-oriented transport for use
 * by RPC.  It serves to bypass the Stream head, implements
 * the record marking protocol, and dispatches incoming RPC messages.
 */

/* Default idle timer values */
#define MIR_CLNT_IDLE_TIMEOUT   (5 * (60 * 1000L))  /* 5 minutes */
#define MIR_SVC_IDLE_TIMEOUT    (6 * (60 * 1000L))  /* 6 minutes */
#define MIR_SVC_ORDREL_TIMEOUT  (10 * (60 * 1000L)) /* 10 minutes */
#define MIR_LASTFRAG    0x80000000  /* Record marker */

#define DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr))

#define MIR_SVC_QUIESCED(mir)   \
    (mir->mir_ref_cnt == 0 && mir->mir_inrservice == 0)

#define MIR_CLEAR_INRSRV(mir_ptr)   {   \
    (mir_ptr)->mir_inrservice = 0;  \
    if ((mir_ptr)->mir_type == RPC_SERVER &&    \
        (mir_ptr)->mir_closing) \
        cv_signal(&(mir_ptr)->mir_condvar); \
}

/*
 * Don't block service procedure (and mir_close) if
 * we are in the process of closing.
 */
#define MIR_WCANPUTNEXT(mir_ptr, write_q)   \
    (canputnext(write_q) || ((mir_ptr)->mir_svc_no_more_msgs == 1))

static int  mir_clnt_dup_request(queue_t *q, mblk_t *mp);
static void mir_rput_proto(queue_t *q, mblk_t *mp);
static int  mir_svc_policy_notify(queue_t *q, int event);
static void mir_svc_release(queue_t *wq, mblk_t *mp);
static void mir_svc_start(queue_t *wq);
static void mir_svc_idle_start(queue_t *, mir_t *);
static void mir_svc_idle_stop(queue_t *, mir_t *);
static void mir_svc_start_close(queue_t *, mir_t *);
static void mir_clnt_idle_do_stop(queue_t *);
static void mir_clnt_idle_stop(queue_t *, mir_t *);
static void mir_clnt_idle_start(queue_t *, mir_t *);
static void mir_wput(queue_t *q, mblk_t *mp);
static void mir_wput_other(queue_t *q, mblk_t *mp);
static void mir_wsrv(queue_t *q);
static  void    mir_disconnect(queue_t *, mir_t *ir);
static  int mir_check_len(queue_t *, int32_t, mblk_t *);
static  void    mir_timer(void *);

extern void (*mir_rele)(queue_t *, mblk_t *);
extern void (*mir_start)(queue_t *);
extern void (*clnt_stop_idle)(queue_t *);

clock_t clnt_idle_timeout = MIR_CLNT_IDLE_TIMEOUT;
clock_t svc_idle_timeout = MIR_SVC_IDLE_TIMEOUT;

/*
 * Timeout for subsequent notifications of idle connection.  This is
 * typically used to clean up after a wedged orderly release.
 */
clock_t svc_ordrel_timeout = MIR_SVC_ORDREL_TIMEOUT; /* milliseconds */

extern  uint_t  *clnt_max_msg_sizep;
extern  uint_t  *svc_max_msg_sizep;
uint_t  clnt_max_msg_size = RPC_MAXDATASIZE;
uint_t  svc_max_msg_size = RPC_MAXDATASIZE;
uint_t  mir_krpc_cell_null;

static void
mir_timer_stop(mir_t *mir)
{
    timeout_id_t tid;

    ASSERT(MUTEX_HELD(&mir->mir_mutex));

    /*
     * Since the mir_mutex lock needs to be released to call
     * untimeout(), we need to make sure that no other thread
     * can start/stop the timer (changing mir_timer_id) during
     * that time.  The mir_timer_call bit and the mir_timer_cv
     * condition variable are used to synchronize this.  Setting
     * mir_timer_call also tells mir_timer() (refer to the comments
     * in mir_timer()) that it does not need to do anything.
     */
    while (mir->mir_timer_call)
        cv_wait(&mir->mir_timer_cv, &mir->mir_mutex);
    mir->mir_timer_call = B_TRUE;

    if ((tid = mir->mir_timer_id) != 0) {
        mir->mir_timer_id = 0;
        mutex_exit(&mir->mir_mutex);
        (void) untimeout(tid);
        mutex_enter(&mir->mir_mutex);
    }
    mir->mir_timer_call = B_FALSE;
    cv_broadcast(&mir->mir_timer_cv);
}

static void
mir_timer_start(queue_t *q, mir_t *mir, clock_t intrvl)
{
    timeout_id_t tid;

    ASSERT(MUTEX_HELD(&mir->mir_mutex));

    while (mir->mir_timer_call)
        cv_wait(&mir->mir_timer_cv, &mir->mir_mutex);
    mir->mir_timer_call = B_TRUE;

    if ((tid = mir->mir_timer_id) != 0) {
        mutex_exit(&mir->mir_mutex);
        (void) untimeout(tid);
        mutex_enter(&mir->mir_mutex);
    }
    /* Only start the timer when it is not closing. */
    if (!mir->mir_closing) {
        mir->mir_timer_id = timeout(mir_timer, q,
            MSEC_TO_TICK(intrvl));
    }
    mir->mir_timer_call = B_FALSE;
    cv_broadcast(&mir->mir_timer_cv);
}

static int
mir_clnt_dup_request(queue_t *q, mblk_t *mp)
{
    mblk_t  *mp1;
    uint32_t  new_xid;
    uint32_t  old_xid;

    ASSERT(MUTEX_HELD(&((mir_t *)q->q_ptr)->mir_mutex));
    new_xid = BE32_TO_U32(&mp->b_rptr[4]);
    /*
     * This loop is a bit tacky -- it walks the STREAMS list of
     * flow-controlled messages.
     */
    if ((mp1 = q->q_first) != NULL) {
        do {
            old_xid = BE32_TO_U32(&mp1->b_rptr[4]);
            if (new_xid == old_xid)
                return (1);
        } while ((mp1 = mp1->b_next) != NULL);
    }
    return (0);
}

static int
mir_close(queue_t *q)
{
    mir_t   *mir;
    mblk_t  *mp;
    bool_t queue_cleaned = FALSE;

    RPCLOG(32, "rpcmod: mir_close of q 0x%p\n", (void *)q);
    mir = (mir_t *)q->q_ptr;
    ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
    mutex_enter(&mir->mir_mutex);
    if ((mp = mir->mir_head_mp) != NULL) {
        mir->mir_head_mp = (mblk_t *)0;
        freemsg(mp);
    }
    /*
     * Set mir_closing so we get notified when MIR_SVC_QUIESCED()
     * is TRUE.  And mir_timer_start() won't start the timer again.
     */
    mir->mir_closing = B_TRUE;
    mir_timer_stop(mir);

    if (mir->mir_type == RPC_SERVER) {
        flushq(q, FLUSHDATA);   /* Ditch anything waiting on read q */

        /*
         * This will prevent more requests from arriving and
         * will force rpcmod to ignore flow control.
         */
        mir_svc_start_close(WR(q), mir);

        while ((!MIR_SVC_QUIESCED(mir)) || mir->mir_inwservice == 1) {

            if (mir->mir_ref_cnt && !mir->mir_inrservice &&
                (queue_cleaned == FALSE)) {
                /*
                 * call into SVC to clean the queue
                 */
                mutex_exit(&mir->mir_mutex);
                svc_queueclean(q);
                queue_cleaned = TRUE;
                mutex_enter(&mir->mir_mutex);
                continue;
            }

            /*
             * Bugid 1253810 - Force the write service
             * procedure to send its messages, regardless
             * whether the downstream  module is ready
             * to accept data.
             */
            if (mir->mir_inwservice == 1)
                qenable(WR(q));

            cv_wait(&mir->mir_condvar, &mir->mir_mutex);
        }

        mutex_exit(&mir->mir_mutex);
        qprocsoff(q);

        /* Notify KRPC that this stream is going away. */
        svc_queueclose(q);
    } else {
        mutex_exit(&mir->mir_mutex);
        qprocsoff(q);
    }

    mutex_destroy(&mir->mir_mutex);
    cv_destroy(&mir->mir_condvar);
    cv_destroy(&mir->mir_timer_cv);
    kmem_free(mir, sizeof (mir_t));
    return (0);
}

/*
 * This is server side only (RPC_SERVER).
 *
 * Exit idle mode.
 */
static void
mir_svc_idle_stop(queue_t *q, mir_t *mir)
{
    ASSERT(MUTEX_HELD(&mir->mir_mutex));
    ASSERT((q->q_flag & QREADR) == 0);
    ASSERT(mir->mir_type == RPC_SERVER);
    RPCLOG(16, "rpcmod: mir_svc_idle_stop of q 0x%p\n", (void *)q);

    mir_timer_stop(mir);
}

/*
 * This is server side only (RPC_SERVER).
 *
 * Start idle processing, which will include setting idle timer if the
 * stream is not being closed.
 */
static void
mir_svc_idle_start(queue_t *q, mir_t *mir)
{
    ASSERT(MUTEX_HELD(&mir->mir_mutex));
    ASSERT((q->q_flag & QREADR) == 0);
    ASSERT(mir->mir_type == RPC_SERVER);
    RPCLOG(16, "rpcmod: mir_svc_idle_start q 0x%p\n", (void *)q);

    /*
     * Don't re-start idle timer if we are closing queues.
     */
    if (mir->mir_closing) {
        RPCLOG(16, "mir_svc_idle_start - closing: 0x%p\n",
            (void *)q);

        /*
         * We will call mir_svc_idle_start() whenever MIR_SVC_QUIESCED()
         * is true.  When it is true, and we are in the process of
         * closing the stream, signal any thread waiting in
         * mir_close().
         */
        if (mir->mir_inwservice == 0)
            cv_signal(&mir->mir_condvar);

    } else {
        RPCLOG(16, "mir_svc_idle_start - reset %s timer\n",
            mir->mir_ordrel_pending ? "ordrel" : "normal");
        /*
         * Normal condition, start the idle timer.  If an orderly
         * release has been sent, set the timeout to wait for the
         * client to close its side of the connection.  Otherwise,
         * use the normal idle timeout.
         */
        mir_timer_start(q, mir, mir->mir_ordrel_pending ?
            svc_ordrel_timeout : mir->mir_idle_timeout);
    }
}

/* ARGSUSED */
static int
mir_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
{
    mir_t   *mir;

    RPCLOG(32, "rpcmod: mir_open of q 0x%p\n", (void *)q);
    /* Set variables used directly by KRPC. */
    if (!mir_rele)
        mir_rele = mir_svc_release;
    if (!mir_start)
        mir_start = mir_svc_start;
    if (!clnt_stop_idle)
        clnt_stop_idle = mir_clnt_idle_do_stop;
    if (!clnt_max_msg_sizep)
        clnt_max_msg_sizep = &clnt_max_msg_size;
    if (!svc_max_msg_sizep)
        svc_max_msg_sizep = &svc_max_msg_size;

    /* Allocate a zero'ed out mir structure for this stream. */
    mir = kmem_zalloc(sizeof (mir_t), KM_SLEEP);

    /*
     * We set hold inbound here so that incoming messages will
     * be held on the read-side queue until the stream is completely
     * initialized with a RPC_CLIENT or RPC_SERVER ioctl.  During
     * the ioctl processing, the flag is cleared and any messages that
     * arrived between the open and the ioctl are delivered to KRPC.
     *
     * Early data should never arrive on a client stream since
     * servers only respond to our requests and we do not send any.
     * until after the stream is initialized.  Early data is
     * very common on a server stream where the client will start
     * sending data as soon as the connection is made (and this
     * is especially true with TCP where the protocol accepts the
     * connection before nfsd or KRPC is notified about it).
     */

    mir->mir_hold_inbound = 1;

    /*
     * Start the record marker looking for a 4-byte header.  When
     * this length is negative, it indicates that rpcmod is looking
     * for bytes to consume for the record marker header.  When it
     * is positive, it holds the number of bytes that have arrived
     * for the current fragment and are being held in mir_header_mp.
     */

    mir->mir_frag_len = -(int32_t)sizeof (uint32_t);

    mir->mir_zoneid = rpc_zoneid();
    mutex_init(&mir->mir_mutex, NULL, MUTEX_DEFAULT, NULL);
    cv_init(&mir->mir_condvar, NULL, CV_DRIVER, NULL);
    cv_init(&mir->mir_timer_cv, NULL, CV_DRIVER, NULL);

    q->q_ptr = (char *)mir;
    WR(q)->q_ptr = (char *)mir;

    /*
     * We noenable the read-side queue because we don't want it
     * automatically enabled by putq.  We enable it explicitly
     * in mir_wsrv when appropriate. (See additional comments on
     * flow control at the beginning of mir_rsrv.)
     */
    noenable(q);

    qprocson(q);
    return (0);
}

/*
 * Read-side put routine for both the client and server side.  Does the
 * record marking for incoming RPC messages, and when complete, dispatches
 * the message to either the client or server.
 */
static void
mir_do_rput(queue_t *q, mblk_t *mp, int srv)
{
    mblk_t  *cont_mp;
    int excess;
    int32_t frag_len;
    int32_t frag_header;
    mblk_t  *head_mp;
    int len;
    mir_t   *mir;
    mblk_t  *mp1;
    unsigned char   *rptr;
    mblk_t  *tail_mp;
    unsigned char   *wptr;
    boolean_t   stop_timer = B_FALSE;

    mir = (mir_t *)q->q_ptr;
    ASSERT(mir != NULL);

    /*
     * If the stream has not been set up as a RPC_CLIENT or RPC_SERVER
     * with the corresponding ioctl, then don't accept
     * any inbound data.  This should never happen for streams
     * created by nfsd or client-side KRPC because they are careful
     * to set the mode of the stream before doing anything else.
     */
    if (mir->mir_type == 0) {
        freemsg(mp);
        return;
    }

    ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));

    switch (mp->b_datap->db_type) {
    case M_DATA:
        break;
    case M_PROTO:
    case M_PCPROTO:
        rptr = mp->b_rptr;
        if (mp->b_wptr - rptr < sizeof (uint32_t)) {
            RPCLOG(1, "mir_rput: runt TPI message (%d bytes)\n",
                (int)(mp->b_wptr - rptr));
            freemsg(mp);
            return;
        }
        if (((union T_primitives *)rptr)->type != T_DATA_IND) {
            mir_rput_proto(q, mp);
            return;
        }

        /* Throw away the T_DATA_IND block and continue with data. */
        mp1 = mp;
        mp = mp->b_cont;
        freeb(mp1);
        break;
    case M_SETOPTS:
        /*
         * If a module on the stream is trying set the Stream head's
         * high water mark, then set our hiwater to the requested
         * value.  We are the "stream head" for all inbound
         * data messages since messages are passed directly to KRPC.
         */
        if ((mp->b_wptr - mp->b_rptr) >= sizeof (struct stroptions)) {
            struct stroptions   *stropts;

            stropts = (struct stroptions *)mp->b_rptr;
            if ((stropts->so_flags & SO_HIWAT) &&
                !(stropts->so_flags & SO_BAND)) {
                (void) strqset(q, QHIWAT, 0, stropts->so_hiwat);
            }
        }
        putnext(q, mp);
        return;
    case M_FLUSH:
        RPCLOG(32, "mir_do_rput: ignoring M_FLUSH on q 0x%p. ",
            (void *)q);
        RPCLOG(32, "M_FLUSH is %x\n", (uint_t)*mp->b_rptr);

        putnext(q, mp);
        return;
    default:
        putnext(q, mp);
        return;
    }

    mutex_enter(&mir->mir_mutex);

    /*
     * If this connection is closing, don't accept any new messages.
     */
    if (mir->mir_svc_no_more_msgs) {
        ASSERT(mir->mir_type == RPC_SERVER);
        mutex_exit(&mir->mir_mutex);
        freemsg(mp);
        return;
    }

    /* Get local copies for quicker access. */
    frag_len = mir->mir_frag_len;
    frag_header = mir->mir_frag_header;
    head_mp = mir->mir_head_mp;
    tail_mp = mir->mir_tail_mp;

    /* Loop, processing each message block in the mp chain separately. */
    do {
        /*
         * cont_mp is used in the do/while condition below to
         * walk to the next block in the STREAMS message.
         * mp->b_cont may be nil'ed during processing so we
         * can't rely on it to find the next block.
         */
        cont_mp = mp->b_cont;

        /*
         * Get local copies of rptr and wptr for our processing.
         * These always point into "mp" (the current block being
         * processed), but rptr is updated as we consume any
         * record header in this message, and wptr is updated to
         * point to the end of the data for the current fragment,
         * if it ends in this block.  The main point is that
         * they are not always the same as b_rptr and b_wptr.
         * b_rptr and b_wptr will be updated when appropriate.
         */
        rptr = mp->b_rptr;
        wptr = mp->b_wptr;
same_mblk:;
        len = (int)(wptr - rptr);
        if (len <= 0) {
            /*
             * If we have processed all of the data in the message
             * or the block is empty to begin with, then we're
             * done with this block and can go on to cont_mp,
             * if there is one.
             *
             * First, we check to see if the current block is
             * now zero-length and, if so, we free it.
             * This happens when either the block was empty
             * to begin with or we consumed all of the data
             * for the record marking header.
             */
            if (rptr <= mp->b_rptr) {
                /*
                 * If head_mp is non-NULL, add cont_mp to the
                 * mblk list. XXX But there is a possibility
                 * that tail_mp = mp or even head_mp = mp XXX
                 */
                if (head_mp) {
                    if (head_mp == mp)
                        head_mp = NULL;
                    else if (tail_mp != mp) {
        ASSERT((tail_mp->b_cont == NULL) || (tail_mp->b_cont == mp));
                        tail_mp->b_cont = cont_mp;
                        /*
                         * It's possible that, because
                         * of a very short mblk (0-3
                         * bytes), we've ended up here
                         * and that cont_mp could be
                         * NULL (if we're at the end
                         * of an mblk chain). If so,
                         * don't set tail_mp to
                         * cont_mp, because the next
                         * time we access it, we'll
                         * dereference a NULL pointer
                         * and crash. Just leave
                         * tail_mp pointing at the
                         * current end of chain.
                         */
                        if (cont_mp)
                            tail_mp = cont_mp;
                    } else {
                        mblk_t *smp = head_mp;

                        while ((smp->b_cont != NULL) &&
                            (smp->b_cont != mp))
                            smp = smp->b_cont;
                        smp->b_cont = cont_mp;
                        /*
                         * Don't set tail_mp to cont_mp
                         * if it's NULL. Instead, set
                         * tail_mp to smp, which is the
                         * end of the chain starting
                         * at head_mp.
                         */
                        if (cont_mp)
                            tail_mp = cont_mp;
                        else
                            tail_mp = smp;
                    }
                }
                freeb(mp);
            }
            continue;
        }

        /*
         * frag_len starts at -4 and is incremented past the record
         * marking header to 0, and then becomes positive as real data
         * bytes are received for the message.  While frag_len is less
         * than zero, we need more bytes for the record marking
         * header.
         */
        if (frag_len < 0) {
            uchar_t *up = rptr;
            /*
             * Collect as many bytes as we need for the record
             * marking header and that are available in this block.
             */
            do {
                --len;
                frag_len++;
                frag_header <<= 8;
                frag_header += (*up++ & 0xFF);
            } while (len > 0 && frag_len < 0);

            if (rptr == mp->b_rptr) {
                /*
                 * The record header is located at the
                 * beginning of the block, so just walk
                 * b_rptr past it.
                 */
                mp->b_rptr = rptr = up;
            } else {
                /*
                 * The record header is located in the middle
                 * of a block, so copy any remaining data up.
                 * This happens when an RPC message is
                 * fragmented into multiple pieces and
                 * a middle (or end) fragment immediately
                 * follows a previous fragment in the same
                 * message block.
                 */
                wptr = &rptr[len];
                mp->b_wptr = wptr;
                if (len) {
                    RPCLOG(32, "mir_do_rput: copying %d "
                        "bytes of data up", len);
                    RPCLOG(32, " db_ref %d\n",
                        (uint_t)mp->b_datap->db_ref);
                    bcopy(up, rptr, len);
                }
            }

            /*
             * If we haven't received the complete record header
             * yet, then loop around to get the next block in the
             * STREAMS message. The logic at same_mblk label will
             * free the current block if it has become empty.
             */
            if (frag_len < 0) {
                RPCLOG(32, "mir_do_rput: frag_len is still < 0 "
                "(%d)", len);
                goto same_mblk;
            }

#ifdef  RPCDEBUG
            if ((frag_header & MIR_LASTFRAG) == 0) {
                RPCLOG0(32, "mir_do_rput: multi-fragment "
                    "record\n");
            }
            {
                uint_t l = frag_header & ~MIR_LASTFRAG;

                if (l != 0 && mir->mir_max_msg_sizep &&
                    l >= *mir->mir_max_msg_sizep) {
                    RPCLOG(32, "mir_do_rput: fragment size"
                        " (%d) > maximum", l);
                    RPCLOG(32, " (%u)\n",
                        *mir->mir_max_msg_sizep);
                }
            }
#endif
            /*
             * At this point we have retrieved the complete record
             * header for this fragment.  If the current block is
             * empty, then we need to free it and walk to the next
             * block.
             */
            if (mp->b_rptr >= wptr) {
                /*
                 * If this is not the last fragment or if we
                 * have not received all the data for this
                 * RPC message, then loop around to the next
                 * block.
                 */
                if (!(frag_header & MIR_LASTFRAG) ||
                    (frag_len -
                    (frag_header & ~MIR_LASTFRAG)) ||
                    !head_mp)
                    goto same_mblk;

                /*
                 * Quick walk to next block in the
                 * STREAMS message.
                 */
                freeb(mp);
                continue;
            }
        }

        /*
         * We've collected the complete record header.  The data
         * in the current block is added to the end of the RPC
         * message.  Note that tail_mp is the same as mp after
         * this linkage.
         */
        if (!head_mp)
            head_mp = mp;
        else if (tail_mp != mp) {
            ASSERT((tail_mp->b_cont == NULL) ||
                (tail_mp->b_cont == mp));
            tail_mp->b_cont = mp;
        }
        tail_mp = mp;

        /*
         * Add the length of this block to the accumulated
         * fragment length.
         */
        frag_len += len;
        excess = frag_len - (frag_header & ~MIR_LASTFRAG);
        /*
         * If we have not received all the data for this fragment,
         * then walk to the next block.
         */
        if (excess < 0)
            continue;

        /*
         * We've received a complete fragment, so reset frag_len
         * for the next one.
         */
        frag_len = -(int32_t)sizeof (uint32_t);

        /*
         * Update rptr to point to the beginning of the next
         * fragment in this block.  If there are no more bytes
         * in the block (excess is 0), then rptr will be equal
         * to wptr.
         */
        rptr = wptr - excess;

        /*
         * Now we check to see if this fragment is the last one in
         * the RPC message.
         */
        if (!(frag_header & MIR_LASTFRAG)) {
            /*
             * This isn't the last one, so start processing the
             * next fragment.
             */
            frag_header = 0;

            /*
             * If excess is 0, the next fragment
             * starts at the beginning of the next block --
             * we "continue" to the end of the while loop and
             * walk to cont_mp.
             */
            if (excess == 0)
                continue;
            RPCLOG0(32, "mir_do_rput: multi-fragment message with "
                "two or more fragments in one mblk\n");

            /*
             * If excess is non-0, then the next fragment starts
             * in this block.  rptr points to the beginning
             * of the next fragment and we "goto same_mblk"
             * to continue processing.
             */
            goto same_mblk;
        }

        /*
         * We've got a complete RPC message.  Before passing it
         * upstream, check to see if there is extra data in this
         * message block. If so, then we separate the excess
         * from the complete message. The excess data is processed
         * after the current message goes upstream.
         */
        if (excess > 0) {
            RPCLOG(32, "mir_do_rput: end of record, but excess "
                "data (%d bytes) in this mblk. dupb/copyb "
                "needed\n", excess);

            /* Duplicate only the overlapping block. */
            mp1 = dupb(tail_mp);

            /*
             * dupb() might have failed due to ref count wrap around
             * so try a copyb().
             */
            if (mp1 == NULL)
                mp1 = copyb(tail_mp);

            /*
             * Do not use bufcall() to schedule a "buffer
             * availability event."  The reason is that
             * bufcall() has problems.  For example, if memory
             * runs out, bufcall() itself will fail since it
             * needs to allocate memory.  The most appropriate
             * action right now is to disconnect this connection
             * as the system is under stress.  We should try to
             * free up resources.
             */
            if (mp1 == NULL) {
                freemsg(head_mp);
                RPCLOG0(1, "mir_do_rput: dupb/copyb failed\n");
                mir->mir_frag_header = 0;
                mir->mir_frag_len = -(int)sizeof (uint32_t);
                mir->mir_head_mp = NULL;
                mir->mir_tail_mp = NULL;

                mir_disconnect(q, mir);
                return;
            }

            /*
             * The new message block is linked with the
             * continuation block in cont_mp.  We then point
             * cont_mp to the new block so that we will
             * process it next.
             */
            mp1->b_cont = cont_mp;
            cont_mp = mp1;
            /*
             * Data in the new block begins at the
             * next fragment (rptr).
             */
            cont_mp->b_rptr += (rptr - tail_mp->b_rptr);
            ASSERT(cont_mp->b_rptr >= cont_mp->b_datap->db_base);
            ASSERT(cont_mp->b_rptr <= cont_mp->b_wptr);

            /* Data in the current fragment ends at rptr. */
            tail_mp->b_wptr = rptr;
            ASSERT(tail_mp->b_wptr <= tail_mp->b_datap->db_lim);
            ASSERT(tail_mp->b_wptr >= tail_mp->b_rptr);

        }

        /* tail_mp is the last block with data for this RPC message. */
        tail_mp->b_cont = NULL;

        /* Pass the RPC message to the current consumer. */
        switch (mir->mir_type) {
        case RPC_CLIENT:
            if (clnt_dispatch_notify(head_mp, mir->mir_zoneid)) {
                /*
                 * Mark this stream as active.  This marker
                 * is used in mir_timer().
                 */

                mir->mir_clntreq = 1;
                mir->mir_use_timestamp = lbolt;
            } else
                freemsg(head_mp);
            break;

        case RPC_SERVER:
            /*
             * Check for flow control before passing the
             * message to KRPC.
             */

            if (!mir->mir_hold_inbound) {
                if (mir->mir_krpc_cell) {
                    /*
                     * If the reference count is 0
                     * (not including this request),
                     * then the stream is transitioning
                     * from idle to non-idle.  In this case,
                     * we cancel the idle timer.
                     */
                    if (mir->mir_ref_cnt++ == 0)
                        stop_timer = B_TRUE;
                    if (mir_check_len(q,
                        (int32_t)msgdsize(mp), mp))
                        return;
                    svc_queuereq(q, head_mp); /* to KRPC */
                } else {
                    /*
                     * Count # of times this happens. Should
                     * be never, but experience shows
                     * otherwise.
                     */
                    mir_krpc_cell_null++;
                    freemsg(head_mp);
                }

            } else {
                /*
                 * If the outbound side of the stream is
                 * flow controlled, then hold this message
                 * until client catches up. mir_hold_inbound
                 * is set in mir_wput and cleared in mir_wsrv.
                 */
                if (srv)
                    (void) putbq(q, head_mp);
                else
                    (void) putq(q, head_mp);
                mir->mir_inrservice = B_TRUE;
            }
            break;
        default:
            RPCLOG(1, "mir_rput: unknown mir_type %d\n",
                mir->mir_type);
            freemsg(head_mp);
            break;
        }

        /*
         * Reset head_mp and frag_header since we're starting on a
         * new RPC fragment and message.
         */
        head_mp = NULL;
        tail_mp = NULL;
        frag_header = 0;
    } while ((mp = cont_mp) != NULL);

    /*
     * Do a sanity check on the message length.  If this message is
     * getting excessively large, shut down the connection.
     */
    if (head_mp != NULL && mir->mir_setup_complete &&
        mir_check_len(q, frag_len, head_mp))
        return;

    /* Save our local copies back in the mir structure. */
    mir->mir_frag_header = frag_header;
    mir->mir_frag_len = frag_len;
    mir->mir_head_mp = head_mp;
    mir->mir_tail_mp = tail_mp;

    /*
     * The timer is stopped after the whole message chain is processed.
     * The reason is that stopping the timer releases the mir_mutex
     * lock temporarily.  This means that the request can be serviced
     * while we are still processing the message chain.  This is not
     * good.  So we stop the timer here instead.
     *
     * Note that if the timer fires before we stop it, it will not
     * do any harm as MIR_SVC_QUIESCED() is false and mir_timer()
     * will just return;
     */
    if (stop_timer) {
        RPCLOG(16, "mir_do_rput stopping idle timer on 0x%p because "
            "ref cnt going to non zero\n", (void *) WR(q));
        mir_svc_idle_stop(WR(q), mir);
    }
    mutex_exit(&mir->mir_mutex);
}

static void
mir_rput(queue_t *q, mblk_t *mp)
{
    mir_do_rput(q, mp, 0);
}

static void
mir_rput_proto(queue_t *q, mblk_t *mp)
{
    mir_t   *mir = (mir_t *)q->q_ptr;
    uint32_t    type;
    uint32_t reason = 0;

    ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));

    type = ((union T_primitives *)mp->b_rptr)->type;
    switch (mir->mir_type) {
    case RPC_CLIENT:
        switch (type) {
        case T_DISCON_IND:
            reason = ((struct T_discon_ind *)
                (mp->b_rptr))->DISCON_reason;
            /*FALLTHROUGH*/
        case T_ORDREL_IND:
            mutex_enter(&mir->mir_mutex);
            if (mir->mir_head_mp) {
                freemsg(mir->mir_head_mp);
                mir->mir_head_mp = (mblk_t *)0;
                mir->mir_tail_mp = (mblk_t *)0;
            }
            /*
             * We are disconnecting, but not necessarily
             * closing. By not closing, we will fail to
             * pick up a possibly changed global timeout value,
             * unless we store it now.
             */
            mir->mir_idle_timeout = clnt_idle_timeout;
            mir_clnt_idle_stop(WR(q), mir);

            /*
             * Even though we are unconnected, we still
             * leave the idle timer going on the client. The
             * reason for is that if we've disconnected due
             * to a server-side disconnect, reset, or connection
             * timeout, there is a possibility the client may
             * retry the RPC request. This retry needs to done on
             * the same bound address for the server to interpret
             * it as such. However, we don't want
             * to wait forever for that possibility. If the
             * end-point stays unconnected for mir_idle_timeout
             * units of time, then that is a signal to the
             * connection manager to give up waiting for the
             * application (eg. NFS) to send a retry.
             */
            mir_clnt_idle_start(WR(q), mir);
            mutex_exit(&mir->mir_mutex);
            clnt_dispatch_notifyall(WR(q), type, reason);
            freemsg(mp);
            return;
        case T_ERROR_ACK:
        {
            struct T_error_ack  *terror;

            terror = (struct T_error_ack *)mp->b_rptr;
            RPCLOG(1, "mir_rput_proto T_ERROR_ACK for queue 0x%p",
                (void *)q);
            RPCLOG(1, " ERROR_prim: %s,",
                rpc_tpiprim2name(terror->ERROR_prim));
            RPCLOG(1, " TLI_error: %s,",
                rpc_tpierr2name(terror->TLI_error));
            RPCLOG(1, " UNIX_error: %d\n", terror->UNIX_error);
            if (terror->ERROR_prim == T_DISCON_REQ)  {
                clnt_dispatch_notifyall(WR(q), type, reason);
                freemsg(mp);
                return;
            } else {
                if (clnt_dispatch_notifyconn(WR(q), mp))
                    return;
            }
            break;
        }
        case T_OK_ACK:
        {
            struct T_ok_ack *tok = (struct T_ok_ack *)mp->b_rptr;

            if (tok->CORRECT_prim == T_DISCON_REQ) {
                clnt_dispatch_notifyall(WR(q), type, reason);
                freemsg(mp);
                return;
            } else {
                if (clnt_dispatch_notifyconn(WR(q), mp))
                    return;
            }
            break;
        }
        case T_CONN_CON:
        case T_INFO_ACK:
        case T_OPTMGMT_ACK:
            if (clnt_dispatch_notifyconn(WR(q), mp))
                return;
            break;
        case T_BIND_ACK:
            break;
        default:
            RPCLOG(1, "mir_rput: unexpected message %d "
                "for KRPC client\n",
                ((union T_primitives *)mp->b_rptr)->type);
            break;
        }
        break;

    case RPC_SERVER:
        switch (type) {
        case T_BIND_ACK:
        {
            struct T_bind_ack   *tbind;

            /*
             * If this is a listening stream, then shut
             * off the idle timer.
             */
            tbind = (struct T_bind_ack *)mp->b_rptr;
            if (tbind->CONIND_number > 0) {
                mutex_enter(&mir->mir_mutex);
                mir_svc_idle_stop(WR(q), mir);

                /*
                 * mark this as a listen endpoint
                 * for special handling.
                 */

                mir->mir_listen_stream = 1;
                mutex_exit(&mir->mir_mutex);
            }
            break;
        }
        case T_DISCON_IND:
        case T_ORDREL_IND:
            RPCLOG(16, "mir_rput_proto: got %s indication\n",
                type == T_DISCON_IND ? "disconnect"
                : "orderly release");

            /*
             * For listen endpoint just pass
             * on the message.
             */

            if (mir->mir_listen_stream)
                break;

            mutex_enter(&mir->mir_mutex);

            /*
             * If client wants to break off connection, record
             * that fact.
             */
            mir_svc_start_close(WR(q), mir);

            /*
             * If we are idle, then send the orderly release
             * or disconnect indication to nfsd.
             */
            if (MIR_SVC_QUIESCED(mir)) {
                mutex_exit(&mir->mir_mutex);
                break;
            }

            RPCLOG(16, "mir_rput_proto: not idle, so "
                "disconnect/ord rel indication not passed "
                "upstream on 0x%p\n", (void *)q);

            /*
             * Hold the indication until we get idle
             * If there already is an indication stored,
             * replace it if the new one is a disconnect. The
             * reasoning is that disconnection takes less time
             * to process, and once a client decides to
             * disconnect, we should do that.
             */
            if (mir->mir_svc_pend_mp) {
                if (type == T_DISCON_IND) {
                    RPCLOG(16, "mir_rput_proto: replacing"
                        " held disconnect/ord rel"
                        " indication with disconnect on"
                        " 0x%p\n", (void *)q);

                    freemsg(mir->mir_svc_pend_mp);
                    mir->mir_svc_pend_mp = mp;
                } else {
                    RPCLOG(16, "mir_rput_proto: already "
                        "held a disconnect/ord rel "
                        "indication. freeing ord rel "
                        "ind on 0x%p\n", (void *)q);
                    freemsg(mp);
                }
            } else
                mir->mir_svc_pend_mp = mp;

            mutex_exit(&mir->mir_mutex);
            return;

        default:
            /* nfsd handles server-side non-data messages. */
            break;
        }
        break;

    default:
        break;
    }

    putnext(q, mp);
}

/*
 * The server-side read queues are used to hold inbound messages while
 * outbound flow control is exerted.  When outbound flow control is
 * relieved, mir_wsrv qenables the read-side queue.  Read-side queues
 * are not enabled by STREAMS and are explicitly noenable'ed in mir_open.
 *
 * For the server side,  we have two types of messages queued. The first type
 * are messages that are ready to be XDR decoded and and then sent to the
 * RPC program's dispatch routine. The second type are "raw" messages that
 * haven't been processed, i.e. assembled from rpc record fragements into
 * full requests. The only time we will see the second type of message
 * queued is if we have a memory allocation failure while processing a
 * a raw message. The field mir_first_non_processed_mblk will mark the
 * first such raw message. So the flow for server side is:
 *
 *  - send processed queued messages to kRPC until we run out or find
 *    one that needs additional processing because we were short on memory
 *    earlier
 *  - process a message that was deferred because of lack of
 *    memory
 *  - continue processing messages until the queue empties or we
 *    have to stop because of lack of memory
 *  - during each of the above phase, if the queue is empty and
 *    there are no pending messages that were passed to the RPC
 *    layer, send upstream the pending disconnect/ordrel indication if
 *    there is one
 *
 * The read-side queue is also enabled by a bufcall callback if dupmsg
 * fails in mir_rput.
 */
static void
mir_rsrv(queue_t *q)
{
    mir_t   *mir;
    mblk_t  *mp;
    mblk_t  *cmp = NULL;
    boolean_t stop_timer = B_FALSE;

    mir = (mir_t *)q->q_ptr;
    mutex_enter(&mir->mir_mutex);

    mp = NULL;
    switch (mir->mir_type) {
    case RPC_SERVER:
        if (mir->mir_ref_cnt == 0)
            mir->mir_hold_inbound = 0;
        if (mir->mir_hold_inbound) {

            ASSERT(cmp == NULL);
            if (q->q_first == NULL) {

                MIR_CLEAR_INRSRV(mir);

                if (MIR_SVC_QUIESCED(mir)) {
                    cmp = mir->mir_svc_pend_mp;
                    mir->mir_svc_pend_mp = NULL;
                }
            }

            mutex_exit(&mir->mir_mutex);

            if (cmp != NULL) {
                RPCLOG(16, "mir_rsrv: line %d: sending a held "
                    "disconnect/ord rel indication upstream\n",
                    __LINE__);
                putnext(q, cmp);
            }

            return;
        }
        while (mp = getq(q)) {
            if (mir->mir_krpc_cell &&
                (mir->mir_svc_no_more_msgs == 0)) {
                /*
                 * If we were idle, turn off idle timer since
                 * we aren't idle any more.
                 */
                if (mir->mir_ref_cnt++ == 0)
                    stop_timer = B_TRUE;
                if (mir_check_len(q,
                    (int32_t)msgdsize(mp), mp))
                    return;
                svc_queuereq(q, mp);
            } else {
                /*
                 * Count # of times this happens. Should be
                 * never, but experience shows otherwise.
                 */
                if (mir->mir_krpc_cell == NULL)
                    mir_krpc_cell_null++;
                freemsg(mp);
            }
        }
        break;
    case RPC_CLIENT:
        break;
    default:
        RPCLOG(1, "mir_rsrv: unexpected mir_type %d\n", mir->mir_type);

        if (q->q_first == NULL)
            MIR_CLEAR_INRSRV(mir);

        mutex_exit(&mir->mir_mutex);

        return;
    }

    /*
     * The timer is stopped after all the messages are processed.
     * The reason is that stopping the timer releases the mir_mutex
     * lock temporarily.  This means that the request can be serviced
     * while we are still processing the message queue.  This is not
     * good.  So we stop the timer here instead.
     */
    if (stop_timer)  {
        RPCLOG(16, "mir_rsrv stopping idle timer on 0x%p because ref "
            "cnt going to non zero\n", (void *)WR(q));
        mir_svc_idle_stop(WR(q), mir);
    }

    if (q->q_first == NULL) {

        MIR_CLEAR_INRSRV(mir);

        ASSERT(cmp == NULL);
        if (mir->mir_type == RPC_SERVER && MIR_SVC_QUIESCED(mir)) {
            cmp = mir->mir_svc_pend_mp;
            mir->mir_svc_pend_mp = NULL;
        }

        mutex_exit(&mir->mir_mutex);

        if (cmp != NULL) {
            RPCLOG(16, "mir_rsrv: line %d: sending a held "
                "disconnect/ord rel indication upstream\n",
                __LINE__);
            putnext(q, cmp);
        }

        return;
    }
    mutex_exit(&mir->mir_mutex);
}

static int mir_svc_policy_fails;

/*
 * Called to send an event code to nfsd/lockd so that it initiates
 * connection close.
 */
static int
mir_svc_policy_notify(queue_t *q, int event)
{
    mblk_t  *mp;
#ifdef DEBUG
    mir_t *mir = (mir_t *)q->q_ptr;
    ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
#endif
    ASSERT(q->q_flag & QREADR);

    /*
     * Create an M_DATA message with the event code and pass it to the
     * Stream head (nfsd or whoever created the stream will consume it).
     */
    mp = allocb(sizeof (int), BPRI_HI);

    if (!mp) {

        mir_svc_policy_fails++;
        RPCLOG(16, "mir_svc_policy_notify: could not allocate event "
            "%d\n", event);
        return (ENOMEM);
    }

    U32_TO_BE32(event, mp->b_rptr);
    mp->b_wptr = mp->b_rptr + sizeof (int);
    putnext(q, mp);
    return (0);
}

/*
 * Server side: start the close phase. We want to get this rpcmod slot in an
 * idle state before mir_close() is called.
 */
static void
mir_svc_start_close(queue_t *wq, mir_t *mir)
{
    ASSERT(MUTEX_HELD(&mir->mir_mutex));
    ASSERT((wq->q_flag & QREADR) == 0);
    ASSERT(mir->mir_type == RPC_SERVER);


    /*
     * Do not accept any more messages.
     */
    mir->mir_svc_no_more_msgs = 1;

    /*
     * Next two statements will make the read service procedure invoke
     * svc_queuereq() on everything stuck in the streams read queue.
     * It's not necessary because enabling the write queue will
     * have the same effect, but why not speed the process along?
     */
    mir->mir_hold_inbound = 0;
    qenable(RD(wq));

    /*
     * Meanwhile force the write service procedure to send the
     * responses downstream, regardless of flow control.
     */
    qenable(wq);
}

/*
 * This routine is called directly by KRPC after a request is completed,
 * whether a reply was sent or the request was dropped.
 */
static void
mir_svc_release(queue_t *wq, mblk_t *mp)
{
    mir_t   *mir = (mir_t *)wq->q_ptr;
    mblk_t  *cmp = NULL;

    ASSERT((wq->q_flag & QREADR) == 0);
    if (mp)
        freemsg(mp);

    mutex_enter(&mir->mir_mutex);

    /*
     * Start idle processing if this is the last reference.
     */
    if ((mir->mir_ref_cnt == 1) && (mir->mir_inrservice == 0)) {

        RPCLOG(16, "mir_svc_release starting idle timer on 0x%p "
            "because ref cnt is zero\n", (void *) wq);

        cmp = mir->mir_svc_pend_mp;
        mir->mir_svc_pend_mp = NULL;
        mir_svc_idle_start(wq, mir);
    }

    mir->mir_ref_cnt--;
    ASSERT(mir->mir_ref_cnt >= 0);

    /*
     * Wake up the thread waiting to close.
     */

    if ((mir->mir_ref_cnt == 0) && mir->mir_closing)
        cv_signal(&mir->mir_condvar);

    mutex_exit(&mir->mir_mutex);

    if (cmp) {
        RPCLOG(16, "mir_svc_release: sending a held "
            "disconnect/ord rel indication upstream on queue 0x%p\n",
            (void *)RD(wq));

        putnext(RD(wq), cmp);
    }
}

/*
 * This routine is called by server-side KRPC when it is ready to
 * handle inbound messages on the stream.
 */
static void
mir_svc_start(queue_t *wq)
{
    mir_t   *mir = (mir_t *)wq->q_ptr;

    /*
     * no longer need to take the mir_mutex because the
     * mir_setup_complete field has been moved out of
     * the binary field protected by the mir_mutex.
     */

    mir->mir_setup_complete = 1;
    qenable(RD(wq));
}

/*
 * client side wrapper for stopping timer with normal idle timeout.
 */
static void
mir_clnt_idle_stop(queue_t *wq, mir_t *mir)
{
    ASSERT(MUTEX_HELD(&mir->mir_mutex));
    ASSERT((wq->q_flag & QREADR) == 0);
    ASSERT(mir->mir_type == RPC_CLIENT);

    mir_timer_stop(mir);
}

/*
 * client side wrapper for stopping timer with normal idle timeout.
 */
static void
mir_clnt_idle_start(queue_t *wq, mir_t *mir)
{
    ASSERT(MUTEX_HELD(&mir->mir_mutex));
    ASSERT((wq->q_flag & QREADR) == 0);
    ASSERT(mir->mir_type == RPC_CLIENT);

    mir_timer_start(wq, mir, mir->mir_idle_timeout);
}

/*
 * client side only. Forces rpcmod to stop sending T_ORDREL_REQs on
 * end-points that aren't connected.
 */
static void
mir_clnt_idle_do_stop(queue_t *wq)
{
    mir_t   *mir = (mir_t *)wq->q_ptr;

    RPCLOG(1, "mir_clnt_idle_do_stop: wq 0x%p\n", (void *)wq);
    ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
    mutex_enter(&mir->mir_mutex);
    mir_clnt_idle_stop(wq, mir);
    mutex_exit(&mir->mir_mutex);
}

/*
 * Timer handler.  It handles idle timeout and memory shortage problem.
 */
static void
mir_timer(void *arg)
{
    queue_t *wq = (queue_t *)arg;
    mir_t *mir = (mir_t *)wq->q_ptr;
    boolean_t notify;

    mutex_enter(&mir->mir_mutex);

    /*
     * mir_timer_call is set only when either mir_timer_[start|stop]
     * is progressing.  And mir_timer() can only be run while they
     * are progressing if the timer is being stopped.  So just
     * return.
     */
    if (mir->mir_timer_call) {
        mutex_exit(&mir->mir_mutex);
        return;
    }
    mir->mir_timer_id = 0;

    switch (mir->mir_type) {
    case RPC_CLIENT:

        /*
         * For clients, the timer fires at clnt_idle_timeout
         * intervals.  If the activity marker (mir_clntreq) is
         * zero, then the stream has been idle since the last
         * timer event and we notify KRPC.  If mir_clntreq is
         * non-zero, then the stream is active and we just
         * restart the timer for another interval.  mir_clntreq
         * is set to 1 in mir_wput for every request passed
         * downstream.
         *
         * If this was a memory shortage timer reset the idle
         * timeout regardless; the mir_clntreq will not be a
         * valid indicator.
         *
         * The timer is initially started in mir_wput during
         * RPC_CLIENT ioctl processing.
         *
         * The timer interval can be changed for individual
         * streams with the ND variable "mir_idle_timeout".
         */
        if (mir->mir_clntreq > 0 && mir->mir_use_timestamp +
            MSEC_TO_TICK(mir->mir_idle_timeout) - lbolt >= 0) {
            clock_t tout;

            tout = mir->mir_idle_timeout -
                TICK_TO_MSEC(lbolt - mir->mir_use_timestamp);
            if (tout < 0)
                tout = 1000;
#if 0
            printf("mir_timer[%d < %d + %d]: reset client timer "
                "to %d (ms)\n", TICK_TO_MSEC(lbolt),
                TICK_TO_MSEC(mir->mir_use_timestamp),
                mir->mir_idle_timeout, tout);
#endif
            mir->mir_clntreq = 0;
            mir_timer_start(wq, mir, tout);
            mutex_exit(&mir->mir_mutex);
            return;
        }
#if 0
printf("mir_timer[%d]: doing client timeout\n", lbolt / hz);
#endif
        /*
         * We are disconnecting, but not necessarily
         * closing. By not closing, we will fail to
         * pick up a possibly changed global timeout value,
         * unless we store it now.
         */
        mir->mir_idle_timeout = clnt_idle_timeout;
        mir_clnt_idle_start(wq, mir);

        mutex_exit(&mir->mir_mutex);
        /*
         * We pass T_ORDREL_REQ as an integer value
         * to KRPC as the indication that the stream
         * is idle.  This is not a T_ORDREL_REQ message,
         * it is just a convenient value since we call
         * the same KRPC routine for T_ORDREL_INDs and
         * T_DISCON_INDs.
         */
        clnt_dispatch_notifyall(wq, T_ORDREL_REQ, 0);
        return;

    case RPC_SERVER:

        /*
         * For servers, the timer is only running when the stream
         * is really idle or memory is short.  The timer is started
         * by mir_wput when mir_type is set to RPC_SERVER and
         * by mir_svc_idle_start whenever the stream goes idle
         * (mir_ref_cnt == 0).  The timer is cancelled in
         * mir_rput whenever a new inbound request is passed to KRPC
         * and the stream was previously idle.
         *
         * The timer interval can be changed for individual
         * streams with the ND variable "mir_idle_timeout".
         *
         * If the stream is not idle do nothing.
         */
        if (!MIR_SVC_QUIESCED(mir)) {
            mutex_exit(&mir->mir_mutex);
            return;
        }

        notify = !mir->mir_inrservice;
        mutex_exit(&mir->mir_mutex);

        /*
         * If there is no packet queued up in read queue, the stream
         * is really idle so notify nfsd to close it.
         */
        if (notify) {
            RPCLOG(16, "mir_timer: telling stream head listener "
                "to close stream (0x%p)\n", (void *) RD(wq));
            (void) mir_svc_policy_notify(RD(wq), 1);
        }
        return;
    default:
        RPCLOG(1, "mir_timer: unexpected mir_type %d\n",
            mir->mir_type);
        mutex_exit(&mir->mir_mutex);
        return;
    }
}

/*
 * Called by the RPC package to send either a call or a return, or a
 * transport connection request.  Adds the record marking header.
 */
static void
mir_wput(queue_t *q, mblk_t *mp)
{
    uint_t  frag_header;
    mir_t   *mir = (mir_t *)q->q_ptr;
    uchar_t *rptr = mp->b_rptr;

    if (!mir) {
        freemsg(mp);
        return;
    }

    if (mp->b_datap->db_type != M_DATA) {
        mir_wput_other(q, mp);
        return;
    }

    if (mir->mir_ordrel_pending == 1) {
        freemsg(mp);
        RPCLOG(16, "mir_wput wq 0x%p: got data after T_ORDREL_REQ\n",
            (void *)q);
        return;
    }

    frag_header = (uint_t)DLEN(mp);
    frag_header |= MIR_LASTFRAG;

    /* Stick in the 4 byte record marking header. */
    if ((rptr - mp->b_datap->db_base) < sizeof (uint32_t) ||
        !IS_P2ALIGNED(mp->b_rptr, sizeof (uint32_t))) {
        /*
         * Since we know that M_DATA messages are created exclusively
         * by KRPC, we expect that KRPC will leave room for our header
         * and 4 byte align which is normal for XDR.
         * If KRPC (or someone else) does not cooperate, then we
         * just throw away the message.
         */
        RPCLOG(1, "mir_wput: KRPC did not leave space for record "
            "fragment header (%d bytes left)\n",
            (int)(rptr - mp->b_datap->db_base));
        freemsg(mp);
        return;
    }
    rptr -= sizeof (uint32_t);
    *(uint32_t *)rptr = htonl(frag_header);
    mp->b_rptr = rptr;

    mutex_enter(&mir->mir_mutex);
    if (mir->mir_type == RPC_CLIENT) {
        /*
         * For the client, set mir_clntreq to indicate that the
         * connection is active.
         */
        mir->mir_clntreq = 1;
        mir->mir_use_timestamp = lbolt;
    }

    /*
     * If we haven't already queued some data and the downstream module
     * can accept more data, send it on, otherwise we queue the message
     * and take other actions depending on mir_type.
     */
    if (!mir->mir_inwservice && MIR_WCANPUTNEXT(mir, q)) {
        mutex_exit(&mir->mir_mutex);

        /*
         * Now we pass the RPC message downstream.
         */
        putnext(q, mp);
        return;
    }

    switch (mir->mir_type) {
    case RPC_CLIENT:
        /*
         * Check for a previous duplicate request on the
         * queue.  If there is one, then we throw away
         * the current message and let the previous one
         * go through.  If we can't find a duplicate, then
         * send this one.  This tap dance is an effort
         * to reduce traffic and processing requirements
         * under load conditions.
         */
        if (mir_clnt_dup_request(q, mp)) {
            mutex_exit(&mir->mir_mutex);
            freemsg(mp);
            return;
        }
        break;
    case RPC_SERVER:
        /*
         * Set mir_hold_inbound so that new inbound RPC
         * messages will be held until the client catches
         * up on the earlier replies.  This flag is cleared
         * in mir_wsrv after flow control is relieved;
         * the read-side queue is also enabled at that time.
         */
        mir->mir_hold_inbound = 1;
        break;
    default:
        RPCLOG(1, "mir_wput: unexpected mir_type %d\n", mir->mir_type);
        break;
    }
    mir->mir_inwservice = 1;
    (void) putq(q, mp);
    mutex_exit(&mir->mir_mutex);
}

static void
mir_wput_other(queue_t *q, mblk_t *mp)
{
    mir_t   *mir = (mir_t *)q->q_ptr;
    struct iocblk   *iocp;
    uchar_t *rptr = mp->b_rptr;
    bool_t  flush_in_svc = FALSE;

    ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
    switch (mp->b_datap->db_type) {
    case M_IOCTL:
        iocp = (struct iocblk *)rptr;
        switch (iocp->ioc_cmd) {
        case RPC_CLIENT:
            mutex_enter(&mir->mir_mutex);
            if (mir->mir_type != 0 &&
                mir->mir_type != iocp->ioc_cmd) {
ioc_eperm:
                mutex_exit(&mir->mir_mutex);
                iocp->ioc_error = EPERM;
                iocp->ioc_count = 0;
                mp->b_datap->db_type = M_IOCACK;
                qreply(q, mp);
                return;
            }

            mir->mir_type = iocp->ioc_cmd;

            /*
             * Clear mir_hold_inbound which was set to 1 by
             * mir_open.  This flag is not used on client
             * streams.
             */
            mir->mir_hold_inbound = 0;
            mir->mir_max_msg_sizep = &clnt_max_msg_size;

            /*
             * Start the idle timer.  See mir_timer() for more
             * information on how client timers work.
             */
            mir->mir_idle_timeout = clnt_idle_timeout;
            mir_clnt_idle_start(q, mir);
            mutex_exit(&mir->mir_mutex);

            mp->b_datap->db_type = M_IOCACK;
            qreply(q, mp);
            return;
        case RPC_SERVER:
            mutex_enter(&mir->mir_mutex);
            if (mir->mir_type != 0 &&
                mir->mir_type != iocp->ioc_cmd)
                goto ioc_eperm;

            /*
             * We don't clear mir_hold_inbound here because
             * mir_hold_inbound is used in the flow control
             * model. If we cleared it here, then we'd commit
             * a small violation to the model where the transport
             * might immediately block downstream flow.
             */

            mir->mir_type = iocp->ioc_cmd;
            mir->mir_max_msg_sizep = &svc_max_msg_size;

            /*
             * Start the idle timer.  See mir_timer() for more
             * information on how server timers work.
             *
             * Note that it is important to start the idle timer
             * here so that connections time out even if we
             * never receive any data on them.
             */
            mir->mir_idle_timeout = svc_idle_timeout;
            RPCLOG(16, "mir_wput_other starting idle timer on 0x%p "
                "because we got RPC_SERVER ioctl\n", (void *)q);
            mir_svc_idle_start(q, mir);
            mutex_exit(&mir->mir_mutex);

            mp->b_datap->db_type = M_IOCACK;
            qreply(q, mp);
            return;
        default:
            break;
        }
        break;

    case M_PROTO:
        if (mir->mir_type == RPC_CLIENT) {
            /*
             * We are likely being called from the context of a
             * service procedure. So we need to enqueue. However
             * enqueing may put our message behind data messages.
             * So flush the data first.
             */
            flush_in_svc = TRUE;
        }
        if ((mp->b_wptr - rptr) < sizeof (uint32_t) ||
            !IS_P2ALIGNED(rptr, sizeof (uint32_t)))
            break;

        switch (((union T_primitives *)rptr)->type) {
        case T_DATA_REQ:
            /* Don't pass T_DATA_REQ messages downstream. */
            freemsg(mp);
            return;
        case T_ORDREL_REQ:
            RPCLOG(8, "mir_wput_other wq 0x%p: got T_ORDREL_REQ\n",
                (void *)q);
            mutex_enter(&mir->mir_mutex);
            if (mir->mir_type != RPC_SERVER) {
                /*
                 * We are likely being called from
                 * clnt_dispatch_notifyall(). Sending
                 * a T_ORDREL_REQ will result in
                 * a some kind of _IND message being sent,
                 * will be another call to
                 * clnt_dispatch_notifyall(). To keep the stack
                 * lean, queue this message.
                 */
                mir->mir_inwservice = 1;
                (void) putq(q, mp);
                mutex_exit(&mir->mir_mutex);
                return;
            }

            /*
             * Mark the structure such that we don't accept any
             * more requests from client. We could defer this
             * until we actually send the orderly release
             * request downstream, but all that does is delay
             * the closing of this stream.
             */
            RPCLOG(16, "mir_wput_other wq 0x%p: got T_ORDREL_REQ "
                " so calling mir_svc_start_close\n", (void *)q);

            mir_svc_start_close(q, mir);

            /*
             * If we have sent down a T_ORDREL_REQ, don't send
             * any more.
             */
            if (mir->mir_ordrel_pending) {
                freemsg(mp);
                mutex_exit(&mir->mir_mutex);
                return;
            }

            /*
             * If the stream is not idle, then we hold the
             * orderly release until it becomes idle.  This
             * ensures that KRPC will be able to reply to
             * all requests that we have passed to it.
             *
             * We also queue the request if there is data already
             * queued, because we cannot allow the T_ORDREL_REQ
             * to go before data. When we had a separate reply
             * count, this was not a problem, because the
             * reply count was reconciled when mir_wsrv()
             * completed.
             */
            if (!MIR_SVC_QUIESCED(mir) ||
                mir->mir_inwservice == 1) {
                mir->mir_inwservice = 1;
                (void) putq(q, mp);

                RPCLOG(16, "mir_wput_other: queuing "
                    "T_ORDREL_REQ on 0x%p\n", (void *)q);

                mutex_exit(&mir->mir_mutex);
                return;
            }

            /*
             * Mark the structure so that we know we sent
             * an orderly release request, and reset the idle timer.
             */
            mir->mir_ordrel_pending = 1;

            RPCLOG(16, "mir_wput_other: calling mir_svc_idle_start"
                " on 0x%p because we got T_ORDREL_REQ\n",
                (void *)q);

            mir_svc_idle_start(q, mir);
            mutex_exit(&mir->mir_mutex);

            /*
             * When we break, we will putnext the T_ORDREL_REQ.
             */
            break;

        case T_CONN_REQ:
            mutex_enter(&mir->mir_mutex);
            if (mir->mir_head_mp != NULL) {
                freemsg(mir->mir_head_mp);
                mir->mir_head_mp = NULL;
                mir->mir_tail_mp = NULL;
            }
            mir->mir_frag_len = -(int32_t)sizeof (uint32_t);
            /*
             * Restart timer in case mir_clnt_idle_do_stop() was
             * called.
             */
            mir->mir_idle_timeout = clnt_idle_timeout;
            mir_clnt_idle_stop(q, mir);
            mir_clnt_idle_start(q, mir);
            mutex_exit(&mir->mir_mutex);
            break;

        default:
            /*
             * T_DISCON_REQ is one of the interesting default
             * cases here. Ideally, an M_FLUSH is done before
             * T_DISCON_REQ is done. However, that is somewhat
             * cumbersome for clnt_cots.c to do. So we queue
             * T_DISCON_REQ, and let the service procedure
             * flush all M_DATA.
             */
            break;
        }
        /* fallthru */;
    default:
        if (mp->b_datap->db_type >= QPCTL) {
            if (mp->b_datap->db_type == M_FLUSH) {
                if (mir->mir_type == RPC_CLIENT &&
                    *mp->b_rptr & FLUSHW) {
                    RPCLOG(32, "mir_wput_other: flushing "
                        "wq 0x%p\n", (void *)q);
                    if (*mp->b_rptr & FLUSHBAND) {
                        flushband(q, *(mp->b_rptr + 1),
                            FLUSHDATA);
                    } else {
                        flushq(q, FLUSHDATA);
                    }
                } else {
                    RPCLOG(32, "mir_wput_other: ignoring "
                        "M_FLUSH on wq 0x%p\n", (void *)q);
                }
            }
            break;
        }

        mutex_enter(&mir->mir_mutex);
        if (mir->mir_inwservice == 0 && MIR_WCANPUTNEXT(mir, q)) {
            mutex_exit(&mir->mir_mutex);
            break;
        }
        mir->mir_inwservice = 1;
        mir->mir_inwflushdata = flush_in_svc;
        (void) putq(q, mp);
        mutex_exit(&mir->mir_mutex);
        qenable(q);

        return;
    }
    putnext(q, mp);
}

static void
mir_wsrv(queue_t *q)
{
    mblk_t  *mp;
    mir_t   *mir;
    bool_t flushdata;

    mir = (mir_t *)q->q_ptr;
    mutex_enter(&mir->mir_mutex);

    flushdata = mir->mir_inwflushdata;
    mir->mir_inwflushdata = 0;

    while (mp = getq(q)) {
        if (mp->b_datap->db_type == M_DATA) {
            /*
             * Do not send any more data if we have sent
             * a T_ORDREL_REQ.
             */
            if (flushdata || mir->mir_ordrel_pending == 1) {
                freemsg(mp);
                continue;
            }

            /*
             * Make sure that the stream can really handle more
             * data.
             */
            if (!MIR_WCANPUTNEXT(mir, q)) {
                (void) putbq(q, mp);
                mutex_exit(&mir->mir_mutex);
                return;
            }

            /*
             * Now we pass the RPC message downstream.
             */
            mutex_exit(&mir->mir_mutex);
            putnext(q, mp);
            mutex_enter(&mir->mir_mutex);
            continue;
        }

        /*
         * This is not an RPC message, pass it downstream
         * (ignoring flow control) if the server side is not sending a
         * T_ORDREL_REQ downstream.
         */
        if (mir->mir_type != RPC_SERVER ||
            ((union T_primitives *)mp->b_rptr)->type !=
            T_ORDREL_REQ) {
            mutex_exit(&mir->mir_mutex);
            putnext(q, mp);
            mutex_enter(&mir->mir_mutex);
            continue;
        }

        if (mir->mir_ordrel_pending == 1) {
            /*
             * Don't send two T_ORDRELs
             */
            freemsg(mp);
            continue;
        }

        /*
         * Mark the structure so that we know we sent an orderly
         * release request.  We will check to see slot is idle at the
         * end of this routine, and if so, reset the idle timer to
         * handle orderly release timeouts.
         */
        mir->mir_ordrel_pending = 1;
        RPCLOG(16, "mir_wsrv: sending ordrel req on q 0x%p\n",
            (void *)q);
        /*
         * Send the orderly release downstream. If there are other
         * pending replies we won't be able to send them.  However,
         * the only reason we should send the orderly release is if
         * we were idle, or if an unusual event occurred.
         */
        mutex_exit(&mir->mir_mutex);
        putnext(q, mp);
        mutex_enter(&mir->mir_mutex);
    }

    if (q->q_first == NULL)
        /*
         * If we call mir_svc_idle_start() below, then
         * clearing mir_inwservice here will also result in
         * any thread waiting in mir_close() to be signaled.
         */
        mir->mir_inwservice = 0;

    if (mir->mir_type != RPC_SERVER) {
        mutex_exit(&mir->mir_mutex);
        return;
    }

    /*
     * If idle we call mir_svc_idle_start to start the timer (or wakeup
     * a close). Also make sure not to start the idle timer on the
     * listener stream. This can cause nfsd to send an orderly release
     * command on the listener stream.
     */
    if (MIR_SVC_QUIESCED(mir) && !(mir->mir_listen_stream)) {
        RPCLOG(16, "mir_wsrv: calling mir_svc_idle_start on 0x%p "
            "because mir slot is idle\n", (void *)q);
        mir_svc_idle_start(q, mir);
    }

    /*
     * If outbound flow control has been relieved, then allow new
     * inbound requests to be processed.
     */
    if (mir->mir_hold_inbound) {
        mir->mir_hold_inbound = 0;
        qenable(RD(q));
    }
    mutex_exit(&mir->mir_mutex);
}

static void
mir_disconnect(queue_t *q, mir_t *mir)
{
    ASSERT(MUTEX_HELD(&mir->mir_mutex));

    switch (mir->mir_type) {
    case RPC_CLIENT:
        /*
         * We are disconnecting, but not necessarily
         * closing. By not closing, we will fail to
         * pick up a possibly changed global timeout value,
         * unless we store it now.
         */
        mir->mir_idle_timeout = clnt_idle_timeout;
        mir_clnt_idle_start(WR(q), mir);
        mutex_exit(&mir->mir_mutex);

        /*
         * T_DISCON_REQ is passed to KRPC as an integer value
         * (this is not a TPI message).  It is used as a
         * convenient value to indicate a sanity check
         * failure -- the same KRPC routine is also called
         * for T_DISCON_INDs and T_ORDREL_INDs.
         */
        clnt_dispatch_notifyall(WR(q), T_DISCON_REQ, 0);
        break;

    case RPC_SERVER:
        mir->mir_svc_no_more_msgs = 1;
        mir_svc_idle_stop(WR(q), mir);
        mutex_exit(&mir->mir_mutex);
        RPCLOG(16, "mir_disconnect: telling "
            "stream head listener to disconnect stream "
            "(0x%p)\n", (void *) q);
        (void) mir_svc_policy_notify(q, 2);
        break;

    default:
        mutex_exit(&mir->mir_mutex);
        break;
    }
}

/*
 * do a sanity check on the length of the fragment.
 * returns 1 if bad else 0.
 */
static int
mir_check_len(queue_t *q, int32_t frag_len,
    mblk_t *head_mp)
{
    mir_t   *mir;

    mir = (mir_t *)q->q_ptr;

    /*
     * Do a sanity check on the message length.  If this message is
     * getting excessively large, shut down the connection.
     */

    if ((frag_len <= 0) || (mir->mir_max_msg_sizep == NULL) ||
        (frag_len <= *mir->mir_max_msg_sizep)) {
        return (0);
    }

    freemsg(head_mp);
    mir->mir_head_mp = (mblk_t *)0;
    mir->mir_frag_len = -(int)sizeof (uint32_t);
    if (mir->mir_type != RPC_SERVER || mir->mir_setup_complete) {
        cmn_err(CE_NOTE,
            "KRPC: record fragment from %s of size(%d) exceeds "
            "maximum (%u). Disconnecting",
            (mir->mir_type == RPC_CLIENT) ? "server" :
            (mir->mir_type == RPC_SERVER) ? "client" :
            "test tool",
            frag_len, *mir->mir_max_msg_sizep);
    }

    mir_disconnect(q, mir);
    return (1);
}