rpcmod.c revision 108322fb1c3ed341aba9c80c9774df0ed9e35768
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Kernel RPC filtering module
*/
#include <sys/param.h>
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
#include <sys/tihdr.h>
#include <sys/timod.h>
#include <sys/tiuser.h>
#include <sys/debug.h>
#include <sys/signal.h>
#include <sys/pcb.h>
#include <sys/user.h>
#include <sys/errno.h>
#include <sys/cred.h>
#include <sys/policy.h>
#include <sys/inline.h>
#include <sys/cmn_err.h>
#include <sys/kmem.h>
#include <sys/file.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/t_lock.h>
#include <sys/ddi.h>
#include <sys/vtrace.h>
#include <sys/callb.h>
#include <sys/strlog.h>
#include <rpc/rpc_com.h>
#include <inet/common.h>
#include <rpc/types.h>
#include <sys/time.h>
#include <rpc/xdr.h>
#include <rpc/auth.h>
#include <rpc/clnt.h>
#include <rpc/rpc_msg.h>
#include <rpc/clnt.h>
#include <rpc/svc.h>
#include <rpc/rpcsys.h>
#include <rpc/rpc_rdma.h>
/*
* This is the loadable module wrapper.
*/
#include <sys/conf.h>
#include <sys/modctl.h>
#include <sys/syscall.h>
extern struct streamtab rpcinfo;
static struct fmodsw fsw = {
"rpcmod",
&rpcinfo,
D_NEW|D_MP,
};
/*
* Module linkage information for the kernel.
*/
static struct modlstrmod modlstrmod = {
&mod_strmodops, "rpc interface str mod", &fsw
};
/*
* For the RPC system call.
*/
static struct sysent rpcsysent = {
2,
SE_32RVAL1 | SE_ARGC | SE_NOUNLOAD,
rpcsys
};
static struct modlsys modlsys = {
&mod_syscallops,
"RPC syscall",
&rpcsysent
};
#ifdef _SYSCALL32_IMPL
static struct modlsys modlsys32 = {
&mod_syscallops32,
"32-bit RPC syscall",
&rpcsysent
};
#endif /* _SYSCALL32_IMPL */
static struct modlinkage modlinkage = {
MODREV_1,
{
&modlsys,
#ifdef _SYSCALL32_IMPL
&modlsys32,
#endif
&modlstrmod,
NULL
}
};
int
_init(void)
{
int error = 0;
callb_id_t cid;
int status;
svc_init();
clnt_init();
cid = callb_add(connmgr_cpr_reset, 0, CB_CL_CPR_RPC, "rpc");
if (error = mod_install(&modlinkage)) {
/*
* Could not install module, cleanup previous
* initialization work.
*/
clnt_fini();
if (cid != NULL)
(void) callb_delete(cid);
return (error);
}
/*
* Load up the RDMA plugins and initialize the stats. Even if the
* plugins loadup fails, but rpcmod was successfully installed the
* counters still get initialized.
*/
rw_init(&rdma_lock, NULL, RW_DEFAULT, NULL);
mutex_init(&rdma_modload_lock, NULL, MUTEX_DEFAULT, NULL);
mt_kstat_init();
/*
* Get our identification into ldi. This is used for loading
* other modules, e.g. rpcib.
*/
status = ldi_ident_from_mod(&modlinkage, &rpcmod_li);
if (status != 0) {
cmn_err(CE_WARN, "ldi_ident_from_mod fails with %d", status);
rpcmod_li = NULL;
}
return (error);
}
/*
* The unload entry point fails, because we advertise entry points into
* rpcmod from the rest of kRPC: rpcmod_release().
*/
int
_fini(void)
{
return (EBUSY);
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}
extern int nulldev();
#define RPCMOD_ID 2049
int rmm_open(), rmm_close();
/*
* To save instructions, since STREAMS ignores the return value
* from these functions, they are defined as void here. Kind of icky, but...
*/
void rmm_rput(queue_t *, mblk_t *);
void rmm_wput(queue_t *, mblk_t *);
void rmm_rsrv(queue_t *);
void rmm_wsrv(queue_t *);
int rpcmodopen(), rpcmodclose();
void rpcmodrput(), rpcmodwput();
void rpcmodrsrv(), rpcmodwsrv();
static void rpcmodwput_other(queue_t *, mblk_t *);
static int mir_close(queue_t *q);
static int mir_open(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp);
static void mir_rput(queue_t *q, mblk_t *mp);
static void mir_rsrv(queue_t *q);
static void mir_wput(queue_t *q, mblk_t *mp);
static void mir_wsrv(queue_t *q);
static struct module_info rpcmod_info =
{RPCMOD_ID, "rpcmod", 0, INFPSZ, 256*1024, 1024};
/*
* Read side has no service procedure.
*/
static struct qinit rpcmodrinit = {
(int (*)())rmm_rput,
(int (*)())rmm_rsrv,
rmm_open,
rmm_close,
nulldev,
&rpcmod_info,
NULL
};
/*
* The write put procedure is simply putnext to conserve stack space.
* The write service procedure is not used to queue data, but instead to
* synchronize with flow control.
*/
static struct qinit rpcmodwinit = {
(int (*)())rmm_wput,
(int (*)())rmm_wsrv,
rmm_open,
rmm_close,
nulldev,
&rpcmod_info,
NULL
};
struct streamtab rpcinfo = { &rpcmodrinit, &rpcmodwinit, NULL, NULL };
struct xprt_style_ops {
int (*xo_open)();
int (*xo_close)();
void (*xo_wput)();
void (*xo_wsrv)();
void (*xo_rput)();
void (*xo_rsrv)();
};
static struct xprt_style_ops xprt_clts_ops = {
rpcmodopen,
rpcmodclose,
rpcmodwput,
rpcmodwsrv,
rpcmodrput,
NULL
};
static struct xprt_style_ops xprt_cots_ops = {
mir_open,
mir_close,
mir_wput,
mir_wsrv,
mir_rput,
mir_rsrv
};
/*
* Per rpcmod "slot" data structure. q->q_ptr points to one of these.
*/
struct rpcm {
void *rm_krpc_cell; /* Reserved for use by KRPC */
struct xprt_style_ops *rm_ops;
int rm_type; /* Client or server side stream */
#define RM_CLOSING 0x1 /* somebody is trying to close slot */
uint_t rm_state; /* state of the slot. see above */
uint_t rm_ref; /* cnt of external references to slot */
kmutex_t rm_lock; /* mutex protecting above fields */
kcondvar_t rm_cwait; /* condition for closing */
zoneid_t rm_zoneid; /* zone which pushed rpcmod */
};
struct temp_slot {
void *cell;
struct xprt_style_ops *ops;
int type;
mblk_t *info_ack;
kmutex_t lock;
kcondvar_t wait;
};
void tmp_rput(queue_t *q, mblk_t *mp);
struct xprt_style_ops tmpops = {
NULL,
NULL,
putnext,
NULL,
tmp_rput,
NULL
};
void
tmp_rput(queue_t *q, mblk_t *mp)
{
struct temp_slot *t = (struct temp_slot *)(q->q_ptr);
struct T_info_ack *pptr;
switch (mp->b_datap->db_type) {
case M_PCPROTO:
pptr = (struct T_info_ack *)mp->b_rptr;
switch (pptr->PRIM_type) {
case T_INFO_ACK:
mutex_enter(&t->lock);
t->info_ack = mp;
cv_signal(&t->wait);
mutex_exit(&t->lock);
return;
default:
break;
}
default:
break;
}
/*
* Not an info-ack, so free it. This is ok because we should
* not be receiving data until the open finishes: rpcmod
* is pushed well before the end-point is bound to an address.
*/
freemsg(mp);
}
int
rmm_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *crp)
{
mblk_t *bp;
struct temp_slot ts, *t;
struct T_info_ack *pptr;
int error = 0;
int procson = 0;
ASSERT(q != NULL);
/*
* Check for re-opens.
*/
if (q->q_ptr) {
TRACE_1(TR_FAC_KRPC, TR_RPCMODOPEN_END,
"rpcmodopen_end:(%s)", "q->qptr");
return (0);
}
t = &ts;
bzero(t, sizeof (*t));
q->q_ptr = (void *)t;
/* WR(q)->q_ptr = (void *)t; */
/*
* Allocate the required messages upfront.
*/
if ((bp = allocb(sizeof (struct T_info_req) +
sizeof (struct T_info_ack), BPRI_LO)) == (mblk_t *)NULL) {
return (ENOBUFS);
}
mutex_init(&t->lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&t->wait, NULL, CV_DEFAULT, NULL);
t->ops = &tmpops;
qprocson(q);
procson = 1;
bp->b_datap->db_type = M_PCPROTO;
*(int32_t *)bp->b_wptr = (int32_t)T_INFO_REQ;
bp->b_wptr += sizeof (struct T_info_req);
putnext(WR(q), bp);
mutex_enter(&t->lock);
while ((bp = t->info_ack) == NULL) {
if (cv_wait_sig(&t->wait, &t->lock) == 0) {
error = EINTR;
break;
}
}
mutex_exit(&t->lock);
mutex_destroy(&t->lock);
cv_destroy(&t->wait);
if (error)
goto out;
pptr = (struct T_info_ack *)t->info_ack->b_rptr;
if (pptr->SERV_type == T_CLTS) {
error = rpcmodopen(q, devp, flag, sflag, crp);
if (error == 0) {
t = (struct temp_slot *)q->q_ptr;
t->ops = &xprt_clts_ops;
}
} else {
error = mir_open(q, devp, flag, sflag, crp);
if (error == 0) {
t = (struct temp_slot *)q->q_ptr;
t->ops = &xprt_cots_ops;
}
}
out:
freemsg(bp);
if (error && procson)
qprocsoff(q);
return (error);
}
void
rmm_rput(queue_t *q, mblk_t *mp)
{
(*((struct temp_slot *)q->q_ptr)->ops->xo_rput)(q, mp);
}
void
rmm_rsrv(queue_t *q)
{
(*((struct temp_slot *)q->q_ptr)->ops->xo_rsrv)(q);
}
void
rmm_wput(queue_t *q, mblk_t *mp)
{
(*((struct temp_slot *)q->q_ptr)->ops->xo_wput)(q, mp);
}
void
rmm_wsrv(queue_t *q)
{
(*((struct temp_slot *)q->q_ptr)->ops->xo_wsrv)(q);
}
int
rmm_close(queue_t *q, int flag, cred_t *crp)
{
return ((*((struct temp_slot *)q->q_ptr)->ops->xo_close)(q, flag, crp));
}
/*
* rpcmodopen - open routine gets called when the module gets pushed
* onto the stream.
*/
/*ARGSUSED*/
int
rpcmodopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *crp)
{
struct rpcm *rmp;
extern void (*rpc_rele)(queue_t *, mblk_t *);
static void rpcmod_release(queue_t *, mblk_t *);
TRACE_0(TR_FAC_KRPC, TR_RPCMODOPEN_START, "rpcmodopen_start:");
/*
* Initialize entry points to release a rpcmod slot (and an input
* message if supplied) and to send an output message to the module
* below rpcmod.
*/
if (rpc_rele == NULL)
rpc_rele = rpcmod_release;
/*
* Only sufficiently privileged users can use this module, and it
* is assumed that they will use this module properly, and NOT send
* bulk data from downstream.
*/
if (secpolicy_rpcmod_open(crp) != 0)
return (EPERM);
/*
* Allocate slot data structure.
*/
rmp = kmem_zalloc(sizeof (*rmp), KM_SLEEP);
mutex_init(&rmp->rm_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&rmp->rm_cwait, NULL, CV_DEFAULT, NULL);
rmp->rm_zoneid = rpc_zoneid();
/*
* slot type will be set by kRPC client and server ioctl's
*/
rmp->rm_type = 0;
q->q_ptr = (void *)rmp;
WR(q)->q_ptr = (void *)rmp;
TRACE_1(TR_FAC_KRPC, TR_RPCMODOPEN_END, "rpcmodopen_end:(%s)", "end");
return (0);
}
/*
* rpcmodclose - This routine gets called when the module gets popped
* off of the stream.
*/
/*ARGSUSED*/
int
rpcmodclose(queue_t *q, int flag, cred_t *crp)
{
struct rpcm *rmp;
ASSERT(q != NULL);
rmp = (struct rpcm *)q->q_ptr;
/*
* Mark our state as closing.
*/
mutex_enter(&rmp->rm_lock);
rmp->rm_state |= RM_CLOSING;
/*
* Check and see if there are any messages on the queue. If so, send
* the messages, regardless whether the downstream module is ready to
* accept data.
*/
if (rmp->rm_type == RPC_SERVER) {
flushq(q, FLUSHDATA);
qenable(WR(q));
if (rmp->rm_ref) {
mutex_exit(&rmp->rm_lock);
/*
* call into SVC to clean the queue
*/
svc_queueclean(q);
mutex_enter(&rmp->rm_lock);
/*
* Block while there are kRPC threads with a reference
* to this message.
*/
while (rmp->rm_ref)
cv_wait(&rmp->rm_cwait, &rmp->rm_lock);
}
mutex_exit(&rmp->rm_lock);
/*
* It is now safe to remove this queue from the stream. No kRPC
* threads have a reference to the stream, and none ever will,
* because RM_CLOSING is set.
*/
qprocsoff(q);
/* Notify kRPC that this stream is going away. */
svc_queueclose(q);
} else {
mutex_exit(&rmp->rm_lock);
qprocsoff(q);
}
q->q_ptr = NULL;
WR(q)->q_ptr = NULL;
mutex_destroy(&rmp->rm_lock);
cv_destroy(&rmp->rm_cwait);
kmem_free(rmp, sizeof (*rmp));
return (0);
}
#ifdef DEBUG
int rpcmod_send_msg_up = 0;
int rpcmod_send_uderr = 0;
int rpcmod_send_dup = 0;
int rpcmod_send_dup_cnt = 0;
#endif
/*
* rpcmodrput - Module read put procedure. This is called from
* the module, driver, or stream head downstream.
*/
void
rpcmodrput(queue_t *q, mblk_t *mp)
{
struct rpcm *rmp;
union T_primitives *pptr;
int hdrsz;
TRACE_0(TR_FAC_KRPC, TR_RPCMODRPUT_START, "rpcmodrput_start:");
ASSERT(q != NULL);
rmp = (struct rpcm *)q->q_ptr;
if (rmp->rm_type == 0) {
freemsg(mp);
return;
}
#ifdef DEBUG
if (rpcmod_send_msg_up > 0) {
mblk_t *nmp = copymsg(mp);
if (nmp) {
putnext(q, nmp);
rpcmod_send_msg_up--;
}
}
if ((rpcmod_send_uderr > 0) && mp->b_datap->db_type == M_PROTO) {
mblk_t *nmp;
struct T_unitdata_ind *data;
struct T_uderror_ind *ud;
int d;
data = (struct T_unitdata_ind *)mp->b_rptr;
if (data->PRIM_type == T_UNITDATA_IND) {
d = sizeof (*ud) - sizeof (*data);
nmp = allocb(mp->b_wptr - mp->b_rptr + d, BPRI_HI);
if (nmp) {
ud = (struct T_uderror_ind *)nmp->b_rptr;
ud->PRIM_type = T_UDERROR_IND;
ud->DEST_length = data->SRC_length;
ud->DEST_offset = data->SRC_offset + d;
ud->OPT_length = data->OPT_length;
ud->OPT_offset = data->OPT_offset + d;
ud->ERROR_type = ENETDOWN;
if (data->SRC_length) {
bcopy(mp->b_rptr +
data->SRC_offset,
nmp->b_rptr +
ud->DEST_offset,
data->SRC_length);
}
if (data->OPT_length) {
bcopy(mp->b_rptr +
data->OPT_offset,
nmp->b_rptr +
ud->OPT_offset,
data->OPT_length);
}
nmp->b_wptr += d;
nmp->b_wptr += (mp->b_wptr - mp->b_rptr);
nmp->b_datap->db_type = M_PROTO;
putnext(q, nmp);
rpcmod_send_uderr--;
}
}
}
#endif
switch (mp->b_datap->db_type) {
default:
putnext(q, mp);
break;
case M_PROTO:
case M_PCPROTO:
ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (int32_t));
pptr = (union T_primitives *)mp->b_rptr;
/*
* Forward this message to krpc if it is data.
*/
if (pptr->type == T_UNITDATA_IND) {
mblk_t *nmp;
/*
* Check if the module is being popped.
*/
mutex_enter(&rmp->rm_lock);
if (rmp->rm_state & RM_CLOSING) {
mutex_exit(&rmp->rm_lock);
putnext(q, mp);
break;
}
switch (rmp->rm_type) {
case RPC_CLIENT:
mutex_exit(&rmp->rm_lock);
hdrsz = mp->b_wptr - mp->b_rptr;
/*
* Make sure the header is sane.
*/
if (hdrsz < TUNITDATAINDSZ ||
hdrsz < (pptr->unitdata_ind.OPT_length +
pptr->unitdata_ind.OPT_offset) ||
hdrsz < (pptr->unitdata_ind.SRC_length +
pptr->unitdata_ind.SRC_offset)) {
freemsg(mp);
return;
}
/*
* Call clnt_clts_dispatch_notify, so that it can
* pass the message to the proper caller. Don't
* discard the header just yet since the client may
* need the sender's address.
*/
clnt_clts_dispatch_notify(mp, hdrsz, rmp->rm_zoneid);
return;
case RPC_SERVER:
/*
* rm_krpc_cell is exclusively used by the kRPC
* CLTS server
*/
if (rmp->rm_krpc_cell) {
#ifdef DEBUG
/*
* Test duplicate request cache and
* rm_ref count handling by sending a
* duplicate every so often, if
* desired.
*/
if (rpcmod_send_dup &&
rpcmod_send_dup_cnt++ %
rpcmod_send_dup)
nmp = copymsg(mp);
else
nmp = NULL;
#endif
/*
* Raise the reference count on this
* module to prevent it from being
* popped before krpc generates the
* reply.
*/
rmp->rm_ref++;
mutex_exit(&rmp->rm_lock);
/*
* Submit the message to krpc.
*/
svc_queuereq(q, mp);
#ifdef DEBUG
/*
* Send duplicate if we created one.
*/
if (nmp) {
mutex_enter(&rmp->rm_lock);
rmp->rm_ref++;
mutex_exit(&rmp->rm_lock);
svc_queuereq(q, nmp);
}
#endif
} else {
mutex_exit(&rmp->rm_lock);
freemsg(mp);
}
return;
default:
mutex_exit(&rmp->rm_lock);
freemsg(mp);
return;
} /* end switch(rmp->rm_type) */
} else if (pptr->type == T_UDERROR_IND) {
mutex_enter(&rmp->rm_lock);
hdrsz = mp->b_wptr - mp->b_rptr;
/*
* Make sure the header is sane
*/
if (hdrsz < TUDERRORINDSZ ||
hdrsz < (pptr->uderror_ind.OPT_length +
pptr->uderror_ind.OPT_offset) ||
hdrsz < (pptr->uderror_ind.DEST_length +
pptr->uderror_ind.DEST_offset)) {
mutex_exit(&rmp->rm_lock);
freemsg(mp);
return;
}
/*
* In the case where a unit data error has been
* received, all we need to do is clear the message from
* the queue.
*/
mutex_exit(&rmp->rm_lock);
freemsg(mp);
RPCLOG(32, "rpcmodrput: unitdata error received at "
"%ld\n", gethrestime_sec());
return;
} /* end else if (pptr->type == T_UDERROR_IND) */
putnext(q, mp);
break;
} /* end switch (mp->b_datap->db_type) */
TRACE_0(TR_FAC_KRPC, TR_RPCMODRPUT_END,
"rpcmodrput_end:");
/*
* Return codes are not looked at by the STREAMS framework.
*/
}
/*
* write put procedure
*/
void
rpcmodwput(queue_t *q, mblk_t *mp)
{
struct rpcm *rmp;
ASSERT(q != NULL);
switch (mp->b_datap->db_type) {
case M_PROTO:
case M_PCPROTO:
break;
default:
rpcmodwput_other(q, mp);
return;
}
/*
* Check to see if we can send the message downstream.
*/
if (canputnext(q)) {
putnext(q, mp);
return;
}
rmp = (struct rpcm *)q->q_ptr;
ASSERT(rmp != NULL);
/*
* The first canputnext failed. Try again except this time with the
* lock held, so that we can check the state of the stream to see if
* it is closing. If either of these conditions evaluate to true
* then send the meesage.
*/
mutex_enter(&rmp->rm_lock);
if (canputnext(q) || (rmp->rm_state & RM_CLOSING)) {
mutex_exit(&rmp->rm_lock);
putnext(q, mp);
} else {
/*
* canputnext failed again and the stream is not closing.
* Place the message on the queue and let the service
* procedure handle the message.
*/
mutex_exit(&rmp->rm_lock);
(void) putq(q, mp);
}
}
static void
rpcmodwput_other(queue_t *q, mblk_t *mp)
{
struct rpcm *rmp;
struct iocblk *iocp;
rmp = (struct rpcm *)q->q_ptr;
ASSERT(rmp != NULL);
switch (mp->b_datap->db_type) {
case M_IOCTL:
iocp = (struct iocblk *)mp->b_rptr;
ASSERT(iocp != NULL);
switch (iocp->ioc_cmd) {
case RPC_CLIENT:
case RPC_SERVER:
mutex_enter(&rmp->rm_lock);
rmp->rm_type = iocp->ioc_cmd;
mutex_exit(&rmp->rm_lock);
mp->b_datap->db_type = M_IOCACK;
qreply(q, mp);
return;
default:
/*
* pass the ioctl downstream and hope someone
* down there knows how to handle it.
*/
putnext(q, mp);
return;
}
default:
break;
}
/*
* This is something we definitely do not know how to handle, just
* pass the message downstream
*/
putnext(q, mp);
}
/*
* Module write service procedure. This is called by downstream modules
* for back enabling during flow control.
*/
void
rpcmodwsrv(queue_t *q)
{
struct rpcm *rmp;
mblk_t *mp = NULL;
rmp = (struct rpcm *)q->q_ptr;
ASSERT(rmp != NULL);
/*
* Get messages that may be queued and send them down stream
*/
while ((mp = getq(q)) != NULL) {
/*
* Optimize the service procedure for the server-side, by
* avoiding a call to canputnext().
*/
if (rmp->rm_type == RPC_SERVER || canputnext(q)) {
putnext(q, mp);
continue;
}
(void) putbq(q, mp);
return;
}
}
static void
rpcmod_release(queue_t *q, mblk_t *bp)
{
struct rpcm *rmp;
/*
* For now, just free the message.
*/
if (bp)
freemsg(bp);
rmp = (struct rpcm *)q->q_ptr;
mutex_enter(&rmp->rm_lock);
rmp->rm_ref--;
if (rmp->rm_ref == 0 && (rmp->rm_state & RM_CLOSING)) {
cv_broadcast(&rmp->rm_cwait);
}
mutex_exit(&rmp->rm_lock);
}
/*
* This part of rpcmod is pushed on a connection-oriented transport for use
* by RPC. It serves to bypass the Stream head, implements
* the record marking protocol, and dispatches incoming RPC messages.
*/
/* Default idle timer values */
#define MIR_CLNT_IDLE_TIMEOUT (5 * (60 * 1000L)) /* 5 minutes */
#define MIR_SVC_IDLE_TIMEOUT (6 * (60 * 1000L)) /* 6 minutes */
#define MIR_SVC_ORDREL_TIMEOUT (10 * (60 * 1000L)) /* 10 minutes */
#define MIR_LASTFRAG 0x80000000 /* Record marker */
#define DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr))
typedef struct mir_s {
void *mir_krpc_cell; /* Reserved for KRPC use. This field */
/* must be first in the structure. */
struct xprt_style_ops *rm_ops;
int mir_type; /* Client or server side stream */
mblk_t *mir_head_mp; /* RPC msg in progress */
/*
* mir_head_mp points the first mblk being collected in
* the current RPC message. Record headers are removed
* before data is linked into mir_head_mp.
*/
mblk_t *mir_tail_mp; /* Last mblk in mir_head_mp */
/*
* mir_tail_mp points to the last mblk in the message
* chain starting at mir_head_mp. It is only valid
* if mir_head_mp is non-NULL and is used to add new
* data blocks to the end of chain quickly.
*/
int32_t mir_frag_len; /* Bytes seen in the current frag */
/*
* mir_frag_len starts at -4 for beginning of each fragment.
* When this length is negative, it indicates the number of
* bytes that rpcmod needs to complete the record marker
* header. When it is positive or zero, it holds the number
* of bytes that have arrived for the current fragment and
* are held in mir_header_mp.
*/
int32_t mir_frag_header;
/*
* Fragment header as collected for the current fragment.
* It holds the last-fragment indicator and the number
* of bytes in the fragment.
*/
unsigned int
mir_ordrel_pending : 1, /* Sent T_ORDREL_REQ */
mir_hold_inbound : 1, /* Hold inbound messages on server */
/* side until outbound flow control */
/* is relieved. */
mir_closing : 1, /* The stream is being closed */
mir_inrservice : 1, /* data queued or rd srv proc running */
mir_inwservice : 1, /* data queued or wr srv proc running */
mir_inwflushdata : 1, /* flush M_DATAs when srv runs */
/*
* On client streams, mir_clntreq is 0 or 1; it is set
* to 1 whenever a new request is sent out (mir_wput)
* and cleared when the timer fires (mir_timer). If
* the timer fires with this value equal to 0, then the
* stream is considered idle and KRPC is notified.
*/
mir_clntreq : 1,
/*
* On server streams, stop accepting messages
*/
mir_svc_no_more_msgs : 1,
mir_listen_stream : 1, /* listen end point */
mir_setup_complete : 1, /* server has initialized everything */
mir_timer_call : 1,
mir_junk_fill_thru_bit_31 : 21;
timeout_id_t mir_timer_id; /* Timer for idle checks */
clock_t mir_idle_timeout; /* Allowed idle time before shutdown */
/*
* This value is copied from clnt_idle_timeout or
* svc_idle_timeout during the appropriate ioctl.
* Kept in milliseconds
*/
clock_t mir_use_timestamp; /* updated on client with each use */
/*
* This value is set to lbolt
* every time a client stream sends or receives data.
* Even if the timer message arrives, we don't shutdown
* client unless:
* lbolt >= MSEC_TO_TICK(mir_idle_timeout)+mir_use_timestamp.
* This value is kept in HZ.
*/
uint_t *mir_max_msg_sizep; /* Reference to sanity check size */
/*
* This pointer is set to &clnt_max_msg_size or
* &svc_max_msg_size during the appropriate ioctl.
*/
zoneid_t mir_zoneid; /* zone which pushed rpcmod */
/* Server-side fields. */
int mir_ref_cnt; /* Reference count: server side only */
/* counts the number of references */
/* that a kernel RPC server thread */
/* (see svc_run()) has on this rpcmod */
/* slot. Effectively, it is the */
/* number * of unprocessed messages */
/* that have been passed up to the */
/* KRPC layer */
mblk_t *mir_svc_pend_mp; /* Pending T_ORDREL_IND or */
/* T_DISCON_IND */
/*
* these fields are for both client and server, but for debugging,
* it is easier to have these last in the structure.
*/
kmutex_t mir_mutex; /* Mutex and condvar for close */
kcondvar_t mir_condvar; /* synchronization. */
kcondvar_t mir_timer_cv; /* Timer routine sync. */
} mir_t;
#define MIR_SVC_QUIESCED(mir) \
(mir->mir_ref_cnt == 0 && mir->mir_inrservice == 0)
#define MIR_CLEAR_INRSRV(mir_ptr) { \
(mir_ptr)->mir_inrservice = 0; \
if ((mir_ptr)->mir_type == RPC_SERVER && \
(mir_ptr)->mir_closing) \
cv_signal(&(mir_ptr)->mir_condvar); \
}
/*
* Don't block service procedure (and mir_close) if
* we are in the process of closing.
*/
#define MIR_WCANPUTNEXT(mir_ptr, write_q) \
(canputnext(write_q) || ((mir_ptr)->mir_svc_no_more_msgs == 1))
static int mir_clnt_dup_request(queue_t *q, mblk_t *mp);
static void mir_rput_proto(queue_t *q, mblk_t *mp);
static int mir_svc_policy_notify(queue_t *q, int event);
static void mir_svc_release(queue_t *wq, mblk_t *mp);
static void mir_svc_start(queue_t *wq);
static void mir_svc_idle_start(queue_t *, mir_t *);
static void mir_svc_idle_stop(queue_t *, mir_t *);
static void mir_svc_start_close(queue_t *, mir_t *);
static void mir_clnt_idle_do_stop(queue_t *);
static void mir_clnt_idle_stop(queue_t *, mir_t *);
static void mir_clnt_idle_start(queue_t *, mir_t *);
static void mir_wput(queue_t *q, mblk_t *mp);
static void mir_wput_other(queue_t *q, mblk_t *mp);
static void mir_wsrv(queue_t *q);
static void mir_disconnect(queue_t *, mir_t *ir);
static int mir_check_len(queue_t *, int32_t, mblk_t *);
static void mir_timer(void *);
extern void (*mir_rele)(queue_t *, mblk_t *);
extern void (*mir_start)(queue_t *);
extern void (*clnt_stop_idle)(queue_t *);
clock_t clnt_idle_timeout = MIR_CLNT_IDLE_TIMEOUT;
clock_t svc_idle_timeout = MIR_SVC_IDLE_TIMEOUT;
/*
* Timeout for subsequent notifications of idle connection. This is
* typically used to clean up after a wedged orderly release.
*/
clock_t svc_ordrel_timeout = MIR_SVC_ORDREL_TIMEOUT; /* milliseconds */
extern uint_t *clnt_max_msg_sizep;
extern uint_t *svc_max_msg_sizep;
uint_t clnt_max_msg_size = RPC_MAXDATASIZE;
uint_t svc_max_msg_size = RPC_MAXDATASIZE;
uint_t mir_krpc_cell_null;
static void
mir_timer_stop(mir_t *mir)
{
timeout_id_t tid;
ASSERT(MUTEX_HELD(&mir->mir_mutex));
/*
* Since the mir_mutex lock needs to be released to call
* untimeout(), we need to make sure that no other thread
* can start/stop the timer (changing mir_timer_id) during
* that time. The mir_timer_call bit and the mir_timer_cv
* condition variable are used to synchronize this. Setting
* mir_timer_call also tells mir_timer() (refer to the comments
* in mir_timer()) that it does not need to do anything.
*/
while (mir->mir_timer_call)
cv_wait(&mir->mir_timer_cv, &mir->mir_mutex);
mir->mir_timer_call = B_TRUE;
if ((tid = mir->mir_timer_id) != 0) {
mir->mir_timer_id = 0;
mutex_exit(&mir->mir_mutex);
(void) untimeout(tid);
mutex_enter(&mir->mir_mutex);
}
mir->mir_timer_call = B_FALSE;
cv_broadcast(&mir->mir_timer_cv);
}
static void
mir_timer_start(queue_t *q, mir_t *mir, clock_t intrvl)
{
timeout_id_t tid;
ASSERT(MUTEX_HELD(&mir->mir_mutex));
while (mir->mir_timer_call)
cv_wait(&mir->mir_timer_cv, &mir->mir_mutex);
mir->mir_timer_call = B_TRUE;
if ((tid = mir->mir_timer_id) != 0) {
mutex_exit(&mir->mir_mutex);
(void) untimeout(tid);
mutex_enter(&mir->mir_mutex);
}
/* Only start the timer when it is not closing. */
if (!mir->mir_closing) {
mir->mir_timer_id = timeout(mir_timer, q,
MSEC_TO_TICK(intrvl));
}
mir->mir_timer_call = B_FALSE;
cv_broadcast(&mir->mir_timer_cv);
}
static int
mir_clnt_dup_request(queue_t *q, mblk_t *mp)
{
mblk_t *mp1;
uint32_t new_xid;
uint32_t old_xid;
ASSERT(MUTEX_HELD(&((mir_t *)q->q_ptr)->mir_mutex));
new_xid = BE32_TO_U32(&mp->b_rptr[4]);
/*
* This loop is a bit tacky -- it walks the STREAMS list of
* flow-controlled messages.
*/
if ((mp1 = q->q_first) != NULL) {
do {
old_xid = BE32_TO_U32(&mp1->b_rptr[4]);
if (new_xid == old_xid)
return (1);
} while ((mp1 = mp1->b_next) != NULL);
}
return (0);
}
static int
mir_close(queue_t *q)
{
mir_t *mir;
mblk_t *mp;
bool_t queue_cleaned = FALSE;
RPCLOG(32, "rpcmod: mir_close of q 0x%p\n", (void *)q);
mir = (mir_t *)q->q_ptr;
ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
mutex_enter(&mir->mir_mutex);
if ((mp = mir->mir_head_mp) != NULL) {
mir->mir_head_mp = (mblk_t *)0;
freemsg(mp);
}
/*
* Set mir_closing so we get notified when MIR_SVC_QUIESCED()
* is TRUE. And mir_timer_start() won't start the timer again.
*/
mir->mir_closing = B_TRUE;
mir_timer_stop(mir);
if (mir->mir_type == RPC_SERVER) {
flushq(q, FLUSHDATA); /* Ditch anything waiting on read q */
/*
* This will prevent more requests from arriving and
* will force rpcmod to ignore flow control.
*/
mir_svc_start_close(WR(q), mir);
while ((!MIR_SVC_QUIESCED(mir)) || mir->mir_inwservice == 1) {
if (mir->mir_ref_cnt && !mir->mir_inrservice &&
(queue_cleaned == FALSE)) {
/*
* call into SVC to clean the queue
*/
mutex_exit(&mir->mir_mutex);
svc_queueclean(q);
queue_cleaned = TRUE;
mutex_enter(&mir->mir_mutex);
continue;
}
/*
* Bugid 1253810 - Force the write service
* procedure to send its messages, regardless
* whether the downstream module is ready
* to accept data.
*/
if (mir->mir_inwservice == 1)
qenable(WR(q));
cv_wait(&mir->mir_condvar, &mir->mir_mutex);
}
mutex_exit(&mir->mir_mutex);
qprocsoff(q);
/* Notify KRPC that this stream is going away. */
svc_queueclose(q);
} else {
mutex_exit(&mir->mir_mutex);
qprocsoff(q);
}
mutex_destroy(&mir->mir_mutex);
cv_destroy(&mir->mir_condvar);
cv_destroy(&mir->mir_timer_cv);
kmem_free(mir, sizeof (mir_t));
return (0);
}
/*
* This is server side only (RPC_SERVER).
*
* Exit idle mode.
*/
static void
mir_svc_idle_stop(queue_t *q, mir_t *mir)
{
ASSERT(MUTEX_HELD(&mir->mir_mutex));
ASSERT((q->q_flag & QREADR) == 0);
ASSERT(mir->mir_type == RPC_SERVER);
RPCLOG(16, "rpcmod: mir_svc_idle_stop of q 0x%p\n", (void *)q);
mir_timer_stop(mir);
}
/*
* This is server side only (RPC_SERVER).
*
* Start idle processing, which will include setting idle timer if the
* stream is not being closed.
*/
static void
mir_svc_idle_start(queue_t *q, mir_t *mir)
{
ASSERT(MUTEX_HELD(&mir->mir_mutex));
ASSERT((q->q_flag & QREADR) == 0);
ASSERT(mir->mir_type == RPC_SERVER);
RPCLOG(16, "rpcmod: mir_svc_idle_start q 0x%p\n", (void *)q);
/*
* Don't re-start idle timer if we are closing queues.
*/
if (mir->mir_closing) {
RPCLOG(16, "mir_svc_idle_start - closing: 0x%p\n",
(void *)q);
/*
* We will call mir_svc_idle_start() whenever MIR_SVC_QUIESCED()
* is true. When it is true, and we are in the process of
* closing the stream, signal any thread waiting in
* mir_close().
*/
if (mir->mir_inwservice == 0)
cv_signal(&mir->mir_condvar);
} else {
RPCLOG(16, "mir_svc_idle_start - reset %s timer\n",
mir->mir_ordrel_pending ? "ordrel" : "normal");
/*
* Normal condition, start the idle timer. If an orderly
* release has been sent, set the timeout to wait for the
* client to close its side of the connection. Otherwise,
* use the normal idle timeout.
*/
mir_timer_start(q, mir, mir->mir_ordrel_pending ?
svc_ordrel_timeout : mir->mir_idle_timeout);
}
}
/* ARGSUSED */
static int
mir_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
{
mir_t *mir;
RPCLOG(32, "rpcmod: mir_open of q 0x%p\n", (void *)q);
/* Set variables used directly by KRPC. */
if (!mir_rele)
mir_rele = mir_svc_release;
if (!mir_start)
mir_start = mir_svc_start;
if (!clnt_stop_idle)
clnt_stop_idle = mir_clnt_idle_do_stop;
if (!clnt_max_msg_sizep)
clnt_max_msg_sizep = &clnt_max_msg_size;
if (!svc_max_msg_sizep)
svc_max_msg_sizep = &svc_max_msg_size;
/* Allocate a zero'ed out mir structure for this stream. */
mir = kmem_zalloc(sizeof (mir_t), KM_SLEEP);
/*
* We set hold inbound here so that incoming messages will
* be held on the read-side queue until the stream is completely
* initialized with a RPC_CLIENT or RPC_SERVER ioctl. During
* the ioctl processing, the flag is cleared and any messages that
* arrived between the open and the ioctl are delivered to KRPC.
*
* Early data should never arrive on a client stream since
* servers only respond to our requests and we do not send any.
* until after the stream is initialized. Early data is
* very common on a server stream where the client will start
* sending data as soon as the connection is made (and this
* is especially true with TCP where the protocol accepts the
* connection before nfsd or KRPC is notified about it).
*/
mir->mir_hold_inbound = 1;
/*
* Start the record marker looking for a 4-byte header. When
* this length is negative, it indicates that rpcmod is looking
* for bytes to consume for the record marker header. When it
* is positive, it holds the number of bytes that have arrived
* for the current fragment and are being held in mir_header_mp.
*/
mir->mir_frag_len = -(int32_t)sizeof (uint32_t);
mir->mir_zoneid = rpc_zoneid();
mutex_init(&mir->mir_mutex, NULL, MUTEX_DEFAULT, NULL);
cv_init(&mir->mir_condvar, NULL, CV_DRIVER, NULL);
cv_init(&mir->mir_timer_cv, NULL, CV_DRIVER, NULL);
q->q_ptr = (char *)mir;
WR(q)->q_ptr = (char *)mir;
/*
* We noenable the read-side queue because we don't want it
* automatically enabled by putq. We enable it explicitly
* in mir_wsrv when appropriate. (See additional comments on
* flow control at the beginning of mir_rsrv.)
*/
noenable(q);
qprocson(q);
return (0);
}
/*
* Read-side put routine for both the client and server side. Does the
* record marking for incoming RPC messages, and when complete, dispatches
* the message to either the client or server.
*/
static void
mir_do_rput(queue_t *q, mblk_t *mp, int srv)
{
mblk_t *cont_mp;
int excess;
int32_t frag_len;
int32_t frag_header;
mblk_t *head_mp;
int len;
mir_t *mir;
mblk_t *mp1;
unsigned char *rptr;
mblk_t *tail_mp;
unsigned char *wptr;
boolean_t stop_timer = B_FALSE;
mir = (mir_t *)q->q_ptr;
ASSERT(mir != NULL);
/*
* If the stream has not been set up as a RPC_CLIENT or RPC_SERVER
* with the corresponding ioctl, then don't accept
* any inbound data. This should never happen for streams
* created by nfsd or client-side KRPC because they are careful
* to set the mode of the stream before doing anything else.
*/
if (mir->mir_type == 0) {
freemsg(mp);
return;
}
ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
switch (mp->b_datap->db_type) {
case M_DATA:
break;
case M_PROTO:
case M_PCPROTO:
rptr = mp->b_rptr;
if (mp->b_wptr - rptr < sizeof (uint32_t)) {
RPCLOG(1, "mir_rput: runt TPI message (%d bytes)\n",
(int)(mp->b_wptr - rptr));
freemsg(mp);
return;
}
if (((union T_primitives *)rptr)->type != T_DATA_IND) {
mir_rput_proto(q, mp);
return;
}
/* Throw away the T_DATA_IND block and continue with data. */
mp1 = mp;
mp = mp->b_cont;
freeb(mp1);
break;
case M_SETOPTS:
/*
* If a module on the stream is trying set the Stream head's
* high water mark, then set our hiwater to the requested
* value. We are the "stream head" for all inbound
* data messages since messages are passed directly to KRPC.
*/
if ((mp->b_wptr - mp->b_rptr) >= sizeof (struct stroptions)) {
struct stroptions *stropts;
stropts = (struct stroptions *)mp->b_rptr;
if ((stropts->so_flags & SO_HIWAT) &&
!(stropts->so_flags & SO_BAND)) {
(void) strqset(q, QHIWAT, 0, stropts->so_hiwat);
}
}
putnext(q, mp);
return;
case M_FLUSH:
RPCLOG(32, "mir_do_rput: ignoring M_FLUSH on q 0x%p. ",
(void *)q);
RPCLOG(32, "M_FLUSH is %x\n", (uint_t)*mp->b_rptr);
putnext(q, mp);
return;
default:
putnext(q, mp);
return;
}
mutex_enter(&mir->mir_mutex);
/*
* If this connection is closing, don't accept any new messages.
*/
if (mir->mir_svc_no_more_msgs) {
ASSERT(mir->mir_type == RPC_SERVER);
mutex_exit(&mir->mir_mutex);
freemsg(mp);
return;
}
/* Get local copies for quicker access. */
frag_len = mir->mir_frag_len;
frag_header = mir->mir_frag_header;
head_mp = mir->mir_head_mp;
tail_mp = mir->mir_tail_mp;
/* Loop, processing each message block in the mp chain separately. */
do {
/*
* cont_mp is used in the do/while condition below to
* walk to the next block in the STREAMS message.
* mp->b_cont may be nil'ed during processing so we
* can't rely on it to find the next block.
*/
cont_mp = mp->b_cont;
/*
* Get local copies of rptr and wptr for our processing.
* These always point into "mp" (the current block being
* processed), but rptr is updated as we consume any
* record header in this message, and wptr is updated to
* point to the end of the data for the current fragment,
* if it ends in this block. The main point is that
* they are not always the same as b_rptr and b_wptr.
* b_rptr and b_wptr will be updated when appropriate.
*/
rptr = mp->b_rptr;
wptr = mp->b_wptr;
same_mblk:;
len = (int)(wptr - rptr);
if (len <= 0) {
/*
* If we have processed all of the data in the message
* or the block is empty to begin with, then we're
* done with this block and can go on to cont_mp,
* if there is one.
*
* First, we check to see if the current block is
* now zero-length and, if so, we free it.
* This happens when either the block was empty
* to begin with or we consumed all of the data
* for the record marking header.
*/
if (rptr <= mp->b_rptr) {
/*
* If head_mp is non-NULL, add cont_mp to the
* mblk list. XXX But there is a possibility
* that tail_mp = mp or even head_mp = mp XXX
*/
if (head_mp) {
if (head_mp == mp)
head_mp = NULL;
else if (tail_mp != mp) {
ASSERT((tail_mp->b_cont == NULL) || (tail_mp->b_cont == mp));
tail_mp->b_cont = cont_mp;
/*
* It's possible that, because
* of a very short mblk (0-3
* bytes), we've ended up here
* and that cont_mp could be
* NULL (if we're at the end
* of an mblk chain). If so,
* don't set tail_mp to
* cont_mp, because the next
* time we access it, we'll
* dereference a NULL pointer
* and crash. Just leave
* tail_mp pointing at the
* current end of chain.
*/
if (cont_mp)
tail_mp = cont_mp;
} else {
mblk_t *smp = head_mp;
while ((smp->b_cont != NULL) &&
(smp->b_cont != mp))
smp = smp->b_cont;
smp->b_cont = cont_mp;
/*
* Don't set tail_mp to cont_mp
* if it's NULL. Instead, set
* tail_mp to smp, which is the
* end of the chain starting
* at head_mp.
*/
if (cont_mp)
tail_mp = cont_mp;
else
tail_mp = smp;
}
}
freeb(mp);
}
continue;
}
/*
* frag_len starts at -4 and is incremented past the record
* marking header to 0, and then becomes positive as real data
* bytes are received for the message. While frag_len is less
* than zero, we need more bytes for the record marking
* header.
*/
if (frag_len < 0) {
uchar_t *up = rptr;
/*
* Collect as many bytes as we need for the record
* marking header and that are available in this block.
*/
do {
--len;
frag_len++;
frag_header <<= 8;
frag_header += (*up++ & 0xFF);
} while (len > 0 && frag_len < 0);
if (rptr == mp->b_rptr) {
/*
* The record header is located at the
* beginning of the block, so just walk
* b_rptr past it.
*/
mp->b_rptr = rptr = up;
} else {
/*
* The record header is located in the middle
* of a block, so copy any remaining data up.
* This happens when an RPC message is
* fragmented into multiple pieces and
* a middle (or end) fragment immediately
* follows a previous fragment in the same
* message block.
*/
wptr = &rptr[len];
mp->b_wptr = wptr;
if (len) {
RPCLOG(32, "mir_do_rput: copying %d "
"bytes of data up", len);
RPCLOG(32, " db_ref %d\n",
(uint_t)mp->b_datap->db_ref);
bcopy(up, rptr, len);
}
}
/*
* If we haven't received the complete record header
* yet, then loop around to get the next block in the
* STREAMS message. The logic at same_mblk label will
* free the current block if it has become empty.
*/
if (frag_len < 0) {
RPCLOG(32, "mir_do_rput: frag_len is still < 0 "
"(%d)", len);
goto same_mblk;
}
#ifdef RPCDEBUG
if ((frag_header & MIR_LASTFRAG) == 0) {
RPCLOG0(32, "mir_do_rput: multi-fragment "
"record\n");
}
{
uint_t l = frag_header & ~MIR_LASTFRAG;
if (l != 0 && mir->mir_max_msg_sizep &&
l >= *mir->mir_max_msg_sizep) {
RPCLOG(32, "mir_do_rput: fragment size"
" (%d) > maximum", l);
RPCLOG(32, " (%u)\n",
*mir->mir_max_msg_sizep);
}
}
#endif
/*
* At this point we have retrieved the complete record
* header for this fragment. If the current block is
* empty, then we need to free it and walk to the next
* block.
*/
if (mp->b_rptr >= wptr) {
/*
* If this is not the last fragment or if we
* have not received all the data for this
* RPC message, then loop around to the next
* block.
*/
if (!(frag_header & MIR_LASTFRAG) ||
(frag_len -
(frag_header & ~MIR_LASTFRAG)) ||
!head_mp)
goto same_mblk;
/*
* Quick walk to next block in the
* STREAMS message.
*/
freeb(mp);
continue;
}
}
/*
* We've collected the complete record header. The data
* in the current block is added to the end of the RPC
* message. Note that tail_mp is the same as mp after
* this linkage.
*/
if (!head_mp)
head_mp = mp;
else if (tail_mp != mp) {
ASSERT((tail_mp->b_cont == NULL) ||
(tail_mp->b_cont == mp));
tail_mp->b_cont = mp;
}
tail_mp = mp;
/*
* Add the length of this block to the accumulated
* fragment length.
*/
frag_len += len;
excess = frag_len - (frag_header & ~MIR_LASTFRAG);
/*
* If we have not received all the data for this fragment,
* then walk to the next block.
*/
if (excess < 0)
continue;
/*
* We've received a complete fragment, so reset frag_len
* for the next one.
*/
frag_len = -(int32_t)sizeof (uint32_t);
/*
* Update rptr to point to the beginning of the next
* fragment in this block. If there are no more bytes
* in the block (excess is 0), then rptr will be equal
* to wptr.
*/
rptr = wptr - excess;
/*
* Now we check to see if this fragment is the last one in
* the RPC message.
*/
if (!(frag_header & MIR_LASTFRAG)) {
/*
* This isn't the last one, so start processing the
* next fragment.
*/
frag_header = 0;
/*
* If excess is 0, the next fragment
* starts at the beginning of the next block --
* we "continue" to the end of the while loop and
* walk to cont_mp.
*/
if (excess == 0)
continue;
RPCLOG0(32, "mir_do_rput: multi-fragment message with "
"two or more fragments in one mblk\n");
/*
* If excess is non-0, then the next fragment starts
* in this block. rptr points to the beginning
* of the next fragment and we "goto same_mblk"
* to continue processing.
*/
goto same_mblk;
}
/*
* We've got a complete RPC message. Before passing it
* upstream, check to see if there is extra data in this
* message block. If so, then we separate the excess
* from the complete message. The excess data is processed
* after the current message goes upstream.
*/
if (excess > 0) {
RPCLOG(32, "mir_do_rput: end of record, but excess "
"data (%d bytes) in this mblk. dupb/copyb "
"needed\n", excess);
/* Duplicate only the overlapping block. */
mp1 = dupb(tail_mp);
/*
* dupb() might have failed due to ref count wrap around
* so try a copyb().
*/
if (mp1 == NULL)
mp1 = copyb(tail_mp);
/*
* Do not use bufcall() to schedule a "buffer
* availability event." The reason is that
* bufcall() has problems. For example, if memory
* runs out, bufcall() itself will fail since it
* needs to allocate memory. The most appropriate
* action right now is to disconnect this connection
* as the system is under stress. We should try to
* free up resources.
*/
if (mp1 == NULL) {
freemsg(head_mp);
RPCLOG0(1, "mir_do_rput: dupb/copyb failed\n");
mir->mir_frag_header = 0;
mir->mir_frag_len = -(int)sizeof (uint32_t);
mir->mir_head_mp = NULL;
mir->mir_tail_mp = NULL;
mir_disconnect(q, mir);
return;
}
/*
* The new message block is linked with the
* continuation block in cont_mp. We then point
* cont_mp to the new block so that we will
* process it next.
*/
mp1->b_cont = cont_mp;
cont_mp = mp1;
/*
* Data in the new block begins at the
* next fragment (rptr).
*/
cont_mp->b_rptr += (rptr - tail_mp->b_rptr);
ASSERT(cont_mp->b_rptr >= cont_mp->b_datap->db_base);
ASSERT(cont_mp->b_rptr <= cont_mp->b_wptr);
/* Data in the current fragment ends at rptr. */
tail_mp->b_wptr = rptr;
ASSERT(tail_mp->b_wptr <= tail_mp->b_datap->db_lim);
ASSERT(tail_mp->b_wptr >= tail_mp->b_rptr);
}
/* tail_mp is the last block with data for this RPC message. */
tail_mp->b_cont = NULL;
/* Pass the RPC message to the current consumer. */
switch (mir->mir_type) {
case RPC_CLIENT:
if (clnt_dispatch_notify(head_mp, mir->mir_zoneid)) {
/*
* Mark this stream as active. This marker
* is used in mir_timer().
*/
mir->mir_clntreq = 1;
mir->mir_use_timestamp = lbolt;
} else
freemsg(head_mp);
break;
case RPC_SERVER:
/*
* Check for flow control before passing the
* message to KRPC.
*/
if (!mir->mir_hold_inbound) {
if (mir->mir_krpc_cell) {
/*
* If the reference count is 0
* (not including this request),
* then the stream is transitioning
* from idle to non-idle. In this case,
* we cancel the idle timer.
*/
if (mir->mir_ref_cnt++ == 0)
stop_timer = B_TRUE;
if (mir_check_len(q,
(int32_t)msgdsize(mp), mp))
return;
svc_queuereq(q, head_mp); /* to KRPC */
} else {
/*
* Count # of times this happens. Should be
* never, but experience shows otherwise.
*/
mir_krpc_cell_null++;
freemsg(head_mp);
}
} else {
/*
* If the outbound side of the stream is
* flow controlled, then hold this message
* until client catches up. mir_hold_inbound
* is set in mir_wput and cleared in mir_wsrv.
*/
if (srv)
(void) putbq(q, head_mp);
else
(void) putq(q, head_mp);
mir->mir_inrservice = B_TRUE;
}
break;
default:
RPCLOG(1, "mir_rput: unknown mir_type %d\n",
mir->mir_type);
freemsg(head_mp);
break;
}
/*
* Reset head_mp and frag_header since we're starting on a
* new RPC fragment and message.
*/
head_mp = NULL;
tail_mp = NULL;
frag_header = 0;
} while ((mp = cont_mp) != NULL);
/*
* Do a sanity check on the message length. If this message is
* getting excessively large, shut down the connection.
*/
if (head_mp != NULL && mir->mir_setup_complete &&
mir_check_len(q, frag_len, head_mp))
return;
/* Save our local copies back in the mir structure. */
mir->mir_frag_header = frag_header;
mir->mir_frag_len = frag_len;
mir->mir_head_mp = head_mp;
mir->mir_tail_mp = tail_mp;
/*
* The timer is stopped after the whole message chain is processed.
* The reason is that stopping the timer releases the mir_mutex
* lock temporarily. This means that the request can be serviced
* while we are still processing the message chain. This is not
* good. So we stop the timer here instead.
*
* Note that if the timer fires before we stop it, it will not
* do any harm as MIR_SVC_QUIESCED() is false and mir_timer()
* will just return;
*/
if (stop_timer) {
RPCLOG(16, "mir_do_rput stopping idle timer on 0x%p because "
"ref cnt going to non zero\n", (void *) WR(q));
mir_svc_idle_stop(WR(q), mir);
}
mutex_exit(&mir->mir_mutex);
}
static void
mir_rput(queue_t *q, mblk_t *mp)
{
mir_do_rput(q, mp, 0);
}
static void
mir_rput_proto(queue_t *q, mblk_t *mp)
{
mir_t *mir = (mir_t *)q->q_ptr;
uint32_t type;
uint32_t reason = 0;
ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
type = ((union T_primitives *)mp->b_rptr)->type;
switch (mir->mir_type) {
case RPC_CLIENT:
switch (type) {
case T_DISCON_IND:
reason =
((struct T_discon_ind *)(mp->b_rptr))->DISCON_reason;
/*FALLTHROUGH*/
case T_ORDREL_IND:
mutex_enter(&mir->mir_mutex);
if (mir->mir_head_mp) {
freemsg(mir->mir_head_mp);
mir->mir_head_mp = (mblk_t *)0;
mir->mir_tail_mp = (mblk_t *)0;
}
/*
* We are disconnecting, but not necessarily
* closing. By not closing, we will fail to
* pick up a possibly changed global timeout value,
* unless we store it now.
*/
mir->mir_idle_timeout = clnt_idle_timeout;
mir_clnt_idle_stop(WR(q), mir);
/*
* Even though we are unconnected, we still
* leave the idle timer going on the client. The
* reason for is that if we've disconnected due
* to a server-side disconnect, reset, or connection
* timeout, there is a possibility the client may
* retry the RPC request. This retry needs to done on
* the same bound address for the server to interpret
* it as such. However, we don't want
* to wait forever for that possibility. If the
* end-point stays unconnected for mir_idle_timeout
* units of time, then that is a signal to the
* connection manager to give up waiting for the
* application (eg. NFS) to send a retry.
*/
mir_clnt_idle_start(WR(q), mir);
mutex_exit(&mir->mir_mutex);
clnt_dispatch_notifyall(WR(q), type, reason);
freemsg(mp);
return;
case T_ERROR_ACK:
{
struct T_error_ack *terror;
terror = (struct T_error_ack *)mp->b_rptr;
RPCLOG(1, "mir_rput_proto T_ERROR_ACK for queue 0x%p",
(void *)q);
RPCLOG(1, " ERROR_prim: %s,",
rpc_tpiprim2name(terror->ERROR_prim));
RPCLOG(1, " TLI_error: %s,",
rpc_tpierr2name(terror->TLI_error));
RPCLOG(1, " UNIX_error: %d\n", terror->UNIX_error);
if (terror->ERROR_prim == T_DISCON_REQ) {
clnt_dispatch_notifyall(WR(q), type, reason);
freemsg(mp);
return;
} else {
if (clnt_dispatch_notifyconn(WR(q), mp))
return;
}
break;
}
case T_OK_ACK:
{
struct T_ok_ack *tok = (struct T_ok_ack *)mp->b_rptr;
if (tok->CORRECT_prim == T_DISCON_REQ) {
clnt_dispatch_notifyall(WR(q), type, reason);
freemsg(mp);
return;
} else {
if (clnt_dispatch_notifyconn(WR(q), mp))
return;
}
break;
}
case T_CONN_CON:
case T_INFO_ACK:
case T_OPTMGMT_ACK:
if (clnt_dispatch_notifyconn(WR(q), mp))
return;
break;
case T_BIND_ACK:
break;
default:
RPCLOG(1, "mir_rput: unexpected message %d "
"for KRPC client\n",
((union T_primitives *)mp->b_rptr)->type);
break;
}
break;
case RPC_SERVER:
switch (type) {
case T_BIND_ACK:
{
struct T_bind_ack *tbind;
/*
* If this is a listening stream, then shut
* off the idle timer.
*/
tbind = (struct T_bind_ack *)mp->b_rptr;
if (tbind->CONIND_number > 0) {
mutex_enter(&mir->mir_mutex);
mir_svc_idle_stop(WR(q), mir);
/*
* mark this as a listen endpoint
* for special handling.
*/
mir->mir_listen_stream = 1;
mutex_exit(&mir->mir_mutex);
}
break;
}
case T_DISCON_IND:
case T_ORDREL_IND:
RPCLOG(16, "mir_rput_proto: got %s indication\n",
type == T_DISCON_IND ? "disconnect"
: "orderly release");
/*
* For listen endpoint just pass
* on the message.
*/
if (mir->mir_listen_stream)
break;
mutex_enter(&mir->mir_mutex);
/*
* If client wants to break off connection, record
* that fact.
*/
mir_svc_start_close(WR(q), mir);
/*
* If we are idle, then send the orderly release
* or disconnect indication to nfsd.
*/
if (MIR_SVC_QUIESCED(mir)) {
mutex_exit(&mir->mir_mutex);
break;
}
RPCLOG(16, "mir_rput_proto: not idle, so "
"disconnect/ord rel indication not passed "
"upstream on 0x%p\n", (void *)q);
/*
* Hold the indication until we get idle
* If there already is an indication stored,
* replace it if the new one is a disconnect. The
* reasoning is that disconnection takes less time
* to process, and once a client decides to
* disconnect, we should do that.
*/
if (mir->mir_svc_pend_mp) {
if (type == T_DISCON_IND) {
RPCLOG(16, "mir_rput_proto: replacing"
" held disconnect/ord rel"
" indication with disconnect on"
" 0x%p\n", (void *)q);
freemsg(mir->mir_svc_pend_mp);
mir->mir_svc_pend_mp = mp;
} else {
RPCLOG(16, "mir_rput_proto: already "
"held a disconnect/ord rel "
"indication. freeing ord rel "
"ind on 0x%p\n", (void *)q);
freemsg(mp);
}
} else
mir->mir_svc_pend_mp = mp;
mutex_exit(&mir->mir_mutex);
return;
default:
/* nfsd handles server-side non-data messages. */
break;
}
break;
default:
break;
}
putnext(q, mp);
}
/*
* The server-side read queues are used to hold inbound messages while
* outbound flow control is exerted. When outbound flow control is
* relieved, mir_wsrv qenables the read-side queue. Read-side queues
* are not enabled by STREAMS and are explicitly noenable'ed in mir_open.
*
* For the server side, we have two types of messages queued. The first type
* are messages that are ready to be XDR decoded and and then sent to the
* RPC program's dispatch routine. The second type are "raw" messages that
* haven't been processed, i.e. assembled from rpc record fragements into
* full requests. The only time we will see the second type of message
* queued is if we have a memory allocation failure while processing a
* a raw message. The field mir_first_non_processed_mblk will mark the
* first such raw message. So the flow for server side is:
*
* - send processed queued messages to kRPC until we run out or find
* one that needs additional processing because we were short on memory
* earlier
* - process a message that was deferred because of lack of
* memory
* - continue processing messages until the queue empties or we
* have to stop because of lack of memory
* - during each of the above phase, if the queue is empty and
* there are no pending messages that were passed to the RPC
* layer, send upstream the pending disconnect/ordrel indication if
* there is one
*
* The read-side queue is also enabled by a bufcall callback if dupmsg
* fails in mir_rput.
*/
static void
mir_rsrv(queue_t *q)
{
mir_t *mir;
mblk_t *mp;
mblk_t *cmp = NULL;
boolean_t stop_timer = B_FALSE;
mir = (mir_t *)q->q_ptr;
mutex_enter(&mir->mir_mutex);
mp = NULL;
switch (mir->mir_type) {
case RPC_SERVER:
if (mir->mir_ref_cnt == 0)
mir->mir_hold_inbound = 0;
if (mir->mir_hold_inbound) {
ASSERT(cmp == NULL);
if (q->q_first == NULL) {
MIR_CLEAR_INRSRV(mir);
if (MIR_SVC_QUIESCED(mir)) {
cmp = mir->mir_svc_pend_mp;
mir->mir_svc_pend_mp = NULL;
}
}
mutex_exit(&mir->mir_mutex);
if (cmp != NULL) {
RPCLOG(16, "mir_rsrv: line %d: sending a held "
"disconnect/ord rel indication upstream\n",
__LINE__);
putnext(q, cmp);
}
return;
}
while (mp = getq(q)) {
if (mir->mir_krpc_cell) {
/*
* If we were idle, turn off idle timer since
* we aren't idle any more.
*/
if (mir->mir_ref_cnt++ == 0)
stop_timer = B_TRUE;
if (mir_check_len(q,
(int32_t)msgdsize(mp), mp))
return;
svc_queuereq(q, mp);
} else {
/*
* Count # of times this happens. Should be
* never, but experience shows otherwise.
*/
mir_krpc_cell_null++;
freemsg(mp);
}
}
break;
case RPC_CLIENT:
break;
default:
RPCLOG(1, "mir_rsrv: unexpected mir_type %d\n", mir->mir_type);
if (q->q_first == NULL)
MIR_CLEAR_INRSRV(mir);
mutex_exit(&mir->mir_mutex);
return;
}
/*
* The timer is stopped after all the messages are processed.
* The reason is that stopping the timer releases the mir_mutex
* lock temporarily. This means that the request can be serviced
* while we are still processing the message queue. This is not
* good. So we stop the timer here instead.
*/
if (stop_timer) {
RPCLOG(16, "mir_rsrv stopping idle timer on 0x%p because ref "
"cnt going to non zero\n", (void *)WR(q));
mir_svc_idle_stop(WR(q), mir);
}
if (q->q_first == NULL) {
MIR_CLEAR_INRSRV(mir);
ASSERT(cmp == NULL);
if (mir->mir_type == RPC_SERVER && MIR_SVC_QUIESCED(mir)) {
cmp = mir->mir_svc_pend_mp;
mir->mir_svc_pend_mp = NULL;
}
mutex_exit(&mir->mir_mutex);
if (cmp != NULL) {
RPCLOG(16, "mir_rsrv: line %d: sending a held "
"disconnect/ord rel indication upstream\n",
__LINE__);
putnext(q, cmp);
}
return;
}
mutex_exit(&mir->mir_mutex);
}
static int mir_svc_policy_fails;
/*
* Called to send an event code to nfsd/lockd so that it initiates
* connection close.
*/
static int
mir_svc_policy_notify(queue_t *q, int event)
{
mblk_t *mp;
#ifdef DEBUG
mir_t *mir = (mir_t *)q->q_ptr;
ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
#endif
ASSERT(q->q_flag & QREADR);
/*
* Create an M_DATA message with the event code and pass it to the
* Stream head (nfsd or whoever created the stream will consume it).
*/
mp = allocb(sizeof (int), BPRI_HI);
if (!mp) {
mir_svc_policy_fails++;
RPCLOG(16, "mir_svc_policy_notify: could not allocate event "
"%d\n", event);
return (ENOMEM);
}
U32_TO_BE32(event, mp->b_rptr);
mp->b_wptr = mp->b_rptr + sizeof (int);
putnext(q, mp);
return (0);
}
/*
* Server side: start the close phase. We want to get this rpcmod slot in an
* idle state before mir_close() is called.
*/
static void
mir_svc_start_close(queue_t *wq, mir_t *mir)
{
ASSERT(MUTEX_HELD(&mir->mir_mutex));
ASSERT((wq->q_flag & QREADR) == 0);
ASSERT(mir->mir_type == RPC_SERVER);
/*
* Do not accept any more messages.
*/
mir->mir_svc_no_more_msgs = 1;
/*
* Next two statements will make the read service procedure invoke
* svc_queuereq() on everything stuck in the streams read queue.
* It's not necessary because enabling the write queue will
* have the same effect, but why not speed the process along?
*/
mir->mir_hold_inbound = 0;
qenable(RD(wq));
/*
* Meanwhile force the write service procedure to send the
* responses downstream, regardless of flow control.
*/
qenable(wq);
}
/*
* This routine is called directly by KRPC after a request is completed,
* whether a reply was sent or the request was dropped.
*/
static void
mir_svc_release(queue_t *wq, mblk_t *mp)
{
mir_t *mir = (mir_t *)wq->q_ptr;
mblk_t *cmp = NULL;
ASSERT((wq->q_flag & QREADR) == 0);
if (mp)
freemsg(mp);
mutex_enter(&mir->mir_mutex);
mir->mir_ref_cnt--;
ASSERT(mir->mir_ref_cnt >= 0);
/*
* Start idle processing if this is the last reference.
*/
if (MIR_SVC_QUIESCED(mir)) {
RPCLOG(16, "mir_svc_release starting idle timer on 0x%p "
"because ref cnt is zero\n", (void *) wq);
cmp = mir->mir_svc_pend_mp;
mir->mir_svc_pend_mp = NULL;
mir_svc_idle_start(wq, mir);
}
mutex_exit(&mir->mir_mutex);
if (cmp) {
RPCLOG(16, "mir_svc_release: sending a held "
"disconnect/ord rel indication upstream on queue 0x%p\n",
(void *)RD(wq));
putnext(RD(wq), cmp);
}
}
/*
* This routine is called by server-side KRPC when it is ready to
* handle inbound messages on the stream.
*/
static void
mir_svc_start(queue_t *wq)
{
mir_t *mir = (mir_t *)wq->q_ptr;
mutex_enter(&mir->mir_mutex);
mir->mir_setup_complete = 1;
mutex_exit(&mir->mir_mutex);
qenable(RD(wq));
}
/*
* client side wrapper for stopping timer with normal idle timeout.
*/
static void
mir_clnt_idle_stop(queue_t *wq, mir_t *mir)
{
ASSERT(MUTEX_HELD(&mir->mir_mutex));
ASSERT((wq->q_flag & QREADR) == 0);
ASSERT(mir->mir_type == RPC_CLIENT);
mir_timer_stop(mir);
}
/*
* client side wrapper for stopping timer with normal idle timeout.
*/
static void
mir_clnt_idle_start(queue_t *wq, mir_t *mir)
{
ASSERT(MUTEX_HELD(&mir->mir_mutex));
ASSERT((wq->q_flag & QREADR) == 0);
ASSERT(mir->mir_type == RPC_CLIENT);
mir_timer_start(wq, mir, mir->mir_idle_timeout);
}
/*
* client side only. Forces rpcmod to stop sending T_ORDREL_REQs on
* end-points that aren't connected.
*/
static void
mir_clnt_idle_do_stop(queue_t *wq)
{
mir_t *mir = (mir_t *)wq->q_ptr;
RPCLOG(1, "mir_clnt_idle_do_stop: wq 0x%p\n", (void *)wq);
ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
mutex_enter(&mir->mir_mutex);
mir_clnt_idle_stop(wq, mir);
mutex_exit(&mir->mir_mutex);
}
/*
* Timer handler. It handles idle timeout and memory shortage problem.
*/
static void
mir_timer(void *arg)
{
queue_t *wq = (queue_t *)arg;
mir_t *mir = (mir_t *)wq->q_ptr;
boolean_t notify;
mutex_enter(&mir->mir_mutex);
/*
* mir_timer_call is set only when either mir_timer_[start|stop]
* is progressing. And mir_timer() can only be run while they
* are progressing if the timer is being stopped. So just
* return.
*/
if (mir->mir_timer_call) {
mutex_exit(&mir->mir_mutex);
return;
}
mir->mir_timer_id = 0;
switch (mir->mir_type) {
case RPC_CLIENT:
/*
* For clients, the timer fires at clnt_idle_timeout
* intervals. If the activity marker (mir_clntreq) is
* zero, then the stream has been idle since the last
* timer event and we notify KRPC. If mir_clntreq is
* non-zero, then the stream is active and we just
* restart the timer for another interval. mir_clntreq
* is set to 1 in mir_wput for every request passed
* downstream.
*
* If this was a memory shortage timer reset the idle
* timeout regardless; the mir_clntreq will not be a
* valid indicator.
*
* The timer is initially started in mir_wput during
* RPC_CLIENT ioctl processing.
*
* The timer interval can be changed for individual
* streams with the ND variable "mir_idle_timeout".
*/
if (mir->mir_clntreq > 0 && mir->mir_use_timestamp +
MSEC_TO_TICK(mir->mir_idle_timeout) - lbolt >= 0) {
clock_t tout;
tout = mir->mir_idle_timeout -
TICK_TO_MSEC(lbolt - mir->mir_use_timestamp);
if (tout < 0)
tout = 1000;
#if 0
printf("mir_timer[%d < %d + %d]: reset client timer to %d (ms)\n",
TICK_TO_MSEC(lbolt), TICK_TO_MSEC(mir->mir_use_timestamp),
mir->mir_idle_timeout, tout);
#endif
mir->mir_clntreq = 0;
mir_timer_start(wq, mir, tout);
mutex_exit(&mir->mir_mutex);
return;
}
#if 0
printf("mir_timer[%d]: doing client timeout\n", lbolt / hz);
#endif
/*
* We are disconnecting, but not necessarily
* closing. By not closing, we will fail to
* pick up a possibly changed global timeout value,
* unless we store it now.
*/
mir->mir_idle_timeout = clnt_idle_timeout;
mir_clnt_idle_start(wq, mir);
mutex_exit(&mir->mir_mutex);
/*
* We pass T_ORDREL_REQ as an integer value
* to KRPC as the indication that the stream
* is idle. This is not a T_ORDREL_REQ message,
* it is just a convenient value since we call
* the same KRPC routine for T_ORDREL_INDs and
* T_DISCON_INDs.
*/
clnt_dispatch_notifyall(wq, T_ORDREL_REQ, 0);
return;
case RPC_SERVER:
/*
* For servers, the timer is only running when the stream
* is really idle or memory is short. The timer is started
* by mir_wput when mir_type is set to RPC_SERVER and
* by mir_svc_idle_start whenever the stream goes idle
* (mir_ref_cnt == 0). The timer is cancelled in
* mir_rput whenever a new inbound request is passed to KRPC
* and the stream was previously idle.
*
* The timer interval can be changed for individual
* streams with the ND variable "mir_idle_timeout".
*
* If the stream is not idle do nothing.
*/
if (!MIR_SVC_QUIESCED(mir)) {
mutex_exit(&mir->mir_mutex);
return;
}
notify = !mir->mir_inrservice;
mutex_exit(&mir->mir_mutex);
/*
* If there is no packet queued up in read queue, the stream
* is really idle so notify nfsd to close it.
*/
if (notify) {
RPCLOG(16, "mir_timer: telling stream head listener "
"to close stream (0x%p)\n", (void *) RD(wq));
(void) mir_svc_policy_notify(RD(wq), 1);
}
return;
default:
RPCLOG(1, "mir_timer: unexpected mir_type %d\n",
mir->mir_type);
mutex_exit(&mir->mir_mutex);
return;
}
}
/*
* Called by the RPC package to send either a call or a return, or a
* transport connection request. Adds the record marking header.
*/
static void
mir_wput(queue_t *q, mblk_t *mp)
{
uint_t frag_header;
mir_t *mir = (mir_t *)q->q_ptr;
uchar_t *rptr = mp->b_rptr;
if (!mir) {
freemsg(mp);
return;
}
if (mp->b_datap->db_type != M_DATA) {
mir_wput_other(q, mp);
return;
}
if (mir->mir_ordrel_pending == 1) {
freemsg(mp);
RPCLOG(16, "mir_wput wq 0x%p: got data after T_ORDREL_REQ\n",
(void *)q);
return;
}
frag_header = (uint_t)DLEN(mp);
frag_header |= MIR_LASTFRAG;
/* Stick in the 4 byte record marking header. */
if ((rptr - mp->b_datap->db_base) < sizeof (uint32_t) ||
!IS_P2ALIGNED(mp->b_rptr, sizeof (uint32_t))) {
/*
* Since we know that M_DATA messages are created exclusively
* by KRPC, we expect that KRPC will leave room for our header
* and 4 byte align which is normal for XDR.
* If KRPC (or someone else) does not cooperate, then we
* just throw away the message.
*/
RPCLOG(1, "mir_wput: KRPC did not leave space for record "
"fragment header (%d bytes left)\n",
(int)(rptr - mp->b_datap->db_base));
freemsg(mp);
return;
}
rptr -= sizeof (uint32_t);
*(uint32_t *)rptr = htonl(frag_header);
mp->b_rptr = rptr;
mutex_enter(&mir->mir_mutex);
if (mir->mir_type == RPC_CLIENT) {
/*
* For the client, set mir_clntreq to indicate that the
* connection is active.
*/
mir->mir_clntreq = 1;
mir->mir_use_timestamp = lbolt;
}
/*
* If we haven't already queued some data and the downstream module
* can accept more data, send it on, otherwise we queue the message
* and take other actions depending on mir_type.
*/
if (!mir->mir_inwservice && MIR_WCANPUTNEXT(mir, q)) {
mutex_exit(&mir->mir_mutex);
/*
* Now we pass the RPC message downstream.
*/
putnext(q, mp);
return;
}
switch (mir->mir_type) {
case RPC_CLIENT:
/*
* Check for a previous duplicate request on the
* queue. If there is one, then we throw away
* the current message and let the previous one
* go through. If we can't find a duplicate, then
* send this one. This tap dance is an effort
* to reduce traffic and processing requirements
* under load conditions.
*/
if (mir_clnt_dup_request(q, mp)) {
mutex_exit(&mir->mir_mutex);
freemsg(mp);
return;
}
break;
case RPC_SERVER:
/*
* Set mir_hold_inbound so that new inbound RPC
* messages will be held until the client catches
* up on the earlier replies. This flag is cleared
* in mir_wsrv after flow control is relieved;
* the read-side queue is also enabled at that time.
*/
mir->mir_hold_inbound = 1;
break;
default:
RPCLOG(1, "mir_wput: unexpected mir_type %d\n", mir->mir_type);
break;
}
mir->mir_inwservice = 1;
(void) putq(q, mp);
mutex_exit(&mir->mir_mutex);
}
static void
mir_wput_other(queue_t *q, mblk_t *mp)
{
mir_t *mir = (mir_t *)q->q_ptr;
struct iocblk *iocp;
uchar_t *rptr = mp->b_rptr;
bool_t flush_in_svc = FALSE;
ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
switch (mp->b_datap->db_type) {
case M_IOCTL:
iocp = (struct iocblk *)rptr;
switch (iocp->ioc_cmd) {
case RPC_CLIENT:
mutex_enter(&mir->mir_mutex);
if (mir->mir_type != 0 &&
mir->mir_type != iocp->ioc_cmd) {
ioc_eperm:
mutex_exit(&mir->mir_mutex);
iocp->ioc_error = EPERM;
iocp->ioc_count = 0;
mp->b_datap->db_type = M_IOCACK;
qreply(q, mp);
return;
}
mir->mir_type = iocp->ioc_cmd;
/*
* Clear mir_hold_inbound which was set to 1 by
* mir_open. This flag is not used on client
* streams.
*/
mir->mir_hold_inbound = 0;
mir->mir_max_msg_sizep = &clnt_max_msg_size;
/*
* Start the idle timer. See mir_timer() for more
* information on how client timers work.
*/
mir->mir_idle_timeout = clnt_idle_timeout;
mir_clnt_idle_start(q, mir);
mutex_exit(&mir->mir_mutex);
mp->b_datap->db_type = M_IOCACK;
qreply(q, mp);
return;
case RPC_SERVER:
mutex_enter(&mir->mir_mutex);
if (mir->mir_type != 0 &&
mir->mir_type != iocp->ioc_cmd)
goto ioc_eperm;
/*
* We don't clear mir_hold_inbound here because
* mir_hold_inbound is used in the flow control
* model. If we cleared it here, then we'd commit
* a small violation to the model where the transport
* might immediately block downstream flow.
*/
mir->mir_type = iocp->ioc_cmd;
mir->mir_max_msg_sizep = &svc_max_msg_size;
/*
* Start the idle timer. See mir_timer() for more
* information on how server timers work.
*
* Note that it is important to start the idle timer
* here so that connections time out even if we
* never receive any data on them.
*/
mir->mir_idle_timeout = svc_idle_timeout;
RPCLOG(16, "mir_wput_other starting idle timer on 0x%p "
"because we got RPC_SERVER ioctl\n", (void *)q);
mir_svc_idle_start(q, mir);
mutex_exit(&mir->mir_mutex);
mp->b_datap->db_type = M_IOCACK;
qreply(q, mp);
return;
default:
break;
}
break;
case M_PROTO:
if (mir->mir_type == RPC_CLIENT) {
/*
* We are likely being called from the context of a
* service procedure. So we need to enqueue. However
* enqueing may put our message behind data messages.
* So flush the data first.
*/
flush_in_svc = TRUE;
}
if ((mp->b_wptr - rptr) < sizeof (uint32_t) ||
!IS_P2ALIGNED(rptr, sizeof (uint32_t)))
break;
switch (((union T_primitives *)rptr)->type) {
case T_DATA_REQ:
/* Don't pass T_DATA_REQ messages downstream. */
freemsg(mp);
return;
case T_ORDREL_REQ:
RPCLOG(8, "mir_wput_other wq 0x%p: got T_ORDREL_REQ\n",
(void *)q);
mutex_enter(&mir->mir_mutex);
if (mir->mir_type != RPC_SERVER) {
/*
* We are likely being called from
* clnt_dispatch_notifyall(). Sending
* a T_ORDREL_REQ will result in
* a some kind of _IND message being sent,
* will be another call to
* clnt_dispatch_notifyall(). To keep the stack
* lean, queue this message.
*/
mir->mir_inwservice = 1;
(void) putq(q, mp);
mutex_exit(&mir->mir_mutex);
return;
}
/*
* Mark the structure such that we don't accept any
* more requests from client. We could defer this
* until we actually send the orderly release
* request downstream, but all that does is delay
* the closing of this stream.
*/
RPCLOG(16, "mir_wput_other wq 0x%p: got T_ORDREL_REQ "
" so calling mir_svc_start_close\n", (void *)q);
mir_svc_start_close(q, mir);
/*
* If we have sent down a T_ORDREL_REQ, don't send
* any more.
*/
if (mir->mir_ordrel_pending) {
freemsg(mp);
mutex_exit(&mir->mir_mutex);
return;
}
/*
* If the stream is not idle, then we hold the
* orderly release until it becomes idle. This
* ensures that KRPC will be able to reply to
* all requests that we have passed to it.
*
* We also queue the request if there is data already
* queued, because we cannot allow the T_ORDREL_REQ
* to go before data. When we had a separate reply
* count, this was not a problem, because the
* reply count was reconciled when mir_wsrv()
* completed.
*/
if (!MIR_SVC_QUIESCED(mir) ||
mir->mir_inwservice == 1) {
mir->mir_inwservice = 1;
(void) putq(q, mp);
RPCLOG(16, "mir_wput_other: queuing "
"T_ORDREL_REQ on 0x%p\n", (void *)q);
mutex_exit(&mir->mir_mutex);
return;
}
/*
* Mark the structure so that we know we sent
* an orderly release request, and reset the idle timer.
*/
mir->mir_ordrel_pending = 1;
RPCLOG(16, "mir_wput_other: calling mir_svc_idle_start"
" on 0x%p because we got T_ORDREL_REQ\n",
(void *)q);
mir_svc_idle_start(q, mir);
mutex_exit(&mir->mir_mutex);
/*
* When we break, we will putnext the T_ORDREL_REQ.
*/
break;
case T_CONN_REQ:
mutex_enter(&mir->mir_mutex);
if (mir->mir_head_mp != NULL) {
freemsg(mir->mir_head_mp);
mir->mir_head_mp = NULL;
mir->mir_tail_mp = NULL;
}
mir->mir_frag_len = -(int32_t)sizeof (uint32_t);
/*
* Restart timer in case mir_clnt_idle_do_stop() was
* called.
*/
mir->mir_idle_timeout = clnt_idle_timeout;
mir_clnt_idle_stop(q, mir);
mir_clnt_idle_start(q, mir);
mutex_exit(&mir->mir_mutex);
break;
default:
/*
* T_DISCON_REQ is one of the interesting default
* cases here. Ideally, an M_FLUSH is done before
* T_DISCON_REQ is done. However, that is somewhat
* cumbersome for clnt_cots.c to do. So we queue
* T_DISCON_REQ, and let the service procedure
* flush all M_DATA.
*/
break;
}
/* fallthru */;
default:
if (mp->b_datap->db_type >= QPCTL) {
if (mp->b_datap->db_type == M_FLUSH) {
if (mir->mir_type == RPC_CLIENT &&
*mp->b_rptr & FLUSHW) {
RPCLOG(32, "mir_wput_other: flushing "
"wq 0x%p\n", (void *)q);
if (*mp->b_rptr & FLUSHBAND) {
flushband(q, *(mp->b_rptr + 1),
FLUSHDATA);
} else {
flushq(q, FLUSHDATA);
}
} else {
RPCLOG(32, "mir_wput_other: ignoring "
"M_FLUSH on wq 0x%p\n", (void *)q);
}
}
break;
}
mutex_enter(&mir->mir_mutex);
if (mir->mir_inwservice == 0 && MIR_WCANPUTNEXT(mir, q)) {
mutex_exit(&mir->mir_mutex);
break;
}
mir->mir_inwservice = 1;
mir->mir_inwflushdata = flush_in_svc;
(void) putq(q, mp);
mutex_exit(&mir->mir_mutex);
qenable(q);
return;
}
putnext(q, mp);
}
static void
mir_wsrv(queue_t *q)
{
mblk_t *mp;
mir_t *mir;
bool_t flushdata;
mir = (mir_t *)q->q_ptr;
mutex_enter(&mir->mir_mutex);
flushdata = mir->mir_inwflushdata;
mir->mir_inwflushdata = 0;
while (mp = getq(q)) {
if (mp->b_datap->db_type == M_DATA) {
/*
* Do not send any more data if we have sent
* a T_ORDREL_REQ.
*/
if (flushdata || mir->mir_ordrel_pending == 1) {
freemsg(mp);
continue;
}
/*
* Make sure that the stream can really handle more
* data.
*/
if (!MIR_WCANPUTNEXT(mir, q)) {
(void) putbq(q, mp);
mutex_exit(&mir->mir_mutex);
return;
}
/*
* Now we pass the RPC message downstream.
*/
mutex_exit(&mir->mir_mutex);
putnext(q, mp);
mutex_enter(&mir->mir_mutex);
continue;
}
/*
* This is not an RPC message, pass it downstream
* (ignoring flow control) if the server side is not sending a
* T_ORDREL_REQ downstream.
*/
if (mir->mir_type != RPC_SERVER ||
((union T_primitives *)mp->b_rptr)->type !=
T_ORDREL_REQ) {
mutex_exit(&mir->mir_mutex);
putnext(q, mp);
mutex_enter(&mir->mir_mutex);
continue;
}
if (mir->mir_ordrel_pending == 1) {
/*
* Don't send two T_ORDRELs
*/
freemsg(mp);
continue;
}
/*
* Mark the structure so that we know we sent an orderly
* release request. We will check to see slot is idle at the
* end of this routine, and if so, reset the idle timer to
* handle orderly release timeouts.
*/
mir->mir_ordrel_pending = 1;
RPCLOG(16, "mir_wsrv: sending ordrel req on q 0x%p\n",
(void *)q);
/*
* Send the orderly release downstream. If there are other
* pending replies we won't be able to send them. However,
* the only reason we should send the orderly release is if
* we were idle, or if an unusual event occurred.
*/
mutex_exit(&mir->mir_mutex);
putnext(q, mp);
mutex_enter(&mir->mir_mutex);
}
if (q->q_first == NULL)
/*
* If we call mir_svc_idle_start() below, then
* clearing mir_inwservice here will also result in
* any thread waiting in mir_close() to be signaled.
*/
mir->mir_inwservice = 0;
if (mir->mir_type != RPC_SERVER) {
mutex_exit(&mir->mir_mutex);
return;
}
/*
* If idle we call mir_svc_idle_start to start the timer (or wakeup
* a close). Also make sure not to start the idle timer on the
* listener stream. This can cause nfsd to send an orderly release
* command on the listener stream.
*/
if (MIR_SVC_QUIESCED(mir) && !(mir->mir_listen_stream)) {
RPCLOG(16, "mir_wsrv: calling mir_svc_idle_start on 0x%p "
"because mir slot is idle\n", (void *)q);
mir_svc_idle_start(q, mir);
}
/*
* If outbound flow control has been relieved, then allow new
* inbound requests to be processed.
*/
if (mir->mir_hold_inbound) {
mir->mir_hold_inbound = 0;
qenable(RD(q));
}
mutex_exit(&mir->mir_mutex);
}
static void
mir_disconnect(queue_t *q, mir_t *mir)
{
ASSERT(MUTEX_HELD(&mir->mir_mutex));
switch (mir->mir_type) {
case RPC_CLIENT:
/*
* We are disconnecting, but not necessarily
* closing. By not closing, we will fail to
* pick up a possibly changed global timeout value,
* unless we store it now.
*/
mir->mir_idle_timeout = clnt_idle_timeout;
mir_clnt_idle_start(WR(q), mir);
mutex_exit(&mir->mir_mutex);
/*
* T_DISCON_REQ is passed to KRPC as an integer value
* (this is not a TPI message). It is used as a
* convenient value to indicate a sanity check
* failure -- the same KRPC routine is also called
* for T_DISCON_INDs and T_ORDREL_INDs.
*/
clnt_dispatch_notifyall(WR(q), T_DISCON_REQ, 0);
break;
case RPC_SERVER:
mir->mir_svc_no_more_msgs = 1;
mir_svc_idle_stop(WR(q), mir);
mutex_exit(&mir->mir_mutex);
RPCLOG(16, "mir_disconnect: telling "
"stream head listener to disconnect stream "
"(0x%p)\n", (void *) q);
(void) mir_svc_policy_notify(q, 2);
break;
default:
mutex_exit(&mir->mir_mutex);
break;
}
}
/*
* do a sanity check on the length of the fragment.
* returns 1 if bad else 0.
*/
static int
mir_check_len(queue_t *q, int32_t frag_len,
mblk_t *head_mp)
{
mir_t *mir;
mir = (mir_t *)q->q_ptr;
/*
* Do a sanity check on the message length. If this message is
* getting excessively large, shut down the connection.
*/
if ((frag_len <= 0) || (mir->mir_max_msg_sizep == NULL) ||
(frag_len <= *mir->mir_max_msg_sizep)) {
return (0);
}
freemsg(head_mp);
mir->mir_head_mp = (mblk_t *)0;
mir->mir_frag_len = -(int)sizeof (uint32_t);
if (mir->mir_type != RPC_SERVER || mir->mir_setup_complete) {
cmn_err(CE_NOTE,
"KRPC: record fragment from %s of size(%d) exceeds "
"maximum (%u). Disconnecting",
(mir->mir_type == RPC_CLIENT) ? "server" :
(mir->mir_type == RPC_SERVER) ? "client" :
"test tool",
frag_len, *mir->mir_max_msg_sizep);
}
mir_disconnect(q, mir);
return (1);
}