svc_rdma.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2004 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* Portions of this source code were derived from Berkeley
* 4.3 BSD under license from the Regents of the University of
* California.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Server side of RPC over RDMA in the kernel.
*/
#include <sys/sysmacros.h>
#include <rpc/rpc_rdma.h>
/*
* RDMA transport specific data associated with SVCMASTERXPRT
*/
struct rdma_data {
};
/*
* Plugin connection specific data stashed away in clone SVCXPRT
*/
struct clone_rdma_data {
};
#ifdef DEBUG
int rdma_svc_debug = 0;
#endif
/*
* Routines exported through ops vector.
*/
void svc_rdma_kdestroy(SVCMASTERXPRT *);
void (*)(), int, int);
static void svc_rdma_kfreeres(SVCXPRT *);
static void svc_rdma_kclone_destroy(SVCXPRT *);
static void svc_rdma_kstart(SVCMASTERXPRT *);
void svc_rdma_kstop(SVCMASTERXPRT *);
/*
* Server transport operations vector.
*/
struct svc_ops rdma_svc_ops = {
svc_rdma_krecv, /* Get requests */
svc_rdma_kgetargs, /* Deserialize arguments */
svc_rdma_ksend, /* Send reply */
svc_rdma_kfreeargs, /* Free argument data space */
svc_rdma_kdestroy, /* Destroy transport handle */
svc_rdma_kdup, /* Check entry in dup req cache */
svc_rdma_kdupdone, /* Mark entry in dup req cache as done */
svc_rdma_kgetres, /* Get pointer to response buffer */
svc_rdma_kfreeres, /* Destroy pre-serialized response header */
svc_rdma_kclone_destroy, /* Destroy a clone xprt */
svc_rdma_kstart /* Tell `ready-to-receive' to rpcmod */
};
/*
* Server statistics
* NOTE: This structure type is duplicated in the NFS fast path.
*/
struct {
} rdmarsstat = {
{ "calls", KSTAT_DATA_UINT64 },
{ "badcalls", KSTAT_DATA_UINT64 },
{ "nullrecv", KSTAT_DATA_UINT64 },
{ "badlen", KSTAT_DATA_UINT64 },
{ "xdrcall", KSTAT_DATA_UINT64 },
{ "dupchecks", KSTAT_DATA_UINT64 },
{ "dupreqs", KSTAT_DATA_UINT64 },
{ "longrpcs", KSTAT_DATA_UINT64 }
};
/*
* Create a transport record.
* The transport record, output buffer, and private data structure
* are allocated. The output buffer is serialized into using xdrmem.
* There is one transport record per user process which implements a
* set of services.
*/
/* ARGSUSED */
int
{
int error;
queue_t *q;
/*
* modload the RDMA plugins is not already done.
*/
if (!rdma_modloaded) {
if (!rdma_modloaded) {
error = rdma_modload();
}
if (error)
return (error);
}
/*
* master_xprt_count is the count of master transport handles
* that were successfully created and are ready to recieve for
* RDMA based access.
*/
error = 0;
if (rdma_mod_head == NULL) {
started_xprts->rtg_count = 0;
if (rdma_dev_available)
return (EPROTONOSUPPORT);
else
return (ENODEV);
}
/*
* If we have reached here, then atleast one RDMA plugin has loaded.
* Create a master_xprt, make it start listenining on the device,
* if an error is generated, record it, we might need to shut
* the master_xprt.
* SVC_START() calls svc_rdma_kstart which calls plugin binding
* routines.
*/
/*
* One SVCMASTERXPRT per RDMA plugin.
*/
xprt->xp_threads = 0;
xprt->xp_detached_threads = 0;
KM_SLEEP);
}
(uint32_t)~0;
(ushort_t)~0;
/*
* Each of the plugins will have their own Service ID
* to listener specific mapping, like port number for VI
* and service name for IB.
*/
if (error) {
"failed");
goto cleanup;
}
goto cleanup;
}
/*
* This is set only when there is atleast one or more
* transports successfully created. We insert the pointer
* to the created RDMA master xprt into a separately maintained
* list. This way we can easily reference it later to cleanup,
* when NFS kRPC service pool is going away/unregistered.
*/
started_xprts->rtg_count ++;
continue;
if (error == RDMA_FAILED)
}
/*
* Don't return any error even if a single plugin was started
* successfully.
*/
if (started_xprts->rtg_count == 0)
return (error);
return (0);
}
/*
* Cleanup routine for freeing up memory allocated by
* svc_rdma_kcreate()
*/
void
{
}
static void
{
struct rdma_svc_data *svcdata;
/*
* Create a listener for module at this port
*/
}
void
{
struct rdma_svc_data *svcdata;
/*
* Call the stop listener routine for each plugin.
*/
" listener");
}
/* ARGSUSED */
static void
{
}
static bool_t
{
struct clone_rdma_data *vd;
/*
* Post a receive descriptor on this
* endpoint to ensure all packets are received.
*/
if (status != RDMA_SUCCESS) {
"svc_rdma_krecv: rdma_svc_postrecv failed %d", status);
}
return (FALSE);
}
/*
* Decode rpc message
*/
/*
* Get the XID
*/
/*
* Treat xid as opaque (xid is the first entity
* in the rpc rdma message).
*/
/* Skip xid and set the xdr position accordingly. */
return (FALSE);
}
/*
* Should not get RDMA_DONE
*/
return (FALSE); /* no response */
}
#ifdef DEBUG
if (rdma_svc_debug)
#endif
/*
* Now decode the chunk list
*/
}
/*
* A chunk at 0 offset indicates that the RPC call message
* is in a chunk. Get the RPC call message chunk.
*/
/* Remove RPC call message chunk from chunklist */
/* Allocate and register memory for the RPC call msg chunk */
"svc_rdma_krecv: no memory for rpc call");
clist_free(cl);
return (FALSE);
}
if (status) {
"svc_rdma_krecv: clist_register failed");
clist_free(cl);
return (FALSE);
}
/*
* Now read the RPC call message in
*/
if (status) {
"svc_rdma_krecv: rdma_read failed %d", status);
clist_free(cl);
return (FALSE);
}
/*
* Sync memory for CPU after DMA
*/
/*
* Deregister the chunk
*/
/*
* Setup the XDR for the RPC call message
*/
/*
* Free the chunk element with the Long RPC details and
* the message received.
*/
} else {
/*
* Now the RPC call message header
*/
}
clist_free(cl);
return (FALSE);
}
/*
* Point the remote transport address in the service_transport
* handle at the address in the request.
*/
#ifdef DEBUG
if (rdma_svc_debug) {
struct sockaddr_in *sin4;
char print_addr[INET_ADDRSTRLEN];
"svc_rdma_krecv: remote clnt_addr: %s", print_addr);
}
#endif
return (TRUE);
}
/*
* Send rpc reply.
*/
static bool_t
{
struct clone_rdma_data *vd;
int status;
int msglen;
/*
* If there is a result procedure specified in the reply message,
* it will be processed in the xdr_replymsg and SVCAUTH_WRAP.
* We need to make sure it won't be processed twice, so we null
* it for xdr_replymsg here.
*/
}
}
/*
* Get the size of the rpc reply message. Need this
* to determine if the rpc reply message will fit in
* the pre-allocated RDMA buffers. If the rpc reply
* message length is greater that the pre-allocated
* buffers then, a one time use buffer is allocated
* and registered for this rpc reply.
*/
if (msglen > RPC_MSG_SZ) {
/*
* Allocate chunk buffer for rpc reply
*/
op = RDMA_NOMSG;
} else {
/*
* Get a pre-allocated buffer for rpc reply
*/
"svc_rdma_ksend: no free buffers!");
return (retval);
}
}
/*
* Initialize the XDR encode stream.
*/
xdr_results, xdr_location)))) {
if (cle)
"svc_rdma_ksend: xdr_replymsg/SVCAUTH_WRAP "
"failed");
goto out;
}
}
/*
* For RPCSEC_GSS since we cannot accurately presize the
* buffer required for encoding, we assume that its going
* to be a Long RPC to start with. We also create the
* the XDR stream with min_chunk set to 0 which instructs
* the XDR layer to not chunk the incoming byte stream.
*/
/*
* Long RPC. Allocate one time use custom buffer.
*/
XDR_ENCODE, NULL);
op = RDMA_NOMSG;
/*
* Initialize the XDR encode stream.
*/
xdr_results, xdr_location)))) {
}
if (cle)
"svc_rdma_ksend: xdr_replymsg/SVCAUTH_WRAP "
"failed");
goto out;
}
/*
* If we had to allocate a new buffer while encoding
* then update the addr and len.
*/
}
/*
* If it so happens that the encoded message is after all
* not long enough to be a Long RPC then allocate a
* SEND_BUFFER and copy the encoded message into it.
*/
if (len > RPC_MSG_SZ) {
} else {
/*
* Get a pre-allocated buffer for rpc reply
*/
"svc_rdma_ksend: no free buffers!");
return (retval);
}
XDR_ENCODE, NULL);
}
}
if (msglen > RPC_MSG_SZ) {
/*
* Allocate chunk buffer for rpc reply
*/
op = RDMA_NOMSG;
} else {
/*
* Get a pre-allocated buffer for rpc reply
*/
"svc_rdma_ksend: no free buffers!");
return (retval);
}
}
/*
* Initialize the XDR encode stream.
*/
if (cle)
"svc_rdma_ksend: xdr_replymsg/SVCAUTH_WRAP "
"failed");
goto out;
}
}
/*
* Get clist and a buffer for sending it across
*/
goto out;
}
/*
* Now register the chunks in the list
*/
if (status != RDMA_SUCCESS) {
"svc_rdma_ksend: clist register failed");
goto out;
}
}
/*
* XDR the XID, vers, and op
*/
/*
* Treat xid as opaque (xid is the first entity
* in the rpc rdma message).
*/
vers = RPCRDMA_VERS;
/* Skip xid and set the xdr position accordingly. */
goto out;
}
/*
* Now XDR the chunk list
*/
} else {
}
/*
* Send the reply message to the client
*/
if (status != RDMA_SUCCESS) {
goto out;
}
#ifdef DEBUG
if (rdma_svc_debug)
printf("svc_rdma_ksend: chunk response len %d xid %u\n",
#endif
/*
* Post a receive buffer because we expect a RDMA_DONE
* message.
*/
/*
* Send the RPC reply message and wait for RDMA_DONE
*/
if (status != RDMA_SUCCESS) {
#ifdef DEBUG
if (rdma_svc_debug)
"rdma_send_resp failed %d", status);
#endif
goto out;
}
#ifdef DEBUG
if (rdma_svc_debug)
#endif
} else {
#ifdef DEBUG
if (rdma_svc_debug)
#endif
if (status != RDMA_SUCCESS) {
#ifdef DEBUG
if (rdma_svc_debug)
"rdma_send failed %d", status);
#endif
goto out;
}
}
out:
/*
* Deregister the chunks
*/
if (reg)
if (op == RDMA_NOMSG) {
/*
* Long RPC reply in chunk. Free it up.
*/
}
clist_free(cl);
}
/*
* Free up sendlist chunks
*/
/*
* Destroy private data for xdr rdma
*/
/*
* This is completely disgusting. If public is set it is
* a pointer to a structure whose first field is the address
* of the function to free that structure and any related
* stuff. (see rrokfree in nfs_xdr.c).
*/
/* LINTED pointer alignment */
}
return (retval);
}
/*
* Deserialize arguments.
*/
static bool_t
{
return (FALSE);
return (TRUE);
}
static bool_t
{
struct clone_rdma_data *vd;
if (args_ptr) {
clist_free(cl);
}
return (retval);
}
/* ARGSUSED */
static int32_t *
{
return (NULL);
}
/* ARGSUSED */
static void
{
}
/*
* the dup cacheing routines below provide a cache of non-failure
* transaction id's. rpc service routines can use this to detect
* retransmissions and re-send a non-failure response.
*/
/*
* MAXDUPREQS is the number of cached items. It should be adjusted
* to the service load so that there is likely to be a response entry
* when the first retransmission comes in.
*/
#define MAXDUPREQS 1024
/*
* This should be appropriately scaled to MAXDUPREQS.
*/
#define DRHASHSZ 257
#else
#endif
static int rdmandupreqs = 0;
static int rdmamaxdupreqs = MAXDUPREQS;
static kmutex_t rdmadupreq_lock;
static int rdmadrhashstat[DRHASHSZ];
/*
* rdmadrmru points to the head of a circular linked list in lru order.
* rdmadrmru->dr_next == drlru
*/
/*
* svc_rdma_kdup searches the request cache and returns 0 if the
* request is not found in the cache. If it is found, then it
* returns the state of the request (in progress or done) and
* the status or attributes that were part of the original reply.
*/
static int
{
int status;
/*
* Check to see whether an entry already exists in the cache.
*/
if (dupcachedp != NULL)
} else {
}
return (status);
}
}
/*
* There wasn't an entry, either allocate a new one or recycle
* an old one.
*/
if (rdmandupreqs < rdmamaxdupreqs) {
return (DUP_ERROR);
}
if (rdmadrmru) {
} else {
}
rdmandupreqs++;
} else {
return (DUP_ERROR);
}
}
if (dr->dr_resfree) {
}
}
return (DUP_ERROR);
}
}
return (DUP_ERROR);
}
}
rdmadrhashstat[drhash]++;
return (DUP_NEW);
}
/*
* svc_rdma_kdupdone marks the request done (DUP_DONE or DUP_DROP)
* and stores the response.
*/
static void
{
}
}
/*
* This routine expects that the mutex, rdmadupreq_lock, is already held.
*/
static void
{
rdmadrhashstat[drhash]--;
} else {
}
return;
}
}
}