svc_rdma.c revision 0a701b1ec2b55bddc48b62124df936152ff820f7
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* Portions of this source code were derived from Berkeley
* 4.3 BSD under license from the Regents of the University of
* California.
*/
/*
* Server side of RPC over RDMA in the kernel.
*/
#include <sys/sysmacros.h>
#include <rpc/rpc_rdma.h>
#define SVC_RDMA_SUCCESS 0
#define SVC_RDMA_FAIL -1
#define SVC_CREDIT_FACTOR (0.5)
#define MSG_IS_RPCSEC_GSS(msg) \
/*
* RDMA transport specific data associated with SVCMASTERXPRT
*/
struct rdma_data {
};
/*
* Plugin connection specific data stashed away in clone SVCXPRT
*/
struct clone_rdma_data {
};
/*
* Routines exported through ops vector.
*/
void svc_rdma_kdestroy(SVCMASTERXPRT *);
void (*)(), int, int);
static void svc_rdma_kfreeres(SVCXPRT *);
static void svc_rdma_kclone_destroy(SVCXPRT *);
static void svc_rdma_kstart(SVCMASTERXPRT *);
void svc_rdma_kstop(SVCMASTERXPRT *);
int *, int *, unsigned int *);
/*
* Server transport operations vector.
*/
struct svc_ops rdma_svc_ops = {
svc_rdma_krecv, /* Get requests */
svc_rdma_kgetargs, /* Deserialize arguments */
svc_rdma_ksend, /* Send reply */
svc_rdma_kfreeargs, /* Free argument data space */
svc_rdma_kdestroy, /* Destroy transport handle */
svc_rdma_kdup, /* Check entry in dup req cache */
svc_rdma_kdupdone, /* Mark entry in dup req cache as done */
svc_rdma_kgetres, /* Get pointer to response buffer */
svc_rdma_kfreeres, /* Destroy pre-serialized response header */
svc_rdma_kclone_destroy, /* Destroy a clone xprt */
svc_rdma_kstart /* Tell `ready-to-receive' to rpcmod */
};
/*
* Server statistics
* NOTE: This structure type is duplicated in the NFS fast path.
*/
struct {
} rdmarsstat = {
{ "calls", KSTAT_DATA_UINT64 },
{ "badcalls", KSTAT_DATA_UINT64 },
{ "nullrecv", KSTAT_DATA_UINT64 },
{ "badlen", KSTAT_DATA_UINT64 },
{ "xdrcall", KSTAT_DATA_UINT64 },
{ "dupchecks", KSTAT_DATA_UINT64 },
{ "dupreqs", KSTAT_DATA_UINT64 },
{ "longrpcs", KSTAT_DATA_UINT64 },
{ "totalreplies", KSTAT_DATA_UINT64 },
{ "totallongreplies", KSTAT_DATA_UINT64 },
{ "totalinlinereplies", KSTAT_DATA_UINT64 },
};
/*
* Create a transport record.
* The transport record, output buffer, and private data structure
* are allocated. The output buffer is serialized into using xdrmem.
* There is one transport record per user process which implements a
* set of services.
*/
/* ARGSUSED */
int
{
int error;
queue_t *q;
/*
* modload the RDMA plugins is not already done.
*/
if (!rdma_modloaded) {
/*CONSTANTCONDITION*/
if (!rdma_modloaded) {
error = rdma_modload();
}
if (error)
return (error);
}
/*
* master_xprt_count is the count of master transport handles
* that were successfully created and are ready to recieve for
* RDMA based access.
*/
error = 0;
if (rdma_mod_head == NULL) {
started_xprts->rtg_count = 0;
if (rdma_dev_available)
return (EPROTONOSUPPORT);
else
return (ENODEV);
}
/*
* If we have reached here, then atleast one RDMA plugin has loaded.
* Create a master_xprt, make it start listenining on the device,
* if an error is generated, record it, we might need to shut
* the master_xprt.
* SVC_START() calls svc_rdma_kstart which calls plugin binding
* routines.
*/
/*
* One SVCMASTERXPRT per RDMA plugin.
*/
xprt->xp_threads = 0;
xprt->xp_detached_threads = 0;
KM_SLEEP);
}
(uint32_t)~0;
(ushort_t)~0;
/*
* Each of the plugins will have their own Service ID
* to listener specific mapping, like port number for VI
* and service name for IB.
*/
if (error) {
goto cleanup;
}
goto cleanup;
}
/*
* This is set only when there is atleast one or more
* transports successfully created. We insert the pointer
* to the created RDMA master xprt into a separately maintained
* list. This way we can easily reference it later to cleanup,
* when NFS kRPC service pool is going away/unregistered.
*/
started_xprts->rtg_count ++;
continue;
if (error == RDMA_FAILED)
}
/*
* Don't return any error even if a single plugin was started
* successfully.
*/
if (started_xprts->rtg_count == 0)
return (error);
return (0);
}
/*
* Cleanup routine for freeing up memory allocated by
* svc_rdma_kcreate()
*/
void
{
}
static void
{
struct rdma_svc_data *svcdata;
/*
* Create a listener for module at this port
*/
}
void
{
struct rdma_svc_data *svcdata;
/*
* Call the stop listener routine for each plugin.
*/
}
/* ARGSUSED */
static void
{
}
static bool_t
{
struct clone_rdma_data *crdp;
uint32_t wcl_total_length = 0;
if (status != RDMA_SUCCESS) {
goto badrpc_call;
}
goto xdr_err;
}
/* Checking if the status of the recv operation was normal */
goto badrpc_call;
}
goto xdr_err;
}
if (cl)
clist_free(cl);
goto xdr_err;
}
/*
* A chunk at 0 offset indicates that the RPC call message
* is in a chunk. Get the RPC call message chunk.
*/
/* Remove RPC call message chunk from chunklist */
/* Allocate and register memory for the RPC call msg chunk */
goto cll_malloc_err;
}
goto cll_malloc_err;
}
if (status) {
goto cll_malloc_err;
}
/*
* Now read the RPC call message in
*/
if (status) {
goto cll_malloc_err;
}
} else {
/* Use xdrrdmablk_ops to indicate there is a read chunk list */
}
}
}
goto callmsg_err;
}
/*
* Point the remote transport address in the service_transport
* handle at the address in the request.
*/
return (TRUE);
if (cl)
clist_free(cl);
return (FALSE);
}
static int
{
int status;
int count = 0;
int alloc_len;
char *memp;
rdma_buf_t long_rpc = {0};
struct clone_rdma_data *crdp;
/* Choose a size for the long rpc response */
if (MSG_IS_RPCSEC_GSS(msg)) {
} else {
}
} else {
} else {
}
}
}
return (SVC_RDMA_FAIL);
}
xdr_results, xdr_location)))) {
return (SVC_RDMA_FAIL);
}
*numchunks = 0;
*freelen = 0;
break;
}
*numchunks += 1;
if (count == 0)
break;
}
/*
* MUST fail if there are still more data
*/
if (count > 0) {
return (SVC_RDMA_FAIL);
}
return (SVC_RDMA_FAIL);
}
if (status) {
return (SVC_RDMA_FAIL);
}
if (status != RDMA_SUCCESS) {
return (SVC_RDMA_FAIL);
}
return (SVC_RDMA_SUCCESS);
}
static int
{
/*
* Get a pre-allocated buffer for rpc reply
*/
return (SVC_RDMA_FAIL);
}
if (has_args) {
(!has_args ||
xdr_results, xdr_location)))) {
return (SVC_RDMA_FAIL);
}
} else {
return (SVC_RDMA_FAIL);
}
}
return (SVC_RDMA_SUCCESS);
}
/*
* Send rpc reply.
*/
static bool_t
{
struct clone_rdma_data *crdp;
uint32_t rdma_credit = 0;
int freelen = 0;
/*
* If there is a result procedure specified in the reply message,
* it will be processed in the xdr_replymsg and SVCAUTH_WRAP.
* We need to make sure it won't be processed twice, so we null
* it for xdr_replymsg here.
*/
}
}
/*
* Given the limit on the inline response size (RPC_MSG_SZ),
* there is a need to make a guess as to the overall size of
* the response. If the resultant size is beyond the inline
* size, then the server needs to use the "reply chunk list"
* provided by the client (if the client provided one). An
* example of this type of response would be a READDIR
* response (e.g. a small directory read would fit in RPC_MSG_SZ
* and that is the preference but it may not fit)
*
* Combine the encoded size and the size of the true results
* and then make the decision about where to encode and send results.
*
* One important note, this calculation is ignoring the size
* of the encoding of the authentication overhead. The reason
* for this is rooted in the complexities of access to the
* encoded size of RPCSEC_GSS related authentiation,
* integrity, and privacy.
*
* If it turns out that the encoded authentication bumps the
* response over the RPC_MSG_SZ limit, then it may need to
* attempt to encode for the reply chunk list.
*/
/*
* Calculating the "sizeof" the RPC response header and the
* encoded results.
*/
if (msglen > 0) {
}
if (has_args)
if (msglen < RPC_MSG_SZ) {
/*
* Looks like the response will fit in the inline
* response; let's try
*/
int, status);
int, final_resp_len);
}
}
/*
* If the encode failed (size?) or the message really is
* larger than what is allowed, try the response chunk list.
*/
/*
* attempting to use a reply chunk list when there
* isn't one won't get very far...
*/
goto out;
}
int, final_resp_len);
if (status != SVC_RDMA_SUCCESS) {
goto out;
}
}
int, final_resp_len);
goto out;
}
vers = RPCRDMA_VERS;
/* Skip xid and set the xdr position accordingly. */
goto out;
}
/*
* Now XDR the read chunk list, actually always NULL
*/
/*
* encode write list -- we already drove RDMA_WRITEs
*/
goto out;
}
/*
* XDR encode the RDMA_REPLY write chunk
*/
goto out;
}
if (rdma_response_op == RDMA_MSG) {
}
if (status == RDMA_SUCCESS) {
}
out:
/*
* Free up sendlist chunks
*/
/*
* Destroy private data for xdr rdma
*/
}
}
/*
* This is completely disgusting. If public is set it is
* a pointer to a structure whose first field is the address
* of the function to free that structure and any related
* stuff. (see rrokfree in nfs_xdr.c).
*/
/* LINTED pointer alignment */
}
}
return (retval);
}
/*
* Deserialize arguments.
*/
static bool_t
{
return (FALSE);
return (TRUE);
}
static bool_t
{
struct clone_rdma_data *crdp;
/*
* Free the args if needed then XDR_DESTROY
*/
if (args_ptr) {
}
}
return (retval);
}
/* ARGSUSED */
static int32_t *
{
return (NULL);
}
/* ARGSUSED */
static void
{
}
/*
* the dup cacheing routines below provide a cache of non-failure
* transaction id's. rpc service routines can use this to detect
* retransmissions and re-send a non-failure response.
*/
/*
* MAXDUPREQS is the number of cached items. It should be adjusted
* to the service load so that there is likely to be a response entry
* when the first retransmission comes in.
*/
#define MAXDUPREQS 1024
/*
* This should be appropriately scaled to MAXDUPREQS.
*/
#define DRHASHSZ 257
#else
#endif
static int rdmandupreqs = 0;
static int rdmamaxdupreqs = MAXDUPREQS;
static kmutex_t rdmadupreq_lock;
static int rdmadrhashstat[DRHASHSZ];
/*
* rdmadrmru points to the head of a circular linked list in lru order.
* rdmadrmru->dr_next == drlru
*/
/*
* svc_rdma_kdup searches the request cache and returns 0 if the
* request is not found in the cache. If it is found, then it
* returns the state of the request (in progress or done) and
* the status or attributes that were part of the original reply.
*/
static int
{
int status;
/*
* Check to see whether an entry already exists in the cache.
*/
if (dupcachedp != NULL)
} else {
}
return (status);
}
}
/*
* There wasn't an entry, either allocate a new one or recycle
* an old one.
*/
if (rdmandupreqs < rdmamaxdupreqs) {
return (DUP_ERROR);
}
if (rdmadrmru) {
} else {
}
rdmandupreqs++;
} else {
return (DUP_ERROR);
}
}
if (dr->dr_resfree) {
}
}
return (DUP_ERROR);
}
}
return (DUP_ERROR);
}
}
rdmadrhashstat[drhash]++;
return (DUP_NEW);
}
/*
* svc_rdma_kdupdone marks the request done (DUP_DONE or DUP_DROP)
* and stores the response.
*/
static void
{
}
}
/*
* This routine expects that the mutex, rdmadupreq_lock, is already held.
*/
static void
{
rdmadrhashstat[drhash]--;
} else {
}
return;
}
}
}
{
return (FALSE);
}
tlen = 0;
while (clist) {
}
/*
* set iov to addr+len of first segment of first wchunk of
* wlist sent by client. krecv() already malloc'd a buffer
* large enough, but registration is deferred until we write
* the buffer back to (NFS) client using RDMA_WRITE.
*/
return (TRUE);
}