xdr_rdma.c revision ed629aef897f4494e9359e52811ca81d4b278489
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2007, The Ohio State University. All rights reserved.
*
* Portions of this source code is developed by the team members of
* The Ohio State University's Network-Based Computing Laboratory (NBCL),
* headed by Professor Dhabaleswar K. (DK) Panda.
*
* Acknowledgements to contributions from developors:
* Ranjit Noronha: noronha@cse.ohio-state.edu
* Lei Chai : chail@cse.ohio-state.edu
* Weikuan Yu : yuw@cse.ohio-state.edu
*
*/
/*
* xdr_rdma.c, XDR implementation using RDMA to move large chunks
*/
#include <rpc/rpc_sztypes.h>
#include <rpc/rpc_rdma.h>
#include <sys/sysmacros.h>
void xdrrdma_destroy(XDR *);
struct xdr_ops xdrrdmablk_ops = {
};
struct xdr_ops xdrrdma_ops = {
};
/*
* A chunk list entry identifies a chunk of opaque data to be moved
* separately from the rest of the RPC message. xp_min_chunk = 0, is a
* special case for ENCODING, which means do not chunk the incoming stream of
* data.
*
* A read chunk can contain part of the RPC message in addition to the
* inline message. In such a case, (xp_offp - x_base) will not provide
* the correct xdr offset of the entire message. xp_off is used in such
* a case to denote the offset or current position in the overall message
* covering both the inline and the chunk. This is used only in the case
* of decoding and useful to compare read chunk 'c_xdroff' offsets.
*
* An example for a read chunk containing an XDR message:
* An NFSv4 compound as following:
*
* PUTFH
* WRITE [4109 bytes]
* GETATTR
*
* Solaris Encoding is:
* -------------------
*
* <Inline message>: [PUTFH WRITE4args GETATTR]
* |
* v
* [RDMA_READ chunks]: [write data]
*
*
* Linux encoding is:
* -----------------
*
* <Inline message>: [PUTFH WRITE4args]
* |
* v
* [RDMA_READ chunks]: [Write data] [Write data2] [Getattr chunk]
* chunk1 chunk2 chunk3
*
* where the READ chunks are as:
*
* - chunk1 - 4k
* write data |
* - chunk2 - 13 bytes(4109 - 4k)
* getattr op - chunk3 - 19 bytes
* (getattr op starts at byte 4 after 3 bytes of roundup)
*
*/
typedef struct {
int xp_min_chunk;
int xp_buf_size; /* size of xdr buffer */
int xp_off; /* overall offset */
extern kmem_cache_t *clist_cache;
{
uint32_t cur_offset = 0;
uint32_t total_segments = 0;
uint32_t actual_segments = 0;
/*
* first deal with the length since xdr bytes are counted
*/
return (FALSE);
}
return (FALSE);
}
/*
* if no data we are done
*/
if (total_len == 0)
return (TRUE);
while (cle) {
}
/*
* If there was a chunk at the current offset, then setup a read
* chunk list which records the destination address and length
* and will RDMA READ the data in later.
*/
return (FALSE);
return (FALSE);
/*
* Setup the chunk list with appropriate
* address (offset) and length
*/
for (actual_segments = 0;
if (total_len <= 0)
break;
/*
* not the first time in the loop
*/
if (actual_segments > 0)
alen = 0;
}
if (!alen)
total_len > 0) {
}
total_len == 0) {
int, total_segments, int, actual_segments);
}
rdclist = clist_alloc();
else {
}
}
out:
/*
* Adjust the chunk length, if we read only a part of
* a chunk.
*/
if (alen) {
}
return (retval);
}
/*
* The procedure xdrrdma_create initializes a stream descriptor for a memory
* buffer.
*/
void
{
KM_SLEEP);
xdrp->xp_reply_chunk_len = 0;
xdrp->xp_reply_chunk_len_alt = 0;
/* Find last element in chunk list and set xp_rcl_next */
continue;
} else {
}
if (xdrp->xp_min_chunk != 0)
}
/* ARGSUSED */
void
{
return;
}
}
}
}
if (xdrp->xp_rcl_xdr)
}
static bool_t
{
int chunked = 0;
/*
* check if rest of the rpc message is in a chunk
*/
return (FALSE);
}
chunked = 1;
}
/* LINTED pointer alignment */
if (chunked)
}
return (TRUE);
}
static bool_t
{
return (FALSE);
/* LINTED pointer alignment */
return (TRUE);
}
/*
* DECODE bytes from XDR stream for rdma.
* If the XDR stream contains a read chunk list,
* it will go through xdrrdma_getrdmablk instead.
*/
static bool_t
{
uint32_t cur_offset = 0;
uint32_t total_segments = 0;
uint32_t actual_segments = 0;
while (cle) {
}
} else {
}
/*
* If there was a chunk at the current offset, then setup a read
* chunk list which records the destination address and length
* and will RDMA READ the data in later.
*/
for (actual_segments = 0;
if (total_len <= 0)
break;
if (status != RDMA_SUCCESS)
goto out;
alen = 0;
}
if (!alen)
total_len > 0) {
}
total_len == 0) {
int, total_segments, int, actual_segments);
}
/*
* RDMA READ the chunk data from the remote end.
* First prep the destination buffer by registering
* it, then RDMA READ the chunk data. Since we are
* doing streaming memory, sync the destination
* buffer to CPU and deregister the buffer.
*/
return (FALSE);
}
if (status != RDMA_SUCCESS) {
/*
* Deregister the previous chunks
* before return
*/
goto out;
}
/*
* Now read the chunk in
*/
total_len == 0) {
} else {
}
if (status != RDMA_SUCCESS) {
int, status);
}
}
/*
* sync the memory for cpu
*/
if (clist_syncmem(
}
out:
/*
* Deregister the chunks
*/
while (actual_segments != 0) {
}
if (alen) {
}
return (retval);
}
return (FALSE);
return (TRUE);
}
/*
* ENCODE some bytes into an XDR stream xp_min_chunk = 0, means the stream of
* bytes contain no chunks to seperate out, and if the bytes do not fit in
* the supplied buffer, grow the buffer and free the old buffer.
*/
static bool_t
{
/*
* Is this stream accepting chunks?
* If so, does the either of the two following conditions exist?
* - length of bytes to encode is greater than the min chunk size?
* - remaining space in this stream is shorter than length of
* bytes to encode?
*
* If the above exists, then create a chunk for this encoding
* and save the addresses, etc.
*/
((xdrp->xp_min_chunk != 0 &&
cle = clist_alloc();
return (TRUE);
}
/* Is there enough space to encode what is left? */
return (FALSE);
}
return (TRUE);
}
{
}
{
return (FALSE);
return (TRUE);
}
/* ARGSUSED */
static rpc_inline_t *
{
/*
* Since chunks aren't in-line, check to see whether there is
* a chunk in the inline range.
*/
return (NULL);
}
/* LINTED pointer alignment */
return (NULL);
return (NULL);
} else {
return (buf);
}
}
static bool_t
{
int len, i;
switch (request) {
case XDR_PEEK:
/*
* Return the next 4 byte unit in the XDR stream.
*/
return (FALSE);
return (TRUE);
case XDR_SKIPBYTES:
/*
* Skip the next N bytes in the XDR stream.
*/
return (FALSE);
return (TRUE);
case XDR_RDMA_SET_FLAGS:
/*
* Set the flags provided in the *info in xp_flags for rdma
* xdr stream control.
*/
return (TRUE);
case XDR_RDMA_GET_FLAGS:
/*
* Get the flags provided in xp_flags return through *info
*/
return (TRUE);
case XDR_RDMA_GET_CHUNK_LEN:
return (TRUE);
case XDR_RDMA_ADD_CHUNK:
/*
* Store wlist information
*/
case RCI_WRITE_UIO_CHUNK:
return (TRUE);
}
for (i = 0; i < uiop->uio_iovcnt; i++) {
rwl = clist_alloc();
/*
* if userspace address, put adspace ptr in
* clist. If not, then do nothing since it's
* already set to NULL (from kmem_zalloc)
*/
}
else {
}
}
break;
case RCI_WRITE_ADDR_CHUNK:
rwl = clist_alloc();
break;
case RCI_REPLY_CHUNK:
break;
}
return (TRUE);
case XDR_RDMA_GET_WLIST:
return (TRUE);
case XDR_RDMA_SET_WLIST:
return (TRUE);
case XDR_RDMA_GET_RLIST:
return (TRUE);
case XDR_RDMA_GET_WCINFO:
return (TRUE);
default:
return (FALSE);
}
}
/*
* Not all fields in struct clist are interesting to the RPC over RDMA
* protocol. Only XDR the interesting fields.
*/
{
return (FALSE);
return (FALSE);
return (FALSE);
return (FALSE);
return (FALSE);
return (TRUE);
}
/*
* The following two functions are forms of xdr_pointer()
* and xdr_reference(). Since the generic versions just
* kmem_alloc() a new clist, we actually want to use the
* rdma_clist kmem_cache.
*/
/*
* Generate or free a clist structure from the
* kmem_cache "rdma_clist"
*/
{
case XDR_FREE:
return (TRUE);
case XDR_DECODE:
break;
case XDR_ENCODE:
break;
}
}
}
return (stat);
}
/*
* XDR a pointer to a possibly recursive clist. This differs
* with xdr_reference in that it can serialize/deserialiaze
* trees correctly.
*
* What is sent is actually a union:
*
* union object_pointer switch (boolean b) {
* case TRUE: object_data data;
* case FALSE: void nothing;
* }
*
* > objpp: Pointer to the pointer to the object.
*
*/
{
return (FALSE);
if (!more_data) {
return (TRUE);
}
}
{
}
/* ARGSUSED */
{
}
{
int i;
uint_t num_segment = 0;
/* does a wlist exist? */
if (w == NULL) {
}
/* Encode N consecutive segments, 1, N, HLOO, ..., HLOO, 0 */
return (FALSE);
num_segment++;
}
return (FALSE);
for (i = 0; i < num_segment; i++) {
return (FALSE);
return (FALSE);
return (FALSE);
w = w->c_next;
}
return (FALSE);
return (TRUE);
}
/*
* Conditionally decode a RDMA WRITE chunk list from XDR stream.
*
* If the next boolean in the XDR stream is false there is no
* RDMA WRITE chunk list present. Otherwise iterate over the
* array and for each entry: allocate a struct clist and decode.
* Pass back an indication via wlist_exists if we have seen a
* RDMA WRITE chunk list.
*/
{
uint32_t i;
return (FALSE);
/* is there a wlist? */
*wlist_exists = FALSE;
return (TRUE);
}
*wlist_exists = TRUE;
return (FALSE);
tmp = *w = clist_alloc();
for (i = 0; i < seg_array_len; i++) {
return (FALSE);
return (FALSE);
return (FALSE);
if (i < seg_array_len - 1) {
} else {
}
}
return (FALSE);
return (TRUE);
}
/*
* Server side RDMA WRITE list decode.
* XDR context is memory ops
*/
{
char *memp;
uint32_t wcl_length = 0;
uint32_t i;
*total_length = 0;
return (FALSE);
}
return (TRUE);
}
return (FALSE);
}
for (i = 0; i < num_wclist; i++) {
goto err_out;
goto err_out;
goto err_out;
}
if (i < num_wclist - 1) {
}
}
goto err_out;
return (FALSE);
}
for (i = 0; i < num_wclist; i++) {
}
return (TRUE);
return (FALSE);
}
/*
* XDR decode the long reply write chunk.
*/
{
uint32_t i;
return (FALSE);
if (have_rchunk == FALSE)
return (TRUE);
return (FALSE);
}
if (num_wclist == 0) {
return (FALSE);
}
for (i = 0; i < num_wclist; i++) {
if (i > 0) {
}
goto err_out;
goto err_out;
goto err_out;
}
}
return (TRUE);
return (FALSE);
}
{
int i;
if (seg_array_len > 0) {
return (FALSE);
return (FALSE);
for (i = 0; i < seg_array_len; i++) {
if (!cl_longreply)
return (FALSE);
if (!xdr_uint32(xdrs,
return (FALSE);
return (FALSE);
return (FALSE);
}
} else {
return (FALSE);
}
return (TRUE);
}
{
return (FALSE);
}
/*
* The entire buffer is registered with the first chunk.
* Later chunks will use the same registered memory handle.
*/
return (FALSE);
}
#else
#endif
/*
* Use the same memory handle for all the chunks
*/
/*
* Now read the chunk in
*/
} else {
}
if (status != RDMA_SUCCESS) {
return (FALSE);
}
}
}
return (retval);
}
{
return (TRUE);
}
{
int status;
int rndup_present, rnduplen;
rndup_present = 0;
/* caller is doing a sizeof */
return (TRUE);
/* copy of the first chunk */
/*
* The entire buffer is registered with the first chunk.
* Later chunks will use the same registered memory handle.
*/
if (status != RDMA_SUCCESS) {
return (FALSE);
}
/*
* Only transfer the read data ignoring any trailing
* roundup chunks. A bit of work, but it saves an
* unnecessary extra RDMA_WRITE containing only
* roundup bytes.
*/
if (rnduplen) {
/*
* Check if there is a trailing roundup chunk
*/
while (tcl) {
rndup_present = 1;
break;
}
}
/*
* Make a copy chunk list skipping the last chunk
*/
if (rndup_present) {
while (cl) {
tcl = clist_alloc();
} else {
}
/* last chunk */
break;
}
}
}
/* No roundup chunks */
}
/*
* Set the registered memory handles for the
* rest of the chunks same as the first chunk.
*/
while (tcl) {
}
/*
* Sync the total len beginning from the first chunk.
*/
if (status != RDMA_SUCCESS) {
return (FALSE);
}
if (rndup_present)
if (status != RDMA_SUCCESS) {
return (FALSE);
}
return (TRUE);
}
/*
* Reads one chunk at a time
*/
static bool_t
{
int status;
/*
* len is used later to decide xdr offset in
* the chunk factoring any 4-byte XDR alignment
* (See read chunk example top of this file)
*/
}
/*
* If this is the first chunk to contain the RPC
* message set xp_off to the xdr offset of the
* inline message.
*/
return (FALSE);
/*
* Make a copy of the chunk to read from client.
* Chunks are read on demand, so read only one
* for now.
*/
rclp = clist_alloc();
/*
* If there is a roundup present, then skip those
* bytes when reading.
*/
if (len) {
}
return (status);
}
/*
* This copy of read chunks containing the XDR
* message is freed later in xdrrdma_destroy()
*/
if (xdrp->xp_rcl_xdr) {
/* Add the chunk to end of the list */
} else {
}
return (TRUE);
}
static void
{
/*
* Read chunks containing parts XDR message are
* special: in case of multiple chunks each has
* its own buffer.
*/
while (cl) {
}
}