vsw_rxdring.c revision 7bd3a2e26cc8569257b88c1691d559138e1d32d0
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/sysmacros.h>
#include <sys/machsystm.h>
#include <sys/mach_descrip.h>
#include <sys/vio_mailbox.h>
#include <sys/vio_common.h>
#include <sys/vnet_common.h>
#include <sys/vnet_mailbox.h>
#include <sys/vio_util.h>
/*
* This file contains the implementation of RxDringData transfer mode of VIO
* Protocol in vsw. The functions in this file are invoked from vsw_ldc.c
* after RxDringData mode is negotiated with the peer during attribute phase of
* handshake. This file contains functions that setup the transmit and receive
* descriptor rings, and associated resources in RxDringData mode. It also
* contains the transmit and receive data processing functions that are invoked
* in RxDringData mode. The data processing routines in this file have the
* suffix '_shm' to indicate the shared memory mechanism used in RxDringData
* mode.
*/
/* Functions exported to vsw_ldc.c */
void vsw_ldc_rcv_worker(void *arg);
void vsw_process_dringdata_shm(void *, void *);
/* Internal functions */
/* Functions imported from vsw_ldc.c */
extern void vsw_process_pkt(void *);
extern void vsw_destroy_rxpools(void *);
/* Tunables */
extern int vsw_wretries;
extern int vsw_recv_delay;
extern int vsw_recv_retries;
extern uint32_t vsw_chain_len;
extern uint32_t vsw_num_descriptors;
extern uint32_t vsw_nrbufs_factor;
{ \
(count) = 0; \
}
{
/*
* If we can't create a dring, obviously no point sending
* a message.
*/
return (NULL);
KM_SLEEP);
/* payload */
sizeof (ldc_mem_cookie_t));
mp->dring_ident = 0;
/* skip over dring cookies */
/* copy data_ncookies in the msg */
/* copy data area size in the msg */
/* copy data area cookies in the msg */
return (mp);
}
/*
* Allocate receive resources for the channel. The resources consist of a
* receive descriptor ring and an associated receive buffer area.
*/
static dring_info_t *
{
/* Create the receive descriptor ring */
goto fail;
}
/* Get the addr of descriptor ring */
goto fail;
} else {
}
/*
* Allocate a table that maps descriptor to its associated buffer;
* used while receiving to validate that the peer has not changed the
* buffer offset provided in the descriptor.
*/
KM_SLEEP);
/* Setup the descriptor ring */
goto fail;
}
/*
* The descriptors and the associated buffers are all ready;
* now bind descriptor ring to the channel.
*/
goto fail;
}
/* haven't used any descriptors yet */
return (dp);
fail:
return (NULL);
}
/*
* Setup the descriptors in the rx dring.
* Returns 0 on success, 1 on failure.
*/
static int
{
int i;
int rv;
static char *name = "vsw_setup_rx_dring";
/*
* Allocate a single large buffer that serves as the rx buffer area.
* We allocate a ldc memory handle and export the buffer area as shared
* memory. We send the ldc memcookie for this buffer space to the peer,
* as part of dring registration phase during handshake. We manage this
* buffer area as individual buffers of max_frame_size and provide
* specific buffer offsets in each descriptor to the peer. Note that
* the factor used to compute the # of buffers (above) must be > 1 to
* ensure that there are more buffers than the # of descriptors. This
* is needed because, while the shared memory buffers are sent up our
* stack during receive, the sender needs additional buffers that can
* be used for further transmits. This also means there is no one to
* one correspondence between the descriptor index and buffer offset.
* The sender has to read the buffer offset in the descriptor and use
* the specified offset to copy the tx data into the shared buffer. We
* (receiver) manage the individual buffers and their state (see
* VIO_MBLK_STATEs in vio_util.h).
*/
/* Allocate a ldc memhandle for the entire rx data area */
if (rv != 0) {
goto fail;
}
/* Allocate memory for the data cookies */
sizeof (ldc_mem_cookie_t), KM_SLEEP);
/*
* Bind ldc memhandle to the corresponding rx data area.
*/
if (rv != 0) {
goto fail;
}
goto fail;
}
/*
* Successful in binding the handle to rx data area. Now setup mblks
* around each data buffer and setup the descriptors to point to these
* rx data buffers. We associate each descriptor with a buffer
* by specifying the buffer offset in the descriptor. When the peer
* needs to transmit data, this offset is read by the peer to determine
* the buffer in the mapped buffer area where the data to be
* transmitted should be copied, for a specific descriptor.
*/
if (rv != 0) {
goto fail;
}
for (i = 0; i < dp->num_descriptors; i++) {
/* allocate an mblk around this data buffer */
rxdp_to_vmp[i] = vmp;
}
return (0);
fail:
/* return failure; caller will cleanup */
return (1);
}
/*
* Free receive resources for the channel.
*/
void
{
return;
}
/*
* If we can't destroy the rx pool for this channel, dispatch a
* task to retry and clean up those rx pools. Note that we
* don't need to wait for the task to complete. If the vsw
* device itself gets detached (vsw_detach()), it will wait for
* the task to complete implicitly in ddi_taskq_destroy().
*/
}
}
/* Free rx data area cookies */
sizeof (ldc_mem_cookie_t));
}
/* Unbind rx data area memhandle */
if (dp->data_ncookies != 0) {
dp->data_ncookies = 0;
}
/* Free rx data area memhandle */
if (dp->data_handle) {
dp->data_handle = 0;
}
/* Now free the rx data area itself */
}
/* Finally, free the receive descriptor ring */
}
}
}
/*
* Map the receive descriptor ring exported by the peer, as our transmit
* descriptor ring.
*/
{
int i;
int rv;
return (NULL);
}
/* RxDringData mode specific initializations */
ldcp->dringdata_msgid = 0;
/*
* Mark the descriptor state as 'done'. This is implementation specific
* and not required by the protocol. In our implementation, we only
* need the descripor to be in 'done' state to be used by the transmit
* function and the peer is not aware of it. As the protocol requires
* that during initial registration the exporting end point mark the
* dstate as 'free', we change it 'done' here. After this, the dstate
* in our implementation will keep moving between 'ready', set by our
* transmit function; and and 'done', set by the peer (per protocol)
* after receiving data.
* Setup on_trap() protection before accessing dring shared memory area.
*/
if (rv != 0) {
/*
* Data access fault occured down the code path below while
* accessing the descriptors. Return failure.
*/
goto fail;
}
for (i = 0; i < dp->num_descriptors; i++) {
}
(void) LDC_NO_TRAP();
return (dp);
fail:
}
return (NULL);
}
/*
* Unmap the transmit descriptor ring.
*/
void
{
return;
}
/* Unmap tx data area and free data handle */
}
/* Free tx data area cookies */
sizeof (ldc_mem_cookie_t));
dp->data_ncookies = 0;
}
/* Unmap peer's dring */
}
}
/*
* A per LDC worker thread to process the rx dring and receive packets. This
* thread is woken up by the LDC interrupt handler when a dring data info
* message is received.
*/
void
vsw_ldc_rcv_worker(void *arg)
{
"vsw_rcv_thread");
/*
* Wait until the data is received or a stop
* request is received.
*/
while (!(ldcp->rcv_thr_flags &
(VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
}
/*
* First process the stop request.
*/
break;
}
}
/*
* Update the run status and wakeup the thread that
* has sent the stop request.
*/
thread_exit();
}
/*
* Process the rx descriptor ring in the context of receive worker
* thread and switch the received packets to their destinations.
*/
static void
{
int rv;
int count = 0;
int total_count = 0;
do {
if (rv != 0) {
/* Invalid descriptor error; get next */
continue;
}
break;
}
/* Descriptor not ready for processsing */
if (retries == vsw_recv_retries) {
break;
}
/* Switch packets received so far before retrying */
}
retries++;
goto again;
}
retries = 0;
/* Build a chain of received packets */
/* first pkt */
} else {
}
total_count++;
count++;
/*
* If we have gathered vsw_chain_len (tunable)
* # of packets in the chain, switch them.
*/
if (count == vsw_chain_len) {
}
/*
* Stop further processing if we processed the entire dring
* once; otherwise continue.
*/
int, (total_count));
}
/* Send stopped signal to peer (sender) */
sizeof (vio_dring_msg_t), B_TRUE);
}
/*
* Process the next index in the rx dring and receive the associated packet.
*
* Returns:
* bp: Success: The received packet.
* Failure: NULL
* retval:
* Success: 0
* Failure: EAGAIN: Descriptor not ready
* EIO: Descriptor contents invalid.
*/
static int
{
struct ether_header *ehp;
int err = 0;
/*
* Descriptor is not ready.
*/
return (EAGAIN);
}
/*
* Ensure load ordering of dstate and nbytes.
*/
(rxdp->data_buf_offset !=
/*
* Descriptor contents invalid.
*/
goto done;
}
/*
* Now allocate a new buffer for this descriptor before sending up the
* buffer being processed. If that fails, stop processing; as we are
* out of receive buffers.
*/
/*
* Process the current buffer being received.
*/
/*
* We failed to get a new mapped buffer that is needed to
* refill the descriptor. In that case, leave the current
* buffer bound to the descriptor; allocate an mblk dynamically
* and copy the contents of the buffer to the mblk. Then send
* up this mblk. This way the sender has the same buffer as
* before that can be used to send new data.
*/
} else {
/* Mark the status of the current rbuf */
/* Set the offset of the new buffer in the descriptor */
}
/*
* Ensure store ordering of data_buf_offset and dstate; so that the
* peer sees the right data_buf_offset after it checks that the dstate
* is DONE.
*/
/* Now mark the descriptor 'done' */
/* Update stats */
if (IS_BROADCAST(ehp))
else if (IS_MULTICAST(ehp))
done:
/* Update the next index to be processed */
/* Save the new recv index */
/* Return the packet received */
return (err);
}
void
{
/*
* Send a stop request by setting the stop flag and
* wait until the rcv process thread stops.
*/
}
if (tid != 0) {
}
}
int
{
struct ether_header *ehp;
int rv = 0;
return (LDC_TX_FAILURE);
}
return (LDC_TX_FAILURE);
}
/*
* Note: In RxDringData mode, lane_in is associated with transmit and
* lane_out is associated with receive. However, we still keep the
* negotiated mtu in lane_out (our exported attributes).
*/
return (LDC_TX_FAILURE);
}
/*
* Setup on_trap() protection before accessing shared memory areas
* (descriptor and data buffer). Note that we enable this protection a
* little early and turn it off slightly later, than keeping it enabled
* strictly at the points in code below where the descriptor and data
* buffer are accessed. This is done for performance reasons:
* (a) to avoid calling the trap protection code while holding mutex.
*/
if (rv != 0) {
/*
* Data access fault occured down the code path below while
* accessing either the descriptor or the data buffer. Release
* any locks that we might have acquired in the code below and
* return failure.
*/
}
}
goto dringsend_shm_exit;
}
/*
* Allocate a descriptor
*/
statsp->tx_no_desc++;
(void) LDC_NO_TRAP();
return (LDC_TX_NORESOURCES);
}
/* Update descriptor ring index */
/* Access the descriptor */
/* Ensure load ordering of dstate (above) and data_buf_offset. */
/* Get the offset of the buffer to be used */
/* Access the buffer using the offset */
/* Copy data into mapped transmit buffer */
}
/* Set the size of data in the descriptor */
/*
* Ensure store ordering of nbytes and dstate (below); so that the peer
* sees the right nbytes value after it checks that the dstate is READY.
*/
/* Mark the descriptor ready */
/* Check if peer needs wake up (handled below) */
}
/* Update tx stats */
if (is_bcast)
else if (is_mcast)
/*
* We are done accessing shared memory; clear trap protection.
*/
(void) LDC_NO_TRAP();
/*
* Need to wake up the peer ?
*/
if (resched_peer == B_TRUE) {
B_FALSE);
if (rv != 0) {
/* error: drop the packet */
}
}
}
return (LDC_TX_SUCCESS);
}
void
{
case VIO_SUBTYPE_INFO:
break;
case VIO_SUBTYPE_ACK:
break;
case VIO_SUBTYPE_NACK:
/*
* Something is badly wrong if we are getting NACK's
* for our data pkts. So reset the channel.
*/
break;
default:
}
}
static void
{
/* drop the message */
return;
}
/*
* Wake up the rcv worker thread to process the rx dring.
*/
}
}
static void
{
int rv;
/*
* In RxDringData mode (v1.6), start index of -1 can be used by the
* peer to indicate that it is unspecified. However, the end index
* must be set correctly indicating the last descriptor index processed.
*/
/* drop the message if invalid index */
return;
}
/* Validate dring_ident */
/* invalid dring_ident, drop the msg */
return;
}
/*
* Receiver continued processing
* dring after sending us the ack.
*/
return;
}
/*
* Setup on_trap() protection before accessing dring shared memory area.
*/
if (rv != 0) {
/*
* Data access fault occured down the code path below while
* accessing the descriptors. Release any locks that we might
* have acquired in the code below and return failure.
*/
}
return;
}
/*
* Determine if there are any pending tx descriptors ready to be
* processed by the receiver(peer) and if so, send a message to the
* peer to restart receiving.
*/
}
/*
* We are done accessing shared memory; clear trap protection.
*/
(void) LDC_NO_TRAP();
/*
* No ready tx descriptors. Set the flag to send a message to
* the peer when tx descriptors are ready in transmit routine.
*/
return;
}
/*
* We have some tx descriptors ready to be processed by the receiver.
* Send a dring data message to the peer to restart processing.
*/
sizeof (vio_dring_msg_t), B_FALSE);
if (rv != 0) {
}
if (rv == ECONNRESET) {
}
}
/*
*/
int
{
int rv;
int retries = vsw_wretries;
do {
"chan(%lld) rv(%d) size (%d) msglen(%d)\n",
}
/*
* If channel has been reset we either handle it here or
* simply report back that it has been reset and let caller
* decide what to do.
*/
if (rv == ECONNRESET) {
if (handle_reset) {
}
}
return (rv);
}