vnet_rxdring.c revision 8e51227711fb29b69b8f42a3953e759963432065
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
#include <sys/sysmacros.h>
#include <sys/machsystm.h>
#include <sys/ethernet.h>
#include <sys/mach_descrip.h>
#include <sys/vio_mailbox.h>
#include <sys/vio_common.h>
#include <sys/vnet_common.h>
#include <sys/vnet_mailbox.h>
#include <sys/vio_util.h>
#include <sys/vnet_gen.h>
/*
* This file contains the implementation of RxDringData transfer mode of VIO
* Protocol in vnet. The functions in this file are invoked from vnet_gen.c
* after RxDringData mode is negotiated with the peer during attribute phase of
* handshake. This file contains functions that setup the transmit and receive
* descriptor rings, and associated resources in RxDringData mode. It also
* contains the transmit and receive data processing functions that are invoked
* in RxDringData mode. The data processing routines in this file have the
* suffix '_shm' to indicate the shared memory mechanism used in RxDringData
* mode.
*/
/* Functions exported to vnet_gen.c */
/* Internal functions */
/* Functions imported from vnet_gen.c */
extern void vgen_destroy_rxpools(void *arg);
/* Tunables */
extern uint32_t vnet_num_descriptors;
extern uint32_t vgen_chain_len;
extern uint32_t vgen_ldcwr_retries;
extern uint32_t vgen_recv_delay;
extern uint32_t vgen_recv_retries;
extern uint32_t vgen_nrbufs_factor;
#ifdef DEBUG
#define DEBUG_PRINTF vgen_debug_printf
extern int vnet_dbglevel;
extern int vgen_inject_err_flag;
#endif
/*
* Allocate receive resources for the channel. The resources consist of a
* receive descriptor ring and an associated receive buffer area.
*/
int
{
int i, j;
int rv;
rxdsize = sizeof (vnet_rx_dringdata_desc_t);
/* Create the receive descriptor ring */
&ldcp->rx_dring_handle);
if (rv != 0) {
goto fail;
}
/* Get the addr of descriptor ring */
if (rv != 0) {
goto fail;
}
/*
* Allocate a table that maps descriptor to its associated buffer;
* used while receiving to validate that the peer has not changed the
* buffer offset provided in the descriptor.
*/
KM_SLEEP);
/*
* Allocate a single large buffer that serves as the rx buffer area.
* We allocate a ldc memory handle and export the buffer area as shared
* memory. We send the ldc memcookie for this buffer space to the peer,
* as part of dring registration phase during handshake. We manage this
* buffer area as individual buffers of max_frame_size and provide
* specific buffer offsets in each descriptor to the peer. Note that
* the factor used to compute the # of buffers (above) must be > 1 to
* ensure that there are more buffers than the # of descriptors. This
* is needed because, while the shared memory buffers are sent up our
* stack during receive, the sender needs additional buffers that can
* be used for further transmits. This also means there is no one to
* one correspondence between the descriptor index and buffer offset.
* The sender has to read the buffer offset in the descriptor and use
* the specified offset to copy the tx data into the shared buffer. We
* (receiver) manage the individual buffers and their state (see
* VIO_MBLK_STATEs in vio_util.h).
*/
/* Allocate a ldc memhandle for the entire rx data area */
if (rv) {
ldcp->rx_data_handle = 0;
goto fail;
}
/* Allocate memory for the data cookies */
sizeof (ldc_mem_cookie_t), KM_SLEEP);
/*
* Bind ldc memhandle to the corresponding rx data area.
*/
ncookies = 0;
if (rv != 0) {
goto fail;
}
goto fail;
}
for (j = 1; j < ncookies; j++) {
&(ldcp->rx_data_cookie[j]));
if (rv != 0) {
"failed rv (%d)", rv);
goto fail;
}
}
/*
* Successful in binding the handle to rx data area. Now setup mblks
* around each data buffer and setup the descriptors to point to these
* rx data buffers. We associate each descriptor with a buffer
* by specifying the buffer offset in the descriptor. When the peer
* needs to transmit data, this offset is read by the peer to determine
* the buffer in the mapped buffer area where the data to be
* transmitted should be copied, for a specific descriptor.
*/
if (rv != 0) {
goto fail;
}
/* allocate an mblk around this data buffer */
rxdp_to_vmp[i] = vmp;
}
/*
* The descriptors and the associated buffers are all ready;
* now bind descriptor ring to the channel.
*/
if (rv != 0) {
"rv(%x)\n", rv);
goto fail;
}
/* initialize rx seqnum and index */
return (VGEN_SUCCESS);
fail:
return (VGEN_FAILURE);
}
/*
* Free receive resources for the channel.
*/
void
{
/* We first unbind the descriptor ring */
if (ldcp->rx_dring_ncookies != 0) {
ldcp->rx_dring_ncookies = 0;
}
/* Destroy the mblks that are wrapped around the rx data buffers */
/*
* If we can't destroy the rx pool for this channel,
* dispatch a task to retry and clean up. Note that we
* don't need to wait for the task to complete. If the
* vnet device itself gets detached, it will wait for
* the task to complete implicitly in
* ddi_taskq_destroy().
*/
}
}
/* Free rx data area cookies */
sizeof (ldc_mem_cookie_t));
}
/* Unbind rx data area memhandle */
if (ldcp->rx_data_ncookies != 0) {
ldcp->rx_data_ncookies = 0;
}
/* Free rx data area memhandle */
if (ldcp->rx_data_handle != 0) {
ldcp->rx_data_handle = 0;
}
/* Now free the rx data area itself */
/* prealloc'd rx data buffer */
ldcp->rx_data_sz = 0;
}
/* Finally, free the receive descriptor ring */
if (ldcp->rx_dring_handle != 0) {
ldcp->rx_dring_handle = 0;
}
}
/* Reset rx index and seqnum */
}
/*
* Map the receive descriptor ring exported
* by the peer, as our transmit descriptor ring.
*/
int
{
int i;
int rv;
/*
* Sanity check.
*/
if (num_desc < VGEN_NUM_DESCRIPTORS_MIN ||
desc_size < sizeof (vnet_rx_dringdata_desc_t) ||
ncookies > 1) {
goto fail;
}
/* Map the remote dring */
if (rv != 0) {
goto fail;
}
/*
* Sucessfully mapped; now try to get info about the mapped dring
*/
if (rv != 0) {
goto fail;
}
/*
* Save ring address, number of descriptors.
*/
/* Initialize tx dring indexes and seqnum */
ldcp->dringdata_msgid = 0;
/* Save peer's dring_info values */
sizeof (ldc_mem_cookie_t));
/* Set dring_ident for the peer */
/* Return the dring_ident in ack msg */
/*
* Mark the descriptor state as 'done'. This is implementation specific
* and not required by the protocol. In our implementation, we only
* need the descripor to be in 'done' state to be used by the transmit
* function and the peer is not aware of it. As the protocol requires
* that during initial registration the exporting end point mark the
* dstate as 'free', we change it 'done' here. After this, the dstate
* in our implementation will keep moving between 'ready', set by our
* transmit function; and and 'done', set by the peer (per protocol)
* after receiving data.
* Setup on_trap() protection before accessing dring shared memory area.
*/
if (rv != 0) {
/*
* Data access fault occured down the code path below while
* accessing the descriptors. Return failure.
*/
goto fail;
}
for (i = 0; i < num_desc; i++) {
}
(void) LDC_NO_TRAP();
return (VGEN_SUCCESS);
fail:
if (ldcp->tx_dring_handle != 0) {
ldcp->tx_dring_handle = 0;
}
return (VGEN_FAILURE);
}
/*
* Unmap the transmit descriptor ring.
*/
void
{
/* Unmap mapped tx data area */
}
/* Free tx data area handle */
if (ldcp->tx_data_handle != 0) {
ldcp->tx_data_handle = 0;
}
/* Free tx data area cookies */
sizeof (ldc_mem_cookie_t));
ldcp->tx_data_ncookies = 0;
}
/* Unmap peer's dring */
if (ldcp->tx_dring_handle != 0) {
ldcp->tx_dring_handle = 0;
}
/* clobber tx ring members */
}
/*
* Map the shared memory data buffer area exported by the peer.
*/
int
{
int rv;
/* skip over dring cookies */
return (VGEN_FAILURE);
}
/* save # of data area cookies */
/* save data area size */
/* allocate ldc mem handle for data area */
if (rv != 0) {
return (VGEN_FAILURE);
}
/* map the data area */
if (rv != 0) {
ldcp->tx_data_handle = 0;
return (VGEN_FAILURE);
}
/* allocate memory for data area cookies */
sizeof (ldc_mem_cookie_t), KM_SLEEP);
/* save data area cookies */
return (VGEN_SUCCESS);
}
/*
* This function transmits normal data frames (non-priority) over the channel.
* It queues the frame into the transmit descriptor ring and sends a
* VIO_DRING_DATA message if needed, to wake up the peer to (re)start
* processing.
*/
int
{
struct ether_header *ehp;
int rv = 0;
/* Drop the packet if ldc is not up or handshake is not done */
ldcp->ldc_status);
goto dringsend_shm_exit;
}
goto dringsend_shm_exit;
}
goto dringsend_shm_exit;
}
/*
* Setup on_trap() protection before accessing shared memory areas
* (descriptor and data buffer). Note that we enable this protection a
* little early and turn it off slightly later, than keeping it enabled
* strictly at the points in code below where the descriptor and data
* buffer are accessed. This is done for performance reasons:
* (a) to avoid calling the trap protection code while holding mutex.
*/
if (rv != 0) {
/*
* Data access fault occured down the code path below while
* accessing either the descriptor or the data buffer. Release
* any locks that we might have acquired in the code below and
* return failure.
*/
}
}
goto dringsend_shm_exit;
}
/*
* Allocate a descriptor
*/
}
statsp->tx_no_desc++;
(void) LDC_NO_TRAP();
return (VGEN_TX_NORESOURCES);
}
}
/* Update descriptor ring index */
}
/* Access the descriptor */
/* Ensure load ordering of dstate (above) and data_buf_offset. */
/* Get the offset of the buffer to be used */
/* Access the buffer using the offset */
/* Copy data into mapped transmit buffer */
}
/* Set the size of data in the descriptor */
/*
* Ensure store ordering of nbytes and dstate (below); so that the peer
* sees the right nbytes value after it checks that the dstate is READY.
*/
/* Mark the descriptor ready */
/* Check if peer needs wake up (handled below) */
}
/* Update tx stats */
if (is_bcast)
else if (is_mcast)
/*
* We are done accessing shared memory; clear trap protection.
*/
(void) LDC_NO_TRAP();
/*
* Need to wake up the peer ?
*/
if (resched_peer == B_TRUE) {
if (rv != 0) {
/* error: drop the packet */
}
}
}
return (VGEN_TX_SUCCESS);
}
/*
*/
int
{
int rv = 0;
switch (tagp->vio_subtype) {
case VIO_SUBTYPE_INFO:
/*
* To reduce the locking contention, release the
* cblock here and re-acquire it once we are done
* receiving packets.
*/
if (rv != 0) {
}
break;
case VIO_SUBTYPE_ACK:
if (rv != 0) {
}
break;
case VIO_SUBTYPE_NACK:
if (rv != 0) {
}
break;
}
return (rv);
}
static int
{
int rv = 0;
/* drop the message if invalid index */
return (0);
}
/* validate dring_ident */
/* invalid dring_ident, drop the msg */
return (0);
}
/*
* If we are in polling mode, return from here without processing the
* dring. We will process the dring in the context of polling thread.
*/
return (0);
}
/*
* Process the dring and receive packets in intr context.
*/
if (rv != 0) {
}
return (rv);
}
/*
* Process the rx descriptor ring in the context of interrupt thread
* (vgen_ldc_cb() callback) and send the received packets up the stack.
*/
static int
{
int rv;
int count = 0;
int total_count = 0;
do {
if (rv != 0) {
/* Invalid descriptor error; get next */
continue;
}
break;
}
/* Build a chain of received packets */
/* first pkt */
} else {
}
total_count++;
count++;
/*
* We are receiving the packets in interrupt context. If we
* have gathered vgen_chain_len (tunable) # of packets in the
* chain, send them up. (See vgen_poll_rcv_shm() for receiving
* in polling thread context).
*/
if (count == vgen_chain_len) {
int, count);
count = 0;
}
/*
* Stop further processing if we processed the entire dring
* once; otherwise continue.
*/
}
/*
* We send a stopped message to peer (sender) while we are in
* intr mode only; allowing the peer to send further data intrs
* (dring data msgs) to us.
*/
return (rv);
}
return (0);
}
/*
* Process the rx descriptor ring in the context of mac polling thread. Receive
* packets upto the limit specified by bytes_to_pickup or until there are no
* more packets, whichever occurs first. Return the chain of received packets.
*/
mblk_t *
{
int count = 0;
int rv;
/* Channel is being reset and handshake not complete */
return (NULL);
}
do {
if (rv != 0) {
/* Invalid descriptor error; get next */
continue;
}
break;
}
/* Build a chain of received packets */
/* first pkt */
} else {
}
/* Compute total size accumulated */
count++;
/* Reached the bytes limit; we are done. */
if (sz >= bytes_to_pickup) {
break;
}
} while (1);
/*
* We prepend any high priority packets to the chain of packets; note
* that if we are already at the bytes_to_pickup limit, we might
* slightly exceed that in such cases. That should be ok, as these pkts
* are expected to be small in size and arrive at an interval in the
* the order of a few seconds.
*/
}
return (bp);
}
/*
* Process the next index in the rx dring and receive the associated packet.
*
* Returns:
* bp: Success: The received packet.
* Failure: NULL
* size: Success: Size of received packet.
* Failure: 0
* retval:
* Success: 0
* Failure: EAGAIN: Descriptor not ready
* EIO: Descriptor contents invalid.
*/
static int
{
struct ether_header *ehp;
int err = 0;
/*
* Descriptor is not ready.
*/
return (EAGAIN);
}
/*
* Ensure load ordering of dstate and nbytes.
*/
(rxdp->data_buf_offset !=
/*
* Descriptor contents invalid.
*/
goto done;
}
/*
* Now allocate a new buffer for this descriptor before sending up the
* buffer being processed. If that fails, stop processing; as we are
* out of receive buffers.
*/
/*
* Process the current buffer being received.
*/
/*
* We failed to get a new mapped buffer that is needed to
* refill the descriptor. In that case, leave the current
* buffer bound to the descriptor; allocate an mblk dynamically
* and copy the contents of the buffer to the mblk. Then send
* up this mblk. This way the sender has the same buffer as
* before that can be used to send new data.
*/
return (ENOMEM);
}
} else {
/* Mark the status of the current rbuf */
/* Set the offset of the new buffer in the descriptor */
}
/*
* Ensure store ordering of data_buf_offset and dstate; so that the
* peer sees the right data_buf_offset after it checks that the dstate
* is DONE.
*/
/* Now mark the descriptor 'done' */
/* Update stats */
if (IS_BROADCAST(ehp))
else if (IS_MULTICAST(ehp))
done:
/* Update the next index to be processed */
/* Save the new recv index */
/* Return the packet received */
return (err);
}
static int
{
int rv = 0;
/*
* Received an ack for our transmits upto a certain dring index. This
* enables us to reclaim descriptors. We also send a new dring data msg
* to the peer to restart processing if there are pending transmit pkts.
*/
/*
* In RxDringData mode (v1.6), start index of -1 can be used by the
* peer to indicate that it is unspecified. However, the end index
* must be set correctly indicating the last descriptor index processed.
*/
/* drop the message if invalid index */
return (rv);
}
/* Validate dring_ident */
/* invalid dring_ident, drop the msg */
return (rv);
}
/*
* Clear transmit flow control condition
* as some descriptors should be free now.
*/
}
/*
* Receiver continued processing
* dring after sending us the ack.
*/
return (rv);
}
/*
* Receiver stopped processing descriptors.
*/
/*
* Setup on_trap() protection before accessing dring shared memory area.
*/
if (rv != 0) {
/*
* Data access fault occured down the code path below while
* accessing the descriptors. Release any locks that we might
* have acquired in the code below and return failure.
*/
}
return (ECONNRESET);
}
/*
* Determine if there are any pending tx descriptors ready to be
* processed by the receiver(peer) and if so, send a message to the
* peer to restart receiving.
*/
}
/*
* We are done accessing shared memory; clear trap protection.
*/
(void) LDC_NO_TRAP();
/*
* No ready tx descriptors. Set the flag to send a message to
* the peer when tx descriptors are ready in transmit routine.
*/
return (rv);
}
/*
* We have some tx descriptors ready to be processed by the receiver.
* Send a dring data message to the peer to restart processing.
*/
if (rv != VGEN_SUCCESS) {
}
return (rv);
}
static int
{
int rv = 0;
/*
* Peer sent a NACK msg (to indicate bad descriptors ?). The start and
* end correspond to the range of descriptors which are being nack'd.
*/
/*
* In RxDringData mode (v1.6), start index of -1 can be used by
* the peer to indicate that it is unspecified. However, the end index
* must be set correctly indicating the last descriptor index processed.
*/
/* drop the message if invalid index */
return (rv);
}
/* Validate dring_ident */
/* invalid dring_ident, drop the msg */
return (rv);
}
/*
* Setup on_trap() protection before accessing dring shared memory area.
*/
if (rv != 0) {
/*
* Data access fault occured down the code path below while
* accessing the descriptors. Release any locks that we might
* have acquired in the code below and return failure.
*/
return (ECONNRESET);
}
/* We just mark the descrs as free so they can be reused */
}
/*
* We are done accessing shared memory; clear trap protection.
*/
(void) LDC_NO_TRAP();
return (rv);
}
/*
* Send descriptor ring data message to the peer over LDC.
*/
static int
{
int rv;
#ifdef DEBUG
return (VGEN_SUCCESS);
}
#endif
if (rv != VGEN_SUCCESS) {
return (rv);
}
return (VGEN_SUCCESS);
}
/*
* Send dring data ack message.
*/
int
{
int rv = 0;
if (rv != VGEN_SUCCESS) {
}
if (pstate == VIO_DP_STOPPED) {
}
return (rv);
}
/*
*/
static int
{
int rv;
return (VGEN_FAILURE);
do {
if (retries++ >= vgen_ldcwr_retries)
break;
} while (rv == EWOULDBLOCK);
if (rv != 0) {
return (rv);
}
return (VGEN_FAILURE);
}
return (VGEN_SUCCESS);
}