vsw_txdring.c revision 7bd3a2e26cc8569257b88c1691d559138e1d32d0
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/sysmacros.h>
#include <sys/machsystm.h>
#include <sys/mach_descrip.h>
#include <sys/vio_mailbox.h>
#include <sys/vio_common.h>
#include <sys/vnet_common.h>
#include <sys/vnet_mailbox.h>
#include <sys/vio_util.h>
/*
* This file contains the implementation of TxDring data transfer mode of VIO
* Protocol in vsw. The functions in this file are invoked from vsw_ldc.c
* after TxDring mode is negotiated with the peer during attribute phase of
* handshake. This file contains functions that setup the transmit and receive
* descriptor rings, and associated resources in TxDring mode. It also contains
* the transmit and receive data processing functions that are invoked in
* TxDring mode.
*/
/* Functions exported to vsw_ldc.c */
void vsw_ldc_msg_worker(void *arg);
void vsw_process_dringdata(void *, void *);
/* Internal functions */
/* Functions imported from vsw_ldc.c */
extern void vsw_process_pkt(void *);
extern void vsw_destroy_rxpools(void *);
/* Tunables */
extern int vsw_wretries;
extern int vsw_recv_delay;
extern int vsw_recv_retries;
extern boolean_t vsw_jumbo_rxpools;
extern uint32_t vsw_chain_len;
extern uint32_t vsw_num_descriptors;
extern uint32_t vsw_mblk_size1;
extern uint32_t vsw_mblk_size2;
extern uint32_t vsw_mblk_size3;
extern uint32_t vsw_mblk_size4;
extern uint32_t vsw_num_mblks1;
extern uint32_t vsw_num_mblks2;
extern uint32_t vsw_num_mblks3;
extern uint32_t vsw_num_mblks4;
sizeof (vio_dring_msg_t), B_TRUE);
{
/*
* If we can't create a dring, obviously no point sending
* a message.
*/
return (NULL);
/* payload */
mp->dring_ident = 0;
return (mp);
}
/*
* Allocate transmit resources for the channel. The resources consist of a
* transmit descriptor ring and an associated transmit buffer area.
*/
static dring_info_t *
{
/* create public section of ring */
goto fail;
}
/*
* Get the base address of the public section of the ring.
*/
goto fail;
} else {
}
/*
* create private portion of ring
*/
goto fail;
}
/* bind dring to the channel */
goto fail;
}
/* haven't used any descriptors yet */
return (dp);
fail:
return (NULL);
}
/*
* Setup the descriptors in the tx dring.
* Returns 0 on success, 1 on failure.
*/
int
{
static char *name = "vsw_setup_ring";
void *data_addr;
/* public section may be null but private should never be */
/*
* Allocate the region of memory which will be used to hold
* the data the descriptors will refer to.
*/
/*
* In order to ensure that the number of ldc cookies per descriptor is
* limited to be within the default MAX_COOKIES (2), we take the steps
* outlined below:
*
* Align the entire data buffer area to 8K and carve out per descriptor
* data buffers starting from this 8K aligned base address.
*
* We round up the mtu specified to be a multiple of 2K or 4K.
* For sizes up to 12K we round up the size to the next 2K.
* For sizes > 12K we round up to the next 4K (otherwise sizes such as
* 14K could end up needing 3 cookies, with the buffer spread across
* 3 8K pages: 8K+6K, 2K+8K+2K, 6K+8K, ...).
*/
} else {
}
/* allocate extra 8K bytes for alignment */
/* align the starting address of the data area to 8K */
/*
* Initialise some of the private and public (if they exist)
* descriptor fields.
*/
for (i = 0; i < vsw_num_descriptors; i++) {
goto fail;
}
if (rv != 0) {
goto fail;
}
"invalid num of cookies (%d) for size 0x%llx",
goto fail;
} else {
for (j = 1; j < ncookies; j++) {
if (rv != 0) {
goto fail;
}
"size 0x%llx", name, j,
}
}
/* link pub and private sides */
sizeof (ldc_mem_cookie_t));
}
pub_addr++;
}
/*
* move to next element in the dring and the next
* position in the data buffer.
*/
priv_addr++;
}
return (0);
fail:
/* return failure; caller will cleanup */
return (1);
}
/*
* Free transmit resources for the channel.
*/
void
{
int i;
return;
}
/*
* First unbind and free the memory handles
* stored in each descriptor within the ring.
*/
for (i = 0; i < vsw_num_descriptors; i++) {
"unbinding handle for "
"ring 0x%llx at pos %d",
dp, i);
continue;
}
}
if (ldc_mem_free_handle(
"handle for ring 0x%llx "
"at pos %d", dp, i);
continue;
}
}
}
(sizeof (vsw_private_desc_t) * vsw_num_descriptors));
}
/*
* Now unbind and destroy the ring itself.
*/
}
}
}
/*
* Map the transmit descriptor ring exported
* by the peer, as our receive descriptor ring.
*/
{
int rv;
return (NULL);
}
/* TxDring mode specific initializations */
/* Allocate pools of receive mblks */
if (rv != 0) {
/*
* We do not return failure if receive mblk pools can't
* be allocated, instead allocb(9F) will be used to
* dynamically allocate buffers during receive.
*/
}
return (dp);
}
/*
* Unmap the receive descriptor ring.
*/
void
{
return;
}
/*
* If we can't destroy all the rx pools for this channel,
* dispatch a task to retry and clean up those rx pools. Note
* that we don't need to wait for the task to complete. If the
* vsw device itself gets detached (vsw_detach()), it will wait
* for the task to complete implicitly in ddi_taskq_destroy().
*/
}
}
}
static int
{
int rv;
/*
* We round up the mtu specified to be a multiple of 2K to limit the
* number of rx buffer pools created for a given mtu.
*/
/*
* If pool sizes are specified, use them. Note that the presence of
* the first tunable will be used as a hint.
*/
if (vsw_mblk_size1 != 0) {
if (sz4 == 0) { /* need 3 pools */
} else {
}
return (rv);
}
/*
* Pool sizes are not specified. We select the pool sizes based on the
* mtu if vnet_jumbo_rxpools is enabled.
*/
/*
* Receive buffer pool allocation based on mtu is disabled.
* Use the default mechanism of standard size pool allocation.
*/
return (rv);
}
switch (data_sz) {
case VNET_4K:
break;
default: /* data_sz: 4K+ to 16K */
break;
}
return (rv);
}
/*
* Generic routine to send message out over ldc channel.
*
* It is possible that when we attempt to write over the ldc channel
* that we get notified that it has been reset. Depending on the value
* of the handle_reset flag we either handle that event here or simply
* notify the caller that the channel was reset.
*/
int
{
int rv;
int retries = vsw_wretries;
}
}
do {
}
}
/*
* If channel has been reset we either handle it here or
* simply report back that it has been reset and let caller
* decide what to do.
*/
if (rv == ECONNRESET) {
if (handle_reset) {
}
}
return (rv);
}
/*
* A per LDC worker thread to process ldc messages. This thread is woken up by
* the LDC interrupt handler to process LDC packets and receive data.
*/
void
vsw_ldc_msg_worker(void *arg)
{
"vsw_msg_thread");
/*
* Wait until the data is received or a stop
* request is received.
*/
while (!(ldcp->msg_thr_flags &
(VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
}
/*
* First process the stop request.
*/
break;
}
}
/*
* Update the run status and wakeup the thread that
* has sent the stop request.
*/
thread_exit();
}
/* Co-ordinate with msg processing thread to stop it */
void
{
/*
* Send a stop request by setting the stop flag and
* wait until the msg process thread stops.
*/
}
if (tid != 0) {
}
}
/*
* Send packet out via descriptor ring to a logical device.
*/
int
{
int idx;
int status = LDC_TX_SUCCESS;
/* TODO: make test a macro */
return (LDC_TX_FAILURE);
}
return (LDC_TX_FAILURE);
}
return (LDC_TX_FAILURE);
}
/*
* Find a free descriptor
*
* Note: for the moment we are assuming that we will only
* have one dring going from the switch to each of its
* peers. This may change in the future.
*/
/* nothing more we can do */
goto vsw_dringsend_free_exit;
} else {
}
/* copy data into the descriptor */
bufp += VNET_IPALIGN;
bufp += n;
}
/* update statistics */
if (IS_BROADCAST(ehp))
else if (IS_MULTICAST(ehp))
/*
* Determine whether or not we need to send a message to our
* peer prompting them to read our newly updated descriptor(s).
*/
if (dp->restart_reqd) {
/*
* Send a vio_dring_msg to peer to prompt them to read
* the updated descriptor ring.
*/
/* Note - for now using first ring */
/*
* If last_ack_recv is -1 then we know we've not
* received any ack's yet, so this must be the first
* msg sent, so set the start to the begining of the ring.
*/
} else {
}
sizeof (vio_dring_msg_t), B_TRUE);
return (status);
} else {
}
return (status);
}
/*
* Searches the private section of a ring for a free descriptor,
* starting at the location of the last free descriptor found
* previously.
*
* Returns 0 if free descriptor is available, and updates state
* of private descriptor to VIO_DESC_READY, otherwise returns 1.
*
* FUTURE: might need to return contiguous range of descriptors
* as dring info msg assumes all will be contiguous.
*/
int
{
int num = vsw_num_descriptors;
int ret = 1;
ret = 0;
}
/* ring full */
if (ret == 1) {
}
return (ret);
}
/* vsw_reclaim_dring -- reclaim descriptors */
int
{
int i, j, len;
j = 0;
break;
}
/* clear all the fields */
}
return (j);
}
void
{
int read_attempts = 0;
struct ether_header *ehp;
/*
* cast it into the correct structure.
*/
/*
* Switch on the vio_subtype. If its INFO then we need to
* process the data. If its an ACK we need to make sure
* and if its a NACK then we maybe attempt a retry.
*/
case VIO_SUBTYPE_INFO:
return;
}
if (end == -1) {
num = -1;
} else if (end >= 0) {
/* basic sanity check */
"ring length %lld", __func__,
return;
}
} else {
return;
}
"descriptor at pos %d: err %d",
break;
}
/*
* When given a bounded range of descriptors
* to process, its an error to hit a descriptor
* which is not ready. In the non-bounded case
* (end_idx == -1) this simply indicates we have
* reached the end of the current active range.
*/
/* unbound - no error */
if (end == -1) {
if (read_attempts == vsw_recv_retries)
break;
goto vsw_recheck_desc;
}
/* bounded - error - so NACK back */
return;
}
/*
* If we ACK'd the previous descriptor then now
* record the new range start position for later
* ACK's.
*/
if (prev_desc_ack) {
range_start = pos;
}
" 0x%llx : dstate 0x%lx : datalen 0x%lx",
/* invalid size; drop the packet */
goto vsw_process_desc_done;
}
/*
* Ensure that we ask ldc for an aligned
* number of bytes. Data is padded to align on 8
* byte boundary, desc.nbytes is actual data length,
* i.e. minus that padding.
*/
BPRI_MED);
} else {
/*
* No free receive buffers available,
* so fallback onto allocb(9F). Make
* sure that we get a data buffer which
* is a multiple of 8 as this is
* required by ldc_mem_copy.
*/
} else {
}
}
break;
}
if (rv != 0) {
"from %d cookies in desc %d (rv %d)",
break;
} else {
" using %d cookies", __func__,
}
/* adjust the read pointer to skip over the padding */
/* point to the actual end of data */
}
/* update statistics */
if (IS_BROADCAST(ehp))
else if (IS_MULTICAST(ehp))
/*
* IPALIGN space can be used for VLAN_TAG
*/
VSW_VNETPORT, mp);
/* build a chain of received packets */
/* first pkt */
chain = 1;
} else {
chain++;
}
/* mark we are finished with this descriptor */
VIO_DESC_DONE)) != 0) {
"dstate at pos %d: err %d",
break;
}
/*
* Send an ACK back to peer if requested.
*/
sizeof (vio_dring_msg_t), B_FALSE);
/*
* Check if ACK was successfully sent. If not
* we break and deal with that below.
*/
if (msg_rv != 0)
break;
range_start = pos;
}
/* next descriptor */
cnt++;
/*
* Break out of loop here and stop processing to
* allow some other network device (or disk) to
* get access to the cpu.
*/
if (chain > vsw_chain_len) {
break;
}
}
/* send the chain of packets to be switched */
}
/*
* If when we encountered an error when attempting to
* access an imported dring, initiate a connection reset.
*/
if (rng_rv != 0) {
break;
}
/*
* If when we attempted to send the ACK we found that the
* channel had been reset then now handle this.
*/
if (msg_rv == ECONNRESET) {
break;
}
/*
* We are now finished so ACK back with the state
* set to STOPPING so our peer knows we are finished
*/
/*
* We have not processed any more descriptors beyond
* the last one we ACK'd.
*/
if (prev_desc_ack)
sizeof (vio_dring_msg_t), B_TRUE);
break;
case VIO_SUBTYPE_ACK:
/*
* Verify that the relevant descriptors are all
* marked as DONE
*/
return;
}
/*
* If our peer is stopping processing descriptors then
* we check to make sure it has processed all the descriptors
* we have updated. If not then we send it a new message
* to prompt it to restart.
*/
/*
* Check next descriptor in public section of ring.
* If its marked as READY then we need to prompt our
* peer to start processing the ring again.
*/
/*
* Hold the restart lock across all of this to
* make sure that its not possible for us to
* decide that a msg needs to be sent in the future
* but the sending code having already checked is
* about to exit.
*/
sizeof (vio_dring_msg_t), B_FALSE);
} else {
}
}
if (msg_rv == ECONNRESET)
break;
case VIO_SUBTYPE_NACK:
/*
* Something is badly wrong if we are getting NACK's
* for our data pkts. So reset the channel.
*/
break;
default:
}
}