ixgbe_tx.c revision 185c5677613512bc5a906decb5034a5135f67fb1
/*
* CDDL HEADER START
*
* Copyright(c) 2007-2009 Intel Corporation. All rights reserved.
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include "ixgbe_sw.h"
uint32_t);
ixgbe_tx_context_t *, size_t);
static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
#ifndef IXGBE_DEBUG
#pragma inline(ixgbe_save_desc)
#pragma inline(ixgbe_get_context)
#pragma inline(ixgbe_check_context)
#pragma inline(ixgbe_fill_context)
#endif
/*
* ixgbe_ring_tx
*
* To transmit one mblk through one specified ring.
*
* One mblk can consist of several fragments, each fragment
* will be processed with different methods based on the size.
* For the fragments with size less than the bcopy threshold,
* they will be processed by using bcopy; otherwise, they will
* be processed by using DMA binding.
*
* To process the mblk, a tx control block is got from the
* free list. One tx control block contains one tx buffer, which
* is used to copy mblk fragments' data; and one tx DMA handle,
* which is used to bind a mblk fragment with DMA resource.
*
* Several small mblk fragments can be copied into one tx control
* block's buffer, and then the buffer will be transmitted with
* one tx descriptor.
*
* A large fragment only binds with one tx control block's DMA
* handle, and it can span several tx descriptors for transmitting.
*
* So to transmit a packet (mblk), several tx control blocks can
* be used. After the processing, those tx control blocks will
* be put to the work list.
*/
mblk_t *
{
int desc_num;
/* Get the mblk size */
mbsize = 0;
}
if (ixgbe->tx_hcksum_enable) {
/*
* Retrieve checksum context information from the mblk
* context descriptor.
*/
ctx = &tx_context;
return (NULL);
}
/*
* If the mblk size exceeds the max size ixgbe could
* process, then discard this mblk, and return NULL.
*/
return (NULL);
}
} else {
}
/*
* Check and recycle tx descriptors.
* The recycle threshold here should be selected carefully
*/
}
/*
* After the recycling, if the tbd_free is less than the
* overload_threshold, assert overload, return mp;
* and we need to re-schedule the tx again.
*/
return (mp);
}
/*
* The pending_list is a linked list that is used to save
* the tx control blocks that have packet data processed
* but have not put the data to the tx descriptor ring.
* It is used to reduce the lock contention of the tx_lock.
*/
desc_num = 0;
desc_total = 0;
/*
* The software should guarantee LSO packet header(MAC+IP+TCP)
* to be within one descriptor. Here we reallocate and refill the
* the header if it's physical memory non-contiguous.
*/
/* find the last fragment of the header */
}
/*
* If the header and the payload are in different mblks,
* we simply force the header to be copied into pre-allocated
* page-aligned buffer.
*/
goto adjust_threshold;
/*
* There are two cases we need to reallocate a mblk for the
* last header fragment:
* 1. the header is in multiple mblks and the last fragment
* share the same mblk with the payload
* 2. the header is in a single mblk shared with the payload
* and the header is physical memory non-contiguous
*/
< hdr_len)) {
/*
* reallocate the mblk for the last header fragment,
* expect to bcopy into pre-allocated page-aligned
* buffer
*/
if (!new_mp)
return (mp);
/* link the new header fragment with the other parts */
if (pre_mp)
if (hdr_frag_len == hdr_len)
}
/*
* adjust the bcopy threshhold to guarantee
* the header to use bcopy way
*/
if (copy_thresh < hdr_len)
}
current_mp = mp;
/*
* Decide which method to use for the first fragment
*/
/*
* If the mblk includes several contiguous small fragments,
* they may be copied into one buffer. This flag is used to
* indicate whether there are pending fragments that need to
* be copied to the current tx buffer.
*
* If this flag is B_TRUE, it indicates that a new tx control
* block is needed to process the next fragment using either
* copy or DMA binding.
*
* Otherwise, it indicates that the next fragment will be
* copied to the current tx buffer that is maintained by the
* current tx control block. No new tx control block is needed.
*/
while (current_mp) {
/*
* When the current fragment is an empty fragment, if
* the next fragment will still be copied to the current
* tx buffer, we cannot skip this fragment here. Because
* the copy processing is pending for completion. We have
* to process this empty fragment in the tx_copy routine.
*
* If the copy processing is completed or a DMA binding
* processing is just completed, we can just skip this
* empty fragment.
*/
if ((current_len == 0) && (copy_done)) {
continue;
}
if (copy_done) {
/*
* Get a new tx control block from the free list
*/
goto tx_failure;
}
/*
* Push the tx control block to the pending list
* to avoid using lock too early
*/
}
if (current_flag == USE_COPY) {
/*
* Check whether to use bcopy or DMA binding to process
* the next fragment, and if using bcopy, whether we
* need to continue copying the next fragment into the
* current tx buffer.
*/
if (eop) {
/*
* This is the last fragment of the packet, so
* the copy processing will be completed with
* this fragment.
*/
/*
* If the next fragment is too large to be
* copied to the current tx buffer, we need
* to complete the current copy processing.
*/
} else if (next_len > copy_thresh) {
/*
* The next fragment needs to be processed with
* DMA binding. So the copy prcessing will be
* completed with the current fragment.
*/
} else {
/*
* Continue to copy the next fragment to the
* current tx buffer.
*/
}
} else {
/*
* Check whether to use bcopy or DMA binding to process
* the next fragment.
*/
}
if (desc_num > 0)
desc_total += desc_num;
else if (desc_num < 0)
goto tx_failure;
}
/*
* Attach the mblk to the last tx control block
*/
/*
* 82598/82599 chipset has a limitation that no more than 32 tx
* descriptors can be transmited out at one time.
*
* Here is a workaround for it: pull up the mblk then send it
* out with bind way. By doing so, no more than MAX_COOKIE (18)
* descriptors is needed.
*/
/*
* Discard the mblk and free the used resources
*/
while (tcb) {
tcb = (tx_control_block_t *)
}
/*
* Return the tx control blocks in the pending list to
* the free list.
*/
/*
* pull up the mblk and send it out with bind way
*/
return (mp);
}
desc_total = 0;
/*
* if the packet is a LSO packet, we simply
* transmit the header in one descriptor using the copy way
*/
goto tx_failure;
}
desc_total += desc_num;
}
goto tx_failure;
}
} else {
mbsize);
}
if (desc_num < 0) {
goto tx_failure;
}
desc_total += desc_num;
}
/*
* Before fill the tx descriptor ring with the data, we need to
* ensure there are adequate free descriptors for transmit
* (including one context descriptor).
*/
}
/*
* If the number of free tx descriptors is not enough for transmit
* then return mp.
*
* Note: we must put this check under the mutex protection to
* ensure the correctness when multiple threads access it in
* parallel.
*/
goto tx_failure;
}
mbsize);
/*
* now that the transmission succeeds, need to free the original
* mp if we used the pulling up mblk for transmission.
*/
if (pull_mp) {
}
return (NULL);
/*
* If transmission fails, need to free the pulling up mblk.
*/
if (pull_mp) {
}
/*
* Discard the mblk and free the used resources
*/
while (tcb) {
tcb = (tx_control_block_t *)
}
/*
* Return the tx control blocks in the pending list to the free list.
*/
/* Transmit failed, do not drop the mblk, rechedule the transmit */
return (mp);
}
/*
* ixgbe_tx_copy
*
* Copy the mblk fragment to the pre-allocated tx buffer
*/
static int
{
/*
* Copy the packet data of the mblk fragment into the
* pre-allocated tx buffer, which is maintained by the
* tx control block.
*
* Several mblk fragments can be copied into one tx buffer.
* The destination address of the current copied fragment in
* the tx buffer is next to the end of the previous copied
* fragment.
*/
if (len > 0) {
}
desc_num = 0;
/*
* If it is the last fragment copied to the current tx buffer,
* in other words, if there's no remaining fragment or the remaining
* fragment requires a new tx control block to process, we need to
* complete the current copy processing by syncing up the current
* DMA buffer and saving the descriptor data.
*/
if (copy_done) {
/*
* Sync the DMA buffer of the packet data
*/
/*
* Save the address and length to the private data structure
* of the tx control block, which will be used to fill the
* tx descriptor ring after all the fragments are processed.
*/
desc_num++;
}
return (desc_num);
}
/*
* ixgbe_tx_bind
*
* Bind the mblk fragment with DMA
*/
static int
{
int status, i;
int desc_num;
/*
* Use DMA binding to process the mblk fragment
*/
0, &dma_cookie, &ncookies);
if (status != DDI_DMA_MAPPED) {
return (-1);
}
/*
* Each fragment can span several cookies. One cookie will have
* one tx descriptor to transmit.
*/
desc_num = 0;
for (i = ncookies; i > 0; i--) {
/*
* Save the address and length to the private data structure
* of the tx control block, which will be used to fill the
* tx descriptor ring after all the fragments are processed.
*/
desc_num++;
if (i > 1)
}
return (desc_num);
}
/*
* ixgbe_get_context
*
* Get the context information from the mblk
*/
static int
{
unsigned char *pos;
if (hckflags == 0) {
return (0);
}
/*
* LSO relies on tx h/w checksum, so here will drop the package
* if h/w checksum flag is not declared.
*/
"checksum flags are not specified when doing LSO");
return (-1);
}
}
etype = 0;
mac_hdr_len = 0;
l4_proto = 0;
/*
* Firstly get the position of the ether_type/ether_tpid.
* Here we don't assume the ether (VLAN) header is fully included
* in one mblk fragment, so we go thourgh the fragments to parse
* the ether type.
*/
}
if (etype == ETHERTYPE_VLAN) {
/*
* Get the position of the ether_type in VLAN header
*/
}
mac_hdr_len = sizeof (struct ether_vlan_header);
} else {
mac_hdr_len = sizeof (struct ether_header);
}
/*
* Here we don't assume the IP(V6) header is fully included in
* one mblk fragment.
*/
switch (etype) {
case ETHERTYPE_IP:
}
}
/*
* To perform ixgbe LSO, here also need to fill
* the tcp checksum field of the packet with the
* following pseudo-header checksum:
* (ip_source_addr, ip_destination_addr, l4_proto)
*/
}
}
break;
case ETHERTYPE_IPV6:
}
break;
default:
/* Unrecoverable error */
return (-2);
}
}
} else {
/*
* l4 header length is only required for LSO
*/
l4_hdr_len = 0;
}
return (0);
}
/*
* ixgbe_check_context
*
* Check if a new context descriptor is needed
*/
static boolean_t
{
return (B_FALSE);
/*
* Compare the context data retrieved from the mblk and the
* stored data of the last context descriptor. The data need
* to be checked are:
* hcksum_flags
* l4_proto
* mac_hdr_len
* ip_hdr_len
* lso_flag
* mss (only checked for LSO)
* l4_hr_len (only checked for LSO)
* Either one of the above data is changed, a new context descriptor
* will be needed.
*/
return (B_TRUE);
}
return (B_FALSE);
}
/*
* ixgbe_fill_context
*
* Fill the context descriptor with hardware checksum informations
*/
static void
{
/*
* Fill the context descriptor with the checksum
* context information we've got.
*/
case IPPROTO_TCP:
break;
case IPPROTO_UDP:
/*
* We don't have to explicitly set:
* ctx_tbd->type_tucmd_mlhl |=
* IXGBE_ADVTXD_TUCMD_L4T_UDP;
* Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
*/
break;
default:
/* Unrecoverable error */
break;
}
}
ctx_tbd->seqnum_seed = 0;
} else {
ctx_tbd->mss_l4len_idx = 0;
}
}
/*
* ixgbe_tx_fill_ring
*
* Fill the tx descriptor ring with the data
*/
static int
{
int i;
desc_num = 0;
hcksum_flags = 0;
/*
* Get the index of the first tx descriptor that will be filled,
* and the index of the first work list item that will be attached
* with the first used tx control block in the pending list.
* Note: the two indexes are the same.
*/
/*
* Check if a new context descriptor is needed for this packet
*/
if (load_context) {
/*
* Fill the context descriptor with the
* hardware checksum offload informations.
*/
desc_num++;
/*
* Store the checksum context data if
* a new context descriptor is added
*/
}
}
/*
* Fill tx data descriptors with the data saved in the pending list.
* The tx control blocks in the pending list are added to the work list
* at the same time.
*
* The work list is strictly 1:1 corresponding to the descriptor ring.
* One item of the work list corresponds to one tx descriptor. Because
* one tx control block can span multiple tx descriptors, the tx
* control block will be added to the first work list item that
* corresponds to the first tx descriptor generated from that tx
* control block.
*/
desc_num++;
}
/*
* Add the tx control block to the work list
*/
}
if (load_context) {
/*
* Count the context descriptor for
* the first tx control block.
*/
}
/*
* The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
* valid in the first descriptor of the packet.
* Setting paylen in every first_tbd for all parts.
* 82599 requires the packet length in paylen field with or without
* LSO and 82598 will ignore it in non-LSO mode.
*/
case ixgbe_mac_82599EB:
} else {
}
break;
case ixgbe_mac_82598EB:
}
break;
default:
break;
}
/* Set hardware checksum bits */
if (hcksum_flags != 0) {
if (hcksum_flags & HCK_IPV4_HDRCKSUM)
if (hcksum_flags & HCK_PARTIALCKSUM)
}
/*
* The last descriptor of packet needs End Of Packet (EOP),
* and Report Status (RS) bits set
*/
/*
* Sync the DMA buffer of the tx descriptor ring
*/
}
/*
* Update the number of the free tx descriptors.
* The mutual exclusion between the transmission and the recycling
* (for the tx descriptor ring and the work list) is implemented
* with the atomic operation on the number of the free tx descriptors.
*
* Note: we should always decrement the counter tbd_free before
* advancing the hardware TDT pointer to avoid the race condition -
* before the counter tbd_free is decremented, the transmit of the
* tx descriptors has done and the counter tbd_free is increased by
* the tx recycling.
*/
ASSERT(i >= 0);
/*
* Advance the hardware TDT pointer of the tx descriptor ring
*/
DDI_FM_OK) {
}
return (desc_num);
}
/*
* ixgbe_save_desc
*
* will be filled into the tx descriptor ring later.
*/
static void
{
}
/*
* ixgbe_tx_recycle_legacy
*
* Recycle the tx descriptors and tx control blocks.
*
* The work list is traversed to check if the corresponding
* tx descriptors have been transmitted. If so, the resources
* bound to the tx control blocks will be freed, and those
* tx control blocks will be returned to the free list.
*/
{
int desc_num;
tx_ring->recycle_fail = 0;
tx_ring->stall_watchdog = 0;
if (tx_ring->reschedule) {
}
return (0);
}
/*
* Sync the DMA buffer of the tx descriptor ring
*/
}
desc_num = 0;
/*
* Get the last tx descriptor of this packet.
* If the last tx descriptor is done, then
* we can recycle all descriptors of a packet
* which usually includes several tx control blocks.
* For 82599, LSO descriptors can not be recycled
* unless the whole packet's transmission is done.
* That's why packet level recycling is used here.
* For 82598, there's not such limit.
*/
/*
* MAX_TX_RING_SIZE is used to judge whether
* the index is a valid value or not.
*/
if (last_index == MAX_TX_RING_SIZE)
break;
/*
* Check if the Descriptor Done bit is set
*/
if (desc_done) {
/*
* recycle all descriptors of the packet
*/
/*
* Strip off the tx control block from
* the work list, and add it to the
* pending list.
*/
/*
* Count the total number of the tx
* descriptors recycled
*/
if (prev_index == last_index)
break;
}
} else {
break;
}
}
/*
* If no tx descriptors are recycled, no need to do more processing
*/
if (desc_num == 0) {
tx_ring->recycle_fail++;
return (0);
}
tx_ring->recycle_fail = 0;
tx_ring->stall_watchdog = 0;
/*
* Update the head index of the tx descriptor ring
*/
/*
* Update the number of the free tx descriptors with atomic operations
*/
(tx_ring->reschedule)) {
}
/*
* Free the resources used by the tx control blocks
* in the pending list
*/
/*
* Release the resources occupied by the tx control block
*/
tcb = (tx_control_block_t *)
}
/*
* Add the tx control blocks in the pending list to the free list.
*/
return (desc_num);
}
/*
* ixgbe_tx_recycle_head_wb
*
* Check the head write-back, and recycle all the transmitted
* tx descriptors and tx control blocks.
*/
{
int desc_num;
/*
* The mutex_tryenter() is used to avoid unnecessary
* lock contention.
*/
tx_ring->recycle_fail = 0;
tx_ring->stall_watchdog = 0;
if (tx_ring->reschedule) {
}
return (0);
}
/*
* Sync the DMA buffer of the tx descriptor ring
*
* Note: For head write-back mode, the tx descriptors will not
* be written back, but the head write-back value is stored at
* the last extra tbd at the end of the DMA area, we still need
* to sync the head write-back value for kernel.
*
* DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
*/
sizeof (uint32_t),
}
desc_num = 0;
/*
* Get the value of head write-back
*/
/*
* The current tx control block is not
* completely transmitted, stop recycling
*/
break;
}
/*
* Strip off the tx control block from the work list,
* and add it to the pending list.
*/
/*
* Advance the index of the tx descriptor ring
*/
/*
* Count the total number of the tx descriptors recycled
*/
}
/*
* If no tx descriptors are recycled, no need to do more processing
*/
if (desc_num == 0) {
tx_ring->recycle_fail++;
return (0);
}
tx_ring->recycle_fail = 0;
tx_ring->stall_watchdog = 0;
/*
* Update the head index of the tx descriptor ring
*/
/*
* Update the number of the free tx descriptors with atomic operations
*/
(tx_ring->reschedule)) {
}
/*
* Free the resources used by the tx control blocks
* in the pending list
*/
while (tcb) {
/*
* Release the resources occupied by the tx control block
*/
tcb = (tx_control_block_t *)
}
/*
* Add the tx control blocks in the pending list to the free list.
*/
return (desc_num);
}
/*
* ixgbe_free_tcb - free up the tx control block
*
* Free the resources of the tx control block, including
* unbind the previously bound DMA handle, and reset other
* control fields.
*/
void
{
case USE_COPY:
/*
* Reset the buffer length that is used for copy
*/
break;
case USE_DMA:
/*
* Release the DMA resource that is used for
* DMA binding.
*/
break;
default:
break;
}
/*
* Free the mblk
*/
}
}
/*
* ixgbe_get_free_list - Get a free tx control block from the free list
*
* The atomic operation on the number of the available tx control block
* in the free list is used to keep this routine mutual exclusive with
* the routine ixgbe_put_check_list.
*/
static tx_control_block_t *
{
/*
* Check and update the number of the free tx control block
* in the free list.
*/
return (NULL);
return (tcb);
}
/*
* ixgbe_put_free_list
*
* Put a list of used tx control blocks back to the free list
*
* A mutex is used here to ensure the serialization. The mutual exclusion
* between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
* the atomic operation on the counter tcb_free.
*/
void
{
int tcb_num;
tcb_num = 0;
tcb_num++;
}
/*
* Update the number of the free tx control block
* in the free list. This operation must be placed
* under the protection of the lock.
*/
}