/*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
*/
/*
* Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright 2016 Joyent, Inc.
*/
/*
* i40e - Intel 10/40 Gb Ethernet driver
*
* The i40e driver is the main software device driver for the Intel 40 Gb family
* of devices. Note that these devices come in many flavors with both 40 GbE
* ports and 10 GbE ports. This device is the successor to the 82599 family of
* devices (ixgbe).
*
* Unlike previous generations of Intel 1 GbE and 10 GbE devices, the 40 GbE
* devices defined in the XL710 controller (previously known as Fortville) are a
* rather different beast and have a small switch embedded inside of them. In
* addition, the way that most of the programming is done has been overhauled.
* As opposed to just using PCIe memory mapped registers, it also has an
* administrative queue which is used to communicate with firmware running on
* the chip.
*
* Each physical function in the hardware shows up as a device that this driver
* will bind to. The hardware splits many resources evenly across all of the
* physical functions present on the device, while other resources are instead
* shared across the entire card and its up to the device driver to
* intelligently partition them.
*
* ------------
* Organization
* ------------
*
* This driver is made up of several files which have their own theory
* statements spread across them. We'll touch on the high level purpose of each
* file here, and then we'll get into more discussion on how the device is
* generally modelled with respect to the interfaces in illumos.
*
* i40e_gld.c: This file contains all of the bindings to MAC and the networking
* stack.
*
* i40e_intr.c: This file contains all of the interrupt service routines and
* contains logic to enable and disable interrupts on the hardware.
* It also contains the logic to map hardware resources such as the
* rings to and from interrupts and controls their ability to fire.
*
* There is a big theory statement on interrupts present there.
*
* i40e_main.c: The file that you're currently in. It interfaces with the
* traditional OS DDI interfaces and is in charge of configuring
* the device.
*
* i40e_osdep.[ch]: These files contain interfaces and definitions needed to
* work with Intel's common code for the device.
*
* i40e_stats.c: This file contains the general work and logic around our
* kstats. A theory statement on their organization and use of the
* hardware exists there.
*
* i40e_sw.h: This header file contains all of the primary structure definitions
* and constants that are used across the entire driver.
*
* i40e_transceiver.c: This file contains all of the logic for sending and
* receiving data. It contains all of the ring and DMA
* allocation logic, as well as, the actual interfaces to
* send and receive data.
*
* A big theory statement on ring management, descriptors,
* and how it ties into the OS is present there.
*
* --------------
* General Design
* --------------
*
* Before we go too far into the general way we've laid out data structures and
* the like, it's worth taking some time to explain how the hardware is
* organized. This organization informs a lot of how we do things at this time
* in the driver.
*
* Each physical device consists of a number of one or more ports, which are
* considered physical functions in the PCI sense and thus each get enumerated
* by the system, resulting in an instance being created and attached to. While
* there are many resources that are unique to each physical function eg.
* instance of the device, there are many that are shared across all of them.
* Several resources have an amount reserved for each Virtual Station Interface
* (VSI) and then a static pool of resources, available for all functions on the
* card.
*
* The most important resource in hardware are its transmit and receive queue
* pairs (i40e_trqpair_t). These should be thought of as rings in GLDv3
* parlance. There are a set number of these on each device; however, they are
* statically partitioned among all of the different physical functions.
*
* 'Fortville' (the code name for this device family) is basically a switch. To
* map MAC addresses and other things to queues, we end up having to create
* Virtual Station Interfaces (VSIs) and establish forwarding rules that direct
* traffic to a queue. A VSI owns a collection of queues and has a series of
* forwarding rules that point to it. One way to think of this is to treat it
* like MAC does a VNIC. When MAC refers to a group, a collection of rings and
* classification resources, that is a VSI in i40e.
*
* The sets of VSIs is shared across the entire device, though there may be some
* amount that are reserved to each PF. Because the GLDv3 does not let us change
* the number of groups dynamically, we instead statically divide this amount
* evenly between all the functions that exist. In addition, we have the same
* problem with the mac address forwarding rules. There are a static number that
* exist shared across all the functions.
*
* To handle both of these resources, what we end up doing is going through and
* determining which functions belong to the same device. Nominally one might do
* this by having a nexus driver; however, a prime requirement for a nexus
* driver is identifying the various children and activating them. While it is
* possible to get this information from NVRAM, we would end up duplicating a
* lot of the PCI enumeration logic. Really, at the end of the day, the device
* doesn't give us the traditional identification properties we want from a
* nexus driver.
*
* Instead, we rely on some properties that are guaranteed to be unique. While
* it might be tempting to leverage the PBA or serial number of the device from
* NVRAM, there is nothing that says that two devices can't be mis-programmed to
* have the same values in NVRAM. Instead, we uniquely identify a group of
* functions based on their parent in the /devices tree, their PCI bus and PCI
* function identifiers. Using either on their own may not be sufficient.
*
* For each unique PCI device that we encounter, we'll create a i40e_device_t.
* From there, because we don't have a good way to tell the GLDv3 about sharing
* resources between everything, we'll end up just dividing the resources
* evenly between all of the functions. Longer term, if we don't have to declare
* to the GLDv3 that these resources are shared, then we'll maintain a pool and
* hae each PF allocate from the pool in the device, thus if only two of four
* ports are being used, for example, then all of the resources can still be
* used.
*
* -------------------------------------------
* Transmit and Receive Queue Pair Allocations
* -------------------------------------------
*
* NVRAM ends up assigning each PF its own share of the transmit and receive LAN
* queue pairs, we have no way of modifying it, only observing it. From there,
* it's up to us to map these queues to VSIs and VFs. Since we don't support any
* VFs at this time, we only focus on assignments to VSIs.
*
* given VSI (eg. rings to a group). Though in the fullness of time, we want to
* make this something which is fully dynamic and take advantage of documented,
* but not yet available functionality for adding filters based on VXLAN and
* other encapsulation technologies.
*
* -------------------------------------
* Broadcast, Multicast, and Promiscuous
* -------------------------------------
*
* As part of the GLDv3, we need to make sure that we can handle receiving
* broadcast and multicast traffic. As well as enabling promiscuous mode when
* requested. GLDv3 requires that all broadcast and multicast traffic be
* retrieved by the default group, eg. the first one. This is the same thing as
* the default VSI.
*
* To receieve broadcast traffic, we enable it through the admin queue, rather
* than use one of our filters for it. For multicast traffic, we reserve a
* certain number of the hash filters and assign them to a given PF. When we
* exceed those, we then switch to using promicuous mode for multicast traffic.
*
* More specifically, once we exceed the number of filters (indicated because
* the i40e_t`i40e_resources.ifr_nmcastfilt ==
* i40e_t`i40e_resources.ifr_nmcastfilt_used), we then instead need to toggle
* promiscuous mode. If promiscuous mode is toggled then we keep track of the
* number of MACs added to it by incrementing i40e_t`i40e_mcast_promisc_count.
* That will stay enabled until that count reaches zero indicating that we have
* only added multicast addresses that we have a corresponding entry for.
*
* Because MAC itself wants to toggle promiscuous mode, which includes both
* unicast and multicast traffic, we go through and keep track of that
* ourselves. That is maintained through the use of the i40e_t`i40e_promisc_on
* member.
*
* --------------
* VSI Management
* --------------
*
* At this time, we currently only support a single MAC group, and thus a single
* VSI. This VSI is considered the default VSI and should be the only one that
* exists after a reset. Currently it is stored as the member
* i40e_t`i40e_vsi_id. While this works for the moment and for an initial
* driver, it's not sufficient for the longer-term path of the driver. Instead,
* we'll want to actually have a unique i40e_vsi_t structure which is used
* everywhere. Note that this means that every place that uses the
* i40e_t`i40e_vsi_id will need to be refactored.
*
* ----------------
* Structure Layout
* ----------------
*
* The following images relates the core data structures together. The primary
* structure in the system is the i40e_t. It itself contains multiple rings,
* i40e_trqpair_t's which contain the various transmit and receive data. The
* receive data is stored outside of the i40e_trqpair_t and instead in the
* i40e_rx_data_t. The i40e_t has a corresponding i40e_device_t which keeps
* track of per-physical device state. Finally, for every active descriptor,
* there is a corresponding control block, which is where the
* i40e_rx_control_block_t and the i40e_tx_control_block_t come from.
*
* +-----------------------+ +-----------------------+
* | Global i40e_t list | | Global Device list |
* | | +--| |
* | i40e_glist | | | i40e_dlist |
* +-----------------------+ | +-----------------------+
* | v
* | +------------------------+ +-----------------------+
* | | Device-wide Structure |----->| Device-wide Structure |--> ...
* | | i40e_device_t | | i40e_device_t |
* | | | +-----------------------+
* | | dev_info_t * ------+--> Parent in devices tree.
* | | uint_t ------+--> PCI bus number
* | | uint_t ------+--> PCI device number
* | | uint_t ------+--> Number of functions
* | | i40e_switch_rsrcs_t ---+--> Captured total switch resources
* | | list_t ------+-------------+
* | +------------------------+ |
* | ^ |
* | +--------+ |
* | | v
* | +---------------------------+ | +-------------------+
* +->| GLDv3 Device, per PF |-----|-->| GLDv3 Device (PF) |--> ...
* | i40e_t | | | i40e_t |
* | **Primary Structure** | | +-------------------+
* | | |
* | i40e_device_t * --+-----+
* | i40e_state_t --+---> Device State
* | i40e_hw_t --+---> Intel common code structure
* | mac_handle_t --+---> GLDv3 handle to MAC
* | ddi_periodic_t --+---> Link activity timer
* | int (vsi_id) --+---> VSI ID, main identifier
* | i40e_func_rsrc_t --+---> Available hardware resources
* | i40e_switch_rsrc_t * --+---> Switch resource snapshot
* | i40e_sdu --+---> Current MTU
* | i40e_frame_max --+---> Current HW frame size
* | i40e_uaddr_t * --+---> Array of assigned unicast MACs
* | i40e_maddr_t * --+---> Array of assigned multicast MACs
* | i40e_mcast_promisccount --+---> Active multicast state
* | i40e_promisc_on --+---> Current promiscuous mode state
* | kstat_t * --+---> PF kstats
* | kstat_t * --+---> VSI kstats
* | i40e_pf_stats_t --+---> PF kstat backing data
* | i40e_vsi_stats_t --+---> VSI kstat backing data
* | i40e_trqpair_t * --+---------+
* +---------------------------+ |
* |
* v
* +-------------------------------+ +-----------------------------+
* | i40e_trqpair_t | | i40e_trqpair_t |
* + Ring Data Structure | +-----------------------------+
* | |
* | mac_ring_handle_t +--> MAC RX ring handle
* | mac_ring_handle_t +--> MAC TX ring handle
* | i40e_rxq_stat_t --+--> RX Queue stats
* | i40e_txq_stat_t --+--> TX Queue stats
* | uint32_t (tx ring size) +--> TX Ring Size
* | uint32_t (tx free list size) +--> TX Free List Size
* | i40e_dma_buffer_t --------+--> TX Descriptor ring DMA
* | i40e_tx_desc_t * --------+--> TX descriptor ring
* | volatile unt32_t * +--> TX Write back head
* | uint32_t -------+--> TX ring head
* | uint32_t -------+--> TX ring tail
* | uint32_t -------+--> Num TX desc free
* | i40e_tx_control_block_t * --+--> TX control block array ---+
* | i40e_tx_control_block_t ** --+--> TCB work list ----+
* | i40e_tx_control_block_t ** --+--> TCB free list ---+
* | uint32_t -------+--> Free TCB count |
* | i40e_rx_data_t * -------+--+ v
* +-------------------------------+ | +---------------------------+
* | | Per-TX Frame Metadata |
* | | i40e_tx_control_block_t |
* +--------------------+ | |
* | mblk to transmit <--+--- mblk_t * |
* | type of transmit <--+--- i40e_tx_type_t |
* | TX DMA handle <--+--- ddi_dma_handle_t |
* v TX DMA buffer <--+--- i40e_dma_buffer_t |
* +------------------------------+ +---------------------------+
* | Core Receive Data |
* | i40e_rx_data_t |
* | |
* | i40e_dma_buffer_t --+--> RX descriptor DMA Data
* | i40e_rx_desc_t --+--> RX descriptor ring
* | uint32_t --+--> Next free desc.
* | i40e_rx_control_block_t * --+--> RX Control Block Array ---+
* | i40e_rx_control_block_t ** --+--> RCB work list ---+
* | i40e_rx_control_block_t ** --+--> RCB free list ---+
* +------------------------------+ |
* ^ |
* | +---------------------------+ |
* | | Per-RX Frame Metadata |<---------------+
* | | i40e_rx_control_block_t |
* | | |
* | | mblk_t * ----+--> Received mblk_t data
* | | uint32_t ----+--> Reference count
* | | i40e_dma_buffer_t ----+--> Receive data DMA info
* | | frtn_t ----+--> mblk free function info
* +-----+-- i40e_rx_data_t * |
* +---------------------------+
*
* -------------
* Lock Ordering
* -------------
*
* In order to ensure that we don't deadlock, the following represents the
* lock order being used. When grabbing locks, follow the following order. Lower
* numbers are more important. Thus, the i40e_glock which is number 0, must be
* taken before any other locks in the driver. On the other hand, the
* i40e_t`i40e_stat_lock, has the highest number because it's the least
* important lock. Note, that just because one lock is higher than another does
* not mean that all intermediary locks are required.
*
* 0) i40e_glock
* 1) i40e_t`i40e_general_lock
*
* 2) i40e_trqpair_t`itrq_rx_lock
* 3) i40e_trqpair_t`itrq_tx_lock
* 4) i40e_t`i40e_rx_pending_lock
* 5) i40e_trqpair_t`itrq_tcb_lock
*
* 6) i40e_t`i40e_stat_lock
*
* Rules and expectations:
*
* 1) A thread holding locks belong to one PF should not hold locks belonging to
* a second. If for some reason this becomes necessary, locks should be grabbed
* based on the list order in the i40e_device_t, which implies that the
* i40e_glock is held.
*
* 2) When grabbing locks between multiple transmit and receive queues, the
*
* 3) When grabbing both the transmit and receive lock for a given queue, always
* grab i40e_trqpair_t`itrq_rx_lock before the i40e_trqpair_t`itrq_tx_lock.
*
* 4) The following pairs of locks are not expected to be held at the same time:
*
* o i40e_t`i40e_rx_pending_lock and i40e_trqpair_t`itrq_tcb_lock
*
* -----------
* Future Work
* -----------
*
* At the moment the i40e_t driver is rather bare bones, allowing us to start
* getting data flowing and folks using it while we develop additional features.
* While bugs have been filed to cover this future work, the following gives an
* overview of expected work:
*
* o TSO support
* o RSS / multiple ring support
* o Multiple group support
* o DMA binding and breaking up the locking in ring recycling.
* o Enhanced detection of device errors
* o Participation in IRM
* o FMA device reset
* o Stall detection, temperature error detection, etc.
* o More dynamic resource pools
*/
#include "i40e_sw.h"
/*
* The i40e_glock primarily protects the lists below and the i40e_device_t
* structures.
*/
/*
* Access attributes for register mapping.
*/
};
/*
* Logging function for this driver.
*/
static void
{
} else {
buf);
}
}
/*
* Because there's the stupid trailing-comma problem with the C preprocessor
* and variable arguments, I need to instantiate these. Pardon the redundant
* code.
*/
/*PRINTFLIKE2*/
void
{
}
/*PRINTFLIKE2*/
void
{
}
/*PRINTFLIKE2*/
void
{
}
static void
{
return;
}
}
static i40e_device_t *
{
break;
}
}
} else {
/*
* The Intel common code doesn't exactly keep the number of PCI
* functions. But it calculates it during discovery of
* partitions and ports. So what we do is undo the calculation
* that it does originally, as functions are evenly spread
* across ports in the rare case of partitions.
*/
}
return (idp);
}
static void
{
return;
}
/*
* This is a basic link check routine. Mostly we're using this just to see
* if we can get any accurate information about the state of the link being
* up or down, as well as updating the link state, speed, etc. information.
*/
void
{
int ret;
return;
}
/*
* Firmware abstracts all of the mac and phy information for us, so we
* can use i40e_get_link_status to determine the current state.
*/
/*
* Translate from an i40e value to a value in Mbits/s.
*/
switch (speed) {
case I40E_LINK_SPEED_100MB:
break;
case I40E_LINK_SPEED_1GB:
break;
case I40E_LINK_SPEED_10GB:
break;
case I40E_LINK_SPEED_20GB:
break;
case I40E_LINK_SPEED_40GB:
break;
default:
i40e->i40e_link_speed = 0;
break;
}
/*
* At this time, hardware does not support half-duplex
* operation, hence why we don't ask the hardware about our
* current speed.
*/
} else {
i40e->i40e_link_speed = 0;
i40e->i40e_link_duplex = 0;
}
}
static void
{
int i, rc;
for (i = 0; i < i40e->i40e_intr_count; i++) {
if (rc != DDI_SUCCESS) {
i, rc);
}
}
}
static void
{
int i, rc;
for (i = 0; i < i40e->i40e_intr_count; i++) {
if (rc != DDI_SUCCESS) {
i, rc);
}
}
}
/*
* illumos Fault Management Architecture (FMA) support.
*/
int
{
return (de.fme_status);
}
int
{
return (de.fme_status);
}
/*
* Fault service error handling callback function.
*/
/* ARGSUSED */
static int
{
return (err->fme_status);
}
static void
{
if (i40e->i40e_fm_capabilities < 0) {
i40e->i40e_fm_capabilities = 0;
}
/*
* Only register with IO Fault Services if we have some capability
*/
} else {
}
if (i40e->i40e_fm_capabilities) {
}
i40e_fm_error_cb, (void*)i40e);
}
}
} else {
}
}
static void
{
if (i40e->i40e_fm_capabilities) {
}
}
void
{
}
}
/*
* Here we're trying to get the ID of the default VSI. In general, when we come
* through and look at this shortly after attach, we expect there to only be a
* single element present, which is the default VSI. Importantly, each PF seems
* to not see any other devices, in part because of the simple switch mode that
* we're using. If for some reason, we see more artifact, we'll need to revisit
* what we're doing here.
*/
static int
{
int rc;
/* LINTED: E_BAD_PTR_CAST_ALIGN */
NULL);
if (rc != I40E_SUCCESS) {
return (-1);
}
"during attach, not proceeding",
return (-1);
}
}
/*
* We need to fill the i40e_hw_t structure with the capabilities of this PF. We
* must also provide the memory for it; however, we don't need to keep it around
* to the call to the common code. It takes it and parses it into an internal
* structure.
*/
static boolean_t
{
int rc;
for (;;) {
nelems == I40E_HW_CAP_DEFAULT) {
"due to byzantine common code");
return (B_FALSE);
}
continue;
} else if (rc != I40E_SUCCESS ||
return (B_FALSE);
}
break;
}
return (B_TRUE);
}
/*
* Obtain the switch's capabilities as seen by this PF and keep it around for
* our later use.
*/
static boolean_t
{
for (;;) {
if (size > UINT16_MAX)
return (B_FALSE);
if (ret == I40E_ERR_ADMIN_QUEUE_ERROR &&
continue;
} else if (ret != I40E_SUCCESS) {
"failed to retrieve switch statistics: %d", ret);
return (B_FALSE);
}
break;
}
return (B_TRUE);
}
static void
{
}
}
}
}
static boolean_t
{
int *regs, i;
return (B_FALSE);
}
if (nregs < 1) {
return (B_FALSE);
}
return (B_FALSE);
}
/*
* To calculate the total amount of a resource we have available, we
* need to add how many our i40e_t thinks it has guaranteed, if any, and
* then we need to go through and divide the number of available on the
* device, which was snapshotted before anyone should have allocated
* anything, and use that to derive how many are available from the
* pool. Longer term, we may want to turn this into something that's
* more of a pool-like resource that everything can share (though that
* may require some more assistance from MAC).
*
* Though for transmit and receive queue pairs, we just have to ask
* firmware instead.
*/
for (i = 0; i < i40e->i40e_switch_rsrc_actual; i++) {
switch (srp->resource_type) {
break;
break;
break;
default:
break;
}
}
for (i = 0; i < idp->id_rsrcs_act; i++) {
switch (srp->resource_type) {
break;
break;
default:
break;
}
}
/*
* Initialize these as multicast addresses to indicate it's invalid for
* sanity purposes. Think of it like 0xdeadbeef.
*/
return (B_TRUE);
}
static boolean_t
{
int i, rc;
if (rc != DDI_SUCCESS) {
rc);
return (B_FALSE);
}
} else {
for (i = 0; i < i40e->i40e_intr_count; i++) {
if (rc != DDI_SUCCESS) {
"Failed to enable interrupt %d: %d", i, rc);
while (--i >= 0) {
(void) ddi_intr_disable(
i40e->i40e_intr_handles[i]);
}
return (B_FALSE);
}
}
}
return (B_TRUE);
}
static boolean_t
{
int i, rc;
if (rc != DDI_SUCCESS) {
"Interrupt block-disabled failed: %d", rc);
return (B_FALSE);
}
} else {
for (i = 0; i < i40e->i40e_intr_count; i++) {
if (rc != DDI_SUCCESS) {
"Failed to disable interrupt %d: %d",
i, rc);
return (B_FALSE);
}
}
}
return (B_TRUE);
}
/*
* Free receive & transmit rings.
*/
static void
{
int i;
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
/*
* etc.
*/
}
}
}
/*
* Allocate transmit and receive rings, as well as other data structures that we
* need.
*/
static boolean_t
{
int i;
/*
* Now that we have the priority for the interrupts, initialize
* all relevant locks.
*/
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
itrq->itrq_index = i;
}
return (B_TRUE);
}
/*
* Unless a .conf file already overrode i40e_t structure values, they will
* be 0, and need to be set in conjunction with the now-available HW report.
*
* However, at the moment, we cap all of these resources as we only support a
* single receive ring and a single group.
*/
/* ARGSUSED */
static void
{
if (i40e->i40e_num_trqpairs == 0) {
}
if (i40e->i40e_num_rx_groups == 0) {
}
}
/*
* Free any resources required by, or setup by, the Intel common code.
*/
static void
{
int rc;
if (rc != I40E_SUCCESS)
if (rc != I40E_SUCCESS)
}
/*
* Initialize and call Intel common-code routines, includes some setup
* the common code expects from the driver. Also prints on failure, so
* the caller doesn't have to.
*/
static boolean_t
{
int rc;
if (rc != 0) {
return (B_FALSE);
}
if (rc != 0) {
return (B_FALSE);
}
if (rc != 0) {
"%d, potential firmware version mismatch", rc);
return (B_FALSE);
}
"version of the NVM image (%d.%d) than expected (%d.%d).\n"
"Please install the most recent version of the network "
" version of the NVM image (%d.%d) than expected (%d.%d)."
"\nPlease update the NVM image.\n",
}
/*
* We need to call this so that the common code can discover
* capabilities of the hardware, which it uses throughout the rest.
*/
return (B_FALSE);
}
return (B_FALSE);
}
if (rc != 0) {
"%d", rc);
return (B_FALSE);
}
if (rc != 0) {
"%d", rc);
return (B_FALSE);
}
if (rc != I40E_SUCCESS) {
rc);
return (B_FALSE);
}
if (rc != 0) {
"%d", rc);
return (B_FALSE);
}
I40E_SUCCESS) {
rc);
return (B_FALSE);
}
/*
* We need to obtain the Virtual Station ID (VSI) before we can
* perform other operations on the device.
*/
return (B_FALSE);
}
return (B_TRUE);
}
static void
{
int rc;
(void) i40e_disable_interrupts(i40e);
i40e->i40e_periodic_id != 0) {
i40e->i40e_periodic_id = 0;
}
if (rc != 0) {
rc);
}
}
}
(void) ddi_prop_remove_all(devinfo);
}
}
}
static boolean_t
{
pbanum[0] = '\0';
if (irc != I40E_SUCCESS) {
} else {
"printed-board-assembly", (char *)pbanum);
}
#ifdef DEBUG
#endif
"firmware-version", buf);
"firmware-build", buf);
"api-version", buf);
if (!i40e_set_hw_bus_info(hw))
return (B_FALSE);
return (B_FALSE);
}
return (B_TRUE);
}
static boolean_t
{
/*
* Note that we set the hardware's bus information later on, in
* i40e_get_available_resources(). The common code doesn't seem to
* require that it be set in any ways, it seems to be mostly for
* book-keeping.
*/
/* Call common code to set the MAC type for this adapter. */
return (B_FALSE);
return (B_TRUE);
}
static boolean_t
{
int ret;
DDI_SUCCESS) {
return (B_FALSE);
}
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Update parameters required when a new MTU has been configured. Calculate the
* maximum frame size, as well as, size our DMA buffers which we size in
* increments of 1K.
*/
void
{
sizeof (struct ether_vlan_header) + ETHERFCSL;
}
static int
{
int val;
return (val);
}
static void
{
}
}
if (!i40e->i40e_mr_enable) {
}
}
/*
* There are a few constraints on interrupts that we're currently imposing, some
* of which are restrictions from hardware. For a fuller treatment, see
* i40e_intr.c.
*
* Currently, to use MSI-X we require two interrupts be available though in
* theory we should participate in IRM and happily use more interrupts.
*
* Hardware only supports a single MSI being programmed and therefore if we
* don't have MSI-X interrupts available at this time, then we ratchet down the
* number of rings and groups available. Obviously, we only bother with a single
* fixed interrupt.
*/
static boolean_t
{
switch (intr_type) {
case DDI_INTR_TYPE_FIXED:
case DDI_INTR_TYPE_MSI:
request = 1;
min = 1;
break;
case DDI_INTR_TYPE_MSIX:
/*
* At the moment, we always request two MSI-X while we still
* only support a single interrupt. The upper bound on what's
* supported by a given device is defined by MSI_X_PF_N in
* GLPCI_CNF2. When we evolve, we should read it to determine
* what the real max is.
*/
request = 2;
min = 2;
break;
default:
panic("bad interrupt type passed to i40e_alloc_intr_handles: "
"%d", intr_type);
return (B_FALSE);
}
return (B_FALSE);
}
return (B_FALSE);
}
actual = 0;
i40e->i40e_intr_count = 0;
i40e->i40e_intr_count_max = 0;
i40e->i40e_intr_count_min = 0;
if (rc != DDI_SUCCESS) {
goto alloc_handle_fail;
}
goto alloc_handle_fail;
}
/*
* Record the priority and capabilities for our first vector. Once
* we have it, that's our priority until detach time. Even if we
* eventually participate in IRM, our priority shouldn't change.
*/
if (rc != DDI_SUCCESS) {
"Getting interrupt priority failed with %d.", rc);
goto alloc_handle_fail;
}
if (rc != DDI_SUCCESS) {
"Getting interrupt capabilities failed with %d.", rc);
goto alloc_handle_fail;
}
return (B_TRUE);
return (B_FALSE);
}
static boolean_t
{
if (rc != DDI_SUCCESS) {
rc);
return (B_FALSE);
}
i40e->i40e_intr_type = 0;
if ((intr_types & DDI_INTR_TYPE_MSIX) &&
return (B_TRUE);
}
/*
* available due to the fact that the device basically only supports a
* single MSI interrupt.
*/
if ((intr_types & DDI_INTR_TYPE_MSI) &&
return (B_TRUE);
}
if (intr_types & DDI_INTR_TYPE_FIXED) {
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Map different interrupts to MSI-X vectors.
*/
static boolean_t
{
return (B_TRUE);
}
/*
* At the moment, we only have one queue and one interrupt thus both are
* on that one interrupt. However, longer term we need to go back to
* using the ixgbe style map of queues to vectors or walk the linked
* list from the device to know what to go handle. Therefore for the
* moment, since we need to map our single set of rings to the one
* I/O interrupt that exists for MSI-X.
*/
return (B_TRUE);
}
static boolean_t
{
switch (i40e->i40e_intr_type) {
case DDI_INTR_TYPE_MSIX:
if (rc != DDI_SUCCESS) {
(void) ddi_intr_remove_handler(
}
return (B_FALSE);
}
}
break;
case DDI_INTR_TYPE_MSI:
if (rc != DDI_SUCCESS) {
"return %d", rc);
return (B_FALSE);
}
break;
case DDI_INTR_TYPE_FIXED:
if (rc != DDI_SUCCESS) {
" return %d", rc);
return (B_FALSE);
}
break;
default:
/* Cast to pacify lint */
panic("i40e_intr_type %p contains an unknown type: %d",
}
return (B_TRUE);
}
/*
* Perform periodic checks. Longer term, we should be thinking about additional
* things here:
*
* o Stall Detection
* o Temperature sensor detection
* o Device resetting
* o Statistics updating to avoid wraparound
*/
static void
{
}
/*
* Get the hardware state, and scribble away anything that needs scribbling.
*/
static void
{
int rc;
/*
* Try and determine our PHY. Note that we may have to retry to and
* delay to detect fiber correctly.
*/
NULL);
if (rc == I40E_ERR_UNKNOWN_PHY) {
i40e_msec_delay(200);
}
if (rc != I40E_SUCCESS) {
if (rc == I40E_ERR_UNKNOWN_PHY) {
"not attaching.");
} else {
}
}
if (rc != I40E_SUCCESS) {
}
/*
* In general, we don't want to mask off (as in stop from being a cause)
* any of the interrupts that the phy might be able to generate.
*/
if (rc != I40E_SUCCESS) {
}
}
/*
* Go through and re-initialize any existing filters that we may have set up for
* this device. Note that we would only expect them to exist if hardware had
* already been initialized and we had just reset it. While we're not
* implementing this yet, we're keeping this around for when we add reset
* capabilities, so this isn't forgotten.
*/
/* ARGSUSED */
static void
{
}
/*
* Configure the hardware for the Virtual Station Interface (VSI). Currently
* we only support one, but in the future we could instantiate more than one
* per attach-point.
*/
static boolean_t
{
int err;
if (err != I40E_SUCCESS) {
return (B_FALSE);
}
/*
* Set the queue and traffic class bits. Keep it simple for now.
*/
return (B_FALSE);
if (err != I40E_SUCCESS) {
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Wrapper to kick the chipset on.
*/
static boolean_t
{
int rc;
i40e_msec_delay(75);
I40E_SUCCESS) {
return (B_FALSE);
}
}
/* Determine hardware state */
/* Initialize mac addresses. */
/*
* Set up the filter control.
*/
if (rc != I40E_SUCCESS) {
return (B_FALSE);
}
return (B_FALSE);
i40e_flush(hw);
return (B_TRUE);
}
/*
* Take care of tearing down the rx ring. See 8.3.3.1.2 for more information.
*/
static void
{
int i;
/*
* Step 1. The interrupt linked list (see i40e_intr.c for more
* information) should have already been cleared before calling this
* function.
*/
#ifdef DEBUG
}
} else {
}
#endif /* DEBUG */
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
/*
* Step 1. Request the queue by clearing QENA_REQ. It may not be
* set due to unwinding from failures and a partially enabled
* ring set.
*/
if (!(reg & I40E_QRX_ENA_QENA_REQ_MASK))
continue;
}
/*
* Step 2. Wait for the disable to take, by having QENA_STAT in the FPM
* be cleared. Note that we could still receive data in the queue during
* this time. We don't actually wait for this now and instead defer this
* to i40e_shutdown_rings_wait(), after we've interleaved disabling the
* TX queues as well.
*/
}
static void
{
int i;
/*
* Step 1. The interrupt linked list should already have been cleared.
*/
#ifdef DEBUG
}
} else {
}
#endif /* DEBUG */
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
/*
* Step 2. Set the SET_QDIS flag for every queue.
*/
}
/*
* Step 3. Wait at least 400 usec (can be done once for all queues).
*/
drv_usecwait(500);
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
/*
* Step 4. Clear the QENA_REQ flag which tells hardware to
* quiesce. If QENA_REQ is not already set then that means that
* we likely already tried to disable this queue.
*/
if (!(reg & I40E_QTX_ENA_QENA_REQ_MASK))
continue;
}
/*
* Step 5. Wait for all drains to finish. This will be done by the
* hardware removing the QENA_STAT flag from the queue. Rather than
* waiting here, we interleave it with all the others in
* i40e_shutdown_rings_wait().
*/
}
/*
* Wait for all the rings to be shut down. e.g. Steps 2 and 5 from the above
* functions.
*/
static boolean_t
{
int i, try;
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
if ((reg & I40E_QRX_ENA_QENA_STAT_MASK) == 0)
break;
}
if ((reg & I40E_QRX_ENA_QENA_STAT_MASK) != 0) {
i);
return (B_FALSE);
}
if ((reg & I40E_QTX_ENA_QENA_STAT_MASK) == 0)
break;
}
if ((reg & I40E_QTX_ENA_QENA_STAT_MASK) != 0) {
i);
return (B_FALSE);
}
}
return (B_TRUE);
}
static boolean_t
{
return (i40e_shutdown_rings_wait(i40e));
}
static void
{
int i;
for (i = 0; i < rxd->rxd_ring_size; i++) {
}
}
static boolean_t
{
int err;
/*
* This must be set to 0x1, see Table 8-12 in section 8.3.3.2.2.
*/
if (err != I40E_SUCCESS) {
return (B_FALSE);
}
if (err != I40E_SUCCESS) {
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Take care of setting up the descriptor rings and actually programming the
* device. See 8.3.3.1.1 for the full list of steps we need to do to enable the
* rx rings.
*/
static boolean_t
{
int i;
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
/*
* Step 1. Program all receive ring descriptors.
*/
/*
*/
return (B_FALSE);
/*
* Step 3. Clear the queue's tail pointer and set it to the end
* of the space.
*/
/*
* Step 4. Enable the queue via the QENA_REQ.
*/
}
/*
* Note, we wait for every queue to be enabled before we start checking.
* This will hopefully cause most queues to be enabled at this point.
*/
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
/*
* Step 5. Verify that QENA_STAT has been set. It's promised
* that this should occur within about 10 us, but like other
* systems, we give the card a bit more time.
*/
for (j = 0; j < I40E_RING_WAIT_NTRIES; j++) {
if (reg & I40E_QRX_ENA_QENA_STAT_MASK)
break;
}
if ((reg & I40E_QRX_ENA_QENA_STAT_MASK) == 0) {
"out.", i);
return (B_FALSE);
}
}
return (B_TRUE);
}
static boolean_t
{
int err;
/*
* This field isn't actually documented, like crc, but it suggests that
* it should be zeroed. We leave both of these here because of that for
* now. We should check with Intel on why these are here even.
*/
tctx.rdylist_act = 0;
/*
* We're supposed to assign the rdylist field with the value of the
* traffic class index for the first device. We query the VSI parameters
* again to get what the handle is. Note that every queue is always
* assigned to traffic class zero, because we don't actually use them.
*/
if (err != I40E_SUCCESS) {
return (B_FALSE);
}
if (err != I40E_SUCCESS) {
return (B_FALSE);
}
if (err != I40E_SUCCESS) {
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Take care of setting up the descriptor rings and actually programming the
* device. See 8.4.3.1.1 for what we need to do here.
*/
static boolean_t
{
int i;
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
/*
* Step 1. Clear the queue disable flag and verify that the
* index is set correctly.
*/
/*
*/
return (B_FALSE);
/*
* Step 3. Verify that it's clear that this PF owns this queue.
*/
i40e_flush(hw);
/*
* Step 4. Set the QENA_REQ flag.
*/
}
/*
* Note, we wait for every queue to be enabled before we start checking.
* This will hopefully cause most queues to be enabled at this point.
*/
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
/*
* Step 5. Verify that QENA_STAT has been set. It's promised
* that this should occur within about 10 us, but like BSD,
* we'll try for up to 100 ms for this queue.
*/
for (j = 0; j < I40E_RING_WAIT_NTRIES; j++) {
if (reg & I40E_QTX_ENA_QENA_STAT_MASK)
break;
}
if ((reg & I40E_QTX_ENA_QENA_STAT_MASK) == 0) {
"out", i);
return (B_FALSE);
}
}
return (B_TRUE);
}
void
{
int i;
/*
* Shutdown and drain the tx and rx pipeline. We do this using the
* following steps.
*
* 1) Shutdown interrupts to all the queues (trying to keep the admin
* queue alive).
*
* 2) Remove all of the interrupt tx and rx causes by setting the
* interrupt linked lists to zero.
*
* 2) Shutdown the tx and rx rings. Because i40e_shutdown_rings() should
* wait for all the queues to be disabled, once we reach that point
* it should be safe to free associated data.
*
* 4) Wait 50ms after all that is done. This ensures that the rings are
* ready for programming again and we don't have to think about this
* in other parts of the driver.
*
* 5) Disable remaining chip interrupts, (admin queue, etc.)
*
* 6) Verify that FM is happy with all the register accesses we
* performed.
*/
}
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
}
/*
* We should consider refactoring this to be part of the ring start /
* stop routines at some point.
*/
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
}
DDI_FM_OK) {
}
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
}
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
}
i40e->i40e_link_speed = 0;
i40e->i40e_link_duplex = 0;
if (free_allocations) {
}
}
{
int i, err;
if (alloc) {
"Failed to allocate ring memory");
return (B_FALSE);
}
}
/*
* This should get refactored to be part of ring start and stop at
* some point, along with most of the logic here.
*/
for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
B_FALSE) {
int j;
for (j = 0; j < i; j++) {
}
return (B_FALSE);
}
}
if (!i40e_chip_start(i40e)) {
goto done;
}
goto done;
}
goto done;
}
/*
* Enable broadcast traffic; however, do not enable multicast traffic.
* That's handle exclusively through MAC's mc_multicst routines.
*/
if (err != I40E_SUCCESS) {
goto done;
}
if (err != I40E_SUCCESS) {
goto done;
}
/*
* Finally, make sure that we're happy from an FM perspective.
*/
DDI_FM_OK) {
goto done;
}
/* Clear state bits prior to final interrupt enabling. */
done:
}
}
return (rc);
}
/*
* We may have loaned up descriptors to the stack. As such, if we still have
* them outstanding, then we will not continue with detach.
*/
static boolean_t
{
while (i40e->i40e_rx_pending > 0) {
return (B_FALSE);
}
}
return (B_TRUE);
}
static int
{
int instance;
if (cmd != DDI_ATTACH)
return (DDI_FAILURE);
goto attach_fail;
}
if (!i40e_identify_hardware(i40e)) {
goto attach_fail;
}
if (!i40e_regs_map(i40e)) {
goto attach_fail;
}
goto attach_fail;
/*
* When we participate in IRM, we should make sure that we register
* ourselves with it before callbacks.
*/
goto attach_fail;
}
if (!i40e_alloc_trqpairs(i40e)) {
"Failed to allocate receive & transmit rings.");
goto attach_fail;
}
if (!i40e_map_intrs_to_vectors(i40e)) {
goto attach_fail;
}
if (!i40e_add_intr_handlers(i40e)) {
goto attach_fail;
}
if (!i40e_final_init(i40e)) {
goto attach_fail;
}
DDI_FM_OK) {
goto attach_fail;
}
if (!i40e_stats_init(i40e)) {
goto attach_fail;
}
if (!i40e_register_mac(i40e)) {
goto attach_fail;
}
if (i40e->i40e_periodic_id == 0) {
goto attach_fail;
}
if (!i40e_enable_interrupts(i40e)) {
goto attach_fail;
}
return (DDI_SUCCESS);
return (DDI_FAILURE);
}
static int
{
if (cmd != DDI_DETACH)
return (DDI_FAILURE);
return (DDI_FAILURE);
}
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
nulldev, /* cb_open */
nulldev, /* cb_close */
nodev, /* cb_strategy */
nodev, /* cb_print */
nodev, /* cb_dump */
nodev, /* cb_read */
nodev, /* cb_write */
nodev, /* cb_ioctl */
nodev, /* cb_devmap */
nodev, /* cb_mmap */
nodev, /* cb_segmap */
nochpoll, /* cb_chpoll */
ddi_prop_op, /* cb_prop_op */
NULL, /* cb_stream */
CB_REV, /* cb_rev */
nodev, /* cb_aread */
nodev /* cb_awrite */
};
DEVO_REV, /* devo_rev */
0, /* devo_refcnt */
NULL, /* devo_getinfo */
nulldev, /* devo_identify */
nulldev, /* devo_probe */
i40e_attach, /* devo_attach */
i40e_detach, /* devo_detach */
nodev, /* devo_reset */
&i40e_cb_ops, /* devo_cb_ops */
NULL, /* devo_bus_ops */
ddi_power, /* devo_power */
ddi_quiesce_not_supported /* devo_quiesce */
};
};
};
/*
* Module Initialization Functions.
*/
int
_init(void)
{
int status;
if (status != DDI_SUCCESS) {
}
return (status);
}
int
{
}
int
_fini(void)
{
int status;
if (status == DDI_SUCCESS) {
}
return (status);
}