dld_str.c revision 789e0dbbcdddab55f064dbca13950cb068a30efe
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Data-Link Driver
*/
#include <sys/dld_impl.h>
#include <sys/dls_impl.h>
static int str_constructor(void *, void *, int);
static void str_destructor(void *, void *);
static void str_notify_promisc_on_phys(dld_str_t *);
static void str_notify_promisc_off_phys(dld_str_t *);
static void str_notify_link_up(dld_str_t *);
static void str_notify_link_down(dld_str_t *);
static void str_notify_capab_reneg(dld_str_t *);
static void str_notify(void *, mac_notify_type_t);
static kmem_cache_t *str_cachep;
static uint32_t minor_count;
static mod_hash_t *str_hashp;
#define STR_HASHSZ 64
/*
* Some notes on entry points, flow-control, queueing and locking:
*
* This driver exports the traditional STREAMS put entry point as well as
* the non-STREAMS fast-path transmit routine which is provided to IP via
* the DL_CAPAB_POLL negotiation. The put procedure handles all control
* and data operations, while the fast-path routine deals only with M_DATA
* fast-path packets. Regardless of the entry point, all outbound packets
* will end up in dld_tx_single(), where they will be delivered to the MAC
* driver.
*
* The transmit logic operates in two modes: a "not busy" mode where the
* packets will be delivered to the MAC for a send attempt, or "busy" mode
* where they will be enqueued in the internal queue because of flow-control.
* Flow-control happens when the MAC driver indicates the packets couldn't
* be transmitted due to lack of resources (e.g. running out of descriptors).
* In such case, the driver will place a dummy message on its write-side
* STREAMS queue so that the queue is marked as "full". Any subsequent
* packets arriving at the driver will be enqueued in the internal queue,
* which is drained in the context of the service thread that gets scheduled
* whenever the driver is in the "busy" mode. When all packets have been
* successfully delivered by MAC and the internal queue is empty, it will
* transition to the "not busy" mode by removing the dummy message from the
* write-side STREAMS queue; in effect this will trigger backenabling.
* The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
* to the above reasons.
*
* The driver implements an internal transmit queue independent of STREAMS.
* compared to the putq() and get() STREAMS interfaces. The only putq() and
* getq() operations done by the driver are those related to placing and
* control purposes.
*
* Locking is done independent of STREAMS due to the driver being fully MT.
* Threads entering the driver (either from put or service entry points)
* will most likely be readers, with the exception of a few writer cases
* DLD-related ioctl requests. The DLPI detach case is special, because
* it involves freeing resources and therefore must be single-threaded.
* it, because the lock is dropped prior to the driver calling places where
* putnext() may be invoked, and such places may depend on those resources
* to exist. Because of this, the driver always completes the DLPI detach
* process when there are no other threads running in the driver. This is
* done by keeping track of the number of threads, such that the the last
* thread leaving the driver will finish the pending DLPI detach operation.
*/
/*
* dld_max_q_count is the queue depth threshold used to limit the number of
* outstanding packets or bytes allowed in the queue; once this limit is
* reached the driver will free any incoming ones until the queue depth
* drops below the threshold.
*
* This buffering is provided to accomodate clients which do not employ
* their own buffering scheme, and to handle occasional packet bursts.
* Clients which handle their own buffering will receive positive feedback
* from this driver as soon as it transitions into the "busy" state, i.e.
* when the queue is initially filled up; they will get backenabled once
* the queue is empty.
*
* The value chosen here is rather arbitrary; in future some intelligent
* heuristics may be involved which could take into account the hardware's
* transmit ring size, etc.
*/
/*
* dld_finddevinfo() returns the dev_info_t * corresponding to a particular
* dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
* match dev_t. If a stream is found and it is attached, its dev_info_t *
* is returned.
*/
typedef struct i_dld_str_state_s {
/* ARGSUSED */
static uint_t
{
return (MH_WALK_CONTINUE);
/*
* Access to ds_ppa and ds_mh need to be protected by ds_lock.
*/
/*
* Style 1: minor can be derived from the ppa. we
* continue to walk until we find a matching stream
* in attached state.
*/
return (MH_WALK_TERMINATE);
}
} else {
/*
* Clone: a clone minor is unique. we can terminate the
* walk if we find a matching stream -- even if we fail
* to obtain the devinfo.
*/
return (MH_WALK_TERMINATE);
}
}
return (MH_WALK_CONTINUE);
}
static dev_info_t *
{
return (NULL);
/* See if it's a minor node of a VLAN */
return (dls_finddevinfo(dev));
}
/*
* devo_getinfo: getinfo(9e)
*/
/*ARGSUSED*/
int
{
int rc = DDI_FAILURE;
switch (cmd) {
case DDI_INFO_DEVT2DEVINFO:
rc = DDI_SUCCESS;
}
break;
case DDI_INFO_DEVT2INSTANCE:
rc = DDI_SUCCESS;
} else if (minor > DLD_MAX_MINOR &&
rc = DDI_SUCCESS;
}
break;
}
return (rc);
}
/*
* qi_qopen: open(9e)
*/
/*ARGSUSED*/
int
{
int err;
return (ENOTSUP);
/*
* This is a cloning driver and therefore each queue should only
* ever get opened once.
*/
return (EBUSY);
/*
* Create a new dld_str_t for the stream. This will grab a new minor
* number that will be handed back in the cloned dev_t. Creation may
* fail if we can't allocate the dummy mblk used for flow-control.
*/
return (ENOSR);
if (minor != 0) {
/*
* Style 1 open
*/
goto failed;
goto failed;
} else {
}
/*
* Enable the queue srv(9e) routine.
*/
/*
* Construct a cloned dev_t to hand back.
*/
return (0);
return (err);
}
/*
* qi_qclose: close(9e)
*/
int
{
/*
* Wait until pending requests are processed.
*/
while (dsp->ds_pending_cnt > 0)
/*
* Disable the queue srv(9e) routine.
*/
/*
* At this point we can not be entered by any threads via STREAMS
* or the direct call interface, which is available only to IP.
* After the interface is unplumbed, IP wouldn't have any reference
* to this instance, and therefore we are now effectively single
* threaded and don't require any lock protection. Flush all
* pending packets which are sitting in the transmit queue.
*/
/*
* This stream was open to a provider node. Check to see
* if it has been cleanly shut down.
*/
/*
* The stream is either open to a style 1 provider or
* this is not clean shutdown. Detach from the PPA.
* (This is still ok even in the style 1 case).
*/
}
return (0);
}
/*
* qi_qputp: put(9e)
*/
void
{
case M_DATA:
}
break;
case M_PROTO:
case M_PCPROTO:
break;
case M_IOCTL:
break;
case M_FLUSH:
}
} else {
}
break;
default:
break;
}
}
/*
* qi_srvp: srv(9e)
*/
void
{
/*
* Grab all packets (chained via b_next) off our transmit queue
* and try to send them all to the MAC layer. Since the queue
* is independent of streams, we are able to dequeue all messages
* at once without looping through getq() and manually chaining
* them. Note that the queue size parameters (byte and message
* counts) are cleared as well, but we postpone the backenabling
* until after the MAC transmit since some packets may end up
* back at our transmit queue.
*/
return;
}
/*
* Discard packets unless we are attached and bound; note that
* because regardless of the mode all transmit will end up in
* dld_tx_single() where the packets may be queued.
*/
goto done;
}
/*
* Attempt to transmit one or more packets. If the MAC can't
* send them all, re-queue the packet(s) at the beginning of
* the transmit queue to avoid any re-ordering.
*/
done:
/*
* Grab the list lock again and check if the transmit queue is
* really empty; if so, lift up flow-control and backenable any
* writer queues. If the queue is not empty, schedule service
* thread to drain it.
*/
}
}
void
{
struct module_info *modinfo;
}
void
{
struct module_info *modinfo;
}
/*
* Initialize this module's data structures.
*/
void
dld_str_init(void)
{
/*
* Create dld_str_t object cache.
*/
/*
* Create a hash table for maintaining dld_str_t's.
* The ds_minor field (the clone minor number) of a dld_str_t
* is used as a key for this hash table because this number is
* globally unique (allocated from "dls_minor_arena").
*/
}
/*
* Tear down this module's data structures.
*/
int
dld_str_fini(void)
{
/*
* Make sure that there are no objects in use.
*/
if (str_count != 0)
return (EBUSY);
/*
* Check to see if there are any minor numbers still in use.
*/
if (minor_count != 0)
return (EBUSY);
/*
* Destroy object cache.
*/
return (0);
}
/*
* Create a new dld_str_t object.
*/
{
int err;
/*
* Allocate an object from the cache.
*/
/*
* Allocate the dummy mblk for flow-control.
*/
return (NULL);
}
/*
* Initialize the queue pointers.
*/
/*
* We want explicit control over our write-side STREAMS queue
*/
return (dsp);
}
/*
* Destroy a dld_str_t object.
*/
void
{
/*
* Clear the queue pointers.
*/
/*
* Reinitialize all the flags.
*/
dsp->ds_notifications = 0;
/*
* Free the dummy mblk if exists.
*/
}
/*
* Free the object back to the cache.
*/
}
/*
* kmem_cache contructor function: see kmem_cache_create(9f).
*/
/*ARGSUSED*/
static int
{
/*
* Allocate a new minor number.
*/
return (-1);
}
/*
* Initialize the DLPI state machine.
*/
return (0);
}
/*
* kmem_cache destructor function.
*/
/*ARGSUSED*/
static void
{
/*
* Make sure the DLPI state machine was reset.
*/
/*
* Make sure the data-link interface was closed.
*/
/*
* Make sure enabled notifications are cleared.
*/
/*
* Make sure polling is disabled.
*/
/*
* Release the minor number.
*/
}
/*
* M_DATA put. Note that mp is a single message, not a chained message.
*/
void
{
/*
* This function can be called from within dld or from an upper
* layer protocol (currently only tcp). If we are in the busy
* mode enqueue the packet(s) and return. Otherwise hand them
* over to the MAC driver for transmission; any remaining one(s)
* which didn't get sent will be queued.
*
* Note here that we don't grab the list lock prior to checking
* the busy flag. This is okay, because a missed transition
* will not cause any packet reordering for any particular TCP
* connection (which is single-threaded). The enqueue routine
* will atomically set the busy flag and schedule the service
* thread to run; the flag is only cleared by the service thread
* when there is no more packet to be transmitted.
*/
}
/*
* Update the priority bits and VID (may need to insert tag if mp points
* to an untagged packet.
* If vid is VLAN_ID_NONE, use the VID encoded in the packet.
*/
static mblk_t *
{
struct ether_vlan_header *evhp;
struct ether_header *ehp;
/*
* Tagged packet, update the priority bits.
*/
len = sizeof (struct ether_vlan_header);
/*
* In case some drivers only check the db_ref
* count of the first mblk, we pullup the
* message into a single mblk.
*/
return (NULL);
} else {
}
}
} else {
/*
* Untagged packet. Insert the special priority tag.
* First allocate a header mblk.
*/
return (NULL);
/*
* Copy the MAC addresses and typelen
*/
/*
* Free the original message if it's now empty. Link the
* rest of messages to the header message.
*/
} else {
}
}
if (pri == 0)
if (vid == VLAN_ID_NONE)
return (mp);
}
/*
* M_DATA put (IP fast-path mode)
*/
void
{
if (is_ethernet) {
/*
* Update the priority bits to the assigned priority.
*/
if (pri != 0) {
goto discard;
}
}
return;
/* TODO: bump kstat? */
}
/*
* M_DATA put (DLIOCRAW mode)
*/
static void
{
/*
* Certain MAC type plugins provide an illusion for raw DLPI
* consumers. They pretend that the MAC layer is something that
* it's not for the benefit of observability tools. For example,
* mac_wifi pretends that it's Ethernet for such consumers.
* Here, unless native mode is enabled, we call into the MAC layer so
* that this illusion can be maintained. The plugin will optionally
* transform the MAC header here into something that can be passed
* down. The header goes from raw mode to "cooked" mode.
*/
goto discard;
}
/*
* Check the packet is not too big and that any remaining
* fragment list is composed entirely of M_DATA messages. (We
* know the first fragment was M_DATA otherwise we could not
* have got here).
*/
goto discard;
}
goto discard;
/*
* If LSO is enabled, check the size against lso_max. Otherwise,
* compare the packet size with sdu_max.
*/
+ mhi.mhi_hdrsize)
goto discard;
if (is_ethernet) {
/*
* Discard the packet if this is a VLAN stream but the VID in
* the packet is not correct.
*/
goto discard;
/*
* Discard the packet if this packet is a tagged packet
* but both pri and VID are 0.
*/
goto discard;
/*
* Update the priority bits to the per-stream priority if
* priority is not set in the packet. Update the VID for
* packets on a VLAN stream.
*/
goto discard;
}
}
}
return;
/* TODO: bump kstat? */
}
/*
* Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
*/
int
{
int err;
const char *drvname;
char name[MAXNAMELEN];
return (EINVAL);
return (EINVAL);
/*
* Open a channel.
*/
return (err);
}
/*
* Cache the MAC interface handle, a pointer to the immutable MAC
* information and the current and 'factory' MAC address.
*/
/*
* Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
* a non-VLAN interface).
*/
/*
* Set the default packet priority.
*/
/*
* Add a notify function so that the we get updates from the MAC.
*/
return (0);
}
/*
* Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
* from close(2) for style 2.
*/
void
{
/*
* Remove the notify function.
*/
/*
* Clear the polling and promisc flags.
*/
dsp->ds_promisc = 0;
/*
* Clear LSO flags.
*/
dsp->ds_lso_max = 0;
/*
* Close the channel.
*/
/*
* Re-initialize the DLPI state machine.
*/
}
/*
* This function is only called for VLAN streams. In raw mode, we strip VLAN
* tags before sending packets up to the DLS clients, with the exception of
* special priority tagged packets, in that case, we set the VID to 0.
* mp must be a VLAN tagged packet.
*/
static mblk_t *
{
struct ether_vlan_header *evhp;
return (NULL);
}
/*
* Priority is 0, strip the tag.
*/
} else {
/*
* Priority is not 0, update the VID to 0.
*/
}
return (mp);
}
/*
* Raw mode receive function.
*/
/*ARGSUSED*/
void
{
do {
/*
* Get the pointer to the next packet in the chain and then
* clear b_next before the packet gets passed on.
*/
/*
* Wind back b_rptr to point at the MAC header.
*/
/*
* Certain MAC type plugins provide an illusion for raw
* DLPI consumers. They pretend that the MAC layer is
* something that it's not for the benefit of observability
* tools. For example, mac_wifi pretends that it's Ethernet
* for such consumers. Here, unless native mode is enabled,
* we call into the MAC layer so that this illusion can be
* maintained. The plugin will optionally transform the MAC
* header here into something that can be passed up to raw
* consumers. The header goes from "cooked" mode to raw mode.
*/
goto next;
}
}
/*
* Strip the VLAN tag for VLAN streams.
*/
goto next;
}
}
/*
* Pass the packet on.
*/
else
next:
/*
* Move on to the next packet in the chain.
*/
}
/*
* Fast-path receive function.
*/
/*ARGSUSED*/
void
{
/*
* MAC header stripping rules:
* - Tagged packets:
* a. VLAN streams. Strip the whole VLAN header including the tag.
* b. Physical streams
* - VLAN packets (non-zero VID). The stream must be either a
* DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
* Strip the Ethernet header but keep the VLAN header.
* - Special tagged packets (zero VID)
* * The stream is either a DL_PROMISC_SAP listener or a
* ETHERTYPE_VLAN listener, strip the Ethernet header but
* keep the VLAN header.
* * Otherwise, strip the whole VLAN header.
* - Untagged packets. Strip the whole MAC header.
*/
offset = VLAN_TAGSZ;
}
do {
/*
* Get the pointer to the next packet in the chain and then
* clear b_next before the packet gets passed on.
*/
/*
* Wind back b_rptr to point at the VLAN header.
*/
/*
* Pass the packet on.
*/
else
/*
* Move on to the next packet in the chain.
*/
}
/*
* Default receive function (send DL_UNITDATA_IND messages).
*/
/*ARGSUSED*/
void
{
/*
* See MAC header stripping rules in the dld_str_rx_fastpath() function.
*/
offset = VLAN_TAGSZ;
}
do {
/*
* Get the pointer to the next packet in the chain and then
* clear b_next before the packet gets passed on.
*/
/*
* Wind back b_rptr to point at the MAC header.
*/
/*
* Create the DL_UNITDATA_IND M_PROTO.
*/
return;
}
/*
* Advance b_rptr to point at the payload (or the VLAN header).
*/
/*
* Prepend the DL_UNITDATA_IND.
*/
/*
* Send the message.
*/
else
/*
* Move on to the next packet in the chain.
*/
}
/*
* Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
* current state of the interface.
*/
void
{
}
typedef struct dl_unitdata_ind_wrapper {
/*
* Create a DL_UNITDATA_IND M_PROTO message.
*/
static mblk_t *
{
/*
* Get the packet header information.
*/
return (NULL);
/*
* Allocate a message large enough to contain the wrapper structure
* defined above.
*/
sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
DL_UNITDATA_IND)) == NULL)
return (NULL);
/*
* Copy in the destination address.
*/
/*
* Set the destination DLSAP to the SAP value encoded in the packet.
*/
else
/*
* If the destination address was multicast or broadcast then the
* dl_group_address field should be non-zero.
*/
/*
* Copy in the source address if one exists. Some MAC types (DL_IB
* for example) may not have access to source information.
*/
} else {
/*
* Set the source DLSAP to the packet ethertype.
*/
}
return (nmp);
}
/*
* DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_LINK_UP
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_SPEED
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
*/
static void
{
return;
return;
}
/*
* MAC notification callback.
*/
static void
{
switch (type) {
case MAC_NOTE_TX:
qenable(q);
break;
case MAC_NOTE_DEVPROMISC:
/*
* Send the appropriate DL_NOTIFY_IND.
*/
else
break;
case MAC_NOTE_PROMISC:
break;
case MAC_NOTE_UNICST:
/*
* This notification is sent whenever the MAC unicast address
* changes. We need to re-cache the address.
*/
/*
* Send the appropriate DL_NOTIFY_IND.
*/
break;
case MAC_NOTE_LINK:
/*
* This notification is sent every time the MAC driver
* updates the link state.
*/
case LINK_STATE_UP: {
/*
* The link is up so send the appropriate
* DL_NOTIFY_IND.
*/
break;
}
case LINK_STATE_DOWN:
/*
* The link is down so send the appropriate
* DL_NOTIFY_IND.
*/
break;
default:
break;
}
break;
case MAC_NOTE_RESOURCE:
/*
* This notification is sent whenever the MAC resources
* change. We need to renegotiate the capabilities.
* Send the appropriate DL_NOTIFY_IND.
*/
break;
case MAC_NOTE_FASTPATH_FLUSH:
break;
default:
break;
}
}
/*
* Enqueue one or more messages to the transmit queue.
*/
void
{
/* Calculate total size and count of the packet(s) */
msgcnt++;
}
/*
* If the queue depth would exceed the allowed threshold, drop
* new packet(s) and drain those already in the queue.
*/
if (!head_insert &&
goto done;
}
/* Update the queue size parameters */
/*
* If the transmit queue is currently empty and we are
* about to deposit the packet(s) there, switch mode to
* "busy" and raise flow-control condition.
*/
if (!dsp->ds_tx_qbusy) {
}
if (!head_insert) {
/* Tail insertion */
else
} else {
/* Head insertion */
}
done:
/* Schedule service thread to drain the transmit queue */
if (!head_insert)
qenable(q);
}
void
{
if (dsp->ds_tx_qbusy) {
}
}
}
/*
* Process an M_IOCTL message.
*/
static void
{
switch (cmd) {
case DLIOCNATIVE:
break;
case DLIOCRAW:
break;
case DLIOCHDRINFO:
break;
default:
}
}
/*
* DLIOCNATIVE
*/
static void
{
/*
* Native mode can be enabled if it's disabled and if the
* native media type is different.
*/
else
}
/*
* DLIOCRAW
*/
static void
{
return;
}
/*
* Set the receive callback.
*/
}
/*
* Note that raw mode is enabled.
*/
}
/*
* DLIOCHDRINFO
*/
static void
{
int err;
if (dld_opt & DLD_OPT_NO_FASTPATH) {
goto failed;
}
/*
* DLIOCHDRINFO should only come from IP. The one initiated from
* user-land should not be allowed.
*/
goto failed;
}
goto failed;
}
goto failed;
}
goto failed;
}
goto failed;
}
goto failed;
}
/*
* This is a performance optimization. We originally entered
* as reader and only become writer upon transitioning into
* the DLD_FASTPATH mode for the first time. Otherwise we
* stay as reader and return the fast-path header to IP.
*/
/*
* State may have changed before we re-acquired
* the writer lock in case the upgrade failed.
*/
goto failed;
}
}
/*
* Set the receive callback (unless polling is enabled).
*/
/*
* Note that fast-path mode is enabled.
*/
}
return;
}
/*
* Catch-all handler.
*/
static void
{
return;
}
}