/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/*
* Data-Link Driver
*/
#include <sys/dld_impl.h>
#include <sys/mac_client.h>
#include <sys/mac_client_priv.h>
#include <sys/mac_flow.h>
static int str_constructor(void *, void *, int);
static void str_destructor(void *, void *);
static void str_notify_promisc_on_phys(dld_str_t *);
static void str_notify_promisc_off_phys(dld_str_t *);
static void str_notify_link_up(dld_str_t *);
static void str_notify_link_down(dld_str_t *);
static void str_notify_capab_reneg(dld_str_t *);
static void dld_taskq_dispatch(void);
/*
* Some notes on entry points, flow-control, queueing.
*
* This driver exports the traditional STREAMS put entry point as well as
* the non-STREAMS fast-path transmit routine which is provided to IP via
* the DL_CAPAB_POLL negotiation. The put procedure handles all control
* and data operations, while the fast-path routine deals only with M_DATA
* fast-path packets. Regardless of the entry point, all outbound packets
* will end up in DLD_TX(), where they will be delivered to the MAC layer.
*
* The transmit logic operates in the following way: All packets coming
* into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
* happens when the MAC layer indicates the packets couldn't be
* transmitted due to 1) lack of resources (e.g. running out of
* descriptors), or 2) reaching the allowed bandwidth limit for this
* particular flow. The indication comes in the form of a Tx cookie that
* identifies the blocked ring. In such case, DLD will place a
* dummy message on its write-side STREAMS queue so that the queue is
* marked as "full". Any subsequent packets arriving at the driver will
* still be sent to the MAC layer where it either gets queued in the Tx
* SRS or discarded it if queue limit is exceeded. The write-side STREAMS
* queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
* When the write service procedure runs, it will remove the dummy
* message from the write-side STREAMS queue; in effect this will trigger
* backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
* respectively, due to the above reasons.
*
* All non-data operations, both DLPI and ioctls are single threaded on a per
* dld_str_t endpoint. This is done using a taskq so that the control operation
* has kernel context and can cv_wait for resources. In addition all set type
* operations that involve mac level state modification are serialized on a
* per mac end point using the perimeter mechanism provided by the mac layer.
* This serializes all mac clients trying to modify a single mac end point over
* the entire sequence of mac calls made by that client as an atomic unit. The
* mac framework locking is described in mac.c. A critical element is that
*
* dld_finddevinfo() returns the dev_info_t * corresponding to a particular
* dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
* match dev_t. If a stream is found and it is attached, its dev_info_t *
* is returned. If the mac handle is non-null, it can be safely accessed
* below. The mac handle won't be freed until the mac_unregister which
* won't happen until the driver detaches. The DDI framework ensures that
* the detach won't happen while a getinfo is in progress.
*/
typedef struct i_dld_str_state_s {
int ds_instance;
/* ARGSUSED */
static uint_t
{
return (MH_WALK_CONTINUE);
/*
* Clone: a clone minor is unique. we can terminate the
* walk if we find a matching stream -- even if we fail
* to obtain the devinfo.
*/
}
return (MH_WALK_TERMINATE);
}
return (MH_WALK_CONTINUE);
}
static dev_info_t *
{
return (NULL);
/*
* See if it's a minor node of a link
*/
return (dip);
}
int
{
/*
* GLDv3 numbers DLPI style 1 node as the instance number + 1.
* Minor number 0 is reserved for the DLPI style 2 unattached
* node.
*/
return (-1);
/*
* Check for unopened style 1 node.
* Note that this doesn't *necessarily* work for legacy
* devices, but this code is only called within the
* getinfo(9e) implementation for true GLDv3 devices, so it
* doesn't matter.
*/
return (DLS_MINOR2INST(minor));
}
return (state.ds_instance);
}
/*
* devo_getinfo: getinfo(9e)
*
* NB: This may be called for a provider before the provider's
* instances are attached. Hence, if a particular provider needs a
* special mapping (the mac instance != ddi_get_instance()), then it
* may need to provide its own implmentation using the
* mac_devt_to_instance() function, and translating the returned mac
* instance to a devinfo instance. For dev_t's where the minor number
* is too large (i.e. > MAC_MAX_MINOR), the provider can call this
* function indirectly via the mac_getinfo() function.
*/
/*ARGSUSED*/
int
{
switch (cmd) {
case DDI_INFO_DEVT2DEVINFO:
rc = DDI_SUCCESS;
}
break;
case DDI_INFO_DEVT2INSTANCE:
rc = DDI_SUCCESS;
} else if (minor > DLS_MAX_MINOR &&
rc = DDI_SUCCESS;
}
break;
}
return (rc);
}
void *
{
}
int
{
int err;
/*
* Create a new dld_str_t for the stream. This will grab a new minor
* number that will be handed back in the cloned dev_t. Creation may
* fail if we can't allocate the dummy mblk used for flow-control.
*/
return (ENOSR);
if (minor != 0) {
/*
* Style 1 open
*/
goto failed;
} else {
}
/*
* Enable the queue srv(9e) routine.
*/
/*
* Construct a cloned dev_t to hand back.
*/
return (0);
return (err);
}
int
{
/*
* All modules on top have been popped off. So there can't be any
* threads from the top.
*/
/*
* Wait until pending DLPI requests are processed.
*/
while (dsp->ds_dlpi_pending)
/*
* This stream was open to a provider node. Check to see
* if it has been cleanly shut down.
*/
/*
* The stream is either open to a style 1 provider or
* this is not clean shutdown. Detach from the PPA.
* (This is still ok even in the style 1 case).
*/
}
return (0);
}
/*
* qi_qopen: open(9e)
*/
/*ARGSUSED*/
int
{
return (ENOTSUP);
/*
* This is a cloning driver and therefore each queue should only
* ever get opened once.
*/
return (EBUSY);
}
/*
* qi_qclose: close(9e)
*/
int
{
/*
* Disable the queue srv(9e) routine.
*/
return (dld_str_close(rq));
}
/*
* qi_qputp: put(9e)
*/
void
{
case M_DATA:
break;
}
if (mode == DLD_FASTPATH) {
} else {
}
} else {
}
break;
case M_PROTO:
case M_PCPROTO: {
break;
if (prim == DL_UNITDATA_REQ) {
} else {
}
break;
}
case M_IOCTL:
break;
case M_FLUSH:
}
} else {
}
break;
default:
break;
}
}
/*
* qi_srvp: srv(9e)
*/
void
{
}
void
{
}
void
{
}
/*
* Initialize this module's data structures.
*/
void
dld_str_init(void)
{
/*
* Create dld_str_t object cache.
*/
/*
* Create a hash table for maintaining dld_str_t's.
* The ds_minor field (the clone minor number) of a dld_str_t
* is used as a key for this hash table because this number is
* globally unique (allocated from "dls_minor_arena").
*/
}
/*
* Tear down this module's data structures.
*/
int
dld_str_fini(void)
{
/*
* Make sure that there are no objects in use.
*/
if (str_count != 0)
return (EBUSY);
/*
* Ask the dld_taskq thread to quit and wait for it to be done
*/
while (!dld_taskq_done)
/*
* Destroy object cache.
*/
return (0);
}
/*
* Create a new dld_str_t object.
*/
{
int err;
/*
* Allocate an object from the cache.
*/
/*
* Allocate the dummy mblk for flow-control.
*/
return (NULL);
}
/*
* Initialize the queue pointers.
*/
/*
* We want explicit control over our write-side STREAMS queue
*/
return (dsp);
}
/*
* Destroy a dld_str_t object.
*/
void
{
/*
* Clear the queue pointers.
*/
/*
* Reinitialize all the flags.
*/
dsp->ds_notifications = 0;
/*
* Free the dummy mblk if exists.
*/
}
/*
* Free the object back to the cache.
*/
}
/*
* kmem_cache contructor function: see kmem_cache_create(9f).
*/
/*ARGSUSED*/
static int
{
/*
* Allocate a new minor number.
*/
return (-1);
/*
* Initialize the DLPI state machine.
*/
return (0);
}
/*
* kmem_cache destructor function.
*/
/*ARGSUSED*/
static void
{
/*
* Release the minor number.
*/
}
/*
* Update the priority bits and VID (may need to insert tag if mp points
* to an untagged packet.
* If vid is VLAN_ID_NONE, use the VID encoded in the packet.
*/
static mblk_t *
{
/*
* Tagged packet, update the priority bits.
*/
len = sizeof (struct ether_vlan_header);
/*
* In case some drivers only check the db_ref
* count of the first mblk, we pullup the
* message into a single mblk.
*/
return (NULL);
} else {
}
}
} else {
/*
* Untagged packet. Two factors will cause us to insert a
* VLAN header:
* - This is a VLAN link (vid is specified)
* - The link supports user priority tagging and the priority
* is non-zero.
*/
return (mp);
return (NULL);
/*
* Copy the MAC addresses and typelen
*/
/*
* Free the original message if it's now empty. Link the
* rest of the messages to the header message.
*/
} else {
}
}
if (pri == 0)
if (vid == VLAN_ID_NONE)
return (mp);
}
/*
* M_DATA put (IP fast-path mode)
*/
{
if (is_ethernet) {
/*
* Update the priority bits to the assigned priority.
*/
if (pri != 0) {
goto discard;
}
}
}
return (cookie);
/* TODO: bump kstat? */
return (NULL);
}
/*
* M_DATA put (DLIOCRAW mode)
*/
static void
{
/*
* Certain MAC type plugins provide an illusion for raw DLPI
* consumers. They pretend that the MAC layer is something that
* it's not for the benefit of observability tools. For example,
* mac_wifi pretends that it's Ethernet for such consumers.
* Here, unless native mode is enabled, we call into the MAC layer so
* that this illusion can be maintained. The plugin will optionally
* transform the MAC header here into something that can be passed
* down. The header goes from raw mode to "cooked" mode.
*/
goto discard;
}
/*
* Check the packet is not too big and that any remaining
* fragment list is composed entirely of M_DATA messages. (We
* know the first fragment was M_DATA otherwise we could not
* have got here).
*/
goto discard;
}
goto discard;
/*
* If LSO is enabled, check the size against lso_max. Otherwise,
* compare the packet size with max_sdu.
*/
goto discard;
if (is_ethernet) {
/*
* Discard the packet if this is a VLAN stream but the VID in
* the packet is not correct.
*/
goto discard;
/*
* Discard the packet if this packet is a tagged packet
* but both pri and VID are 0.
*/
vid == VLAN_ID_NONE)
goto discard;
/*
* Update the priority bits to the per-stream priority if
* priority is not set in the packet. Update the VID for
* packets on a VLAN stream.
*/
goto discard;
}
}
}
/* Turn on flow-control for dld */
}
return;
/* TODO: bump kstat? */
}
/*
* Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
*/
int
{
int err;
const char *drvname;
return (EINVAL);
return (ENOTSUP);
/*
* /dev node access. This will still be supported for backward
* compatibility reason.
*/
return (EINVAL);
}
goto failed;
goto failed;
/*
* Open a channel.
*/
goto failed;
goto failed;
/*
* Set the default packet priority.
*/
/*
* Add a notify function so that the we get updates from the MAC.
*/
return (0);
if (qassociated)
return (err);
}
/*
* Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
* from close(2) for style 2.
*/
void
{
int err;
/*
* Remove the notify function.
*
* Note that we cannot wait for the notification callback to be removed
* since it could cause the deadlock with str_notify() since they both
* need the mac perimeter. Continue if we cannot remove the
* notification callback right now and wait after we leave the
* perimeter.
*/
/*
* Disable the capabilities
*/
/*
* Clear LSO flags.
*/
dsp->ds_lso_max = 0;
/*
* Now we leave the mac perimeter. If mac_notify_remove() failed
* because the notification callback was in progress, wait for
* it to finish before we proceed.
*/
if (err != 0)
/*
* An unreferenced tagged (non-persistent) vlan gets destroyed
* automatically in the call to dls_devnet_rele.
*/
/*
* Re-initialize the DLPI state machine.
*/
}
/*
* This function is only called for VLAN streams. In raw mode, we strip VLAN
* tags before sending packets up to the DLS clients, with the exception of
* special priority tagged packets, in that case, we set the VID to 0.
* mp must be a VLAN tagged packet.
*/
static mblk_t *
{
return (NULL);
}
/*
* Priority is 0, strip the tag.
*/
} else {
/*
* Priority is not 0, update the VID to 0.
*/
}
return (mp);
}
/*
* Raw mode receive function.
*/
/*ARGSUSED*/
void
{
do {
/*
* Get the pointer to the next packet in the chain and then
* clear b_next before the packet gets passed on.
*/
/*
* Wind back b_rptr to point at the MAC header.
*/
/*
* Certain MAC type plugins provide an illusion for raw
* DLPI consumers. They pretend that the MAC layer is
* something that it's not for the benefit of observability
* tools. For example, mac_wifi pretends that it's Ethernet
* for such consumers. Here, unless native mode is enabled,
* we call into the MAC layer so that this illusion can be
* maintained. The plugin will optionally transform the MAC
* header here into something that can be passed up to raw
* consumers. The header goes from "cooked" mode to raw mode.
*/
goto next;
}
}
/*
* Strip the VLAN tag for VLAN streams.
*/
if (is_ethernet &&
/*
* The priority should be kept only for VLAN
* data-links.
*/
goto next;
}
}
/*
* Pass the packet on.
*/
else
next:
/*
* Move on to the next packet in the chain.
*/
}
/*
* Fast-path receive function.
*/
/*ARGSUSED*/
void
{
/*
* MAC header stripping rules:
* - Tagged packets:
* a. VLAN streams. Strip the whole VLAN header including the tag.
* b. Physical streams
* - VLAN packets (non-zero VID). The stream must be either a
* DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
* Strip the Ethernet header but keep the VLAN header.
* - Special tagged packets (zero VID)
* * The stream is either a DL_PROMISC_SAP listener or a
* ETHERTYPE_VLAN listener, strip the Ethernet header but
* keep the VLAN header.
* * Otherwise, strip the whole VLAN header.
* - Untagged packets. Strip the whole MAC header.
*/
if (mhip->mhi_istagged &&
offset = VLAN_TAGSZ;
}
do {
/*
* Get the pointer to the next packet in the chain and then
* clear b_next before the packet gets passed on.
*/
/*
* Wind back b_rptr to point at the VLAN header.
*/
/*
* Pass the packet on.
*/
else
/*
* Move on to the next packet in the chain.
*/
}
/*
* Default receive function (send DL_UNITDATA_IND messages).
*/
/*ARGSUSED*/
void
{
/*
* See MAC header stripping rules in the dld_str_rx_fastpath() function.
*/
if (mhip->mhi_istagged &&
offset = VLAN_TAGSZ;
}
do {
/*
* Get the pointer to the next packet in the chain and then
* clear b_next before the packet gets passed on.
*/
/*
* Wind back b_rptr to point at the MAC header.
*/
/*
* Create the DL_UNITDATA_IND M_PROTO.
*/
return;
}
/*
* Advance b_rptr to point at the payload (or the VLAN header).
*/
/*
* Prepend the DL_UNITDATA_IND.
*/
/*
* Send the message.
*/
else
/*
* Move on to the next packet in the chain.
*/
}
/*
* DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
*/
static void
{
return;
return;
} else {
}
}
/*
* Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
* current state of the interface.
*/
void
{
}
typedef struct dl_unitdata_ind_wrapper {
/*
* Create a DL_UNITDATA_IND M_PROTO message.
*/
static mblk_t *
{
/*
* Get the packet header information.
*/
return (NULL);
/*
* Allocate a message large enough to contain the wrapper structure
* defined above.
*/
sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
DL_UNITDATA_IND)) == NULL)
return (NULL);
/*
* Copy in the destination address.
*/
/*
* Set the destination DLSAP to the SAP value encoded in the packet.
*/
else
/*
* If the destination address was multicast or broadcast then the
* dl_group_address field should be non-zero.
*/
/*
* Copy in the source address if one exists. Some MAC types (DL_IB
* for example) may not have access to source information.
*/
} else {
/*
* Set the source DLSAP to the packet ethertype.
*/
}
return (nmp);
}
/*
* DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_LINK_UP
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_SPEED
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
*/
static void
{
return;
return;
}
/*
* DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
*/
static void
{
return;
return;
}
static void
{
return;
return;
sizeof (mac_protect_t));
}
/*
* MAC notification callback.
*/
void
{
switch (type) {
case MAC_NOTE_TX:
qenable(q);
break;
case MAC_NOTE_DEVPROMISC:
/*
* Send the appropriate DL_NOTIFY_IND.
*/
if (mac_promisc_get(mh))
else
break;
case MAC_NOTE_UNICST:
/*
* This notification is sent whenever the MAC unicast
* address changes.
*/
/*
* Send the appropriate DL_NOTIFY_IND.
*/
break;
case MAC_NOTE_DEST:
/*
* Only send up DL_NOTE_DEST_ADDR if the link has a
* destination address.
*/
break;
case MAC_NOTE_LOWLINK:
case MAC_NOTE_LINK:
/*
* LOWLINK refers to the actual link status. For links that
* are not part of a bridge instance LOWLINK and LINK state
* are the same. But for a link part of a bridge instance
* LINK state refers to the aggregate link status: "up" when
* at least one link part of the bridge is up and is "down"
* when all links part of the bridge are down.
*
* Clients can request to be notified of the LOWLINK state
* using the DLIOCLOWLINK ioctl. Clients such as the bridge
* daemon request lowlink state changes and upper layer clients
* receive notifications of the aggregate link state changes
* notifications.
*/
/*
* Check that the notification type matches the one that we
* want. If we want lower-level link notifications, and this
* is upper, or if we want upper and this is lower, then
* ignore.
*/
break;
/*
* This notification is sent every time the MAC driver
* updates the link state.
*/
case LINK_STATE_UP: {
/*
* The link is up so send the appropriate
* DL_NOTIFY_IND.
*/
break;
}
case LINK_STATE_DOWN:
/*
* The link is down so send the appropriate
* DL_NOTIFY_IND.
*/
break;
default:
break;
}
break;
case MAC_NOTE_CAPAB_CHG:
/*
* This notification is sent whenever the MAC resources
* change or capabilities change. We need to renegotiate
* the capabilities. Send the appropriate DL_NOTIFY_IND.
*/
break;
case MAC_NOTE_SDU_SIZE: {
break;
}
case MAC_NOTE_FASTPATH_FLUSH:
break;
/* Unused notifications */
case MAC_NOTE_MARGIN:
break;
case MAC_NOTE_ALLOWED_IPS:
break;
default:
break;
}
}
/*
* This function is called via a taskq mechansim to process all control
* messages on a per 'dsp' end point.
*/
static void
{
case M_PROTO:
case M_PCPROTO:
break;
case M_IOCTL:
break;
default:
ASSERT(0);
}
}
dsp->ds_dlpi_pending = 0;
}
/*
* Kernel thread to handle taskq dispatch failures in dld_wput_data. This
* thread is started at boot time.
*/
static void
dld_taskq_dispatch(void)
{
"dld_taskq_dispatch");
while (!dld_taskq_quit) {
}
}
thread_exit();
}
/*
* All control operations are serialized on the 'dsp' and are also funneled
* through a taskq mechanism to ensure that subsequent processing has kernel
* context and can safely use cv_wait.
*
* Mechanisms to handle taskq dispatch failures
*
* The only way to be sure that taskq dispatch does not fail is to either
* specify TQ_SLEEP or to use a static taskq and prepopulate it with
* some number of entries and make sure that the number of outstanding requests
* are less than that number. We can't use TQ_SLEEP since we don't know the
* context. Nor can we bound the total number of 'dsp' end points. So we are
* unable to use either of the above schemes, and are forced to deal with
* taskq dispatch failures. Note that even dynamic taskq could fail in
* dispatch if TQ_NOSLEEP is specified, since this flag is translated
* eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
* framework.
*
* We maintain a queue of 'dsp's that encountered taskq dispatch failure.
* We also have a single global thread to retry the taskq dispatch. This
* thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
* uses TQ_SLEEP to ensure eventual success of the dispatch operation.
*/
static void
{
return;
}
/*
* At this point if ds_dlpi_pending is set, it implies that the taskq
* thread is still active and is processing the last message, though
* the pending queue has been emptied.
*/
if (dsp->ds_dlpi_pending) {
return;
}
TQ_NOSLEEP) != 0)
return;
}
/*
* Process an M_IOCTL message.
*/
static void
{
switch (cmd) {
case DLIOCNATIVE:
break;
case DLIOCMARGININFO:
break;
case DLIOCRAW:
break;
case DLIOCHDRINFO:
break;
case DLIOCLOWLINK:
break;
default:
}
}
/*
* DLIOCNATIVE
*/
static void
{
/*
* Native mode can be enabled if it's disabled and if the
* native media type is different.
*/
else
}
/*
* DLIOCMARGININFO
*/
static void
{
int err;
goto failed;
}
goto failed;
return;
}
/*
* DLIOCRAW
*/
static void
{
return;
}
return;
}
/*
* Set the receive callback.
*/
}
/*
* Note that raw mode is enabled.
*/
}
/*
* DLIOCHDRINFO
*/
static void
{
int err;
if (dld_opt & DLD_OPT_NO_FASTPATH) {
goto failed;
}
/*
* DLIOCHDRINFO should only come from IP. The one initiated from
* user-land should not be allowed.
*/
goto failed;
}
goto failed;
}
goto failed;
}
goto failed;
}
goto failed;
}
goto failed;
}
/*
* This ioctl might happen concurrently with a direct call to dld_capab
* stack does not serialize them, we do so here to avoid mixing
* the callbacks.
*/
/*
* Set the receive callback (unless polling is enabled).
*/
/*
* Note that fast-path mode is enabled.
*/
}
return;
}
/*
* DLIOCLOWLINK: request actual link state changes. When the
* link is part of a bridge instance the client receives actual
* link state changes and not the aggregate link status. Used by
* the bridging daemon (bridged) for proper RSTP operation.
*/
static void
{
int err;
} else {
/* LINTED: alignment */
}
}
/*
* Catch-all handler.
*/
static void
{
return;
}
}