/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/*
* Declarations private to this file
*/
eib_ether_hdr_t *);
uint64_t *);
static void eib_data_rx_comp_intr(ibt_cq_hdl_t, void *);
static void eib_data_tx_comp_intr(ibt_cq_hdl_t, void *);
int
{
/*
* Allocate a eib_chan_t to store stuff about this vnic's data qp
* and initialize it with default admin qp pkey parameters. We'll
* re-associate this with the pkey we receive from the gw once we
* receive the login ack.
*/
/*
*/
"eib_data_setup_cqs(vn_inst=0x%x) failed",
vnic->vn_instance);
goto data_create_qp_fail;
}
/*
* Setup UD channel
*/
"eib_data_setup_ud_channel(vn_inst=0x%x) failed",
vnic->vn_instance);
goto data_create_qp_fail;
}
return (EIB_E_SUCCESS);
return (EIB_E_FAILURE);
}
/*ARGSUSED*/
{
int i;
/*
* Re-arm the rx notification callback before we start polling
* the completion queue. There's nothing much we can do if the
* enable_cq_notify fails - we issue a warning and move on.
*/
if (ret != IBT_SUCCESS) {
"ibt_enable_cq_notify() failed, ret=%d", ret);
}
/*
* We don't want to be stuck in receive processing for too long without
* giving others a chance.
*/
/*
* Handle rx completions
*/
/*
* Clear the posted-to-hca flag and reduce the number
* of posted-rwqes count
*/
} else {
ipkts++;
continue;
} else {
/*
* Add this mp to the list to
* send it to the nw layer. Note
* that the wqe could've been
* returned to the pool if we're
* running low, so don't process
* wqe after this point.
*/
if (head)
else
}
}
}
/*
* We reduce the number of atomic updates to key statistics
* by pooling them here, once per ibt_poll_cq(). The accuracy
* and consistency of the published statistics within a cq
* polling cycle will be compromised a little bit, but that
* should be ok, given that we probably gain a little bit by
* not having to do these atomic operations per packet.
*/
pkts_per_call += ipkts;
if (head) {
}
/*
* If we have processed too many packets in one attempt, we'll
* have to come back here later.
*/
if (pkts_per_call >= EIB_MAX_RX_PKTS_ONINTR) {
NULL);
break;
}
}
return (DDI_INTR_CLAIMED);
}
/*ARGSUSED*/
{
int i;
/*
* Re-arm the tx notification callback before we start polling
* the completion queue. There's nothing much we can do if the
* enable_cq_notify fails - we issue a warning and move on.
*/
if (ret != IBT_SUCCESS) {
"ibt_enable_cq_notify() failed, ret=%d", ret);
}
/*
* Handle tx completions
*/
&polled)) == IBT_SUCCESS) {
} else {
}
}
}
return (DDI_INTR_CLAIMED);
}
void
{
int ret;
/*
* We come here from three places - (a) from the nw layer if the
* rx mblk we handed to it has been done with and the nw layer is
* calling the freemsg() (b) from eib_data_rx_comp() if the rx
* completion processing discovers that the received EoIB packet
* has a problem and (c) from eib_data_err_comp() if we're tearing
* down this channel. We only need to repost the rwqe if we're
* being called back from the nw layer. For the other two cases,
* we'll simply return the rwqe to the pool. Also, since we would've
* already updated the ch_rx_posted counters in the rx completion
* handler, we don't pass the chan pointer to eib_rsrc_return_rwqe
* from within this routine.
*/
return;
}
/*
* If the buffers are being returned by nw layer after a long
* time, this eoib instance could've even been stopped by now.
* If so, simply return the rwqe to the pool.
*/
return;
}
/*
* Or it could've taken even longer, and the nic has even been
* restarted. Only thing we can do is to make sure that the
* original channel pointer we passed corresponds to what's in
* the instance of the vnic currently.
*/
return;
}
/*
* Try to repost the rwqe if we're not tearing down this channel
*/
if (vn_chan->ch_tear_down) {
} else {
if (ret != EIB_E_SUCCESS) {
else
}
}
}
void
{
uint_t i;
/*
* See if we have room for this wqe and then add it to the
* list of tx wrs to post in this channel.
*/
"too many swqes posted already, posted=0x%lx, "
return;
}
} else {
}
/*
* If someone's already posting tx wqes in this channel, let
* them post ours as well.
*/
return;
}
/*
* Post EIB_MAX_POST_MULTIPLE wrs at a time
*/
}
}
/*
* If multiple wrs posting fails for some reason, we'll try
* posting the unposted ones one by one. If even that fails,
* this wqe and return it to the pool.
*/
if (ret != IBT_SUCCESS) {
"ibt_post_send(n_wrs=0x%lx, n_posted=0x%lx) "
1, NULL);
if (ret != IBT_SUCCESS) {
n_failed++;
"eib_data_post_tx: "
"ibt_post_send(n_wrs=1) failed, "
"ret=%d", ret);
}
}
}
total_failed += n_failed;
}
/*
* If we failed to post something, update error stats
*/
if (total_failed) {
}
}
void
{
/*
* Assume that the ether header (with or without vlan tag) is
* contained in one fragment
*/
} else {
evh->eh_tagless = 0;
}
}
int
{
int inst = 0;
return (EIB_E_FAILURE);
/*
* For now, a simple search (but only what we've allocated). Note that
* if we're in the process of creating a vnic, the instance might've
* been allocated, but the vnic entry would be NULL.
*/
if (vnicp) {
}
return (EIB_E_SUCCESS);
}
}
}
/*
* If we haven't been able to locate a vnic for this {mac,vlan} tuple,
* see if we've already failed a creation request for this vnic, and
* return that information.
*/
if (failed) {
}
}
}
return (EIB_E_FAILURE);
}
int
{
/*
* The swqe defaults are set to use the regular ud work request
* member and the IBT_WRC_SEND opcode, so we don't need to do
* anything here if this isn't an LSO packet.
*/
} else {
}
return (EIB_E_FAILURE);
return (EIB_E_SUCCESS);
}
void
{
}
static int
{
int rv;
/*
* Allocate send completion queue. Note that we've already verified
* that cp_max_swqe and cp_max_rwqe meet the max cq size requirements
* of the hca.
*/
if (ret != IBT_SUCCESS) {
"ibt_alloc_cq(snd_cq_sz=0x%lx) failed, ret=%d",
goto setup_data_cqs_fail;
}
EIB_TX_COMP_USEC, 0);
if (ret != IBT_SUCCESS) {
"ibt_modify_cq(snd_comp_count=0x%lx, snd_comp_usec=0x%lx) "
"failed, ret=%d",
}
/*
* Allocate receive completion queue
*/
&rcv_sz);
if (ret != IBT_SUCCESS) {
"ibt_alloc_cq(rcv_cq_sz=0x%lx) failed, ret=%d",
goto setup_data_cqs_fail;
}
EIB_RX_COMP_USEC, 0);
if (ret != IBT_SUCCESS) {
"ibt_modify_cq(rcv_comp_count=0x%lx, rcv_comp_usec=0x%lx) "
"failed, ret=%d",
}
/*
* Set up parameters for collecting tx and rx completion information
*/
/*
* Set up the vnic's data tx completion queue handler and allocate
* a softint for it as well.
*/
"ddi_intr_add_softint() failed for data tx qp, ret=%d", rv);
goto setup_data_cqs_fail;
}
if (ret != IBT_SUCCESS) {
"ibt_enable_cq_notify() failed for tx cq, ret=%d", ret);
goto setup_data_cqs_fail;
}
/*
* And then the data rx completion queue handler
*/
"ddi_intr_add_softint() failed for data rx qp, ret=%d", rv);
goto setup_data_cqs_fail;
}
if (ret != IBT_SUCCESS) {
"ibt_enable_cq_notify() failed for rx cq, ret=%d", ret);
goto setup_data_cqs_fail;
}
return (EIB_E_SUCCESS);
return (EIB_E_FAILURE);
}
static int
{
if (ret != IBT_SUCCESS) {
"ibt_alloc_ud_channel(port=0x%x, pkey_ix=0x%x, "
"cs_sq=0x%lx, cs_rq=0x%lx, sq_sgl=0x%lx) failed, ret=%d",
}
if (ret != IBT_SUCCESS) {
"ibt_query_ud_channel() failed, ret=%d", ret);
}
return (EIB_E_SUCCESS);
return (EIB_E_FAILURE);
}
static void
{
/*
* When the swqe was grabbed, it would've had its wr_opcode and
* wr.ud.udwr_dest set to default values. Since we're now going
* to use LSO, we need to change these.
*/
/*
* Details on the ethernet header in the mp is already known to us
*/
(sizeof (struct ether_vlan_header));
/*
* Calculate the LSO header size and set it in the UD LSO structure.
* Note that the only assumption we make is that each of the Ethernet,
* IP and TCP headers will be contained in a single mblk fragment;
* together, the headers may span multiple mblk fragments. Note also
* that since the EoIB encapsulation header is not part of the message
* block we receive, we'll need to account space for inserting it later.
*/
}
}
/*
* Since the passed mp fragment never contains the EoIB encapsulation
* header, we always have to copy the lso header. Sigh.
*/
/*
* We already have the EoIB encapsulation header written at the
* start of wqe->qe_payload_hdr during swqe acquisition. Only
* copy the remaining headers.
*/
} else {
break;
}
}
}
static int
{
uint_t i;
/*
* Let's skip ahead to the TCP data if this is LSO. Note that while
* the lso header size in the swqe includes the EoIB encapsulation
* header size, that encapsulation header itself won't be found in
* the mblk.
*/
pending_hdr = 0;
if (lsohdr_sz) {
frag_len =
if (frag_len > pending_hdr)
break;
pending_hdr -= frag_len;
}
}
/*
* If this is an LSO packet, we want pktsz to hold the size of the
* non-LSO packet, we want pktsz to refer to the size of the entire
* packet with all the headers, and nblks to hold the number of
* mappings we'll need to iov map this (for reserved lkey request).
*/
if (lsohdr_sz == 0) {
nblks = 1;
} else {
nblks = 0;
pktsz = 0;
}
nblks++;
}
pktsz -= pending_hdr;
/*
* We only do ibt_map_mem_iov() if the pktsz is above the tx copy
* threshold and if the number of mp fragments is less than the
* maximum acceptable.
*/
i = 0;
if (lsohdr_sz == 0) {
i++;
}
}
}
if (ret != IBT_SUCCESS) {
"eib_data_prepare_sgl: "
"ibt_map_mem_iov(nblks=0x%lx) failed, ret=%d ",
goto prepare_sgl_copy_path;
}
return (EIB_E_SUCCESS);
}
/*
* Even though this is the copy path for transfers less than
* qe_bufsz, it could still be an LSO packet. If so, we only
* have to write the data following all the headers into the
* work request buffer, since we'll be sending the lso header
* itself separately. If this is not an LSO send (but pkt size
* greater than mtu, say for a jumbo frame), then we need
* to write all the headers including EoIB encapsulation,
* into the work request buffer.
*/
if (lsohdr_sz == 0) {
bufp += EIB_ENCAP_HDR_SZ;
}
pending_hdr = 0;
}
/*
* If the ethernet frame we're going to send is less than
* ETHERMIN, pad up the buffer to ETHERMIN (with zeros)
*/
}
return (EIB_E_SUCCESS);
}
/*
* Copy path for transfers greater than swqe->qe_bufsz
*/
"eib_rsrc_grab_lsobufs() failed");
return (EIB_E_FAILURE);
}
/*
* Copy the larger-than-qe_buf_sz packet into a set of fixed-sized,
* pre-mapped LSO buffers. Note that we might need to skip part of
* the LSO header in the first fragment as before.
*/
skip = pending_hdr;
/*
* If this is a non-LSO packet (perhaps a jumbo frame?)
* we may still need to prefix the EoIB header in the
* wr buffer.
*/
if ((i == 0) && (lsohdr_sz == 0)) {
bufp += EIB_ENCAP_HDR_SZ;
}
avail = 0;
} else {
skip = 0;
}
}
}
return (EIB_E_SUCCESS);
}
/*ARGSUSED*/
static int
{
/*
* If the dmac is a broadcast packet, let it through. Otherwise, either
* we should be in promiscuous mode or the dmac should be in our list of
* joined multicast addresses. Currently we only update the stat
* counters and always let things through.
*/
else
return (1);
}
static void
{
"cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), "
return;
}
}
static void
{
"cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), "
return;
}
}
static mblk_t *
{
/*
* Before we process this mblk and send it up to network layer, see
* if we're running low on rwqes in the wqe pool. If so, allocate a
* new mblk, copy the received data into it and send it up (and return
* the current rwqe back to the pool immediately by calling freemsg()
* on the original mblk).
*/
if (!eib_rsrc_rxpool_low(wqe)) {
} else {
wc->wc_bytes_xfer);
} else {
"wqe level below watermark, dropping rx pkt");
return (NULL);
}
}
/*
* Adjust write pointer depending on how much data came in. Note that
* since the nw layer will expect us to hand over the mp with the
* ethernet header starting at mp->b_rptr, update the b_rptr as well.
*/
/*
* We have a problem if this really happens!
*/
"received packet's b_next not NULL, possible dup from cq");
}
/*
* Drop loopback packets ?
*/
goto data_rx_comp_fail;
}
/*
* Since the recv buffer has been aligned for IP header to start on
* a word boundary, it is safe to say that the EoIB and ethernet
* headers won't start on a word boundary.
*/
/*
* Check EoIB signature and version
*/
if (ec_sign != EIB_EH_SIGNATURE) {
"EoIB encapsulation header signature (0x%lx) unknown",
ec_sign);
goto data_rx_comp_fail;
}
if (ec_ver != EIB_EH_VERSION) {
"EoIB encapsulation header version (0x%lx) unknown",
ec_ver);
goto data_rx_comp_fail;
}
/*
*/
(ec_ip_cs == EIB_EH_IPCSUM_OK)) {
}
/*
* Update the message block's b_rptr to the start of ethernet header
* and parse the header information
*/
/*
* If the incoming packet is vlan-tagged, but the tag doesn't match
* this vnic's vlan, drop it.
*/
"received packet's vlan unknown, expected=0x%x, got=0x%x",
goto data_rx_comp_fail;
}
/*
* Final checks to see if the unicast destination is indeed correct
* and to see if the multicast address is ok for us.
*/
"received packet's macaddr mismatch, "
"expected=%x:%x:%x:%x:%x:%x, got=%x:%x:%x:%x:%x:%x",
goto data_rx_comp_fail;
}
} else {
"multicast packet not ok");
goto data_rx_comp_fail;
}
}
/*
* Strip ethernet FCS if present in the packet. ConnectX-2 doesn't
* support ethernet FCS, so this shouldn't happen anyway.
*/
"ethernet FCS present (ec_hdr=0%lx), ignoring",
ec_hdr);
}
/*
* If this is the same mp as was in the original rwqe (i.e. we didn't
* do any allocb()), then mark the rwqe flag so we know that its mblk
* is with the network layer.
*/
if (!allocd_mp) {
}
return (mp);
return (NULL);
}
static void
{
wqe->qe_iov_hdl);
if (ret != IBT_SUCCESS) {
"eib_data_tx_comp: "
"ibt_unmap_mem_iov() failed, ret=%d", ret);
}
}
}
}
static void
{
/*
* Currently, all we do is report
*/
case IBT_WC_WR_FLUSHED_ERR:
break;
case IBT_WC_LOCAL_CHAN_OP_ERR:
"IBT_WC_LOCAL_CHAN_OP_ERR seen, wqe_info=0x%lx ",
break;
case IBT_WC_LOCAL_PROTECT_ERR:
"IBT_WC_LOCAL_PROTECT_ERR seen, wqe_info=0x%lx ",
break;
}
/*
* When a wc indicates error, we do not attempt to repost the
* rwqe but simply return it to the wqe pool. Also for rwqes,
* attempting to free the mblk in the wqe invokes the
* eib_data_rx_recycle() callback. For tx wqes, error handling
* is the same as successful completion handling. We still
* swqe to the pool.
*/
} else {
}
}
/*ARGSUSED*/
static void
{
return;
/*
* Reset any completion handlers we may have set up
*/
if (chan->ch_rcv_cq_hdl) {
}
}
/*
* Remove any softints that were added
*/
if (vnic->vn_data_rx_si_hdl) {
}
if (vnic->vn_data_tx_si_hdl) {
}
/*
* Release any work completion buffers we may have allocated
*/
}
chan->ch_rcv_cq_sz = 0;
}
/*
* Free any completion queues we may have allocated
*/
if (chan->ch_rcv_cq_hdl) {
if (ret != IBT_SUCCESS) {
"eib_rb_data_setup_cqs: "
"ibt_free_cq(rcv_cq) failed, ret=%d", ret);
}
}
if (ret != IBT_SUCCESS) {
"eib_rb_data_setup_cqs: "
"ibt_free_cq(snd_cq) failed, ret=%d", ret);
}
}
}
/*ARGSUSED*/
static void
{
return;
/*
* We're trying to tear down this UD channel. Make sure that
* we don't attempt to refill (repost) at any point from now on.
*/
"eib_rb_data_setup_ud_channel: "
"ibt_flush_channel() failed, ret=%d", ret);
}
/*
* Wait until all posted tx wqes on this channel are back with
* the wqe pool.
*/
while (chan->ch_tx_posted > 0)
/*
* Wait until all posted rx wqes on this channel are back with
* the wqe pool.
*/
while (chan->ch_rx_posted > 0)
/*
* Now we're ready to free this channel
*/
"eib_rb_data_setup_ud_channel: "
"ibt_free_channel() failed, ret=%d", ret);
}
chan->ch_ip_hdr_align = 0;
chan->ch_rwqe_bktsz = 0;
chan->ch_lwm_rwqes = 0;
chan->ch_max_rwqes = 0;
chan->ch_max_swqes = 0;
}
}