xnb.c revision 843e19887f64dde75055cf8842fc4db2171eff45
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#ifdef DEBUG
#define XNB_DEBUG 1
#endif /* DEBUG */
#include "xnb.h"
#include <sys/sunddi.h>
#include <sys/sunndi.h>
#include <sys/modctl.h>
#include <sys/conf.h>
#include <sys/mac.h>
#include <sys/dlpi.h>
#include <sys/strsubr.h>
#include <sys/strsun.h>
#include <sys/pattr.h>
#include <vm/seg_kmem.h>
#include <vm/hat_i86.h>
#include <xen/sys/xenbus_impl.h>
#include <xen/sys/xendev.h>
#include <sys/balloon_impl.h>
#include <sys/evtchn_impl.h>
#include <sys/gnttab.h>
#include <sys/gld.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <sys/vnic_impl.h> /* blech. */
/*
* The terms "transmit" and "receive" are used in their traditional
* sense here - packets from other parts of this system are
* "transmitted" to the peer domain and those originating from the
* peer are "received".
*
* In some cases this can be confusing, because various data
* structures are shared with the domU driver, which has the opposite
* view of what constitutes "transmit" and "receive". In naming the
* shared structures the domU driver always wins.
*/
/*
* XXPV dme: things to do, as well as various things indicated
* throughout the source:
* - copy avoidance outbound.
* - copy avoidance inbound.
* - transfer credit limiting.
* - MAC address based filtering.
*/
/*
* Linux expects to have some headroom in received buffers. The Linux
* frontend driver (netfront) checks to see if the headroom is
* available and will re-allocate the buffer to make room if
* necessary. To avoid this we add TX_BUFFER_HEADROOM bytes of
* headroom to each packet we pass to the peer.
*/
#define TX_BUFFER_HEADROOM 16
static boolean_t xnb_cksum_offload = B_TRUE;
static boolean_t xnb_connect_rings(dev_info_t *);
static void xnb_disconnect_rings(dev_info_t *);
static void xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
void *, void *);
static void xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
void *, void *);
static int xnb_rxbuf_constructor(void *, void *, int);
static void xnb_rxbuf_destructor(void *, void *);
static xnb_rxbuf_t *xnb_rxbuf_get(xnb_t *, int);
static void xnb_rxbuf_put(xnb_t *, xnb_rxbuf_t *);
static void xnb_rx_notify_peer(xnb_t *);
static void xnb_rx_complete(xnb_rxbuf_t *);
static void xnb_rx_mark_complete(xnb_t *, RING_IDX, int16_t);
static void xnb_rx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *);
static void xnb_rx_perform_pending_unmop(xnb_t *);
#ifdef XNB_DEBUG
#define NR_GRANT_ENTRIES \
(NR_GRANT_FRAMES * PAGESIZE / sizeof (grant_entry_t))
#endif /* XNB_DEBUG */
/* XXPV dme: are these really invalid? */
#define INVALID_GRANT_HANDLE ((grant_handle_t)-1)
#define INVALID_GRANT_REF ((grant_ref_t)-1)
static kmem_cache_t *xnb_rxbuf_cachep;
static kmutex_t xnb_alloc_page_lock;
/*
* Statistics.
*/
static char *aux_statistics[] = {
"tx_cksum_deferred",
"rx_cksum_no_need",
"tx_notify_deferred",
"tx_notify_sent",
"rx_notify_deferred",
"rx_notify_sent",
"tx_too_early",
"rx_too_early",
"rx_allocb_failed",
"mac_full",
"spurious_intr",
"allocation_success",
"allocation_failure",
"small_allocation_success",
"small_allocation_failure",
"csum_hardware",
"csum_software",
};
static int
xnb_ks_aux_update(kstat_t *ksp, int flag)
{
xnb_t *xnbp;
kstat_named_t *knp;
if (flag != KSTAT_READ)
return (EACCES);
xnbp = ksp->ks_private;
knp = ksp->ks_data;
/*
* Assignment order should match that of the names in
* aux_statistics.
*/
(knp++)->value.ui64 = xnbp->x_stat_tx_cksum_deferred;
(knp++)->value.ui64 = xnbp->x_stat_rx_cksum_no_need;
(knp++)->value.ui64 = xnbp->x_stat_tx_notify_deferred;
(knp++)->value.ui64 = xnbp->x_stat_tx_notify_sent;
(knp++)->value.ui64 = xnbp->x_stat_rx_notify_deferred;
(knp++)->value.ui64 = xnbp->x_stat_rx_notify_sent;
(knp++)->value.ui64 = xnbp->x_stat_tx_too_early;
(knp++)->value.ui64 = xnbp->x_stat_rx_too_early;
(knp++)->value.ui64 = xnbp->x_stat_rx_allocb_failed;
(knp++)->value.ui64 = xnbp->x_stat_mac_full;
(knp++)->value.ui64 = xnbp->x_stat_spurious_intr;
(knp++)->value.ui64 = xnbp->x_stat_allocation_success;
(knp++)->value.ui64 = xnbp->x_stat_allocation_failure;
(knp++)->value.ui64 = xnbp->x_stat_small_allocation_success;
(knp++)->value.ui64 = xnbp->x_stat_small_allocation_failure;
(knp++)->value.ui64 = xnbp->x_stat_csum_hardware;
(knp++)->value.ui64 = xnbp->x_stat_csum_software;
return (0);
}
static boolean_t
xnb_ks_init(xnb_t *xnbp)
{
int nstat = sizeof (aux_statistics) /
sizeof (aux_statistics[0]);
char **cp = aux_statistics;
kstat_named_t *knp;
/*
* Create and initialise kstats.
*/
xnbp->x_kstat_aux = kstat_create(ddi_driver_name(xnbp->x_devinfo),
ddi_get_instance(xnbp->x_devinfo), "aux_statistics", "net",
KSTAT_TYPE_NAMED, nstat, 0);
if (xnbp->x_kstat_aux == NULL)
return (B_FALSE);
xnbp->x_kstat_aux->ks_private = xnbp;
xnbp->x_kstat_aux->ks_update = xnb_ks_aux_update;
knp = xnbp->x_kstat_aux->ks_data;
while (nstat > 0) {
kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
knp++;
cp++;
nstat--;
}
kstat_install(xnbp->x_kstat_aux);
return (B_TRUE);
}
static void
xnb_ks_free(xnb_t *xnbp)
{
kstat_delete(xnbp->x_kstat_aux);
}
/*
* Software checksum calculation and insertion for an arbitrary packet.
*/
/*ARGSUSED*/
static mblk_t *
xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
{
/*
* XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
* because it doesn't cover all of the interesting cases :-(
*/
(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
HCK_FULLCKSUM, KM_NOSLEEP);
return (vnic_fix_cksum(mp));
}
mblk_t *
xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
{
struct ether_header *ehp;
uint16_t sap;
uint32_t offset;
ipha_t *ipha;
ASSERT(mp->b_next == NULL);
/*
* Check that the packet is contained in a single mblk. In
* the "from peer" path this is true today, but will change
* when scatter gather support is added. In the "to peer"
* path we cannot be sure, but in most cases it will be true
* (in the xnbo case the packet has come from a MAC device
* which is unlikely to split packets).
*/
if (mp->b_cont != NULL)
goto software;
/*
* If the MAC has no hardware capability don't do any further
* checking.
*/
if (capab == 0)
goto software;
ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
ehp = (struct ether_header *)mp->b_rptr;
if (ntohs(ehp->ether_type) == VLAN_TPID) {
struct ether_vlan_header *evhp;
ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
evhp = (struct ether_vlan_header *)mp->b_rptr;
sap = ntohs(evhp->ether_type);
offset = sizeof (struct ether_vlan_header);
} else {
sap = ntohs(ehp->ether_type);
offset = sizeof (struct ether_header);
}
/*
* We only attempt to do IPv4 packets in hardware.
*/
if (sap != ETHERTYPE_IP)
goto software;
/*
* We know that this is an IPv4 packet.
*/
ipha = (ipha_t *)(mp->b_rptr + offset);
switch (ipha->ipha_protocol) {
case IPPROTO_TCP:
case IPPROTO_UDP:
/*
* This is a TCP/IPv4 or UDP/IPv4 packet.
*
* If the capabilities indicate that full checksum
* offload is available, use it.
*/
if ((capab & HCKSUM_INET_FULL_V4) != 0) {
(void) hcksum_assoc(mp, NULL, NULL,
0, 0, 0, 0,
HCK_FULLCKSUM, KM_NOSLEEP);
xnbp->x_stat_csum_hardware++;
return (mp);
}
/*
* XXPV dme: If the capabilities indicate that partial
* checksum offload is available, we should use it.
*/
break;
default:
/* Use software. */
break;
}
software:
/*
* We are not able to use any offload so do the whole thing in
* software.
*/
xnbp->x_stat_csum_software++;
return (xnb_software_csum(xnbp, mp));
}
int
xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
{
xnb_t *xnbp;
char *xsname, mac[ETHERADDRL * 3];
xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
xnbp->x_flavour = flavour;
xnbp->x_flavour_data = flavour_data;
xnbp->x_devinfo = dip;
xnbp->x_evtchn = INVALID_EVTCHN;
xnbp->x_irq = B_FALSE;
xnbp->x_tx_ring_handle = INVALID_GRANT_HANDLE;
xnbp->x_rx_ring_handle = INVALID_GRANT_HANDLE;
xnbp->x_cksum_offload = xnb_cksum_offload;
xnbp->x_connected = B_FALSE;
xnbp->x_hotplugged = B_FALSE;
xnbp->x_detachable = B_FALSE;
xnbp->x_peer = xvdi_get_oeid(dip);
xnbp->x_rx_pages_writable = B_FALSE;
xnbp->x_rx_buf_count = 0;
xnbp->x_rx_unmop_count = 0;
xnbp->x_tx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
ASSERT(xnbp->x_tx_va != NULL);
if (ddi_get_iblock_cookie(dip, 0, &xnbp->x_icookie)
!= DDI_SUCCESS)
goto failure;
mutex_init(&xnbp->x_tx_lock, NULL, MUTEX_DRIVER, xnbp->x_icookie);
mutex_init(&xnbp->x_rx_lock, NULL, MUTEX_DRIVER, xnbp->x_icookie);
/* set driver private pointer now */
ddi_set_driver_private(dip, xnbp);
if (!xnb_ks_init(xnbp))
goto late_failure;
/*
* Receive notification of changes in the state of the
* driver in the guest domain.
*/
if (xvdi_add_event_handler(dip, XS_OE_STATE,
xnb_oe_state_change) != DDI_SUCCESS)
goto very_late_failure;
/*
* Receive notification of hotplug events.
*/
if (xvdi_add_event_handler(dip, XS_HP_STATE,
xnb_hp_state_change) != DDI_SUCCESS)
goto very_late_failure;
xsname = xvdi_get_xsname(dip);
if (xenbus_printf(XBT_NULL, xsname,
"feature-no-csum-offload", "%d",
xnbp->x_cksum_offload ? 0 : 1) != 0)
goto very_very_late_failure;
if (xenbus_scanf(XBT_NULL, xsname,
"mac", "%s", mac) != 0) {
cmn_err(CE_WARN, "xnb_attach: "
"cannot read mac address from %s",
xsname);
goto very_very_late_failure;
}
if (ether_aton(mac, xnbp->x_mac_addr) != ETHERADDRL) {
cmn_err(CE_WARN,
"xnb_attach: cannot parse mac address %s",
mac);
goto very_very_late_failure;
}
(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
(void) xvdi_post_event(dip, XEN_HP_ADD);
return (DDI_SUCCESS);
very_very_late_failure: /* not that the naming is getting silly or anything */
xvdi_remove_event_handler(dip, NULL);
very_late_failure:
xnb_ks_free(xnbp);
late_failure:
mutex_destroy(&xnbp->x_rx_lock);
mutex_destroy(&xnbp->x_tx_lock);
failure:
vmem_free(heap_arena, xnbp->x_tx_va, PAGESIZE);
kmem_free(xnbp, sizeof (*xnbp));
return (DDI_FAILURE);
}
/*ARGSUSED*/
void
xnb_detach(dev_info_t *dip)
{
xnb_t *xnbp = ddi_get_driver_private(dip);
ASSERT(xnbp != NULL);
ASSERT(!xnbp->x_connected);
ASSERT(xnbp->x_rx_buf_count == 0);
xnb_disconnect_rings(dip);
xvdi_remove_event_handler(dip, NULL);
xnb_ks_free(xnbp);
ddi_set_driver_private(dip, NULL);
mutex_destroy(&xnbp->x_tx_lock);
mutex_destroy(&xnbp->x_rx_lock);
ASSERT(xnbp->x_tx_va != NULL);
vmem_free(heap_arena, xnbp->x_tx_va, PAGESIZE);
kmem_free(xnbp, sizeof (*xnbp));
}
static mfn_t
xnb_alloc_page(xnb_t *xnbp)
{
#define WARNING_RATE_LIMIT 100
#define BATCH_SIZE 256
static mfn_t mfns[BATCH_SIZE]; /* common across all instances */
static int nth = BATCH_SIZE;
mfn_t mfn;
mutex_enter(&xnb_alloc_page_lock);
if (nth == BATCH_SIZE) {
if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
xnbp->x_stat_allocation_failure++;
mutex_exit(&xnb_alloc_page_lock);
/*
* Try for a single page in low memory situations.
*/
if (balloon_alloc_pages(1, &mfn) != 1) {
xnbp->x_stat_small_allocation_failure++;
if ((xnbp->x_stat_small_allocation_failure
% WARNING_RATE_LIMIT) == 0) {
cmn_err(CE_WARN, "xnb_alloc_page: "
"Cannot allocate memory to "
"transfer packets to peer.");
}
return (0);
} else {
xnbp->x_stat_small_allocation_success++;
return (mfn);
}
}
nth = 0;
xnbp->x_stat_allocation_success++;
}
mfn = mfns[nth++];
mutex_exit(&xnb_alloc_page_lock);
ASSERT(mfn != 0);
return (mfn);
#undef BATCH_SIZE
#undef WARNING_RATE_LIMIT
}
/*ARGSUSED*/
static void
xnb_free_page(xnb_t *xnbp, mfn_t mfn)
{
int r;
/*
* This happens only in the error path, so batching is
* not worth the complication.
*/
if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
cmn_err(CE_WARN, "free_page: cannot decrease memory "
"reservation (%d): page kept but unusable (mfn = 0x%lx).",
r, mfn);
}
}
mblk_t *
xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
{
mblk_t *free = mp, *prev = NULL;
size_t len;
gnttab_transfer_t *gop;
boolean_t notify;
RING_IDX loop, prod, end;
/*
* For each packet the sequence of operations is:
*
* 1. get a new page from the hypervisor.
* 2. get a request slot from the ring.
* 3. copy the data into the new page.
* 4. transfer the page to the peer.
* 5. update the request slot.
* 6. kick the peer.
* 7. free mp.
*
* In order to reduce the number of hypercalls, we prepare
* several packets for the peer and perform a single hypercall
* to transfer them.
*/
mutex_enter(&xnbp->x_tx_lock);
/*
* If we are not connected to the peer or have not yet
* finished hotplug it is too early to pass packets to the
* peer.
*/
if (!(xnbp->x_connected && xnbp->x_hotplugged)) {
mutex_exit(&xnbp->x_tx_lock);
xnbp->x_stat_tx_too_early++;
return (mp);
}
loop = xnbp->x_rx_ring.req_cons;
prod = xnbp->x_rx_ring.rsp_prod_pvt;
gop = xnbp->x_tx_top;
/*
* Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->x_rx_ring) but
* using local variables.
*/
#define XNB_RING_HAS_UNCONSUMED_REQUESTS(_r) \
((((_r)->sring->req_prod - loop) < \
(RING_SIZE(_r) - (loop - prod))) ? \
((_r)->sring->req_prod - loop) : \
(RING_SIZE(_r) - (loop - prod)))
while ((mp != NULL) &&
XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->x_rx_ring)) {
mfn_t mfn;
pfn_t pfn;
netif_rx_request_t *rxreq;
netif_rx_response_t *rxresp;
char *valoop;
size_t offset;
mblk_t *ml;
uint16_t cksum_flags;
/* 1 */
if ((mfn = xnb_alloc_page(xnbp)) == 0) {
xnbp->x_stat_xmit_defer++;
break;
}
/* 2 */
rxreq = RING_GET_REQUEST(&xnbp->x_rx_ring, loop);
#ifdef XNB_DEBUG
if (!(rxreq->id < NET_RX_RING_SIZE))
cmn_err(CE_PANIC, "xnb_to_peer: "
"id %d out of range in request 0x%p",
rxreq->id, (void *)rxreq);
if (rxreq->gref >= NR_GRANT_ENTRIES)
cmn_err(CE_PANIC, "xnb_to_peer: "
"grant ref %d out of range in request 0x%p",
rxreq->gref, (void *)rxreq);
#endif /* XNB_DEBUG */
/* Assign a pfn and map the new page at the allocated va. */
pfn = xen_assign_pfn(mfn);
hat_devload(kas.a_hat, xnbp->x_tx_va, PAGESIZE,
pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
offset = TX_BUFFER_HEADROOM;
/* 3 */
len = 0;
valoop = xnbp->x_tx_va + offset;
for (ml = mp; ml != NULL; ml = ml->b_cont) {
size_t chunk = ml->b_wptr - ml->b_rptr;
bcopy(ml->b_rptr, valoop, chunk);
valoop += chunk;
len += chunk;
}
ASSERT(len + offset < PAGESIZE);
/* Release the pfn. */
hat_unload(kas.a_hat, xnbp->x_tx_va, PAGESIZE,
HAT_UNLOAD_UNMAP);
xen_release_pfn(pfn);
/* 4 */
gop->mfn = mfn;
gop->domid = xnbp->x_peer;
gop->ref = rxreq->gref;
/* 5.1 */
rxresp = RING_GET_RESPONSE(&xnbp->x_rx_ring, prod);
rxresp->offset = offset;
rxresp->flags = 0;
cksum_flags = xnbp->x_flavour->xf_cksum_to_peer(xnbp, mp);
if (cksum_flags != 0)
xnbp->x_stat_tx_cksum_deferred++;
rxresp->flags |= cksum_flags;
rxresp->id = RING_GET_REQUEST(&xnbp->x_rx_ring, prod)->id;
rxresp->status = len;
loop++;
prod++;
gop++;
prev = mp;
mp = mp->b_next;
}
/*
* Did we actually do anything?
*/
if (loop == xnbp->x_rx_ring.req_cons) {
mutex_exit(&xnbp->x_tx_lock);
return (mp);
}
end = loop;
/*
* Unlink the end of the 'done' list from the remainder.
*/
ASSERT(prev != NULL);
prev->b_next = NULL;
if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->x_tx_top,
loop - xnbp->x_rx_ring.req_cons) != 0) {
cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
}
loop = xnbp->x_rx_ring.req_cons;
prod = xnbp->x_rx_ring.rsp_prod_pvt;
gop = xnbp->x_tx_top;
while (loop < end) {
int16_t status = NETIF_RSP_OKAY;
if (gop->status != 0) {
status = NETIF_RSP_ERROR;
/*
* If the status is anything other than
* GNTST_bad_page then we don't own the page
* any more, so don't try to give it back.
*/
if (gop->status != GNTST_bad_page)
gop->mfn = 0;
} else {
/* The page is no longer ours. */
gop->mfn = 0;
}
if (gop->mfn != 0)
/*
* Give back the page, as we won't be using
* it.
*/
xnb_free_page(xnbp, gop->mfn);
else
/*
* We gave away a page, update our accounting
* now.
*/
balloon_drv_subtracted(1);
/* 5.2 */
if (status != NETIF_RSP_OKAY) {
RING_GET_RESPONSE(&xnbp->x_rx_ring, prod)->status =
status;
} else {
xnbp->x_stat_opackets++;
xnbp->x_stat_obytes += len;
}
loop++;
prod++;
gop++;
}
xnbp->x_rx_ring.req_cons = loop;
xnbp->x_rx_ring.rsp_prod_pvt = prod;
/* 6 */
/*LINTED: constant in conditional context*/
RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->x_rx_ring, notify);
if (notify) {
ec_notify_via_evtchn(xnbp->x_evtchn);
xnbp->x_stat_tx_notify_sent++;
} else {
xnbp->x_stat_tx_notify_deferred++;
}
if (mp != NULL)
xnbp->x_stat_xmit_defer++;
mutex_exit(&xnbp->x_tx_lock);
/* Free mblk_t's that we consumed. */
freemsgchain(free);
return (mp);
}
/*ARGSUSED*/
static int
xnb_rxbuf_constructor(void *buf, void *arg, int kmflag)
{
xnb_rxbuf_t *rxp = buf;
bzero(rxp, sizeof (*rxp));
rxp->xr_free_rtn.free_func = xnb_rx_complete;
rxp->xr_free_rtn.free_arg = (caddr_t)rxp;
rxp->xr_mop.host_addr =
(uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE,
((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ?
VM_NOSLEEP : VM_SLEEP);
if (rxp->xr_mop.host_addr == NULL) {
cmn_err(CE_WARN, "xnb_rxbuf_constructor: "
"cannot get address space");
return (-1);
}
/*
* Have the hat ensure that page table exists for the VA.
*/
hat_prepare_mapping(kas.a_hat,
(caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
return (0);
}
/*ARGSUSED*/
static void
xnb_rxbuf_destructor(void *buf, void *arg)
{
xnb_rxbuf_t *rxp = buf;
ASSERT(rxp->xr_mop.host_addr != NULL);
ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
hat_release_mapping(kas.a_hat,
(caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
vmem_free(heap_arena,
(caddr_t)(uintptr_t)rxp->xr_mop.host_addr, PAGESIZE);
}
static void
xnb_rx_notify_peer(xnb_t *xnbp)
{
boolean_t notify;
ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
/*LINTED: constant in conditional context*/
RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->x_tx_ring, notify);
if (notify) {
ec_notify_via_evtchn(xnbp->x_evtchn);
xnbp->x_stat_rx_notify_sent++;
} else {
xnbp->x_stat_rx_notify_deferred++;
}
}
static void
xnb_rx_complete(xnb_rxbuf_t *rxp)
{
xnb_t *xnbp = rxp->xr_xnbp;
ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
mutex_enter(&xnbp->x_rx_lock);
xnb_rx_schedule_unmop(xnbp, &rxp->xr_mop);
xnb_rx_perform_pending_unmop(xnbp);
if (xnbp->x_connected) {
xnb_rx_mark_complete(xnbp, rxp->xr_id, rxp->xr_status);
xnb_rx_notify_peer(xnbp);
}
xnb_rxbuf_put(xnbp, rxp);
mutex_exit(&xnbp->x_rx_lock);
}
static void
xnb_rx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
{
RING_IDX i;
netif_tx_response_t *txresp;
ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
i = xnbp->x_tx_ring.rsp_prod_pvt;
txresp = RING_GET_RESPONSE(&xnbp->x_tx_ring, i);
txresp->id = id;
txresp->status = status;
xnbp->x_tx_ring.rsp_prod_pvt = i + 1;
/*
* Note that we don't push the change to the peer here - that
* is the callers responsibility.
*/
}
/*
* XXPV dme: currently pending unmap operations are stored on a
* per-instance basis. Should they be per-driver? The locking would
* have to change (obviously), but there might be an improvement from
* batching more together. Right now they are all 'done' either at
* the tail of each receive operation (copy case) or on each
* completion (non-copy case). Should that be changed to some
* interval (watermark?) to improve the chance of batching?
*/
static void
xnb_rx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop)
{
gnttab_unmap_grant_ref_t *unmop;
ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
ASSERT(xnbp->x_rx_unmop_count <= NET_TX_RING_SIZE);
unmop = &xnbp->x_rx_unmop[xnbp->x_rx_unmop_count];
xnbp->x_rx_unmop_count++;
unmop->host_addr = mop->host_addr;
unmop->dev_bus_addr = mop->dev_bus_addr;
unmop->handle = mop->handle;
#ifdef XNB_DEBUG
if (xnbp->x_rx_unmop_count <= NET_TX_RING_SIZE)
ASSERT(xnbp->x_rx_unmop[xnbp->x_rx_unmop_count].host_addr
== NULL);
#endif /* XNB_DEBUG */
}
static void
xnb_rx_perform_pending_unmop(xnb_t *xnbp)
{
#ifdef XNB_DEBUG
RING_IDX loop;
gnttab_unmap_grant_ref_t *unmop;
#endif /* XNB_DEBUG */
ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
if (xnbp->x_rx_unmop_count == 0)
return;
if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
xnbp->x_rx_unmop, xnbp->x_rx_unmop_count) < 0) {
cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
"unmap grant operation failed, "
"%d pages lost", xnbp->x_rx_unmop_count);
}
#ifdef XNB_DEBUG
for (loop = 0, unmop = xnbp->x_rx_unmop;
loop < xnbp->x_rx_unmop_count;
loop++, unmop++) {
if (unmop->status != 0) {
cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
"unmap grant reference failed (%d)",
unmop->status);
}
}
#endif /* XNB_DEBUG */
xnbp->x_rx_unmop_count = 0;
#ifdef XNB_DEBUG
bzero(xnbp->x_rx_unmop, sizeof (xnbp->x_rx_unmop));
#endif /* XNB_DEBUG */
}
static xnb_rxbuf_t *
xnb_rxbuf_get(xnb_t *xnbp, int flags)
{
xnb_rxbuf_t *rxp;
ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
rxp = kmem_cache_alloc(xnb_rxbuf_cachep, flags);
if (rxp != NULL) {
ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
rxp->xr_flags |= XNB_RXBUF_INUSE;
rxp->xr_xnbp = xnbp;
rxp->xr_mop.dom = xnbp->x_peer;
rxp->xr_mop.flags = GNTMAP_host_map;
if (!xnbp->x_rx_pages_writable)
rxp->xr_mop.flags |= GNTMAP_readonly;
xnbp->x_rx_buf_count++;
}
return (rxp);
}
static void
xnb_rxbuf_put(xnb_t *xnbp, xnb_rxbuf_t *rxp)
{
ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
rxp->xr_flags &= ~XNB_RXBUF_INUSE;
xnbp->x_rx_buf_count--;
kmem_cache_free(xnb_rxbuf_cachep, rxp);
}
static mblk_t *
xnb_recv(xnb_t *xnbp)
{
RING_IDX start, end, loop;
gnttab_map_grant_ref_t *mop;
xnb_rxbuf_t **rxpp;
netif_tx_request_t *txreq;
boolean_t work_to_do;
mblk_t *head, *tail;
/*
* If the peer granted a read-only mapping to the page then we
* must copy the data, as the local protocol stack (should the
* packet be destined for this host) will modify the packet
* 'in place'.
*/
boolean_t copy = !xnbp->x_rx_pages_writable;
/*
* For each individual request, the sequence of actions is:
*
* 1. get the request.
* 2. map the page based on the grant ref.
* 3. allocate an mblk, copy the data to it.
* 4. release the grant.
* 5. update the ring.
* 6. pass the packet upward.
* 7. kick the peer.
*
* In fact, we try to perform the grant operations in batches,
* so there are two loops.
*/
head = tail = NULL;
around:
ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
/*LINTED: constant in conditional context*/
RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->x_tx_ring, work_to_do);
if (!work_to_do) {
finished:
xnb_rx_notify_peer(xnbp);
return (head);
}
start = xnbp->x_tx_ring.req_cons;
end = xnbp->x_tx_ring.sring->req_prod;
for (loop = start, mop = xnbp->x_rx_mop, rxpp = xnbp->x_rx_bufp;
loop != end;
loop++, mop++, rxpp++) {
xnb_rxbuf_t *rxp;
rxp = xnb_rxbuf_get(xnbp, KM_NOSLEEP);
if (rxp == NULL)
break;
ASSERT(xnbp->x_rx_pages_writable ||
((rxp->xr_mop.flags & GNTMAP_readonly)
== GNTMAP_readonly));
rxp->xr_mop.ref =
RING_GET_REQUEST(&xnbp->x_tx_ring, loop)->gref;
ASSERT(rxp->xr_mop.ref < NR_GRANT_ENTRIES);
*mop = rxp->xr_mop;
*rxpp = rxp;
}
if ((loop - start) == 0)
goto finished;
end = loop;
if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
xnbp->x_rx_mop, end - start) != 0) {
cmn_err(CE_WARN, "xnb_recv: map grant operation failed");
loop = start;
rxpp = xnbp->x_rx_bufp;
while (loop != end) {
xnb_rxbuf_put(xnbp, *rxpp);
loop++;
rxpp++;
}
goto finished;
}
for (loop = start, mop = xnbp->x_rx_mop, rxpp = xnbp->x_rx_bufp;
loop != end;
loop++, mop++, rxpp++) {
mblk_t *mp = NULL;
int16_t status = NETIF_RSP_OKAY;
xnb_rxbuf_t *rxp = *rxpp;
if (mop->status != 0) {
cmn_err(CE_WARN, "xnb_recv: "
"failed to map buffer: %d",
mop->status);
status = NETIF_RSP_ERROR;
}
txreq = RING_GET_REQUEST(&xnbp->x_tx_ring, loop);
if (status == NETIF_RSP_OKAY) {
if (copy) {
mp = allocb(txreq->size, BPRI_MED);
if (mp == NULL) {
status = NETIF_RSP_ERROR;
xnbp->x_stat_rx_allocb_failed++;
} else {
bcopy((caddr_t)(uintptr_t)
mop->host_addr + txreq->offset,
mp->b_wptr, txreq->size);
mp->b_wptr += txreq->size;
}
} else {
mp = desballoc((unsigned char *)(uintptr_t)
mop->host_addr + txreq->offset,
txreq->size, 0, &rxp->xr_free_rtn);
if (mp == NULL) {
status = NETIF_RSP_ERROR;
xnbp->x_stat_rx_allocb_failed++;
} else {
rxp->xr_id = txreq->id;
rxp->xr_status = status;
rxp->xr_mop = *mop;
mp->b_wptr += txreq->size;
}
}
/*
* If we have a buffer and there are checksum
* flags, process them appropriately.
*/
if ((mp != NULL) &&
((txreq->flags &
(NETTXF_csum_blank | NETTXF_data_validated))
!= 0)) {
mp = xnbp->x_flavour->xf_cksum_from_peer(xnbp,
mp, txreq->flags);
xnbp->x_stat_rx_cksum_no_need++;
}
}
if (copy || (mp == NULL)) {
xnb_rx_mark_complete(xnbp, txreq->id, status);
xnb_rx_schedule_unmop(xnbp, mop);
}
if (mp != NULL) {
xnbp->x_stat_ipackets++;
xnbp->x_stat_rbytes += txreq->size;
mp->b_next = NULL;
if (head == NULL) {
ASSERT(tail == NULL);
head = mp;
} else {
ASSERT(tail != NULL);
tail->b_next = mp;
}
tail = mp;
}
}
/*
* This has to be here rather than in the 'finished' code
* because we can only handle NET_TX_RING_SIZE pending unmap
* operations, which may be exceeded by multiple trips around
* the receive loop during heavy load (one trip around the
* loop cannot generate more than NET_TX_RING_SIZE unmap
* operations).
*/
xnb_rx_perform_pending_unmop(xnbp);
if (copy) {
for (loop = start, rxpp = xnbp->x_rx_bufp;
loop != end;
loop++, rxpp++)
xnb_rxbuf_put(xnbp, *rxpp);
}
xnbp->x_tx_ring.req_cons = loop;
goto around;
/* NOTREACHED */
}
/*
* intr() -- ring interrupt service routine
*/
static uint_t
xnb_intr(caddr_t arg)
{
xnb_t *xnbp = (xnb_t *)arg;
mblk_t *mp;
xnbp->x_stat_intr++;
mutex_enter(&xnbp->x_rx_lock);
ASSERT(xnbp->x_connected);
mp = xnb_recv(xnbp);
mutex_exit(&xnbp->x_rx_lock);
if (!xnbp->x_hotplugged) {
xnbp->x_stat_rx_too_early++;
goto fail;
}
if (mp == NULL) {
xnbp->x_stat_spurious_intr++;
goto fail;
}
xnbp->x_flavour->xf_recv(xnbp, mp);
return (DDI_INTR_CLAIMED);
fail:
freemsgchain(mp);
return (DDI_INTR_CLAIMED);
}
static boolean_t
xnb_connect_rings(dev_info_t *dip)
{
xnb_t *xnbp = ddi_get_driver_private(dip);
char *oename;
struct gnttab_map_grant_ref map_op;
evtchn_port_t evtchn;
int i;
/*
* Cannot attempt to connect the rings if already connected.
*/
ASSERT(!xnbp->x_connected);
oename = xvdi_get_oename(dip);
if (xenbus_gather(XBT_NULL, oename,
"event-channel", "%u", &evtchn,
"tx-ring-ref", "%lu", &xnbp->x_tx_ring_ref,
"rx-ring-ref", "%lu", &xnbp->x_rx_ring_ref,
NULL) != 0) {
cmn_err(CE_WARN, "xnb_connect_rings: "
"cannot read other-end details from %s",
oename);
goto fail;
}
if (xenbus_scanf(XBT_NULL, oename,
"feature-tx-writable", "%d", &i) != 0)
i = 0;
if (i != 0)
xnbp->x_rx_pages_writable = B_TRUE;
if (xenbus_scanf(XBT_NULL, oename,
"feature-no-csum-offload", "%d", &i) != 0)
i = 0;
if ((i == 1) || !xnbp->x_cksum_offload)
xnbp->x_cksum_offload = B_FALSE;
/*
* 1. allocate a vaddr for the tx page, one for the rx page.
* 2. call GNTTABOP_map_grant_ref to map the relevant pages
* into the allocated vaddr (one for tx, one for rx).
* 3. call EVTCHNOP_bind_interdomain to have the event channel
* bound to this domain.
* 4. associate the event channel with an interrupt.
* 5. declare ourselves connected.
* 6. enable the interrupt.
*/
/* 1.tx */
xnbp->x_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
0, 0, 0, 0, VM_SLEEP);
ASSERT(xnbp->x_tx_ring_addr != NULL);
/* 2.tx */
map_op.host_addr = (uint64_t)((long)xnbp->x_tx_ring_addr);
map_op.flags = GNTMAP_host_map;
map_op.ref = xnbp->x_tx_ring_ref;
map_op.dom = xnbp->x_peer;
hat_prepare_mapping(kas.a_hat, xnbp->x_tx_ring_addr);
if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
&map_op, 1) != 0 || map_op.status != 0) {
cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
goto fail;
}
xnbp->x_tx_ring_handle = map_op.handle;
/*LINTED: constant in conditional context*/
BACK_RING_INIT(&xnbp->x_tx_ring,
(netif_tx_sring_t *)xnbp->x_tx_ring_addr, PAGESIZE);
/* 1.rx */
xnbp->x_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
0, 0, 0, 0, VM_SLEEP);
ASSERT(xnbp->x_rx_ring_addr != NULL);
/* 2.rx */
map_op.host_addr = (uint64_t)((long)xnbp->x_rx_ring_addr);
map_op.flags = GNTMAP_host_map;
map_op.ref = xnbp->x_rx_ring_ref;
map_op.dom = xnbp->x_peer;
hat_prepare_mapping(kas.a_hat, xnbp->x_rx_ring_addr);
if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
&map_op, 1) != 0 || map_op.status != 0) {
cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
goto fail;
}
xnbp->x_rx_ring_handle = map_op.handle;
/*LINTED: constant in conditional context*/
BACK_RING_INIT(&xnbp->x_rx_ring,
(netif_rx_sring_t *)xnbp->x_rx_ring_addr, PAGESIZE);
/* 3 */
if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) {
cmn_err(CE_WARN, "xnb_connect_rings: "
"cannot bind event channel %d", xnbp->x_evtchn);
xnbp->x_evtchn = INVALID_EVTCHN;
goto fail;
}
xnbp->x_evtchn = xvdi_get_evtchn(dip);
/*
* It would be good to set the state to XenbusStateConnected
* here as well, but then what if ddi_add_intr() failed?
* Changing the state in the store will be noticed by the peer
* and cannot be "taken back".
*/
mutex_enter(&xnbp->x_tx_lock);
mutex_enter(&xnbp->x_rx_lock);
/* 5.1 */
xnbp->x_connected = B_TRUE;
mutex_exit(&xnbp->x_rx_lock);
mutex_exit(&xnbp->x_tx_lock);
/* 4, 6 */
if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
!= DDI_SUCCESS) {
cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
goto fail;
}
xnbp->x_irq = B_TRUE;
/* 5.2 */
(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
return (B_TRUE);
fail:
mutex_enter(&xnbp->x_tx_lock);
mutex_enter(&xnbp->x_rx_lock);
xnbp->x_connected = B_FALSE;
mutex_exit(&xnbp->x_rx_lock);
mutex_exit(&xnbp->x_tx_lock);
return (B_FALSE);
}
static void
xnb_disconnect_rings(dev_info_t *dip)
{
xnb_t *xnbp = ddi_get_driver_private(dip);
if (xnbp->x_irq) {
ddi_remove_intr(dip, 0, NULL);
xnbp->x_irq = B_FALSE;
}
if (xnbp->x_evtchn != INVALID_EVTCHN) {
xvdi_free_evtchn(dip);
xnbp->x_evtchn = INVALID_EVTCHN;
}
if (xnbp->x_rx_ring_handle != INVALID_GRANT_HANDLE) {
struct gnttab_unmap_grant_ref unmap_op;
unmap_op.host_addr = (uint64_t)(uintptr_t)xnbp->x_rx_ring_addr;
unmap_op.dev_bus_addr = 0;
unmap_op.handle = xnbp->x_rx_ring_handle;
if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
&unmap_op, 1) != 0)
cmn_err(CE_WARN, "xnb_disconnect_rings: "
"cannot unmap rx-ring page (%d)",
unmap_op.status);
xnbp->x_rx_ring_handle = INVALID_GRANT_HANDLE;
}
if (xnbp->x_rx_ring_addr != NULL) {
hat_release_mapping(kas.a_hat, xnbp->x_rx_ring_addr);
vmem_free(heap_arena, xnbp->x_rx_ring_addr, PAGESIZE);
xnbp->x_rx_ring_addr = NULL;
}
if (xnbp->x_tx_ring_handle != INVALID_GRANT_HANDLE) {
struct gnttab_unmap_grant_ref unmap_op;
unmap_op.host_addr = (uint64_t)(uintptr_t)xnbp->x_tx_ring_addr;
unmap_op.dev_bus_addr = 0;
unmap_op.handle = xnbp->x_tx_ring_handle;
if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
&unmap_op, 1) != 0)
cmn_err(CE_WARN, "xnb_disconnect_rings: "
"cannot unmap tx-ring page (%d)",
unmap_op.status);
xnbp->x_tx_ring_handle = INVALID_GRANT_HANDLE;
}
if (xnbp->x_tx_ring_addr != NULL) {
hat_release_mapping(kas.a_hat, xnbp->x_tx_ring_addr);
vmem_free(heap_arena, xnbp->x_tx_ring_addr, PAGESIZE);
xnbp->x_tx_ring_addr = NULL;
}
}
/*ARGSUSED*/
static void
xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
void *arg, void *impl_data)
{
xnb_t *xnbp = ddi_get_driver_private(dip);
XenbusState new_state = *(XenbusState *)impl_data;
ASSERT(xnbp != NULL);
switch (new_state) {
case XenbusStateConnected:
if (xnb_connect_rings(dip)) {
xnbp->x_flavour->xf_peer_connected(xnbp);
} else {
xnbp->x_flavour->xf_peer_disconnected(xnbp);
xnb_disconnect_rings(dip);
(void) xvdi_switch_state(dip, XBT_NULL,
XenbusStateClosed);
(void) xvdi_post_event(dip, XEN_HP_REMOVE);
}
/*
* Now that we've attempted to connect it's reasonable
* to allow an attempt to detach.
*/
xnbp->x_detachable = B_TRUE;
break;
case XenbusStateClosing:
(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
break;
case XenbusStateClosed:
xnbp->x_flavour->xf_peer_disconnected(xnbp);
mutex_enter(&xnbp->x_tx_lock);
mutex_enter(&xnbp->x_rx_lock);
xnb_disconnect_rings(dip);
xnbp->x_connected = B_FALSE;
mutex_exit(&xnbp->x_rx_lock);
mutex_exit(&xnbp->x_tx_lock);
(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
(void) xvdi_post_event(dip, XEN_HP_REMOVE);
/*
* In all likelyhood this is already set (in the above
* case), but if the peer never attempted to connect
* and the domain is destroyed we get here without
* having been through the case above, so we set it to
* be sure.
*/
xnbp->x_detachable = B_TRUE;
break;
default:
break;
}
}
/*ARGSUSED*/
static void
xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
void *arg, void *impl_data)
{
xnb_t *xnbp = ddi_get_driver_private(dip);
xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
boolean_t success;
ASSERT(xnbp != NULL);
switch (state) {
case Connected:
success = xnbp->x_flavour->xf_hotplug_connected(xnbp);
mutex_enter(&xnbp->x_tx_lock);
mutex_enter(&xnbp->x_rx_lock);
xnbp->x_hotplugged = success;
mutex_exit(&xnbp->x_rx_lock);
mutex_exit(&xnbp->x_tx_lock);
break;
default:
break;
}
}
static struct modldrv modldrv = {
&mod_miscops, "xnb module %I%",
};
static struct modlinkage modlinkage = {
MODREV_1, &modldrv, NULL
};
int
_init(void)
{
int i;
mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
xnb_rxbuf_cachep = kmem_cache_create("xnb_rxbuf_cachep",
sizeof (xnb_rxbuf_t), 0, xnb_rxbuf_constructor,
xnb_rxbuf_destructor, NULL, NULL, NULL, 0);
ASSERT(xnb_rxbuf_cachep != NULL);
i = mod_install(&modlinkage);
if (i != DDI_SUCCESS) {
kmem_cache_destroy(xnb_rxbuf_cachep);
mutex_destroy(&xnb_alloc_page_lock);
}
return (i);
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}
int
_fini(void)
{
int i;
i = mod_remove(&modlinkage);
if (i == DDI_SUCCESS) {
kmem_cache_destroy(xnb_rxbuf_cachep);
mutex_destroy(&xnb_alloc_page_lock);
}
return (i);
}