xnb.c revision 843e19887f64dde75055cf8842fc4db2171eff45
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#ifdef DEBUG
#define XNB_DEBUG 1
#endif /* DEBUG */
#include "xnb.h"
#include <vm/seg_kmem.h>
#include <sys/balloon_impl.h>
#include <sys/evtchn_impl.h>
/*
* The terms "transmit" and "receive" are used in their traditional
* sense here - packets from other parts of this system are
* "transmitted" to the peer domain and those originating from the
* peer are "received".
*
* In some cases this can be confusing, because various data
* structures are shared with the domU driver, which has the opposite
* view of what constitutes "transmit" and "receive". In naming the
* shared structures the domU driver always wins.
*/
/*
* XXPV dme: things to do, as well as various things indicated
* throughout the source:
* - copy avoidance outbound.
* - copy avoidance inbound.
* - transfer credit limiting.
* - MAC address based filtering.
*/
/*
* Linux expects to have some headroom in received buffers. The Linux
* frontend driver (netfront) checks to see if the headroom is
* available and will re-allocate the buffer to make room if
* necessary. To avoid this we add TX_BUFFER_HEADROOM bytes of
* headroom to each packet we pass to the peer.
*/
#define TX_BUFFER_HEADROOM 16
static void xnb_disconnect_rings(dev_info_t *);
void *, void *);
void *, void *);
static int xnb_rxbuf_constructor(void *, void *, int);
static void xnb_rxbuf_destructor(void *, void *);
static void xnb_rx_notify_peer(xnb_t *);
static void xnb_rx_complete(xnb_rxbuf_t *);
static void xnb_rx_perform_pending_unmop(xnb_t *);
#ifdef XNB_DEBUG
#define NR_GRANT_ENTRIES \
#endif /* XNB_DEBUG */
/* XXPV dme: are these really invalid? */
static kmem_cache_t *xnb_rxbuf_cachep;
static kmutex_t xnb_alloc_page_lock;
/*
* Statistics.
*/
static char *aux_statistics[] = {
"tx_cksum_deferred",
"rx_cksum_no_need",
"tx_notify_deferred",
"tx_notify_sent",
"rx_notify_deferred",
"rx_notify_sent",
"tx_too_early",
"rx_too_early",
"rx_allocb_failed",
"mac_full",
"spurious_intr",
"allocation_success",
"allocation_failure",
"small_allocation_success",
"small_allocation_failure",
"csum_hardware",
"csum_software",
};
static int
{
if (flag != KSTAT_READ)
return (EACCES);
/*
* Assignment order should match that of the names in
* aux_statistics.
*/
return (0);
}
static boolean_t
{
int nstat = sizeof (aux_statistics) /
sizeof (aux_statistics[0]);
char **cp = aux_statistics;
/*
* Create and initialise kstats.
*/
KSTAT_TYPE_NAMED, nstat, 0);
return (B_FALSE);
while (nstat > 0) {
knp++;
cp++;
nstat--;
}
return (B_TRUE);
}
static void
{
}
/*
* Software checksum calculation and insertion for an arbitrary packet.
*/
/*ARGSUSED*/
static mblk_t *
{
/*
* XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
* because it doesn't cover all of the interesting cases :-(
*/
return (vnic_fix_cksum(mp));
}
mblk_t *
{
struct ether_header *ehp;
/*
* Check that the packet is contained in a single mblk. In
* the "from peer" path this is true today, but will change
* when scatter gather support is added. In the "to peer"
* path we cannot be sure, but in most cases it will be true
* (in the xnbo case the packet has come from a MAC device
* which is unlikely to split packets).
*/
goto software;
/*
* If the MAC has no hardware capability don't do any further
* checking.
*/
if (capab == 0)
goto software;
struct ether_vlan_header *evhp;
offset = sizeof (struct ether_vlan_header);
} else {
offset = sizeof (struct ether_header);
}
/*
* We only attempt to do IPv4 packets in hardware.
*/
if (sap != ETHERTYPE_IP)
goto software;
/*
* We know that this is an IPv4 packet.
*/
switch (ipha->ipha_protocol) {
case IPPROTO_TCP:
case IPPROTO_UDP:
/*
*
* If the capabilities indicate that full checksum
* offload is available, use it.
*/
if ((capab & HCKSUM_INET_FULL_V4) != 0) {
0, 0, 0, 0,
return (mp);
}
/*
* XXPV dme: If the capabilities indicate that partial
* checksum offload is available, we should use it.
*/
break;
default:
/* Use software. */
break;
}
/*
* We are not able to use any offload so do the whole thing in
* software.
*/
}
int
{
xnbp->x_rx_buf_count = 0;
xnbp->x_rx_unmop_count = 0;
!= DDI_SUCCESS)
goto failure;
/* set driver private pointer now */
if (!xnb_ks_init(xnbp))
goto late_failure;
/*
* Receive notification of changes in the state of the
* driver in the guest domain.
*/
goto very_late_failure;
/*
* Receive notification of hotplug events.
*/
goto very_late_failure;
"feature-no-csum-offload", "%d",
goto very_very_late_failure;
"cannot read mac address from %s",
xsname);
goto very_very_late_failure;
}
"xnb_attach: cannot parse mac address %s",
mac);
goto very_very_late_failure;
}
return (DDI_SUCCESS);
very_very_late_failure: /* not that the naming is getting silly or anything */
return (DDI_FAILURE);
}
/*ARGSUSED*/
void
{
}
static mfn_t
{
#define WARNING_RATE_LIMIT 100
#define BATCH_SIZE 256
static int nth = BATCH_SIZE;
if (nth == BATCH_SIZE) {
/*
* Try for a single page in low memory situations.
*/
% WARNING_RATE_LIMIT) == 0) {
"Cannot allocate memory to "
"transfer packets to peer.");
}
return (0);
} else {
return (mfn);
}
}
nth = 0;
}
return (mfn);
}
/*ARGSUSED*/
static void
{
int r;
/*
* This happens only in the error path, so batching is
* not worth the complication.
*/
"reservation (%d): page kept but unusable (mfn = 0x%lx).",
r, mfn);
}
}
mblk_t *
{
/*
* For each packet the sequence of operations is:
*
* 1. get a new page from the hypervisor.
* 2. get a request slot from the ring.
* 3. copy the data into the new page.
* 4. transfer the page to the peer.
* 5. update the request slot.
* 6. kick the peer.
* 7. free mp.
*
* In order to reduce the number of hypercalls, we prepare
* several packets for the peer and perform a single hypercall
* to transfer them.
*/
/*
* If we are not connected to the peer or have not yet
* finished hotplug it is too early to pass packets to the
* peer.
*/
return (mp);
}
/*
* Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->x_rx_ring) but
* using local variables.
*/
#define XNB_RING_HAS_UNCONSUMED_REQUESTS(_r) \
char *valoop;
/* 1 */
break;
}
/* 2 */
#ifdef XNB_DEBUG
"id %d out of range in request 0x%p",
"grant ref %d out of range in request 0x%p",
#endif /* XNB_DEBUG */
/* Assign a pfn and map the new page at the allocated va. */
/* 3 */
len = 0;
}
/* Release the pfn. */
/* 4 */
/* 5.1 */
if (cksum_flags != 0)
loop++;
prod++;
gop++;
}
/*
* Did we actually do anything?
*/
return (mp);
}
/*
* Unlink the end of the 'done' list from the remainder.
*/
}
/*
* If the status is anything other than
* GNTST_bad_page then we don't own the page
* any more, so don't try to give it back.
*/
} else {
/* The page is no longer ours. */
}
/*
* Give back the page, as we won't be using
* it.
*/
else
/*
* We gave away a page, update our accounting
* now.
*/
/* 5.2 */
if (status != NETIF_RSP_OKAY) {
} else {
xnbp->x_stat_opackets++;
}
loop++;
prod++;
gop++;
}
/* 6 */
/*LINTED: constant in conditional context*/
if (notify) {
} else {
}
/* Free mblk_t's that we consumed. */
return (mp);
}
/*ARGSUSED*/
static int
{
VM_NOSLEEP : VM_SLEEP);
"cannot get address space");
return (-1);
}
/*
* Have the hat ensure that page table exists for the VA.
*/
return (0);
}
/*ARGSUSED*/
static void
{
}
static void
{
/*LINTED: constant in conditional context*/
if (notify) {
} else {
}
}
static void
{
if (xnbp->x_connected) {
}
}
static void
{
RING_IDX i;
/*
* Note that we don't push the change to the peer here - that
* is the callers responsibility.
*/
}
/*
* XXPV dme: currently pending unmap operations are stored on a
* per-instance basis. Should they be per-driver? The locking would
* have to change (obviously), but there might be an improvement from
* batching more together. Right now they are all 'done' either at
* the tail of each receive operation (copy case) or on each
* completion (non-copy case). Should that be changed to some
* interval (watermark?) to improve the chance of batching?
*/
static void
{
xnbp->x_rx_unmop_count++;
#ifdef XNB_DEBUG
== NULL);
#endif /* XNB_DEBUG */
}
static void
{
#ifdef XNB_DEBUG
#endif /* XNB_DEBUG */
if (xnbp->x_rx_unmop_count == 0)
return;
"unmap grant operation failed, "
}
#ifdef XNB_DEBUG
"unmap grant reference failed (%d)",
}
}
#endif /* XNB_DEBUG */
xnbp->x_rx_unmop_count = 0;
#ifdef XNB_DEBUG
#endif /* XNB_DEBUG */
}
static xnb_rxbuf_t *
{
if (!xnbp->x_rx_pages_writable)
xnbp->x_rx_buf_count++;
}
return (rxp);
}
static void
{
xnbp->x_rx_buf_count--;
}
static mblk_t *
{
xnb_rxbuf_t **rxpp;
/*
* If the peer granted a read-only mapping to the page then we
* must copy the data, as the local protocol stack (should the
* packet be destined for this host) will modify the packet
* 'in place'.
*/
/*
* For each individual request, the sequence of actions is:
*
* 1. get the request.
* 2. map the page based on the grant ref.
* 3. allocate an mblk, copy the data to it.
* 4. release the grant.
* 5. update the ring.
* 6. pass the packet upward.
* 7. kick the peer.
*
* In fact, we try to perform the grant operations in batches,
* so there are two loops.
*/
/*LINTED: constant in conditional context*/
if (!work_to_do) {
return (head);
}
break;
== GNTMAP_readonly));
}
goto finished;
loop++;
rxpp++;
}
goto finished;
}
"failed to map buffer: %d",
}
if (status == NETIF_RSP_OKAY) {
if (copy) {
} else {
}
} else {
} else {
}
}
/*
* If we have a buffer and there are checksum
* flags, process them appropriately.
*/
!= 0)) {
}
}
}
xnbp->x_stat_ipackets++;
} else {
}
}
}
/*
* This has to be here rather than in the 'finished' code
* because we can only handle NET_TX_RING_SIZE pending unmap
* operations, which may be exceeded by multiple trips around
* the receive loop during heavy load (one trip around the
* loop cannot generate more than NET_TX_RING_SIZE unmap
* operations).
*/
if (copy) {
}
goto around;
/* NOTREACHED */
}
/*
* intr() -- ring interrupt service routine
*/
static uint_t
{
xnbp->x_stat_intr++;
if (!xnbp->x_hotplugged) {
goto fail;
}
goto fail;
}
return (DDI_INTR_CLAIMED);
fail:
return (DDI_INTR_CLAIMED);
}
static boolean_t
{
char *oename;
struct gnttab_map_grant_ref map_op;
int i;
/*
* Cannot attempt to connect the rings if already connected.
*/
NULL) != 0) {
"cannot read other-end details from %s",
oename);
goto fail;
}
"feature-tx-writable", "%d", &i) != 0)
i = 0;
if (i != 0)
"feature-no-csum-offload", "%d", &i) != 0)
i = 0;
/*
* 1. allocate a vaddr for the tx page, one for the rx page.
* 2. call GNTTABOP_map_grant_ref to map the relevant pages
* into the allocated vaddr (one for tx, one for rx).
* 3. call EVTCHNOP_bind_interdomain to have the event channel
* bound to this domain.
* 4. associate the event channel with an interrupt.
* 5. declare ourselves connected.
* 6. enable the interrupt.
*/
/* 1.tx */
0, 0, 0, 0, VM_SLEEP);
/* 2.tx */
goto fail;
}
/*LINTED: constant in conditional context*/
/* 1.rx */
0, 0, 0, 0, VM_SLEEP);
/* 2.rx */
goto fail;
}
/*LINTED: constant in conditional context*/
/* 3 */
goto fail;
}
/*
* It would be good to set the state to XenbusStateConnected
* here as well, but then what if ddi_add_intr() failed?
* Changing the state in the store will be noticed by the peer
* and cannot be "taken back".
*/
/* 5.1 */
/* 4, 6 */
!= DDI_SUCCESS) {
goto fail;
}
/* 5.2 */
return (B_TRUE);
fail:
return (B_FALSE);
}
static void
{
}
}
struct gnttab_unmap_grant_ref unmap_op;
unmap_op.dev_bus_addr = 0;
&unmap_op, 1) != 0)
"cannot unmap rx-ring page (%d)",
}
}
struct gnttab_unmap_grant_ref unmap_op;
unmap_op.dev_bus_addr = 0;
&unmap_op, 1) != 0)
"cannot unmap tx-ring page (%d)",
}
}
}
/*ARGSUSED*/
static void
{
switch (new_state) {
case XenbusStateConnected:
if (xnb_connect_rings(dip)) {
} else {
}
/*
* Now that we've attempted to connect it's reasonable
* to allow an attempt to detach.
*/
break;
case XenbusStateClosing:
break;
case XenbusStateClosed:
/*
* In all likelyhood this is already set (in the above
* case), but if the peer never attempted to connect
* and the domain is destroyed we get here without
* having been through the case above, so we set it to
* be sure.
*/
break;
default:
break;
}
}
/*ARGSUSED*/
static void
{
switch (state) {
case Connected:
break;
default:
break;
}
}
&mod_miscops, "xnb module %I%",
};
static struct modlinkage modlinkage = {
};
int
_init(void)
{
int i;
sizeof (xnb_rxbuf_t), 0, xnb_rxbuf_constructor,
i = mod_install(&modlinkage);
if (i != DDI_SUCCESS) {
}
return (i);
}
int
{
}
int
_fini(void)
{
int i;
i = mod_remove(&modlinkage);
if (i == DDI_SUCCESS) {
}
return (i);
}