/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifdef DEBUG
#endif /* DEBUG */
#include "xnb.h"
#include <vm/seg_kmem.h>
#include <sys/balloon_impl.h>
#include <sys/evtchn_impl.h>
/*
* The terms "transmit" and "receive" are used in alignment with domU,
* which means that packets originating from the peer domU are "transmitted"
* to other parts of the system and packets are "received" from them.
*/
/*
* Should we allow guests to manipulate multicast group membership?
*/
static void xnb_disconnect_rings(dev_info_t *);
void *, void *);
void *, void *);
static int xnb_txbuf_constructor(void *, void *, int);
static void xnb_txbuf_destructor(void *, void *);
#pragma inline(setup_gop)
static boolean_t is_foreign(void *);
#pragma inline(is_foreign)
/*
* On a 32 bit PAE system physical and machine addresses are larger
* than 32 bits. ddi_btop() on such systems take an unsigned long
* argument, and so addresses above 4G are truncated before ddi_btop()
* gets to see them. To avoid this, code the shift operation here.
*/
/* DMA attributes for transmit and receive data */
DMA_ATTR_V0, /* version of this structure */
0, /* lowest usable address */
0xffffffffffffffffULL, /* highest usable address */
0x7fffffff, /* maximum DMAable byte count */
MMU_PAGESIZE, /* alignment in bytes */
0x7ff, /* bitmap of burst sizes */
1, /* minimum transfer */
0xffffffffU, /* maximum transfer */
0xffffffffffffffffULL, /* maximum segment length */
1, /* maximum number of segments */
1, /* granularity */
0, /* flags (reserved) */
};
/* DMA access attributes for data: NOT to be byte swapped. */
};
/*
* Statistics.
*/
static const char * const aux_statistics[] = {
"rx_cksum_deferred",
"tx_cksum_no_need",
"rx_rsp_notok",
"tx_notify_deferred",
"tx_notify_sent",
"rx_notify_deferred",
"rx_notify_sent",
"tx_too_early",
"rx_too_early",
"rx_allocb_failed",
"tx_allocb_failed",
"rx_foreign_page",
"mac_full",
"spurious_intr",
"allocation_success",
"allocation_failure",
"small_allocation_success",
"small_allocation_failure",
"other_allocation_failure",
"rx_pageboundary_crossed",
"rx_cpoparea_grown",
"csum_hardware",
"csum_software",
"tx_overflow_page",
"tx_unexpected_flags",
};
static int
{
if (flag != KSTAT_READ)
return (EACCES);
/*
* Assignment order should match that of the names in
* aux_statistics.
*/
return (0);
}
static boolean_t
{
sizeof (aux_statistics[0]);
/*
* Create and initialise kstats.
*/
KSTAT_TYPE_NAMED, nstat, 0);
return (B_FALSE);
while (nstat > 0) {
knp++;
cp++;
nstat--;
}
return (B_TRUE);
}
static void
{
}
/*
* Calculate and insert the transport checksum for an arbitrary packet.
*/
static mblk_t *
{
/*
* XXPV dme: shouldn't rely on mac_fix_cksum(), not least
* because it doesn't cover all of the interesting cases :-(
*/
return (mac_fix_cksum(mp));
}
mblk_t *
{
/*
* Check that the packet is contained in a single mblk. In
* the "from peer" path this is true today, but may change
* when scatter gather support is added. In the "to peer"
* path we cannot be sure, but in most cases it will be true
* (in the xnbo case the packet has come from a MAC device
* which is unlikely to split packets).
*/
goto software;
/*
* If the MAC has no hardware capability don't do any further
* checking.
*/
if (capab == 0)
goto software;
offset = sizeof (struct ether_vlan_header);
} else {
offset = sizeof (struct ether_header);
}
/*
* We only attempt to do IPv4 packets in hardware.
*/
if (sap != ETHERTYPE_IP)
goto software;
/*
* We know that this is an IPv4 packet.
*/
switch (ipha->ipha_protocol) {
case IPPROTO_TCP:
case IPPROTO_UDP: {
/*
* can use full IPv4 and partial checksum offload.
*/
break;
} else {
}
if (capab & HCKSUM_INET_FULL_V4) {
/*
* Some devices require that the checksum
* field of the packet is zero for full
* offload.
*/
*stuffp = 0;
return (mp);
}
if (capab & HCKSUM_INET_PARTIAL) {
if (*stuffp == 0) {
/*
* Older Solaris guests don't insert
* the pseudo-header checksum, so we
* calculate it here.
*/
}
return (mp);
}
/* NOTREACHED */
break;
}
default:
/* Use software. */
break;
}
/*
* We are not able to use any offload so do the whole thing in
* software.
*/
}
int
{
char *xsname;
xnbp->xnb_tx_buf_count = 0;
!= DDI_SUCCESS)
goto failure;
xnbp->xnb_rx_cpop_count = 0;
xnbp->xnb_icookie);
xnbp->xnb_icookie);
xnbp->xnb_icookie);
/* Set driver private pointer now. */
sizeof (xnb_txbuf_t), 0,
goto failure_0;
if (!xnb_ks_init(xnbp))
goto failure_1;
/*
* Receive notification of changes in the state of the
* driver in the guest domain.
*/
NULL) != DDI_SUCCESS)
goto failure_2;
/*
* Receive notification of hotplug events.
*/
NULL) != DDI_SUCCESS)
goto failure_2;
"feature-multicast-control", "%d",
xnb_multicast_control ? 1 : 0) != 0)
goto failure_3;
"feature-rx-copy", "%d", 1) != 0)
goto failure_3;
/*
* Linux domUs seem to depend on "feature-rx-flip" being 0
* in addition to "feature-rx-copy" being 1. It seems strange
* to use four possible states to describe a binary decision,
* but we might as well play nice.
*/
"feature-rx-flip", "%d", 0) != 0)
goto failure_3;
return (DDI_SUCCESS);
return (DDI_FAILURE);
}
void
{
if (xnbp->xnb_rx_cpop_count > 0)
* xnbp->xnb_rx_cpop_count);
}
/*
* Allocate a page from the hypervisor to be flipped to the peer.
*
* Try to get pages in batches to reduce the overhead of calls into
* the balloon driver.
*/
static mfn_t
{
if (nth == BATCH_SIZE) {
/*
* Try for a single page in low memory situations.
*/
% WARNING_RATE_LIMIT) == 0)
"Cannot allocate memory to "
"transfer packets to peer.");
return (0);
} else {
return (mfn);
}
}
nth = 0;
}
return (mfn);
}
/*
* Free a page back to the hypervisor.
*
* This happens only in the error path, so batching is not worth the
* complication.
*/
static void
{
int r;
"reservation (%d): page kept but unusable (mfn = 0x%lx).",
r, mfn);
}
}
/*
* Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
* local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
*/
/*
* Pass packets to the peer using page flipping.
*/
mblk_t *
{
/*
* For each packet the sequence of operations is:
*
* 1. get a new page from the hypervisor.
* 2. get a request slot from the ring.
* 3. copy the data into the new page.
* 4. transfer the page to the peer.
* 5. update the request slot.
* 6. kick the peer.
* 7. free mp.
*
* In order to reduce the number of hypercalls, we prepare
* several packets for the peer and perform a single hypercall
* to transfer them.
*/
/*
* If we are not connected to the peer or have not yet
* finished hotplug it is too early to pass packets to the
* peer.
*/
return (mp);
}
char *valoop;
/* 1 */
break;
}
/* 2 */
#ifdef XNB_DEBUG
"id %d out of range in request 0x%p",
#endif /* XNB_DEBUG */
/* Assign a pfn and map the new page at the allocated va. */
/* 3 */
len = 0;
}
/* Release the pfn. */
/* 4 */
/* 5.1 */
if (cksum_flags != 0)
loop++;
prod++;
gop++;
}
/*
* Did we actually do anything?
*/
return (mp);
}
/*
* Unlink the end of the 'done' list from the remainder.
*/
}
/*
* If the status is anything other than
* GNTST_bad_page then we don't own the page
* any more, so don't try to give it back.
*/
} else {
/* The page is no longer ours. */
}
/*
* Give back the page, as we won't be using
* it.
*/
else
/*
* We gave away a page, update our accounting
* now.
*/
/* 5.2 */
if (status != NETIF_RSP_OKAY) {
} else {
}
loop++;
prod++;
gop++;
}
/* 6 */
/* LINTED: constant in conditional context */
if (notify) {
} else {
}
/* Free mblk_t's that we consumed. */
return (mp);
}
/* Helper functions for xnb_copy_to_peer(). */
/*
* Grow the array of copy operation descriptors.
*/
static boolean_t
{
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Check whether an address is on a page that's foreign to this domain.
*/
static boolean_t
{
}
/*
* Insert a newly allocated mblk into a chain, replacing the old one.
*/
static mblk_t *
{
}
/* Make sure we only overwrite pointers to the mblk being replaced. */
return (new_mp);
}
/*
* Set all the fields in a gnttab_copy_t.
*/
static void
{
}
/*
* Pass packets to the peer using hypervisor copy operations.
*/
mblk_t *
{
int i;
/*
* If the peer does not pre-post buffers for received packets,
* use page flipping to pass packets to it.
*/
if (!xnbp->xnb_rx_hv_copy)
/*
* For each packet the sequence of operations is:
*
* 1. get a request slot from the ring.
* 2. set up data for hypercall (see NOTE below)
* 3. have the hypervisore copy the data
* 4. update the request slot.
* 5. kick the peer.
*
* NOTE ad 2.
* In order to reduce the number of hypercalls, we prepare
* several mblks (mp->b_cont != NULL) for the peer and
* perform a single hypercall to transfer them. We also have
* to set up a seperate copy operation for every page.
*
* If we have more than one packet (mp->b_next != NULL), we do
* this whole dance repeatedly.
*/
return (mp);
}
int item_count;
/* 1 */
#ifdef XNB_DEBUG
"id %d out of range in request 0x%p",
#endif /* XNB_DEBUG */
/* 2 */
d_offset = 0;
len = 0;
item_count = 0;
/*
* We walk the b_cont pointers and set up a
* gnttab_copy_t for each sub-page chunk in each data
* block.
*/
/* 2a */
/*
* The hypervisor will not allow us to
* reference a foreign page (e.g. one
* belonging to another domain) by mfn in the
* copy operation. If the data in this mblk is
* on such a page we must copy the data into a
* local page before initiating the hypervisor
* copy operation.
*/
/* We can still use old ml, but not *ml! */
}
"(svd: %p), ml %p,rpt_alg. %p, d_offset "
"(%lu) + chunk (%lu) > PAGESIZE %d!",
(void *)rpt_align,
while (chunk > 0) {
if (!grow_cpop_area(xnbp))
goto failure;
}
/*
* If our mblk crosses a page boundary, we need
* to do a seperate copy for each page.
*/
(int)r_offset);
} else {
}
/*
* The 2nd, 3rd ... last copies will always
* start at r_tmp, therefore r_offset is 0.
*/
r_offset = 0;
gop_cp++;
item_count++;
}
}
/* 3 */
item_count) != 0) {
}
/* 4 */
if (cksum_flags != 0)
for (i = 0; i < item_count; i++) {
int, i);
}
}
/* 5.2 */
if (status != NETIF_RSP_OKAY) {
} else {
}
loop++;
prod++;
}
/*
* Did we actually do anything?
*/
return (mp);
}
/*
* Unlink the end of the 'done' list from the remainder.
*/
/* 6 */
/* LINTED: constant in conditional context */
if (notify) {
} else {
}
/* Free mblk_t structs we have consumed. */
return (mp);
}
static void
{
/* LINTED: constant in conditional context */
} else {
}
}
static void
{
RING_IDX i;
/*
* Note that we don't push the change to the peer here - that
* is the callers responsibility.
*/
}
static void
{
}
static int
{
goto failure;
goto failure_1;
&dma_cookie, &ncookies)
!= DDI_DMA_MAPPED)
goto failure_2;
return (0);
return (-1);
}
static void
{
}
/*
* Take packets from the peer and deliver them onward.
*/
static mblk_t *
{
int n_data_req, i;
/* LINTED: constant in conditional context */
if (!work_to_do) {
return (head);
}
/*
* This usually indicates that the frontend driver is
* misbehaving, as it's not possible to have more than
* NET_TX_RING_SIZE ring elements in play at any one
* time.
*
* We reset the ring pointers to the state declared by
* the frontend and try to carry on.
*/
"items in the ring, resetting and trying to recover.",
/* LINTED: constant in conditional context */
goto around;
}
n_data_req = 0;
if (unexpected_flags != 0) {
/*
* The peer used flag bits that we do not
* recognize.
*/
"unexpected flag bits (0x%x) from peer "
"in transmit request",
/* Mark this entry as failed. */
loop++; /* Consume another slot in the ring. */
erp = (struct netif_extra_info *)
break;
break;
default:
break;
}
/*
* Peer attempted to refer to data beyond the
* end of the granted page.
*/
"attempt to refer beyond the end of granted "
"page in txreq (offset %d, size %d).",
/* Mark this entry as failed. */
} else {
break;
break;
}
txpp++;
cop++;
n_data_req++;
}
loop++;
}
if (n_data_req == 0)
goto around;
i = n_data_req;
while (i > 0) {
txpp++;
i--;
}
goto finished;
}
i = n_data_req;
while (i > 0) {
#ifdef XNB_DEBUG
"txpp 0x%p failed (%d)",
#endif /* XNB_DEBUG */
} else {
/*
* If there are checksum flags, process them
* appropriately.
*/
!= 0) {
}
} else {
}
}
txpp++;
cop++;
i--;
}
goto around;
/* NOTREACHED */
}
static uint_t
{
xnbp->xnb_stat_intr++;
if (!xnbp->xnb_hotplugged) {
goto fail;
}
goto fail;
}
return (DDI_INTR_CLAIMED);
fail:
return (DDI_INTR_CLAIMED);
}
/*
* Read our configuration from xenstore.
*/
{
char *xsname;
"cannot read mac address from %s",
xsname);
return (B_FALSE);
}
"xnb_attach: cannot parse mac address %s",
mac);
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Read the configuration of the peer from xenstore.
*/
{
char *oename;
int i;
NULL) != 0) {
"cannot read other-end details from %s",
oename);
return (B_FALSE);
}
/*
* Check whether our peer requests receive side hypervisor
* copy.
*/
"request-rx-copy", "%d", &i) != 0)
i = 0;
if (i != 0)
/*
* Check whether our peer requests multicast_control.
*/
"request-multicast-control", "%d", &i) != 0)
i = 0;
if (i != 0)
/*
* The Linux backend driver here checks to see if the peer has
* set 'feature-no-csum-offload'. This is used to indicate
* that the guest cannot handle receiving packets without a
* valid checksum. We don't check here, because packets passed
* to the peer _always_ have a valid checksum.
*
* There are three cases:
*
* - the NIC is dedicated: packets from the wire should always
* have a valid checksum. If the hardware validates the
* checksum then the relevant bit will be set in the packet
* attributes and we will inform the peer. It can choose to
* ignore the hardware verification.
*
* - the NIC is shared (VNIC) and a packet originates from the
* wire: this is the same as the case above - the packets
* will have a valid checksum.
*
* - the NIC is shared (VNIC) and a packet originates from the
* host: the MAC layer ensures that all such packets have a
* valid checksum by calculating one if the stack did not.
*/
return (B_TRUE);
}
void
{
if (!xnb_connect_rings(dip)) {
"cannot connect rings");
goto failed;
}
"flavour failed to connect");
goto failed;
}
return;
}
static boolean_t
{
/*
* Cannot attempt to connect the rings if already connected.
*/
/*
* 1. allocate a vaddr for the tx page, one for the rx page.
* 2. call GNTTABOP_map_grant_ref to map the relevant pages
* into the allocated vaddr (one for tx, one for rx).
* 3. call EVTCHNOP_bind_interdomain to have the event channel
* bound to this domain.
* 4. associate the event channel with an interrupt.
* 5. enable the interrupt.
*/
/* 1.tx */
0, 0, 0, 0, VM_SLEEP);
/* 2.tx */
goto fail;
}
/* LINTED: constant in conditional context */
/* 1.rx */
0, 0, 0, 0, VM_SLEEP);
/* 2.rx */
goto fail;
}
/* LINTED: constant in conditional context */
/* 3 */
goto fail;
}
/*
* It would be good to set the state to XenbusStateConnected
* here as well, but then what if ddi_add_intr() failed?
* Changing the state in the store will be noticed by the peer
* and cannot be "taken back".
*/
/* 4, 5 */
!= DDI_SUCCESS) {
goto fail;
}
return (B_TRUE);
fail:
return (B_FALSE);
}
static void
{
}
}
unmap_op.dev_bus_addr = 0;
&unmap_op, 1) != 0)
"cannot unmap rx-ring page (%d)",
}
}
unmap_op.dev_bus_addr = 0;
&unmap_op, 1) != 0)
"cannot unmap tx-ring page (%d)",
}
}
}
static void
{
switch (new_state) {
case XenbusStateConnected:
/* spurious state change */
if (xnbp->xnb_connected)
return;
if (!xnb_read_oe_config(xnbp) ||
"read otherend config error");
break;
}
/*
* Now that we've attempted to connect it's reasonable
* to allow an attempt to detach.
*/
break;
case XenbusStateClosing:
break;
case XenbusStateClosed:
/*
* In all likelyhood this is already set (in the above
* case), but if the peer never attempted to connect
* and the domain is destroyed we get here without
* having been through the case above, so we set it to
* be sure.
*/
break;
default:
break;
}
}
static void
{
switch (state) {
case Connected:
/* spurious hotplug event */
if (xnbp->xnb_hotplugged)
break;
if (!xnb_read_xs_config(xnbp))
break;
break;
break;
default:
break;
}
}
&mod_miscops, "xnb",
};
};
int
_init(void)
{
int i;
i = mod_install(&modlinkage);
if (i != DDI_SUCCESS)
return (i);
}
int
{
}
int
_fini(void)
{
int i;
i = mod_remove(&modlinkage);
if (i == DDI_SUCCESS)
return (i);
}