ib_recv.c revision cadbfdc3bdb156e92d7a88978bc98ea87f6e037f
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
*/
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
static struct kmem_cache *rdsv3_ib_incoming_slab;
static struct kmem_cache *rdsv3_ib_frag_slab;
static void
{
RDSV3_DPRINTF5("rdsv3_ib_frag_drop_page",
}
static void
{
}
/*
* We map a page at a time. Its fragments are posted in order. This
* is called in fragment order as the fragments get send completion events.
* Only the last frag in the page performs the unmapping.
*
* It's OK for ring cleanup to call this in whatever order it likes because
* DMA is not in flight and so we can unmap while other ring entries still
* hold page references in their frags.
*/
static void
struct rdsv3_ib_recv_work *recv)
{
#if 0
RDSV3_DPRINTF5("rdsv3_ib_recv_unmap_page",
#endif
(void) ibt_unmap_mem_iov(
}
}
void
{
struct rdsv3_ib_recv_work *recv;
struct rdsv3_header *hdrp;
uint32_t i;
/* initialize the hdr sgl permanently */
}
}
static void
struct rdsv3_ib_recv_work *recv)
{
}
}
}
void
{
uint32_t i;
}
static int
struct rdsv3_ib_recv_work *recv,
{
goto out;
}
kptr_gfp);
goto out;
}
sizeof (struct rdsv3_page_frag),
}
goto out;
}
goto out;
}
iov_attr.iov_lso_hdr_sz = 0;
/* Data */
/*
* Header comes from pre-registered buffer, so don't map it.
* Map the data only and stick in the header sgl quietly after
* the call.
*/
if (ret != IBT_SUCCESS) {
RDSV3_DPRINTF2("rdsv3_ib_recv_refill_one",
"ibt_map_mem_iov failed: %d", ret);
goto out;
}
/* stick in the header */
/*
* Once we get the RDSV3_PAGE_LAST_OFF frag then rdsv3_ib_frag_unmap()
* must be called on this recv. This happens as completions hit
* in order or on connection shutdown.
*/
} else {
}
ret = 0;
out:
return (ret);
}
/*
* This tries to allocate and post unused work requests after making sure that
* they have all the allocations they need to queue received fragments into
* sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
* pairs don't go unmatched.
*
* -1 is returned if posting fails due to temporary resource exhaustion.
*/
int
{
struct rdsv3_ib_recv_work *recv;
unsigned int succ_wr;
unsigned int posted = 0;
int ret = 0;
RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
"Argh - ring alloc returned pos=%u",
pos);
break;
}
if (ret) {
ret = -1;
break;
}
/* XXX when can this fail? */
RDSV3_DPRINTF5("rdsv3_ib_recv_refill",
"recv %p ibinc %p frag %p ret %d\n", recv,
if (ret) {
RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
"recv post on %u.%u.%u.%u returned %d, "
"disconnecting and reconnecting\n",
ret = -1;
break;
}
posted++;
}
/* We're doing flow control - update the window. */
if (ret)
return (ret);
}
void
{
struct rdsv3_ib_incoming *ibinc;
struct rdsv3_page_frag *frag;
struct rdsv3_page_frag *pos;
RDSV3_DPRINTF5("rdsv3_ib_inc_purge",
}
}
void
{
struct rdsv3_ib_incoming *ibinc;
}
int
{
struct rdsv3_ib_incoming *ibinc;
struct rdsv3_page_frag *frag;
unsigned long to_copy;
unsigned long frag_off = 0;
int copied = 0;
int ret;
if (frag_off == RDSV3_FRAG_SIZE) {
frag_off = 0;
}
RDSV3_DPRINTF5("rdsv3_ib_inc_copy_to_user",
"%lu bytes to user %p from frag [%p, %u] + %lu",
if (ret) {
RDSV3_DPRINTF2("rdsv3_ib_inc_copy_to_user",
break;
}
}
RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user",
return (copied);
}
/* ic starts out kmem_zalloc()ed */
void
{
}
/*
* You'd think that with reliable IB connections you wouldn't need to ack
* messages that have been received. The problem is that IB hardware generates
* an ack message before it has DMAed the message into memory. This creates a
* potential message loss if the HCA is disabled for any reason between when it
* sends the ack and before the message is DMAed and processed. This is only a
* potential issue if another HCA is available for fail-over.
*
* When the remote host receives our ack they'll free the sent message from
* their send queue. To decrease the latency of this we always send an ack
* immediately after we've received messages.
*
* For simplicity, we only have one ack in flight at a time. This puts
* pressure on senders to have deep enough send queues to absorb the latency of
* a single ack frame being in flight. This might not be good enough.
*
* This is implemented by have a long-lived send_wr and sge which point to a
* statically allocated ack frame. This ack wr does not fall under the ring
* accounting that the tx and rx wrs do. The QP attribute specifically makes
* room for it beyond the ring size. Send completion notices its special
* wr_id and avoids working with the ring in that case.
*/
static void
int ack_required)
{
if (ack_required)
}
static uint64_t
{
return (seq);
}
static void
{
int ret;
ic, adv_credits);
rdsv3_message_populate_header(hdr, 0, 0, 0);
NULL);
if (ret) {
/*
* Failed to send. Release the WR, and
* force another ACK.
*/
} else {
}
ic, adv_credits);
}
/*
* There are 3 ways of getting acknowledgements to the peer:
* 1. We call rdsv3_ib_attempt_ack from the recv completion handler
* to send an ACK-only frame.
* However, there can be only one such frame in the send queue
* at any time, so we may have to postpone it.
* 2. When another (data) packet is transmitted while there's
* an ACK in the queue, we piggyback the ACK sequence number
* on the data packet.
* 3. If the ACK WR is done sending, we get called from the
* send queue completion handler, and check whether there's
* another ACK pending (postponed because the WR was on the
* queue). If so, we transmit it.
*
* We maintain 2 variables:
* - i_ack_flags, which keeps track of whether the ACK WR
* is currently in the send queue or not (IB_ACK_IN_FLIGHT)
* - i_ack_next, which is the last sequence number we received
*
* Potentially, send queue and receive queue handlers can run concurrently.
* It would be nice to not have to use a spinlock to synchronize things,
* but the one problem that rules this out is that 64bit updates are
* not atomic on all platforms. Things would be a lot simpler if
* we had atomic64 or maybe cmpxchg64 everywhere.
*
* Reconnecting complicates this picture just slightly. When we
* reconnect, we may be seeing duplicate packets. The peer
* is retransmitting them, because it hasn't seen an ACK for
* them. It is important that we ACK these.
*
* ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
* this flag set *MUST* be acknowledged immediately.
*/
/*
* When we get here, we're called from the recv queue handler.
* Check whether we ought to transmit an ACK.
*/
void
{
unsigned int adv_credits;
return;
return;
}
/* Can we get a send credit? */
return;
}
}
/*
* We get here from the send completion handler, when the
* adapter tells us the ACK frame was sent.
*/
void
{
}
/*
* This is called by the regular xmit code when it wants to piggyback
* an ACK on an outgoing frame.
*/
{
}
return (rdsv3_ib_get_ack(ic));
}
static struct rdsv3_header *
struct rdsv3_ib_recv_work *recv,
{
/*
* Support header at the front (RDS 3.1+) as well as header-at-end.
*
* Cases:
* 1) header all in header buff (great!)
* 2) header all in data page (copy all to header buff)
* 3) header split across hdr buf + data page
* (move bit in hdr buff to end before copying other bit from
* data page)
*/
return (hdr_buff);
/*
* XXX - Need to discuss the support for version < RDS_PROTOCOL_3_1.
*/
return (hdr_buff);
/* version < RDS_PROTOCOL_3_0 */
RDSV3_DPRINTF2("rdsv3_ib_get_header",
data_len);
return (NULL);
}
/*
* It's kind of lame that we're copying from the posted receive pages into
* long-lived bitmaps. We could have posted the bitmaps and rdma written into
* them. But receiving new congestion bitmaps should be a *rare* event, so
* hopefully we won't need to invest that complexity in making it more
* efficient. By copying we can share a simpler core with TCP which has to
* copy.
*/
static void
struct rdsv3_ib_incoming *ibinc)
{
struct rdsv3_cong_map *map;
unsigned int map_off;
unsigned int map_page;
struct rdsv3_page_frag *frag;
unsigned long frag_off;
unsigned long to_copy;
unsigned long copied;
uint64_t uncongested = 0;
/* catch completely corrupt packets */
return;
map_page = 0;
map_off = 0;
frag_off = 0;
copied = 0;
while (copied < RDSV3_CONG_MAP_BYTES) {
unsigned int k;
RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
for (k = 0; k < to_copy; k += 8) {
/*
* Record ports that became uncongested, ie
* bits that changed from 0 to 1.
*/
}
RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
map_off = 0;
map_page++;
}
if (frag_off == RDSV3_FRAG_SIZE) {
frag_off = 0;
}
}
#if 0
/* the congestion map is in little endian order */
#endif
}
/*
* Rings are posted with all the allocations they'll need to queue the
* incoming message to the receiving socket so this can't fail.
* All fragments start with a header, so we can make sure we're not receiving
* garbage, and we can tell a small 8 byte fragment from an ACK frame.
*/
struct rdsv3_ib_ack_state {
unsigned int ack_required:1;
unsigned int ack_next_valid:1;
unsigned int ack_recv_valid:1;
};
static void
struct rdsv3_ib_ack_state *state)
{
/* XXX shut down the connection if port 0,0 are seen? */
RDSV3_DPRINTF5("rdsv3_ib_process_recv",
if (data_len < sizeof (struct rdsv3_header)) {
RDSV3_DPRINTF2("rdsv3_ib_process_recv",
"incoming message from %u.%u.%u.%u didn't include a "
"header, disconnecting and reconnecting",
return;
}
data_len -= sizeof (struct rdsv3_header);
"from %u.%u.%u.%u didn't have a proper version (0x%x) or"
"data_len (0x%x), disconnecting and "
"reconnecting",
return;
}
/* Validate the checksum. */
if (!rdsv3_message_verify_checksum(ihdr)) {
"from %u.%u.%u.%u has corrupted header - "
"forcing a reconnect",
return;
}
/* Process the ACK sequence which comes with every packet */
/* Process the credits update if there was one */
/*
* This is an ACK-only packet. The fact that it gets
* special treatment here is that historically, ACKs
* were rather special beasts.
*/
/*
* Usually the frags make their way on to incs and are then
* freed as
* the inc is freed. We don't go that route, so we have to
* drop the
* page ref ourselves. We can't just leave the page on the recv
* because that confuses the dma mapping of pages and each
* recv's use
* of a partial page. We can leave the frag, though, it will be
* reused.
*
* FIXME: Fold this into the code path below.
*/
return;
}
/*
* If we don't already have an inc on the connection then this
* fragment has a header and starts a message.. copy its header
* into the inc and save the inc so we can hang upcoming fragments
* off its list.
*/
RDSV3_DPRINTF5("rdsv3_ib_process_recv",
} else {
/*
* We can't just use memcmp here; fragments of a
* single message may carry different ACKs
*/
RDSV3_DPRINTF2("rdsv3_ib_process_recv",
"fragment header mismatch; forcing reconnect");
return;
}
}
else {
ic->i_recv_data_rem = 0;
else {
}
/*
* Evaluate the ACK_REQUIRED flag *after* we received
* the complete frame, and after bumping the next_rx
* sequence.
*/
}
}
RDSV3_DPRINTF4("rdsv3_ib_process_recv",
"Return: conn: %p recv: %p len: %d state: %p",
}
/*
* Plucking the oldest entry from the ring can be done concurrently with
* the thread refilling the ring. Each ring operation is protected by
* spinlocks and the transient state of refilling doesn't change the
* recording of which entry is oldest.
*
* This relies on IB only calling one cq comp_handler for each cq so that
* there will only be one caller of rdsv3_recv_incoming() per RDS connection.
*/
void
{
RDSV3_DPRINTF4("rdsv3_ib_recv_cq_comp_handler",
}
static inline void
{
struct rdsv3_ib_recv_work *recv;
IBT_SUCCESS) {
RDSV3_DPRINTF5("rdsv3_ib_recv_cq_comp_handler",
"rwc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
/*
* Also process recvs in connecting state because it is possible
* to get a recv completion _before_ the rdmacm ESTABLISHED
* event is processed.
*/
/*
* We expect errors as the qp is drained during
* shutdown
*/
} else {
RDSV3_DPRINTF2("rdsv3_ib_recv_cq_comp_handler",
"recv completion on "
"%u.%u.%u.%u had status %u, "
"disconnecting and reconnecting\n",
}
}
}
}
static processorid_t rdsv3_taskq_bind_cpuid = 0;
void
rdsv3_ib_recv_tasklet_fn(void *data)
{
struct rdsv3_ib_ack_state state = { 0, };
/* If not already bound, bind this thread to a CPU */
if (cpu_is_online(cp)) {
if (ic->i_recv_tasklet_cpuid >= 0)
}
}
if (state.ack_next_valid)
}
if (rdsv3_conn_up(conn))
/*
* If we ever end up with a really empty receive ring, we're
* in deep trouble, as the sender will definitely see RNR
* timeouts.
*/
/*
* If the ring is running low, then schedule the thread to refill.
*/
}
int
{
int ret = 0;
/*
* If we get a temporary posting failure in this context then
* we're really low and we want the caller to back off for a bit.
*/
else
if (rdsv3_conn_up(conn))
return (ret);
}
int
rdsv3_ib_recv_init(void)
{
/* XXX - hard code it to 128 MB */
if (rdsv3_ib_incoming_slab == NULL)
goto out;
sizeof (struct rdsv3_page_frag),
if (rdsv3_ib_frag_slab == NULL)
else
ret = 0;
out:
return (ret);
}
void
rdsv3_ib_recv_exit(void)
{
}