ib_recv.c revision 1a5e258f5471356ca102c7176637cdce45bac147
/*
*/
/*
* This file contains code imported from the OFED rds source file ib_recv.c
* Oracle elects to have and use the contents of ib_recv.c under and governed
* by the OpenIB.org BSD license (see below for full license text). However,
* the following notice accompanied the original version of this file:
*/
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
static struct kmem_cache *rdsv3_ib_incoming_slab;
void
{
struct rdsv3_ib_recv_work *recv;
struct rdsv3_header *hdrp;
uint32_t i;
/* initialize the hdr sgl permanently */
}
}
static void
struct rdsv3_ib_recv_work *recv)
{
}
}
}
void
{
uint32_t i;
}
static int
struct rdsv3_ib_recv_work *recv)
{
ic->i_max_recv_alloc)) {
goto out;
}
goto out;
}
}
goto out;
}
/* Data sge, structure copy */
return (0);
out:
}
return (-ENOMEM);
}
/*
* This tries to allocate and post unused work requests after making sure that
* they have all the allocations they need to queue received fragments into
* sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
* pairs don't go unmatched.
*
* -1 is returned if posting fails due to temporary resource exhaustion.
*/
int
{
struct rdsv3_ib_recv_work *recv;
unsigned int posted = 0;
RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
"Argh - ring alloc returned pos=%u, avail: %d",
return (-EINVAL);
}
/* populate the WRs */
for (i = 0; i < avail; i++) {
if (ret) {
avail - i);
break;
}
}
if (i) {
/* post the WRs at one shot */
RDSV3_DPRINTF3("rdsv3_ib_recv_refill",
"attempted: %d posted: %d WRs ret %d",
if (ret) {
RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
"disconnecting and reconnecting\n",
i - posted);
}
}
}
/* We're doing flow control - update the window. */
return (ret);
}
/*
* delayed freed incoming's
*/
struct rdsv3_inc_pool {
};
void
{
if (pool) {
}
}
int
{
struct rdsv3_inc_pool *pool;
return (-ENOMEM);
}
return (0);
}
static void
{
struct rdsv3_page_frag *frag;
struct rdsv3_page_frag *pos;
}
}
void
rdsv3_ib_drain_inclist(void *data)
{
struct rdsv3_ib_incoming *ibinc;
int i = 0;
for (;;) {
if (ibinc)
if (!ibinc)
break;
i++;
}
}
void
{
struct rdsv3_ib_incoming *ibinc;
/* save af_thr in a local as ib_inc might be freed at mutex_exit */
}
int
{
struct rdsv3_ib_incoming *ibinc;
struct rdsv3_page_frag *frag;
unsigned long to_copy;
unsigned long frag_off = 0;
int copied = 0;
int ret;
if (frag_off == RDSV3_FRAG_SIZE) {
frag_off = 0;
}
RDSV3_DPRINTF5("rdsv3_ib_inc_copy_to_user",
"%lu bytes to user %p from frag [%p, %u] + %lu",
if (ret) {
RDSV3_DPRINTF2("rdsv3_ib_inc_copy_to_user",
break;
}
}
RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user",
return (copied);
}
/* ic starts out kmem_zalloc()ed */
void
{
}
/*
* You'd think that with reliable IB connections you wouldn't need to ack
* messages that have been received. The problem is that IB hardware generates
* an ack message before it has DMAed the message into memory. This creates a
* potential message loss if the HCA is disabled for any reason between when it
* sends the ack and before the message is DMAed and processed. This is only a
* potential issue if another HCA is available for fail-over.
*
* When the remote host receives our ack they'll free the sent message from
* their send queue. To decrease the latency of this we always send an ack
* immediately after we've received messages.
*
* For simplicity, we only have one ack in flight at a time. This puts
* pressure on senders to have deep enough send queues to absorb the latency of
* a single ack frame being in flight. This might not be good enough.
*
* This is implemented by have a long-lived send_wr and sge which point to a
* statically allocated ack frame. This ack wr does not fall under the ring
* accounting that the tx and rx wrs do. The QP attribute specifically makes
* room for it beyond the ring size. Send completion notices its special
* wr_id and avoids working with the ring in that case.
*/
void
int ack_required)
{
if (ack_required)
}
static uint64_t
{
return (seq);
}
static void
{
int ret;
ic, adv_credits);
rdsv3_message_populate_header(hdr, 0, 0, 0);
NULL);
if (ret) {
/*
* Failed to send. Release the WR, and
* force another ACK.
*/
} else {
}
ic, adv_credits);
}
/*
* There are 3 ways of getting acknowledgements to the peer:
* 1. We call rdsv3_ib_attempt_ack from the recv completion handler
* to send an ACK-only frame.
* However, there can be only one such frame in the send queue
* at any time, so we may have to postpone it.
* 2. When another (data) packet is transmitted while there's
* an ACK in the queue, we piggyback the ACK sequence number
* on the data packet.
* 3. If the ACK WR is done sending, we get called from the
* send queue completion handler, and check whether there's
* another ACK pending (postponed because the WR was on the
* queue). If so, we transmit it.
*
* We maintain 2 variables:
* - i_ack_flags, which keeps track of whether the ACK WR
* is currently in the send queue or not (IB_ACK_IN_FLIGHT)
* - i_ack_next, which is the last sequence number we received
*
* Potentially, send queue and receive queue handlers can run concurrently.
* It would be nice to not have to use a spinlock to synchronize things,
* but the one problem that rules this out is that 64bit updates are
* not atomic on all platforms. Things would be a lot simpler if
* we had atomic64 or maybe cmpxchg64 everywhere.
*
* Reconnecting complicates this picture just slightly. When we
* reconnect, we may be seeing duplicate packets. The peer
* is retransmitting them, because it hasn't seen an ACK for
* them. It is important that we ACK these.
*
* ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
* this flag set *MUST* be acknowledged immediately.
*/
/*
* When we get here, we're called from the recv queue handler.
* Check whether we ought to transmit an ACK.
*/
void
{
unsigned int adv_credits;
return;
return;
}
/* Can we get a send credit? */
return;
}
}
/*
* We get here from the send completion handler, when the
* adapter tells us the ACK frame was sent.
*/
void
{
}
/*
* This is called by the regular xmit code when it wants to piggyback
* an ACK on an outgoing frame.
*/
{
}
return (rdsv3_ib_get_ack(ic));
}
/*
* It's kind of lame that we're copying from the posted receive pages into
* long-lived bitmaps. We could have posted the bitmaps and rdma written into
* them. But receiving new congestion bitmaps should be a *rare* event, so
* hopefully we won't need to invest that complexity in making it more
* efficient. By copying we can share a simpler core with TCP which has to
* copy.
*/
static void
struct rdsv3_ib_incoming *ibinc)
{
struct rdsv3_cong_map *map;
unsigned int map_off;
unsigned int map_page;
struct rdsv3_page_frag *frag;
unsigned long frag_off;
unsigned long to_copy;
unsigned long copied;
uint64_t uncongested = 0;
/* catch completely corrupt packets */
return;
map_page = 0;
map_off = 0;
frag_off = 0;
copied = 0;
while (copied < RDSV3_CONG_MAP_BYTES) {
unsigned int k;
RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
for (k = 0; k < to_copy; k += 8) {
/*
* Record ports that became uncongested, ie
* bits that changed from 0 to 1.
*/
}
RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
map_off = 0;
map_page++;
}
if (frag_off == RDSV3_FRAG_SIZE) {
frag_off = 0;
}
}
#if 0
/* the congestion map is in little endian order */
#endif
}
static void
struct rdsv3_ib_ack_state *state)
{
/* XXX shut down the connection if port 0,0 are seen? */
RDSV3_DPRINTF5("rdsv3_ib_process_recv",
if (data_len < sizeof (struct rdsv3_header)) {
RDSV3_DPRINTF2("rdsv3_ib_process_recv",
"incoming message from %u.%u.%u.%u didn't include a "
"header, disconnecting and reconnecting",
return;
}
data_len -= sizeof (struct rdsv3_header);
/* Validate the checksum. */
if (!rdsv3_message_verify_checksum(ihdr)) {
"from %u.%u.%u.%u has corrupted header - "
"forcing a reconnect",
return;
}
/* Process the ACK sequence which comes with every packet */
/* Process the credits update if there was one */
/*
* This is an ACK-only packet. The fact that it gets
* special treatment here is that historically, ACKs
* were rather special beasts.
*/
return;
}
/*
* If we don't already have an inc on the connection then this
* fragment has a header and starts a message.. copy its header
* into the inc and save the inc so we can hang upcoming fragments
* off its list.
*/
if (!ibinc) {
RDSV3_DPRINTF5("rdsv3_ib_process_recv",
} else {
/*
* We can't just use memcmp here; fragments of a
* single message may carry different ACKs
*/
RDSV3_DPRINTF2("rdsv3_ib_process_recv",
"fragment header mismatch; forcing reconnect");
return;
}
}
else {
ic->i_recv_data_rem = 0;
else {
}
/*
* Evaluate the ACK_REQUIRED flag *after* we received
* the complete frame, and after bumping the next_rx
* sequence.
*/
}
}
RDSV3_DPRINTF4("rdsv3_ib_process_recv",
"Return: conn: %p recv: %p len: %d state: %p",
}
void
struct rdsv3_ib_ack_state *state)
{
struct rdsv3_ib_recv_work *recv;
RDSV3_DPRINTF4("rdsv3_ib_recv_cqe_handler",
"rwc wc_id 0x%llx status %u byte_len %u imm_data %u\n",
/*
* Also process recvs in connecting state because it is possible
* to get a recv completion _before_ the rdmacm ESTABLISHED
* event is processed.
*/
/* We expect errors as the qp is drained during shutdown */
} else {
RDSV3_DPRINTF2("rdsv3_ib_recv_cqe_handler",
"recv completion on "
"%u.%u.%u.%u had status %u, "
"disconnecting and reconnecting\n",
}
}
/*
* If we ever end up with a really empty receive ring, we're
* in deep trouble, as the sender will definitely see RNR
* timeouts.
*/
if (rdsv3_ib_ring_low(recv_ringp)) {
}
}
int
{
int ret = 0;
if (rdsv3_conn_up(conn))
return (ret);
}
int
rdsv3_ib_recv_init(void)
{
sizeof (struct rdsv3_ib_incoming), 0, rdsv3_ib_inc_constructor,
if (!rdsv3_ib_incoming_slab) {
"failed");
return (-ENOMEM);
}
return (0);
}
void
rdsv3_ib_recv_exit(void)
{
}