send.c revision fe817b6022080da0a98b5d2d8cd179f594d6ca5e
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * This file contains code imported from the OFED rds source file send.c
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Oracle elects to have and use the contents of send.c under and governed
03831d35f7499c87d51205817c93e9a8d42c4baestevel * by the OpenIB.org BSD license (see below for full license text). However,
03831d35f7499c87d51205817c93e9a8d42c4baestevel * the following notice accompanied the original version of this file:
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Copyright (c) 2006 Oracle. All rights reserved.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * This software is available to you under a choice of one of two
03831d35f7499c87d51205817c93e9a8d42c4baestevel * licenses. You may choose to be licensed under the terms of the GNU
03831d35f7499c87d51205817c93e9a8d42c4baestevel * General Public License (GPL) Version 2, available from the file
03831d35f7499c87d51205817c93e9a8d42c4baestevel * COPYING in the main directory of this source tree, or the
03831d35f7499c87d51205817c93e9a8d42c4baestevel * OpenIB.org BSD license below:
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Redistribution and use in source and binary forms, with or
03831d35f7499c87d51205817c93e9a8d42c4baestevel * without modification, are permitted provided that the following
03831d35f7499c87d51205817c93e9a8d42c4baestevel * conditions are met:
03831d35f7499c87d51205817c93e9a8d42c4baestevel * - Redistributions of source code must retain the above
03831d35f7499c87d51205817c93e9a8d42c4baestevel * copyright notice, this list of conditions and the following
03831d35f7499c87d51205817c93e9a8d42c4baestevel * disclaimer.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * - Redistributions in binary form must reproduce the above
03831d35f7499c87d51205817c93e9a8d42c4baestevel * copyright notice, this list of conditions and the following
03831d35f7499c87d51205817c93e9a8d42c4baestevel * disclaimer in the documentation and/or other materials
03831d35f7499c87d51205817c93e9a8d42c4baestevel * provided with the distribution.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
03831d35f7499c87d51205817c93e9a8d42c4baestevel * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
03831d35f7499c87d51205817c93e9a8d42c4baestevel * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
03831d35f7499c87d51205817c93e9a8d42c4baestevel * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
03831d35f7499c87d51205817c93e9a8d42c4baestevel * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
03831d35f7499c87d51205817c93e9a8d42c4baestevel * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
03831d35f7499c87d51205817c93e9a8d42c4baestevel * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
03831d35f7499c87d51205817c93e9a8d42c4baestevel * SOFTWARE.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * When transmitting messages in rdsv3_send_xmit, we need to emerge from
03831d35f7499c87d51205817c93e9a8d42c4baestevel * time to time and briefly release the CPU. Otherwise the softlock watchdog
03831d35f7499c87d51205817c93e9a8d42c4baestevel * will kick our shin.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Also, it seems fairer to not let one busy connection stall all the
03831d35f7499c87d51205817c93e9a8d42c4baestevel * send_batch_count is the number of times we'll loop in send_xmit. Setting
03831d35f7499c87d51205817c93e9a8d42c4baestevel * it to 0 will restore the old behavior (where we looped until we had
03831d35f7499c87d51205817c93e9a8d42c4baestevel * drained the queue).
03831d35f7499c87d51205817c93e9a8d42c4baestevelextern void rdsv3_ib_send_unmap_rdma(void *ic, struct rdsv3_rdma_op *op);
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Reset the send state. Caller must hold c_send_lock when calling here.
03831d35f7499c87d51205817c93e9a8d42c4baestevel RDSV3_DPRINTF4("rdsv3_send_reset", "Enter(conn: %p)", conn);
03831d35f7499c87d51205817c93e9a8d42c4baestevel "rm %p mflg 0x%x map %d mihdl %p sgl %p",
03831d35f7499c87d51205817c93e9a8d42c4baestevel rdsv3_ib_send_unmap_rdma(conn->c_transport_data, ro);
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Tell the user the RDMA op is no longer mapped by the
03831d35f7499c87d51205817c93e9a8d42c4baestevel * transport. This isn't entirely true (it's flushed out
03831d35f7499c87d51205817c93e9a8d42c4baestevel * independently) but as the connection is down, there's
03831d35f7499c87d51205817c93e9a8d42c4baestevel * no ongoing RDMA to/from that memory
03831d35f7499c87d51205817c93e9a8d42c4baestevel conn->c_unacked_packets = rdsv3_sysctl_max_unacked_packets;
03831d35f7499c87d51205817c93e9a8d42c4baestevel conn->c_unacked_bytes = rdsv3_sysctl_max_unacked_bytes;
03831d35f7499c87d51205817c93e9a8d42c4baestevel /* Mark messages as retransmissions, and move them to the send q */
03831d35f7499c87d51205817c93e9a8d42c4baestevel RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) {
03831d35f7499c87d51205817c93e9a8d42c4baestevel "RT rm %p mflg 0x%x sgl %p",
03831d35f7499c87d51205817c93e9a8d42c4baestevel list_move_tail(&conn->c_send_queue, &conn->c_retrans);
03831d35f7499c87d51205817c93e9a8d42c4baestevel RDSV3_DPRINTF4("rdsv3_send_reset", "Return(conn: %p)", conn);
03831d35f7499c87d51205817c93e9a8d42c4baestevel * We're making the concious trade-off here to only send one message
03831d35f7499c87d51205817c93e9a8d42c4baestevel * down the connection at a time.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * - tx queueing is a simple fifo list
03831d35f7499c87d51205817c93e9a8d42c4baestevel * - reassembly is optional and easily done by transports per conn
03831d35f7499c87d51205817c93e9a8d42c4baestevel * - no per flow rx lookup at all, straight to the socket
03831d35f7499c87d51205817c93e9a8d42c4baestevel * - less per-frag memory and wire overhead
03831d35f7499c87d51205817c93e9a8d42c4baestevel * - queued acks can be delayed behind large messages
03831d35f7499c87d51205817c93e9a8d42c4baestevel * - small message latency is higher behind queued large messages
03831d35f7499c87d51205817c93e9a8d42c4baestevel * - large message latency isn't starved by intervening small sends
03831d35f7499c87d51205817c93e9a8d42c4baestevel unsigned int tmp;
03831d35f7499c87d51205817c93e9a8d42c4baestevel RDSV3_DPRINTF4("rdsv3_send_xmit", "Enter(conn: %p)", conn);
03831d35f7499c87d51205817c93e9a8d42c4baestevel list_create(&to_be_dropped, sizeof (struct rdsv3_message),
03831d35f7499c87d51205817c93e9a8d42c4baestevel * sendmsg calls here after having queued its message on the send
03831d35f7499c87d51205817c93e9a8d42c4baestevel * queue. We only have one task feeding the connection at a time. If
03831d35f7499c87d51205817c93e9a8d42c4baestevel * another thread is already feeding the queue then we back off. This
03831d35f7499c87d51205817c93e9a8d42c4baestevel * avoids blocking the caller and trading per-connection data between
03831d35f7499c87d51205817c93e9a8d42c4baestevel * caches per message.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * spin trying to push headers and data down the connection until
03831d35f7499c87d51205817c93e9a8d42c4baestevel * the connection doesn't make forward progress.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * See if need to send a congestion map update if we're
03831d35f7499c87d51205817c93e9a8d42c4baestevel * between sending messages. The send_sem protects our sole
03831d35f7499c87d51205817c93e9a8d42c4baestevel * use of c_map_offset and _bytes.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Note this is used only by transports that define a special
03831d35f7499c87d51205817c93e9a8d42c4baestevel * xmit_cong_map function. For all others, we create allocate
03831d35f7499c87d51205817c93e9a8d42c4baestevel * a cong_map message and treat it just like any other send.
03831d35f7499c87d51205817c93e9a8d42c4baestevel ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
03831d35f7499c87d51205817c93e9a8d42c4baestevel * If we're done sending the current message, clear the
03831d35f7499c87d51205817c93e9a8d42c4baestevel * offset and S/G temporaries.
03831d35f7499c87d51205817c93e9a8d42c4baestevel conn->c_xmit_hdr_off == sizeof (struct rdsv3_header) &&
03831d35f7499c87d51205817c93e9a8d42c4baestevel /* Release the reference to the previous message. */
03831d35f7499c87d51205817c93e9a8d42c4baestevel /* If we're asked to send a cong map update, do so. */
03831d35f7499c87d51205817c93e9a8d42c4baestevel if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
03831d35f7499c87d51205817c93e9a8d42c4baestevel sizeof (struct rdsv3_header) +
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Grab the next message from the send queue, if there is one.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * c_xmit_rm holds a ref while we're sending this message down
03831d35f7499c87d51205817c93e9a8d42c4baestevel * the connction. We can use this ref while holding the
03831d35f7499c87d51205817c93e9a8d42c4baestevel * send_sem.. rdsv3_send_reset() is serialized with it.
03831d35f7499c87d51205817c93e9a8d42c4baestevel unsigned int len;
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Move the message from the send queue to
03831d35f7499c87d51205817c93e9a8d42c4baestevel * the retransmit
03831d35f7499c87d51205817c93e9a8d42c4baestevel * list right away.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Unfortunately, the way Infiniband deals with
03831d35f7499c87d51205817c93e9a8d42c4baestevel * RDMA to a bad MR key is by moving the entire
03831d35f7499c87d51205817c93e9a8d42c4baestevel * queue pair to error state. We cold possibly
03831d35f7499c87d51205817c93e9a8d42c4baestevel * recover from that, but right now we drop the
03831d35f7499c87d51205817c93e9a8d42c4baestevel * connection.
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Therefore, we never retransmit messages with
03831d35f7499c87d51205817c93e9a8d42c4baestevel * RDMA ops.
03831d35f7499c87d51205817c93e9a8d42c4baestevel /* Require an ACK every once in a while */
03831d35f7499c87d51205817c93e9a8d42c4baestevel * Try and send an rdma message. Let's see if we can
03831d35f7499c87d51205817c93e9a8d42c4baestevel * keep this simple and require that the transport either
03831d35f7499c87d51205817c93e9a8d42c4baestevel * send the whole rdma or none of it.
if (ret <= 0)
sizeof (struct rdsv3_header)) {
sizeof (struct rdsv3_header) -
while (ret) {
sg++;
out:
return (ret);
if (is_acked)
ret = 0;
ret = 0;
return (ret);
if (rs) {
int error;
if (error != 0) {
int status)
void *ic;
struct rdsv3_message *
goto out;
m_conn_item) {
out:
return (found);
int was_on_sock = 0;
goto unlock_and_drop;
if (rs) {
notifier);
if (was_on_sock)
if (rs) {
int wake = 0;
m_sock_item) {
if (conn)
if (conn)
if (wake)
if (*queued)
goto out;
dport, 0);
(unsigned long long)ntohll(
out:
return (*queued);
int ret = 0;
case RDS_CMSG_RDMA_ARGS:
case RDS_CMSG_RDMA_DEST:
case RDS_CMSG_RDMA_MAP:
if (ret)
return (-EINVAL);
if (ret)
return (ret);
extern unsigned long rdsv3_max_bcopy_size;
int ret = 0;
goto out;
goto out;
goto out;
goto out;
if (ret) {
goto out;
-ret);
goto out;
goto out;
if (ret) {
goto out;
&queued);
if (!queued) {
goto out;
if (nonblock) {
goto out;
if (ret == 0) {
goto out;
if (ret == 0) {
goto out;
queued);
ret = 0;
return (payload_len);
out:
if (allocated_mr)
if (rm)
return (ret);
int ret = 0;
if (!rm) {
goto out;
if (ret)
goto out;
out:
if (rm)
return (ret);