/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
* This file contains code imported from the OFED rds source file ib.c
* Oracle elects to have and use the contents of ib.c under and governed
* by the OpenIB.org BSD license (see below for full license text). However,
* the following notice accompanied the original version of this file:
*/
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <sys/sysmacros.h>
#include <sys/rds.h>
#include <sys/ib/ibtl/ibti.h>
#include <sys/ib/clients/rdsv3/rdsv3.h>
#include <sys/ib/clients/rdsv3/ib.h>
#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
struct list rdsv3_ib_devices;
/* NOTE: if also grabbing ibdev lock, grab this first */
kmutex_t ib_nodev_conns_lock;
list_t ib_nodev_conns;
extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
extern void rdsv3_ib_frag_destructor(void *buf, void *arg);
void
rdsv3_ib_add_one(ib_device_t *device)
{
struct rdsv3_ib_device *rds_ibdev;
ibt_hca_attr_t *dev_attr;
char name[64];
RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device);
/* Only handle IB (no iWARP) devices */
if (device->node_type != RDMA_NODE_IB_CA)
return;
dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
KM_NOSLEEP);
if (!dev_attr)
return;
if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
RDSV3_DPRINTF2("rdsv3_ib_add_one",
"Query device failed for %s", device->name);
goto free_attr;
}
/* We depend on Reserved Lkey */
if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
RDSV3_DPRINTF2("rdsv3_ib_add_one",
"Reserved Lkey support is required: %s",
device->name);
goto free_attr;
}
rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
if (!rds_ibdev)
goto free_attr;
rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device);
rds_ibdev->hca_attr = *dev_attr;
rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL);
mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp;
rds_ibdev->max_responder_resources =
(uint_t)dev_attr->hca_max_rdma_in_qp;
rds_ibdev->dev = device;
rds_ibdev->pd = ib_alloc_pd(device);
if (IS_ERR(rds_ibdev->pd))
goto free_dev;
if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
goto free_dev;
}
if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) {
rdsv3_ib_destroy_mr_pool(rds_ibdev);
goto free_dev;
}
(void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx",
(longlong_t)htonll(dev_attr->hca_node_guid));
rds_ibdev->ib_frag_slab = kmem_cache_create(name,
sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
if (rds_ibdev->ib_frag_slab == NULL) {
RDSV3_DPRINTF2("rdsv3_ib_add_one",
"kmem_cache_create for ib_frag_slab failed for device: %s",
device->name);
rdsv3_ib_destroy_mr_pool(rds_ibdev);
rdsv3_ib_destroy_inc_pool(rds_ibdev);
goto free_dev;
}
rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl,
(uint64_t)rds_ibdev->hca_attr.hca_node_guid);
if (rds_ibdev->aft_hcagp == NULL) {
rdsv3_ib_destroy_mr_pool(rds_ibdev);
rdsv3_ib_destroy_inc_pool(rds_ibdev);
kmem_cache_destroy(rds_ibdev->ib_frag_slab);
goto free_dev;
}
rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn,
(void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU,
rds_ibdev->aft_hcagp);
if (rds_ibdev->fmr_soft_cq == NULL) {
rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
rdsv3_ib_destroy_mr_pool(rds_ibdev);
rdsv3_ib_destroy_inc_pool(rds_ibdev);
kmem_cache_destroy(rds_ibdev->ib_frag_slab);
goto free_dev;
}
rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist,
(void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU,
rds_ibdev->aft_hcagp);
if (rds_ibdev->inc_soft_cq == NULL) {
rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
rdsv3_ib_destroy_mr_pool(rds_ibdev);
rdsv3_ib_destroy_inc_pool(rds_ibdev);
kmem_cache_destroy(rds_ibdev->ib_frag_slab);
goto free_dev;
}
list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
offsetof(struct rdsv3_ib_ipaddr, list));
list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
offsetof(struct rdsv3_ib_connection, ib_node));
list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device);
goto free_attr;
err_pd:
(void) ib_dealloc_pd(rds_ibdev->pd);
free_dev:
mutex_destroy(&rds_ibdev->spinlock);
rw_destroy(&rds_ibdev->rwlock);
kmem_free(rds_ibdev, sizeof (*rds_ibdev));
free_attr:
kmem_free(dev_attr, sizeof (*dev_attr));
}
void
rdsv3_ib_remove_one(struct ib_device *device)
{
struct rdsv3_ib_device *rds_ibdev;
struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device);
rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
if (!rds_ibdev)
return;
RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
list) {
list_remove_node(&i_ipaddr->list);
kmem_free(i_ipaddr, sizeof (*i_ipaddr));
}
rdsv3_ib_destroy_conns(rds_ibdev);
if (rds_ibdev->fmr_soft_cq)
rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
if (rds_ibdev->inc_soft_cq)
rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq);
rdsv3_ib_destroy_mr_pool(rds_ibdev);
rdsv3_ib_destroy_inc_pool(rds_ibdev);
kmem_cache_destroy(rds_ibdev->ib_frag_slab);
rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
#if 0
while (ib_dealloc_pd(rds_ibdev->pd)) {
#ifndef __lock_lint
RDSV3_DPRINTF5("rdsv3_ib_remove_one",
"%s-%d Failed to dealloc pd %p",
__func__, __LINE__, rds_ibdev->pd);
#endif
delay(drv_usectohz(1000));
}
#else
if (ib_dealloc_pd(rds_ibdev->pd)) {
#ifndef __lock_lint
RDSV3_DPRINTF2("rdsv3_ib_remove_one",
"Failed to dealloc pd %p\n", rds_ibdev->pd);
#endif
}
#endif
list_destroy(&rds_ibdev->ipaddr_list);
list_destroy(&rds_ibdev->conn_list);
list_remove_node(&rds_ibdev->list);
mutex_destroy(&rds_ibdev->spinlock);
rw_destroy(&rds_ibdev->rwlock);
kmem_free(rds_ibdev, sizeof (*rds_ibdev));
RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device);
}
#ifndef __lock_lint
struct ib_client rdsv3_ib_client = {
.name = "rdsv3_ib",
.add = rdsv3_ib_add_one,
.remove = rdsv3_ib_remove_one,
.clnt_hdl = NULL,
.state = IB_CLNT_UNINITIALIZED
};
#else
struct ib_client rdsv3_ib_client = {
"rdsv3_ib",
rdsv3_ib_add_one,
rdsv3_ib_remove_one,
NULL,
NULL,
IB_CLNT_UNINITIALIZED
};
#endif
static int
rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
void *buffer)
{
struct rds_info_rdma_connection *iinfo = buffer;
struct rdsv3_ib_connection *ic;
RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
conn, buffer);
/* We will only ever look at IB transports */
if (conn->c_trans != &rdsv3_ib_transport)
return (0);
iinfo->src_addr = conn->c_laddr;
iinfo->dst_addr = conn->c_faddr;
(void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
(void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
struct rdsv3_ib_device *rds_ibdev;
struct rdma_dev_addr *dev_addr;
ic = conn->c_transport_data;
dev_addr = &ic->i_cm_id->route.addr.dev_addr;
ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
&rdsv3_ib_client);
iinfo->max_send_wr = ic->i_send_ring.w_nr;
iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
iinfo->max_send_sge = rds_ibdev->max_sge;
}
RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
conn, buffer);
return (1);
}
static void
rds_ib_ic_info(struct rsock *sock, unsigned int len,
struct rdsv3_info_iterator *iter,
struct rdsv3_info_lengths *lens)
{
RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
sock, iter, lens, len);
rdsv3_for_each_conn_info(sock, len, iter, lens,
rds_ib_conn_info_visitor,
sizeof (struct rds_info_rdma_connection));
}
/*
* Early RDS/IB was built to only bind to an address if there is an IPoIB
* device with that address set.
*
* If it were me, I'd advocate for something more flexible. Sending and
* receiving should be device-agnostic. Transports would try and maintain
* connections between peers who have messages queued. Userspace would be
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
static int
rds_ib_laddr_check(uint32_be_t addr)
{
int ret;
struct rdma_cm_id *cm_id;
struct sockaddr_in sin;
RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
/*
* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
if (!cm_id)
return (-EADDRNOTAVAIL);
(void) memset(&sin, 0, sizeof (sin));
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
/* rdma_bind_addr will only succeed for IB & iWARP devices */
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
/*
* due to this, we will claim to support iWARP devices unless we
* check node_type.
*/
if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
ret = -EADDRNOTAVAIL;
RDSV3_DPRINTF5("rds_ib_laddr_check",
"addr %u.%u.%u.%u ret %d node type %d",
NIPQUAD(addr), ret,
cm_id->device ? cm_id->device->node_type : -1);
rdma_destroy_id(cm_id);
return (ret);
}
void
rdsv3_ib_exit(void)
{
RDSV3_DPRINTF4("rds_ib_exit", "Enter");
rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
rdsv3_ib_destroy_nodev_conns();
ib_unregister_client(&rdsv3_ib_client);
rdsv3_ib_sysctl_exit();
rdsv3_ib_recv_exit();
rdsv3_trans_unregister(&rdsv3_ib_transport);
kmem_free(rdsv3_ib_stats,
nr_cpus * sizeof (struct rdsv3_ib_statistics));
mutex_destroy(&ib_nodev_conns_lock);
list_destroy(&ib_nodev_conns);
list_destroy(&rdsv3_ib_devices);
RDSV3_DPRINTF4("rds_ib_exit", "Return");
}
#ifndef __lock_lint
struct rdsv3_transport rdsv3_ib_transport = {
.laddr_check = rds_ib_laddr_check,
.xmit_complete = rdsv3_ib_xmit_complete,
.xmit = rdsv3_ib_xmit,
.xmit_cong_map = NULL,
.xmit_rdma = rdsv3_ib_xmit_rdma,
.recv = rdsv3_ib_recv,
.conn_alloc = rdsv3_ib_conn_alloc,
.conn_free = rdsv3_ib_conn_free,
.conn_connect = rdsv3_ib_conn_connect,
.conn_shutdown = rdsv3_ib_conn_shutdown,
.inc_copy_to_user = rdsv3_ib_inc_copy_to_user,
.inc_free = rdsv3_ib_inc_free,
.cm_initiate_connect = rdsv3_ib_cm_initiate_connect,
.cm_handle_connect = rdsv3_ib_cm_handle_connect,
.cm_connect_complete = rdsv3_ib_cm_connect_complete,
.stats_info_copy = rdsv3_ib_stats_info_copy,
.exit = rdsv3_ib_exit,
.get_mr = rdsv3_ib_get_mr,
.sync_mr = rdsv3_ib_sync_mr,
.free_mr = rdsv3_ib_free_mr,
.flush_mrs = rdsv3_ib_flush_mrs,
.t_name = "infiniband",
.t_type = RDS_TRANS_IB
};
#else
struct rdsv3_transport rdsv3_ib_transport;
#endif
int
rdsv3_ib_init(void)
{
int ret;
RDSV3_DPRINTF4("rds_ib_init", "Enter");
list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
offsetof(struct rdsv3_ib_device, list));
list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
offsetof(struct rdsv3_ib_connection, ib_node));
mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
/* allocate space for ib statistics */
ASSERT(rdsv3_ib_stats == NULL);
rdsv3_ib_stats = kmem_zalloc(nr_cpus *
sizeof (struct rdsv3_ib_statistics), KM_SLEEP);
rdsv3_ib_client.dip = rdsv3_dev_info;
ret = ib_register_client(&rdsv3_ib_client);
if (ret)
goto out;
ret = rdsv3_ib_sysctl_init();
if (ret)
goto out_ibreg;
ret = rdsv3_ib_recv_init();
if (ret)
goto out_sysctl;
ret = rdsv3_trans_register(&rdsv3_ib_transport);
if (ret)
goto out_recv;
rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
RDSV3_DPRINTF4("rds_ib_init", "Return");
return (0);
out_recv:
rdsv3_ib_recv_exit();
out_sysctl:
rdsv3_ib_sysctl_exit();
out_ibreg:
ib_unregister_client(&rdsv3_ib_client);
out:
kmem_free(rdsv3_ib_stats,
nr_cpus * sizeof (struct rdsv3_ib_statistics));
mutex_destroy(&ib_nodev_conns_lock);
list_destroy(&ib_nodev_conns);
list_destroy(&rdsv3_ib_devices);
return (ret);
}