16e76cdd6e3cfaac7d91c3b0644ee1bc6cf52347agiri * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
16e76cdd6e3cfaac7d91c3b0644ee1bc6cf52347agiri * This file contains code imported from the OFED rds source file ib.c
16e76cdd6e3cfaac7d91c3b0644ee1bc6cf52347agiri * Oracle elects to have and use the contents of ib.c under and governed
16e76cdd6e3cfaac7d91c3b0644ee1bc6cf52347agiri * by the OpenIB.org BSD license (see below for full license text). However,
16e76cdd6e3cfaac7d91c3b0644ee1bc6cf52347agiri * the following notice accompanied the original version of this file:
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * Copyright (c) 2006 Oracle. All rights reserved.
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * This software is available to you under a choice of one of two
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * licenses. You may choose to be licensed under the terms of the GNU
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * General Public License (GPL) Version 2, available from the file
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * COPYING in the main directory of this source tree, or the
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * OpenIB.org BSD license below:
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * Redistribution and use in source and binary forms, with or
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * without modification, are permitted provided that the following
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * conditions are met:
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * - Redistributions of source code must retain the above
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * copyright notice, this list of conditions and the following
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * disclaimer.
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * - Redistributions in binary form must reproduce the above
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * copyright notice, this list of conditions and the following
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * disclaimer in the documentation and/or other materials
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * provided with the distribution.
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * SOFTWARE.
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Otaunsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota/* NOTE: if also grabbing ibdev lock, grab this first */
d2b539e744e90927cf7a57df3475145c279d68f9agiriextern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
d2b539e744e90927cf7a57df3475145c279d68f9agiriextern void rdsv3_ib_frag_destructor(void *buf, void *arg);
b27516f55237249607f754e6e42e865f12456675agiri RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota /* Only handle IB (no iWARP) devices */
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota /* We depend on Reserved Lkey */
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota "Reserved Lkey support is required: %s",
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
5d5562f583b2b6affe19bdce0b3c8b1840d667a4Eiji Ota rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device);
5d5562f583b2b6affe19bdce0b3c8b1840d667a4Eiji Ota rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
5d5562f583b2b6affe19bdce0b3c8b1840d667a4Eiji Ota rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp;
d2b539e744e90927cf7a57df3475145c279d68f9agiri sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
d2b539e744e90927cf7a57df3475145c279d68f9agiri rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
d2b539e744e90927cf7a57df3475145c279d68f9agiri "kmem_cache_create for ib_frag_slab failed for device: %s",
5d5562f583b2b6affe19bdce0b3c8b1840d667a4Eiji Ota rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl,
5d5562f583b2b6affe19bdce0b3c8b1840d667a4Eiji Ota rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn,
5d5562f583b2b6affe19bdce0b3c8b1840d667a4Eiji Ota rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist,
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
b27516f55237249607f754e6e42e865f12456675agiri RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device);
b27516f55237249607f754e6e42e865f12456675agiri RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota "%s-%d Failed to dealloc pd %p",
b27516f55237249607f754e6e42e865f12456675agiri RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota "rdsv3_ib",
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Otards_ib_conn_info_visitor(struct rdsv3_connection *conn,
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota /* We will only ever look at IB transports */
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota return (0);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota return (1);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Otards_ib_ic_info(struct rsock *sock, unsigned int len,
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
fe817b6022080da0a98b5d2d8cd179f594d6ca5eEiji Ota sizeof (struct rds_info_rdma_connection));
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * Early RDS/IB was built to only bind to an address if there is an IPoIB
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * device with that address set.
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * If it were me, I'd advocate for something more flexible. Sending and
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * receiving should be device-agnostic. Transports would try and maintain
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * connections between peers who have messages queued. Userspace would be
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * allowed to influence which paths have priority. We could call userspace
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * asserting this policy "routing".
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * Create a CMA ID and try to bind it. This catches both
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * IB and iWARP capable NICs.
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota /* rdma_bind_addr will only succeed for IB & iWARP devices */
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * due to this, we will claim to support iWARP devices unless we
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota * check node_type.
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota "addr %u.%u.%u.%u ret %d node type %d",
fe817b6022080da0a98b5d2d8cd179f594d6ca5eEiji Ota rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota .cm_initiate_connect = rdsv3_ib_cm_initiate_connect,
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota .cm_connect_complete = rdsv3_ib_cm_connect_complete,
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
a530e0a9c7875fde3c123c906ca193a70dfacc4fagiri /* allocate space for ib statistics */
fe817b6022080da0a98b5d2d8cd179f594d6ca5eEiji Ota rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
c0dd49bdd68c0d758a67d56f07826f3b45cfc664Eiji Ota return (0);