aboutsummaryrefslogtreecommitdiffstats
path: root/gpxe/src/net/infiniband.c
diff options
context:
space:
mode:
Diffstat (limited to 'gpxe/src/net/infiniband.c')
-rw-r--r--gpxe/src/net/infiniband.c405
1 files changed, 378 insertions, 27 deletions
diff --git a/gpxe/src/net/infiniband.c b/gpxe/src/net/infiniband.c
index d79bdc2c..d7813249 100644
--- a/gpxe/src/net/infiniband.c
+++ b/gpxe/src/net/infiniband.c
@@ -16,6 +16,8 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+FILE_LICENCE ( GPL2_OR_LATER );
+
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
@@ -25,12 +27,15 @@
#include <errno.h>
#include <assert.h>
#include <gpxe/list.h>
+#include <gpxe/errortab.h>
#include <gpxe/if_arp.h>
#include <gpxe/netdevice.h>
#include <gpxe/iobuf.h>
#include <gpxe/ipoib.h>
#include <gpxe/process.h>
#include <gpxe/infiniband.h>
+#include <gpxe/ib_mi.h>
+#include <gpxe/ib_sma.h>
/** @file
*
@@ -41,6 +46,26 @@
/** List of Infiniband devices */
struct list_head ib_devices = LIST_HEAD_INIT ( ib_devices );
+/** List of open Infiniband devices, in reverse order of opening */
+static struct list_head open_ib_devices = LIST_HEAD_INIT ( open_ib_devices );
+
+/* Disambiguate the various possible EINPROGRESSes */
+#define EINPROGRESS_INIT ( EINPROGRESS | EUNIQ_01 )
+#define EINPROGRESS_ARMED ( EINPROGRESS | EUNIQ_02 )
+
+/** Human-readable message for the link statuses */
+struct errortab infiniband_errors[] __errortab = {
+ { EINPROGRESS_INIT, "Initialising" },
+ { EINPROGRESS_ARMED, "Armed" },
+};
+
+/***************************************************************************
+ *
+ * Completion queues
+ *
+ ***************************************************************************
+ */
+
/**
* Create completion queue
*
@@ -61,6 +86,8 @@ ib_create_cq ( struct ib_device *ibdev, unsigned int num_cqes,
cq = zalloc ( sizeof ( *cq ) );
if ( ! cq )
goto err_alloc_cq;
+ cq->ibdev = ibdev;
+ list_add ( &cq->list, &ibdev->cqs );
cq->num_cqes = num_cqes;
INIT_LIST_HEAD ( &cq->work_queues );
cq->op = op;
@@ -79,6 +106,7 @@ ib_create_cq ( struct ib_device *ibdev, unsigned int num_cqes,
ibdev->op->destroy_cq ( ibdev, cq );
err_dev_create_cq:
+ list_del ( &cq->list );
free ( cq );
err_alloc_cq:
return NULL;
@@ -96,26 +124,57 @@ void ib_destroy_cq ( struct ib_device *ibdev,
ibdev, cq->cqn );
assert ( list_empty ( &cq->work_queues ) );
ibdev->op->destroy_cq ( ibdev, cq );
+ list_del ( &cq->list );
free ( cq );
}
/**
+ * Poll completion queue
+ *
+ * @v ibdev Infiniband device
+ * @v cq Completion queue
+ */
+void ib_poll_cq ( struct ib_device *ibdev,
+ struct ib_completion_queue *cq ) {
+ struct ib_work_queue *wq;
+
+ /* Poll completion queue */
+ ibdev->op->poll_cq ( ibdev, cq );
+
+ /* Refill receive work queues */
+ list_for_each_entry ( wq, &cq->work_queues, list ) {
+ if ( ! wq->is_send )
+ ib_refill_recv ( ibdev, wq->qp );
+ }
+}
+
+/***************************************************************************
+ *
+ * Work queues
+ *
+ ***************************************************************************
+ */
+
+/**
* Create queue pair
*
* @v ibdev Infiniband device
+ * @v type Queue pair type
* @v num_send_wqes Number of send work queue entries
* @v send_cq Send completion queue
* @v num_recv_wqes Number of receive work queue entries
* @v recv_cq Receive completion queue
- * @v qkey Queue key
* @ret qp Queue pair
+ *
+ * The queue pair will be left in the INIT state; you must call
+ * ib_modify_qp() before it is ready to use for sending and receiving.
*/
struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev,
+ enum ib_queue_pair_type type,
unsigned int num_send_wqes,
struct ib_completion_queue *send_cq,
unsigned int num_recv_wqes,
- struct ib_completion_queue *recv_cq,
- unsigned long qkey ) {
+ struct ib_completion_queue *recv_cq ) {
struct ib_queue_pair *qp;
size_t total_size;
int rc;
@@ -131,16 +190,18 @@ struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev,
goto err_alloc_qp;
qp->ibdev = ibdev;
list_add ( &qp->list, &ibdev->qps );
- qp->qkey = qkey;
+ qp->type = type;
qp->send.qp = qp;
qp->send.is_send = 1;
qp->send.cq = send_cq;
list_add ( &qp->send.list, &send_cq->work_queues );
+ qp->send.psn = ( random() & 0xffffffUL );
qp->send.num_wqes = num_send_wqes;
qp->send.iobufs = ( ( ( void * ) qp ) + sizeof ( *qp ) );
qp->recv.qp = qp;
qp->recv.cq = recv_cq;
list_add ( &qp->recv.list, &recv_cq->work_queues );
+ qp->recv.psn = ( random() & 0xffffffUL );
qp->recv.num_wqes = num_recv_wqes;
qp->recv.iobufs = ( ( ( void * ) qp ) + sizeof ( *qp ) +
( num_send_wqes * sizeof ( qp->send.iobufs[0] ) ));
@@ -152,7 +213,6 @@ struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev,
"%s\n", ibdev, strerror ( rc ) );
goto err_dev_create_qp;
}
-
DBGC ( ibdev, "IBDEV %p created queue pair %p (%p) with QPN %#lx\n",
ibdev, qp, ib_qp_get_drvdata ( qp ), qp->qpn );
DBGC ( ibdev, "IBDEV %p QPN %#lx has %d send entries at [%p,%p)\n",
@@ -161,6 +221,24 @@ struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev,
DBGC ( ibdev, "IBDEV %p QPN %#lx has %d receive entries at [%p,%p)\n",
ibdev, qp->qpn, num_recv_wqes, qp->recv.iobufs,
( ( ( void * ) qp ) + total_size ) );
+
+ /* Calculate externally-visible QPN */
+ switch ( type ) {
+ case IB_QPT_SMI:
+ qp->ext_qpn = IB_QPN_SMI;
+ break;
+ case IB_QPT_GSI:
+ qp->ext_qpn = IB_QPN_GSI;
+ break;
+ default:
+ qp->ext_qpn = qp->qpn;
+ break;
+ }
+ if ( qp->ext_qpn != qp->qpn ) {
+ DBGC ( ibdev, "IBDEV %p QPN %#lx has external QPN %#lx\n",
+ ibdev, qp->qpn, qp->ext_qpn );
+ }
+
return qp;
ibdev->op->destroy_qp ( ibdev, qp );
@@ -178,20 +256,15 @@ struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev,
*
* @v ibdev Infiniband device
* @v qp Queue pair
- * @v mod_list Modification list
- * @v qkey New queue key, if applicable
+ * @v av New address vector, if applicable
* @ret rc Return status code
*/
-int ib_modify_qp ( struct ib_device *ibdev, struct ib_queue_pair *qp,
- unsigned long mod_list, unsigned long qkey ) {
+int ib_modify_qp ( struct ib_device *ibdev, struct ib_queue_pair *qp ) {
int rc;
DBGC ( ibdev, "IBDEV %p modifying QPN %#lx\n", ibdev, qp->qpn );
- if ( mod_list & IB_MODIFY_QKEY )
- qp->qkey = qkey;
-
- if ( ( rc = ibdev->op->modify_qp ( ibdev, qp, mod_list ) ) != 0 ) {
+ if ( ( rc = ibdev->op->modify_qp ( ibdev, qp ) ) != 0 ) {
DBGC ( ibdev, "IBDEV %p could not modify QPN %#lx: %s\n",
ibdev, qp->qpn, strerror ( rc ) );
return rc;
@@ -251,7 +324,7 @@ struct ib_queue_pair * ib_find_qp_qpn ( struct ib_device *ibdev,
struct ib_queue_pair *qp;
list_for_each_entry ( qp, &ibdev->qps, list ) {
- if ( qp->qpn == qpn )
+ if ( ( qpn == qp->qpn ) || ( qpn == qp->ext_qpn ) )
return qp;
}
return NULL;
@@ -311,6 +384,7 @@ struct ib_work_queue * ib_find_wq ( struct ib_completion_queue *cq,
int ib_post_send ( struct ib_device *ibdev, struct ib_queue_pair *qp,
struct ib_address_vector *av,
struct io_buffer *iobuf ) {
+ struct ib_address_vector av_copy;
int rc;
/* Check queue fill level */
@@ -320,6 +394,20 @@ int ib_post_send ( struct ib_device *ibdev, struct ib_queue_pair *qp,
return -ENOBUFS;
}
+ /* Use default address vector if none specified */
+ if ( ! av )
+ av = &qp->av;
+
+ /* Make modifiable copy of address vector */
+ memcpy ( &av_copy, av, sizeof ( av_copy ) );
+ av = &av_copy;
+
+ /* Fill in optional parameters in address vector */
+ if ( ! av->qkey )
+ av->qkey = qp->qkey;
+ if ( ! av->rate )
+ av->rate = IB_RATE_2_5;
+
/* Post to hardware */
if ( ( rc = ibdev->op->post_send ( ibdev, qp, av, iobuf ) ) != 0 ) {
DBGC ( ibdev, "IBDEV %p QPN %#lx could not post send WQE: "
@@ -343,6 +431,13 @@ int ib_post_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp,
struct io_buffer *iobuf ) {
int rc;
+ /* Check packet length */
+ if ( iob_tailroom ( iobuf ) < IB_MAX_PAYLOAD_SIZE ) {
+ DBGC ( ibdev, "IBDEV %p QPN %#lx wrong RX buffer size (%zd)\n",
+ ibdev, qp->qpn, iob_tailroom ( iobuf ) );
+ return -EINVAL;
+ }
+
/* Check queue fill level */
if ( qp->recv.fill >= qp->recv.num_wqes ) {
DBGC ( ibdev, "IBDEV %p QPN %#lx receive queue full\n",
@@ -371,7 +466,12 @@ int ib_post_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp,
*/
void ib_complete_send ( struct ib_device *ibdev, struct ib_queue_pair *qp,
struct io_buffer *iobuf, int rc ) {
- qp->send.cq->op->complete_send ( ibdev, qp, iobuf, rc );
+
+ if ( qp->send.cq->op->complete_send ) {
+ qp->send.cq->op->complete_send ( ibdev, qp, iobuf, rc );
+ } else {
+ free_iob ( iobuf );
+ }
qp->send.fill--;
}
@@ -387,11 +487,54 @@ void ib_complete_send ( struct ib_device *ibdev, struct ib_queue_pair *qp,
void ib_complete_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp,
struct ib_address_vector *av,
struct io_buffer *iobuf, int rc ) {
- qp->recv.cq->op->complete_recv ( ibdev, qp, av, iobuf, rc );
+
+ if ( qp->recv.cq->op->complete_recv ) {
+ qp->recv.cq->op->complete_recv ( ibdev, qp, av, iobuf, rc );
+ } else {
+ free_iob ( iobuf );
+ }
qp->recv.fill--;
}
/**
+ * Refill receive work queue
+ *
+ * @v ibdev Infiniband device
+ * @v qp Queue pair
+ */
+void ib_refill_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp ) {
+ struct io_buffer *iobuf;
+ int rc;
+
+ /* Keep filling while unfilled entries remain */
+ while ( qp->recv.fill < qp->recv.num_wqes ) {
+
+ /* Allocate I/O buffer */
+ iobuf = alloc_iob ( IB_MAX_PAYLOAD_SIZE );
+ if ( ! iobuf ) {
+ /* Non-fatal; we will refill on next attempt */
+ return;
+ }
+
+ /* Post I/O buffer */
+ if ( ( rc = ib_post_recv ( ibdev, qp, iobuf ) ) != 0 ) {
+ DBGC ( ibdev, "IBDEV %p could not refill: %s\n",
+ ibdev, strerror ( rc ) );
+ free_iob ( iobuf );
+ /* Give up */
+ return;
+ }
+ }
+}
+
+/***************************************************************************
+ *
+ * Link control
+ *
+ ***************************************************************************
+ */
+
+/**
* Open port
*
* @v ibdev Infiniband device
@@ -400,16 +543,59 @@ void ib_complete_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp,
int ib_open ( struct ib_device *ibdev ) {
int rc;
- /* Open device if this is the first requested opening */
- if ( ibdev->open_count == 0 ) {
- if ( ( rc = ibdev->op->open ( ibdev ) ) != 0 )
- return rc;
+ /* Increment device open request counter */
+ if ( ibdev->open_count++ > 0 ) {
+ /* Device was already open; do nothing */
+ return 0;
}
- /* Increment device open request counter */
- ibdev->open_count++;
+ /* Create subnet management interface */
+ ibdev->smi = ib_create_mi ( ibdev, IB_QPT_SMI );
+ if ( ! ibdev->smi ) {
+ DBGC ( ibdev, "IBDEV %p could not create SMI\n", ibdev );
+ rc = -ENOMEM;
+ goto err_create_smi;
+ }
+ /* Create subnet management agent */
+ if ( ( rc = ib_create_sma ( ibdev, ibdev->smi ) ) != 0 ) {
+ DBGC ( ibdev, "IBDEV %p could not create SMA: %s\n",
+ ibdev, strerror ( rc ) );
+ goto err_create_sma;
+ }
+
+ /* Create general services interface */
+ ibdev->gsi = ib_create_mi ( ibdev, IB_QPT_GSI );
+ if ( ! ibdev->gsi ) {
+ DBGC ( ibdev, "IBDEV %p could not create GSI\n", ibdev );
+ rc = -ENOMEM;
+ goto err_create_gsi;
+ }
+
+ /* Open device */
+ if ( ( rc = ibdev->op->open ( ibdev ) ) != 0 ) {
+ DBGC ( ibdev, "IBDEV %p could not open: %s\n",
+ ibdev, strerror ( rc ) );
+ goto err_open;
+ }
+
+ /* Add to head of open devices list */
+ list_add ( &ibdev->open_list, &open_ib_devices );
+
+ assert ( ibdev->open_count == 1 );
return 0;
+
+ ibdev->op->close ( ibdev );
+ err_open:
+ ib_destroy_mi ( ibdev, ibdev->gsi );
+ err_create_gsi:
+ ib_destroy_sma ( ibdev, ibdev->smi );
+ err_create_sma:
+ ib_destroy_mi ( ibdev, ibdev->smi );
+ err_create_smi:
+ assert ( ibdev->open_count == 1 );
+ ibdev->open_count = 0;
+ return rc;
}
/**
@@ -423,10 +609,38 @@ void ib_close ( struct ib_device *ibdev ) {
ibdev->open_count--;
/* Close device if this was the last remaining requested opening */
- if ( ibdev->open_count == 0 )
+ if ( ibdev->open_count == 0 ) {
+ list_del ( &ibdev->open_list );
+ ib_destroy_mi ( ibdev, ibdev->gsi );
+ ib_destroy_sma ( ibdev, ibdev->smi );
+ ib_destroy_mi ( ibdev, ibdev->smi );
ibdev->op->close ( ibdev );
+ }
+}
+
+/**
+ * Get link state
+ *
+ * @v ibdev Infiniband device
+ * @ret rc Link status code
+ */
+int ib_link_rc ( struct ib_device *ibdev ) {
+ switch ( ibdev->port_state ) {
+ case IB_PORT_STATE_DOWN: return -ENOTCONN;
+ case IB_PORT_STATE_INIT: return -EINPROGRESS_INIT;
+ case IB_PORT_STATE_ARMED: return -EINPROGRESS_ARMED;
+ case IB_PORT_STATE_ACTIVE: return 0;
+ default: return -EINVAL;
+ }
}
+/***************************************************************************
+ *
+ * Multicast
+ *
+ ***************************************************************************
+ */
+
/**
* Attach to multicast group
*
@@ -434,6 +648,10 @@ void ib_close ( struct ib_device *ibdev ) {
* @v qp Queue pair
* @v gid Multicast GID
* @ret rc Return status code
+ *
+ * Note that this function handles only the local device's attachment
+ * to the multicast GID; it does not issue the relevant MADs to join
+ * the multicast group on the subnet.
*/
int ib_mcast_attach ( struct ib_device *ibdev, struct ib_queue_pair *qp,
struct ib_gid *gid ) {
@@ -486,6 +704,89 @@ void ib_mcast_detach ( struct ib_device *ibdev, struct ib_queue_pair *qp,
}
}
+/***************************************************************************
+ *
+ * Miscellaneous
+ *
+ ***************************************************************************
+ */
+
+/**
+ * Get Infiniband HCA information
+ *
+ * @v ibdev Infiniband device
+ * @ret hca_guid HCA GUID
+ * @ret num_ports Number of ports
+ */
+int ib_get_hca_info ( struct ib_device *ibdev,
+ struct ib_gid_half *hca_guid ) {
+ struct ib_device *tmp;
+ int num_ports = 0;
+
+ /* Search for IB devices with the same physical device to
+ * identify port count and a suitable Node GUID.
+ */
+ for_each_ibdev ( tmp ) {
+ if ( tmp->dev != ibdev->dev )
+ continue;
+ if ( num_ports == 0 ) {
+ memcpy ( hca_guid, &tmp->gid.u.half[1],
+ sizeof ( *hca_guid ) );
+ }
+ num_ports++;
+ }
+ return num_ports;
+}
+
+/**
+ * Set port information
+ *
+ * @v ibdev Infiniband device
+ * @v mad Set port information MAD
+ */
+int ib_set_port_info ( struct ib_device *ibdev, union ib_mad *mad ) {
+ int rc;
+
+ /* Adapters with embedded SMAs do not need to support this method */
+ if ( ! ibdev->op->set_port_info ) {
+ DBGC ( ibdev, "IBDEV %p does not support setting port "
+ "information\n", ibdev );
+ return -ENOTSUP;
+ }
+
+ if ( ( rc = ibdev->op->set_port_info ( ibdev, mad ) ) != 0 ) {
+ DBGC ( ibdev, "IBDEV %p could not set port information: %s\n",
+ ibdev, strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+};
+
+/**
+ * Set partition key table
+ *
+ * @v ibdev Infiniband device
+ * @v mad Set partition key table MAD
+ */
+int ib_set_pkey_table ( struct ib_device *ibdev, union ib_mad *mad ) {
+ int rc;
+
+ /* Adapters with embedded SMAs do not need to support this method */
+ if ( ! ibdev->op->set_pkey_table ) {
+ DBGC ( ibdev, "IBDEV %p does not support setting partition "
+ "key table\n", ibdev );
+ return -ENOTSUP;
+ }
+
+ if ( ( rc = ibdev->op->set_pkey_table ( ibdev, mad ) ) != 0 ) {
+ DBGC ( ibdev, "IBDEV %p could not set partition key table: "
+ "%s\n", ibdev, strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+};
/***************************************************************************
*
@@ -506,6 +807,22 @@ void ib_link_state_changed ( struct ib_device *ibdev ) {
}
/**
+ * Poll event queue
+ *
+ * @v ibdev Infiniband device
+ */
+void ib_poll_eq ( struct ib_device *ibdev ) {
+ struct ib_completion_queue *cq;
+
+ /* Poll device's event queue */
+ ibdev->op->poll_eq ( ibdev );
+
+ /* Poll all completion queues */
+ list_for_each_entry ( cq, &ibdev->cqs, list )
+ ib_poll_cq ( ibdev, cq );
+}
+
+/**
* Single-step the Infiniband event queue
*
* @v process Infiniband event queue process
@@ -513,13 +830,13 @@ void ib_link_state_changed ( struct ib_device *ibdev ) {
static void ib_step ( struct process *process __unused ) {
struct ib_device *ibdev;
- list_for_each_entry ( ibdev, &ib_devices, list ) {
- ibdev->op->poll_eq ( ibdev );
- }
+ for_each_ibdev ( ibdev )
+ ib_poll_eq ( ibdev );
}
/** Infiniband event queue process */
struct process ib_process __permanent_process = {
+ .list = LIST_HEAD_INIT ( ib_process.list ),
.step = ib_step,
};
@@ -546,9 +863,11 @@ struct ib_device * alloc_ibdev ( size_t priv_size ) {
if ( ibdev ) {
drv_priv = ( ( ( void * ) ibdev ) + sizeof ( *ibdev ) );
ib_set_drvdata ( ibdev, drv_priv );
+ INIT_LIST_HEAD ( &ibdev->cqs );
INIT_LIST_HEAD ( &ibdev->qps );
+ ibdev->port_state = IB_PORT_STATE_DOWN;
ibdev->lid = IB_LID_NONE;
- ibdev->pkey = IB_PKEY_NONE;
+ ibdev->pkey = IB_PKEY_DEFAULT;
}
return ibdev;
}
@@ -598,3 +917,35 @@ void unregister_ibdev ( struct ib_device *ibdev ) {
ibdev_put ( ibdev );
DBGC ( ibdev, "IBDEV %p unregistered\n", ibdev );
}
+
+/**
+ * Find Infiniband device by GID
+ *
+ * @v gid GID
+ * @ret ibdev Infiniband device, or NULL
+ */
+struct ib_device * find_ibdev ( struct ib_gid *gid ) {
+ struct ib_device *ibdev;
+
+ for_each_ibdev ( ibdev ) {
+ if ( memcmp ( gid, &ibdev->gid, sizeof ( *gid ) ) == 0 )
+ return ibdev;
+ }
+ return NULL;
+}
+
+/**
+ * Get most recently opened Infiniband device
+ *
+ * @ret ibdev Most recently opened Infiniband device, or NULL
+ */
+struct ib_device * last_opened_ibdev ( void ) {
+ struct ib_device *ibdev;
+
+ list_for_each_entry ( ibdev, &open_ib_devices, open_list ) {
+ assert ( ibdev->open_count != 0 );
+ return ibdev;
+ }
+
+ return NULL;
+}