From 1d8c85d1120ab52a397cdc747d4854cc13cf8293 Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Mon, 6 Jul 2009 23:09:26 +0100 Subject: [PATCH] [infiniband] Create a general management agent Generalise the subnet management agent into a general management agent capable of sending and responding to MADs, including support for retransmissions as necessary. --- src/include/gpxe/errfile.h | 1 + src/include/gpxe/ib_gma.h | 71 ++++++ src/include/gpxe/ib_mad.h | 2 + src/include/gpxe/infiniband.h | 17 ++ src/net/infiniband.c | 36 ++- src/net/infiniband/ib_gma.c | 414 ++++++++++++++++++++++++++++++++++ 6 files changed, 534 insertions(+), 7 deletions(-) create mode 100644 src/include/gpxe/ib_gma.h create mode 100644 src/net/infiniband/ib_gma.c diff --git a/src/include/gpxe/errfile.h b/src/include/gpxe/errfile.h index 4b560009..e8132b47 100644 --- a/src/include/gpxe/errfile.h +++ b/src/include/gpxe/errfile.h @@ -144,6 +144,7 @@ FILE_LICENCE ( GPL2_OR_LATER ); #define ERRFILE_ib_packet ( ERRFILE_NET | 0x00180000 ) #define ERRFILE_icmp ( ERRFILE_NET | 0x00190000 ) #define ERRFILE_ib_qset ( ERRFILE_NET | 0x001a0000 ) +#define ERRFILE_ib_gma ( ERRFILE_NET | 0x001b0000 ) #define ERRFILE_image ( ERRFILE_IMAGE | 0x00000000 ) #define ERRFILE_elf ( ERRFILE_IMAGE | 0x00010000 ) diff --git a/src/include/gpxe/ib_gma.h b/src/include/gpxe/ib_gma.h new file mode 100644 index 00000000..c305a3a1 --- /dev/null +++ b/src/include/gpxe/ib_gma.h @@ -0,0 +1,71 @@ +#ifndef _GPXE_IB_GMA_H +#define _GPXE_IB_GMA_H + +/** @file + * + * Infiniband General Management Agent + * + */ + +FILE_LICENCE ( GPL2_OR_LATER ); + +#include +#include +#include + +struct ib_device; +struct ib_completion_queue; +struct ib_queue_pair; +union ib_mad; + +/** A MAD attribute handler */ +struct ib_mad_handler { + /** Management class */ + uint8_t mgmt_class; + /** Class version */ + uint8_t class_version; + /** Method */ + uint8_t method; + /** Response method, or zero */ + uint8_t resp_method; + /** Attribute (in network byte order) */ + uint16_t attr_id; + /** Handle attribute + * + * @v ibdev Infiniband device + * @v mad MAD + * @ret rc Return status code + * + * The handler should modify the MAD as applicable. If the + * handler returns with a non-zero value in the MAD's @c + * method field, it will be sent as a response. + */ + int ( * handle ) ( struct ib_device *ibdev, union ib_mad *mad ); +}; + +/** MAD attribute handlers */ +#define IB_MAD_HANDLERS __table ( struct ib_mad_handler, "ib_mad_handlers" ) + +/** Declare a MAD attribute handler */ +#define __ib_mad_handler __table_entry ( IB_MAD_HANDLERS, 01 ) + +/** An Infiniband General Management Agent */ +struct ib_gma { + /** Infiniband device */ + struct ib_device *ibdev; + /** Completion queue */ + struct ib_completion_queue *cq; + /** Queue pair */ + struct ib_queue_pair *qp; + + /** List of outstanding MAD requests */ + struct list_head requests; +}; + +extern int ib_gma_request ( struct ib_gma *gma, union ib_mad *mad, + struct ib_address_vector *av ); +extern int ib_create_gma ( struct ib_gma *gma, struct ib_device *ibdev, + unsigned long qkey ); +extern void ib_destroy_gma ( struct ib_gma *gma ); + +#endif /* _GPXE_IB_GMA_H */ diff --git a/src/include/gpxe/ib_mad.h b/src/include/gpxe/ib_mad.h index 7d497999..eaea12b8 100644 --- a/src/include/gpxe/ib_mad.h +++ b/src/include/gpxe/ib_mad.h @@ -201,6 +201,8 @@ struct ib_smp_class_specific { ***************************************************************************** */ +#define IB_SA_CLASS_VERSION 2 + struct ib_rmpp_hdr { uint32_t raw[3]; } __attribute__ (( packed )); diff --git a/src/include/gpxe/infiniband.h b/src/include/gpxe/infiniband.h index be22b7fe..3ffd5e66 100644 --- a/src/include/gpxe/infiniband.h +++ b/src/include/gpxe/infiniband.h @@ -14,6 +14,7 @@ FILE_LICENCE ( GPL2_OR_LATER ); #include #include #include +#include /** Subnet management QPN */ #define IB_QPN_SMA 0 @@ -133,6 +134,19 @@ struct ib_address_vector { struct ib_gid gid; }; +/** Infiniband transmission rates */ +enum ib_rate { + IB_RATE_2_5 = 2, + IB_RATE_10 = 3, + IB_RATE_30 = 4, + IB_RATE_5 = 5, + IB_RATE_20 = 6, + IB_RATE_40 = 7, + IB_RATE_60 = 8, + IB_RATE_80 = 9, + IB_RATE_120 = 10, +}; + /** Infiniband completion queue operations */ struct ib_completion_queue_operations { /** @@ -354,6 +368,9 @@ struct ib_device { /** Outbound packet sequence number */ uint32_t psn; + /** General management agent */ + struct ib_gma gma; + /** Driver private data */ void *drv_priv; /** Owner private data */ diff --git a/src/net/infiniband.c b/src/net/infiniband.c index 48572e03..497c20f6 100644 --- a/src/net/infiniband.c +++ b/src/net/infiniband.c @@ -485,16 +485,36 @@ void ib_refill_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp ) { int ib_open ( struct ib_device *ibdev ) { int rc; - /* Open device if this is the first requested opening */ - if ( ibdev->open_count == 0 ) { - if ( ( rc = ibdev->op->open ( ibdev ) ) != 0 ) - return rc; + /* Increment device open request counter */ + if ( ibdev->open_count++ > 0 ) { + /* Device was already open; do nothing */ + return 0; } - /* Increment device open request counter */ - ibdev->open_count++; + /* Open device */ + if ( ( rc = ibdev->op->open ( ibdev ) ) != 0 ) { + DBGC ( ibdev, "IBDEV %p could not open: %s\n", + ibdev, strerror ( rc ) ); + goto err_open; + } + /* Create general management agent */ + if ( ( rc = ib_create_gma ( &ibdev->gma, ibdev, IB_QKEY_GMA ) ) != 0 ){ + DBGC ( ibdev, "IBDEV %p could not create GMA: %s\n", + ibdev, strerror ( rc ) ); + goto err_create_gma; + } + + assert ( ibdev->open_count == 1 ); return 0; + + ib_destroy_gma ( &ibdev->gma ); + err_create_gma: + ibdev->op->close ( ibdev ); + err_open: + assert ( ibdev->open_count == 1 ); + ibdev->open_count = 0; + return rc; } /** @@ -508,8 +528,10 @@ void ib_close ( struct ib_device *ibdev ) { ibdev->open_count--; /* Close device if this was the last remaining requested opening */ - if ( ibdev->open_count == 0 ) + if ( ibdev->open_count == 0 ) { + ib_destroy_gma ( &ibdev->gma ); ibdev->op->close ( ibdev ); + } } /*************************************************************************** diff --git a/src/net/infiniband/ib_gma.c b/src/net/infiniband/ib_gma.c new file mode 100644 index 00000000..2baae25c --- /dev/null +++ b/src/net/infiniband/ib_gma.c @@ -0,0 +1,414 @@ +/* + * Copyright (C) 2009 Michael Brown . + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +FILE_LICENCE ( GPL2_OR_LATER ); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * @file + * + * Infiniband General Management Agent + * + */ + +/** A MAD request */ +struct ib_mad_request { + /** Associated GMA */ + struct ib_gma *gma; + /** List of outstanding MAD requests */ + struct list_head list; + /** Retry timer */ + struct retry_timer timer; + /** Destination address */ + struct ib_address_vector av; + /** MAD request */ + union ib_mad mad; +}; + +/** GMA number of send WQEs + * + * This is a policy decision. + */ +#define IB_GMA_NUM_SEND_WQES 4 + +/** GMA number of receive WQEs + * + * This is a policy decision. + */ +#define IB_GMA_NUM_RECV_WQES 2 + +/** GMA number of completion queue entries + * + * This is a policy decision + */ +#define IB_GMA_NUM_CQES 8 + +/** GMA TID magic signature */ +#define IB_GMA_TID_MAGIC ( ( 'g' << 24 ) | ( 'P' << 16 ) | ( 'X' << 8 ) | 'E' ) + +/** TID to use for next MAD request */ +static unsigned int next_request_tid; + +/** + * Identify attribute handler + * + * @v mgmt_class Management class + * @v class_version Class version + * @v method Method + * @v attr_id Attribute ID (in network byte order) + * @ret handler Attribute handler (or NULL) + */ +static int ib_handle_mad ( struct ib_device *ibdev, + union ib_mad *mad ) { + struct ib_mad_hdr *hdr = &mad->hdr; + struct ib_mad_handler *handler; + + for_each_table_entry ( handler, IB_MAD_HANDLERS ) { + if ( ( handler->mgmt_class == hdr->mgmt_class ) && + ( handler->class_version == hdr->class_version ) && + ( handler->method == hdr->method ) && + ( handler->attr_id == hdr->attr_id ) ) { + hdr->method = handler->resp_method; + return handler->handle ( ibdev, mad ); + } + } + + hdr->method = IB_MGMT_METHOD_TRAP; + hdr->status = htons ( IB_MGMT_STATUS_UNSUPPORTED_METHOD_ATTR ); + return -ENOTSUP; +} + +/** + * Complete GMA receive + * + * + * @v ibdev Infiniband device + * @v qp Queue pair + * @v av Address vector + * @v iobuf I/O buffer + * @v rc Completion status code + */ +static void ib_gma_complete_recv ( struct ib_device *ibdev, + struct ib_queue_pair *qp, + struct ib_address_vector *av, + struct io_buffer *iobuf, int rc ) { + struct ib_gma *gma = ib_qp_get_ownerdata ( qp ); + struct ib_mad_request *request; + union ib_mad *mad; + struct ib_mad_hdr *hdr; + unsigned int hop_pointer; + unsigned int hop_count; + + /* Ignore errors */ + if ( rc != 0 ) { + DBGC ( gma, "GMA %p RX error: %s\n", gma, strerror ( rc ) ); + goto out; + } + + /* Sanity checks */ + if ( iob_len ( iobuf ) != sizeof ( *mad ) ) { + DBGC ( gma, "GMA %p RX bad size (%zd bytes)\n", + gma, iob_len ( iobuf ) ); + DBGC_HDA ( gma, 0, iobuf->data, iob_len ( iobuf ) ); + goto out; + } + mad = iobuf->data; + hdr = &mad->hdr; + if ( hdr->base_version != IB_MGMT_BASE_VERSION ) { + DBGC ( gma, "GMA %p unsupported base version %x\n", + gma, hdr->base_version ); + DBGC_HDA ( gma, 0, mad, sizeof ( *mad ) ); + goto out; + } + DBGC ( gma, "GMA %p RX TID %08x%08x (%02x,%02x,%02x,%04x) status " + "%04x\n", gma, ntohl ( hdr->tid[0] ), ntohl ( hdr->tid[1] ), + hdr->mgmt_class, hdr->class_version, hdr->method, + ntohs ( hdr->attr_id ), ntohs ( hdr->status ) ); + DBGC2_HDA ( gma, 0, mad, sizeof ( *mad ) ); + + /* Dequeue request if applicable */ + list_for_each_entry ( request, &gma->requests, list ) { + if ( memcmp ( &request->mad.hdr.tid, &hdr->tid, + sizeof ( request->mad.hdr.tid ) ) == 0 ) { + stop_timer ( &request->timer ); + list_del ( &request->list ); + free ( request ); + break; + } + } + + /* Handle MAD, if possible */ + if ( ( rc = ib_handle_mad ( ibdev, mad ) ) != 0 ) { + DBGC ( gma, "GMA %p could not handle TID %08x%08x: %s\n", + gma, ntohl ( hdr->tid[0] ), ntohl ( hdr->tid[1] ), + strerror ( rc ) ); + /* Do not abort; we may want to send an error response */ + } + + /* Finish processing if we have no response to send */ + if ( ! hdr->method ) + goto out; + + DBGC ( gma, "GMA %p TX TID %08x%08x (%02x,%02x,%02x,%04x)\n", gma, + ntohl ( hdr->tid[0] ), ntohl ( hdr->tid[1] ), hdr->mgmt_class, + hdr->class_version, hdr->method, ntohs ( hdr->attr_id ) ); + DBGC2_HDA ( gma, 0, mad, sizeof ( *mad ) ); + + /* Set response fields for directed route SMPs */ + if ( hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ) { + struct ib_mad_smp *smp = &mad->smp; + + hdr->status |= htons ( IB_SMP_STATUS_D_INBOUND ); + hop_pointer = smp->mad_hdr.class_specific.smp.hop_pointer; + hop_count = smp->mad_hdr.class_specific.smp.hop_count; + assert ( hop_count == hop_pointer ); + if ( hop_pointer < ( sizeof ( smp->return_path.hops ) / + sizeof ( smp->return_path.hops[0] ) ) ) { + smp->return_path.hops[hop_pointer] = ibdev->port; + } else { + DBGC ( gma, "GMA %p invalid hop pointer %d\n", + gma, hop_pointer ); + goto out; + } + } + + /* Construct return address */ + av->qkey = ( ( av->qpn == IB_QPN_SMA ) ? IB_QKEY_SMA : IB_QKEY_GMA ); + av->rate = IB_RATE_2_5; + + /* Send MAD response, if applicable */ + if ( ( rc = ib_post_send ( ibdev, qp, av, + iob_disown ( iobuf ) ) ) != 0 ) { + DBGC ( gma, "GMA %p could not send MAD response: %s\n", + gma, strerror ( rc ) ); + goto out; + } + + out: + free_iob ( iobuf ); +} + +/** + * Complete GMA send + * + * + * @v ibdev Infiniband device + * @v qp Queue pair + * @v iobuf I/O buffer + * @v rc Completion status code + */ +static void ib_gma_complete_send ( struct ib_device *ibdev __unused, + struct ib_queue_pair *qp, + struct io_buffer *iobuf, int rc ) { + struct ib_gma *gma = ib_qp_get_ownerdata ( qp ); + + if ( rc != 0 ) { + DBGC ( gma, "GMA %p send completion error: %s\n", + gma, strerror ( rc ) ); + } + free_iob ( iobuf ); +} + +/** GMA completion operations */ +static struct ib_completion_queue_operations ib_gma_completion_ops = { + .complete_send = ib_gma_complete_send, + .complete_recv = ib_gma_complete_recv, +}; + +/** + * Handle MAD request timer expiry + * + * @v timer Retry timer + * @v expired Failure indicator + */ +static void ib_gma_timer_expired ( struct retry_timer *timer, int expired ) { + struct ib_mad_request *request = + container_of ( timer, struct ib_mad_request, timer ); + struct ib_gma *gma = request->gma; + struct ib_device *ibdev = gma->ibdev; + struct io_buffer *iobuf; + int rc; + + /* Abandon TID if we have tried too many times */ + if ( expired ) { + DBGC ( gma, "GMA %p abandoning TID %08x%08x\n", + gma, ntohl ( request->mad.hdr.tid[0] ), + ntohl ( request->mad.hdr.tid[1] ) ); + list_del ( &request->list ); + free ( request ); + return; + } + + DBGC ( gma, "GMA %p TX TID %08x%08x (%02x,%02x,%02x,%04x)\n", + gma, ntohl ( request->mad.hdr.tid[0] ), + ntohl ( request->mad.hdr.tid[1] ), request->mad.hdr.mgmt_class, + request->mad.hdr.class_version, request->mad.hdr.method, + ntohs ( request->mad.hdr.attr_id ) ); + DBGC2_HDA ( gma, 0, &request->mad, sizeof ( request->mad ) ); + + /* Restart retransmission timer */ + start_timer ( timer ); + + /* Construct I/O buffer */ + iobuf = alloc_iob ( sizeof ( request->mad ) ); + if ( ! iobuf ) { + DBGC ( gma, "GMA %p could not allocate buffer for TID " + "%08x%08x\n", gma, ntohl ( request->mad.hdr.tid[0] ), + ntohl ( request->mad.hdr.tid[1] ) ); + return; + } + memcpy ( iob_put ( iobuf, sizeof ( request->mad ) ), &request->mad, + sizeof ( request->mad ) ); + + /* Post send request */ + if ( ( rc = ib_post_send ( ibdev, gma->qp, &request->av, + iobuf ) ) != 0 ) { + DBGC ( gma, "GMA %p could not send TID %08x%08x: %s\n", + gma, ntohl ( request->mad.hdr.tid[0] ), + ntohl ( request->mad.hdr.tid[1] ), strerror ( rc ) ); + free_iob ( iobuf ); + return; + } +} + +/** + * Issue MAD request + * + * @v gma General management agent + * @v mad MAD request + * @v av Destination address, or NULL for SM + * @ret rc Return status code + */ +int ib_gma_request ( struct ib_gma *gma, union ib_mad *mad, + struct ib_address_vector *av ) { + struct ib_device *ibdev = gma->ibdev; + struct ib_mad_request *request; + + /* Allocate and initialise structure */ + request = zalloc ( sizeof ( *request ) ); + if ( ! request ) { + DBGC ( gma, "GMA %p could not allocate MAD request\n", gma ); + return -ENOMEM; + } + request->gma = gma; + list_add ( &request->list, &gma->requests ); + request->timer.expired = ib_gma_timer_expired; + + /* Determine address vector */ + if ( av ) { + memcpy ( &request->av, av, sizeof ( request->av ) ); + } else { + request->av.lid = ibdev->sm_lid; + request->av.sl = ibdev->sm_sl; + request->av.qpn = IB_QPN_GMA; + request->av.qkey = IB_QKEY_GMA; + } + + /* Copy MAD body */ + memcpy ( &request->mad, mad, sizeof ( request->mad ) ); + + /* Allocate TID */ + request->mad.hdr.tid[0] = htonl ( IB_GMA_TID_MAGIC ); + request->mad.hdr.tid[1] = htonl ( ++next_request_tid ); + + /* Start timer to initiate transmission */ + start_timer_nodelay ( &request->timer ); + + return 0; +} + +/** + * Create GMA + * + * @v gma General management agent + * @v ibdev Infiniband device + * @v qkey Queue key + * @ret rc Return status code + */ +int ib_create_gma ( struct ib_gma *gma, struct ib_device *ibdev, + unsigned long qkey ) { + int rc; + + /* Initialise fields */ + memset ( gma, 0, sizeof ( *gma ) ); + gma->ibdev = ibdev; + INIT_LIST_HEAD ( &gma->requests ); + + /* Create completion queue */ + gma->cq = ib_create_cq ( ibdev, IB_GMA_NUM_CQES, + &ib_gma_completion_ops ); + if ( ! gma->cq ) { + DBGC ( gma, "GMA %p could not allocate completion queue\n", + gma ); + rc = -ENOMEM; + goto err_create_cq; + } + + /* Create queue pair */ + gma->qp = ib_create_qp ( ibdev, IB_GMA_NUM_SEND_WQES, gma->cq, + IB_GMA_NUM_RECV_WQES, gma->cq, qkey ); + if ( ! gma->qp ) { + DBGC ( gma, "GMA %p could not allocate queue pair\n", gma ); + rc = -ENOMEM; + goto err_create_qp; + } + ib_qp_set_ownerdata ( gma->qp, gma ); + + DBGC ( gma, "GMA %p running on QPN %#lx\n", gma, gma->qp->qpn ); + + /* Fill receive ring */ + ib_refill_recv ( ibdev, gma->qp ); + return 0; + + ib_destroy_qp ( ibdev, gma->qp ); + err_create_qp: + ib_destroy_cq ( ibdev, gma->cq ); + err_create_cq: + return rc; +} + +/** + * Destroy GMA + * + * @v gma General management agent + */ +void ib_destroy_gma ( struct ib_gma *gma ) { + struct ib_device *ibdev = gma->ibdev; + struct ib_mad_request *request; + struct ib_mad_request *tmp; + + /* Flush any outstanding requests */ + list_for_each_entry_safe ( request, tmp, &gma->requests, list ) { + stop_timer ( &request->timer ); + list_del ( &request->list ); + free ( request ); + } + + ib_destroy_qp ( ibdev, gma->qp ); + ib_destroy_cq ( ibdev, gma->cq ); +}