From a176a24ac0a5769d6a844149595f409a1bc2e41d Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Fri, 18 Apr 2008 02:50:48 +0100 Subject: [PATCH] [Infiniband] Add preliminary multiple port support for Hermon cards Infiniband devices no longer block waiting for link-up in register_ibdev(). Hermon driver needs to create an event queue and poll for link-up events. Infiniband core needs to reread MAD parameters when link state changes. IPoIB needs to cope with Infiniband link parameters being only partially available at probe and open time. --- src/drivers/infiniband/hermon.c | 286 +++++++++++++++++++++++++++++++- src/drivers/infiniband/hermon.h | 65 +++++++- src/drivers/net/ipoib.c | 257 +++++++++++++++++----------- src/include/gpxe/infiniband.h | 20 +++ src/include/gpxe/ipoib.h | 1 + src/net/infiniband.c | 93 ++++++----- 6 files changed, 579 insertions(+), 143 deletions(-) diff --git a/src/drivers/infiniband/hermon.c b/src/drivers/infiniband/hermon.c index c10559f9..41494a5a 100644 --- a/src/drivers/infiniband/hermon.c +++ b/src/drivers/infiniband/hermon.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "hermon.h" @@ -317,19 +318,30 @@ hermon_cmd_write_mtt ( struct hermon *hermon, } static inline int -hermon_cmd_sw2hw_eq ( struct hermon *hermon, unsigned int index, - const struct hermonprm_eqc *eqc ) { +hermon_cmd_map_eq ( struct hermon *hermon, unsigned long index_map, + const struct hermonprm_event_mask *mask ) { return hermon_cmd ( hermon, - HERMON_HCR_IN_CMD ( HERMON_HCR_SW2HW_EQ, - 1, sizeof ( *eqc ) ), - 0, eqc, index, NULL ); + HERMON_HCR_IN_CMD ( HERMON_HCR_MAP_EQ, + 0, sizeof ( *mask ) ), + 0, mask, index_map, NULL ); } static inline int -hermon_cmd_hw2sw_eq ( struct hermon *hermon, unsigned int index ) { +hermon_cmd_sw2hw_eq ( struct hermon *hermon, unsigned int index, + const struct hermonprm_eqc *eqctx ) { return hermon_cmd ( hermon, - HERMON_HCR_VOID_CMD ( HERMON_HCR_HW2SW_EQ ), - 1, NULL, index, NULL ); + HERMON_HCR_IN_CMD ( HERMON_HCR_SW2HW_EQ, + 1, sizeof ( *eqctx ) ), + 0, eqctx, index, NULL ); +} + +static inline int +hermon_cmd_hw2sw_eq ( struct hermon *hermon, unsigned int index, + struct hermonprm_eqc *eqctx ) { + return hermon_cmd ( hermon, + HERMON_HCR_OUT_CMD ( HERMON_HCR_HW2SW_EQ, + 1, sizeof ( *eqctx ) ), + 1, NULL, index, eqctx ); } static inline int @@ -377,6 +389,15 @@ hermon_cmd_rtr2rts_qp ( struct hermon *hermon, unsigned long qpn, 0, ctx, qpn, NULL ); } +static inline int +hermon_cmd_rts2rts_qp ( struct hermon *hermon, unsigned long qpn, + const struct hermonprm_qp_ee_state_transitions *ctx ) { + return hermon_cmd ( hermon, + HERMON_HCR_IN_CMD ( HERMON_HCR_RTS2RTS_QP, + 1, sizeof ( *ctx ) ), + 0, ctx, qpn, NULL ); +} + static inline int hermon_cmd_2rst_qp ( struct hermon *hermon, unsigned long qpn ) { return hermon_cmd ( hermon, @@ -859,6 +880,39 @@ static int hermon_create_qp ( struct ib_device *ibdev, return rc; } +/** + * Modify queue pair + * + * @v ibdev Infiniband device + * @v qp Queue pair + * @v mod_list Modification list + * @ret rc Return status code + */ +static int hermon_modify_qp ( struct ib_device *ibdev, + struct ib_queue_pair *qp, + unsigned long mod_list ) { + struct hermon *hermon = ib_get_drvdata ( ibdev ); + struct hermonprm_qp_ee_state_transitions qpctx; + unsigned long optparammask = 0; + int rc; + + /* Construct optparammask */ + if ( mod_list & IB_MODIFY_QKEY ) + optparammask |= HERMON_QP_OPT_PARAM_QKEY; + + /* Issue RTS2RTS_QP */ + memset ( &qpctx, 0, sizeof ( qpctx ) ); + MLX_FILL_1 ( &qpctx, 0, opt_param_mask, optparammask ); + MLX_FILL_1 ( &qpctx, 44, qpc_eec_data.q_key, qp->qkey ); + if ( ( rc = hermon_cmd_rts2rts_qp ( hermon, qp->qpn, &qpctx ) ) != 0 ){ + DBGC ( hermon, "Hermon %p RTS2RTS_QP failed: %s\n", + hermon, strerror ( rc ) ); + return rc; + } + + return 0; +} + /** * Destroy queue pair * @@ -1356,6 +1410,7 @@ static struct ib_device_operations hermon_ib_operations = { .create_cq = hermon_create_cq, .destroy_cq = hermon_destroy_cq, .create_qp = hermon_create_qp, + .modify_qp = hermon_modify_qp, .destroy_qp = hermon_destroy_qp, .post_send = hermon_post_send, .post_recv = hermon_post_recv, @@ -1367,6 +1422,211 @@ static struct ib_device_operations hermon_ib_operations = { .mad = hermon_mad, }; +/*************************************************************************** + * + * Event queues + * + *************************************************************************** + */ + +/** + * Create event queue + * + * @v hermon Hermon device + * @ret rc Return status code + */ +static int hermon_create_eq ( struct hermon *hermon ) { + struct hermon_event_queue *hermon_eq = &hermon->eq; + struct hermonprm_eqc eqctx; + struct hermonprm_event_mask mask; + unsigned int i; + int rc; + + /* Allocate event queue itself */ + hermon_eq->eqe_size = + ( HERMON_NUM_EQES * sizeof ( hermon_eq->eqe[0] ) ); + hermon_eq->eqe = malloc_dma ( hermon_eq->eqe_size, + sizeof ( hermon_eq->eqe[0] ) ); + if ( ! hermon_eq->eqe ) { + rc = -ENOMEM; + goto err_eqe; + } + memset ( hermon_eq->eqe, 0, hermon_eq->eqe_size ); + for ( i = 0 ; i < HERMON_NUM_EQES ; i++ ) { + MLX_FILL_1 ( &hermon_eq->eqe[i].generic, 7, owner, 1 ); + } + barrier(); + + /* Allocate MTT entries */ + if ( ( rc = hermon_alloc_mtt ( hermon, hermon_eq->eqe, + hermon_eq->eqe_size, + &hermon_eq->mtt ) ) != 0 ) + goto err_alloc_mtt; + + /* Hand queue over to hardware */ + memset ( &eqctx, 0, sizeof ( eqctx ) ); + MLX_FILL_1 ( &eqctx, 0, st, 0xa /* "Fired" */ ); + MLX_FILL_1 ( &eqctx, 2, + page_offset, ( hermon_eq->mtt.page_offset >> 5 ) ); + MLX_FILL_1 ( &eqctx, 3, log_eq_size, fls ( HERMON_NUM_EQES - 1 ) ); + MLX_FILL_1 ( &eqctx, 7, mtt_base_addr_l, + ( hermon_eq->mtt.mtt_base_addr >> 3 ) ); + if ( ( rc = hermon_cmd_sw2hw_eq ( hermon, 0, &eqctx ) ) != 0 ) { + DBGC ( hermon, "Hermon %p SW2HW_EQ failed: %s\n", + hermon, strerror ( rc ) ); + goto err_sw2hw_eq; + } + + /* Map events to this event queue */ + memset ( &mask, 0, sizeof ( mask ) ); + MLX_FILL_1 ( &mask, 1, port_state_change, 1 ); + if ( ( rc = hermon_cmd_map_eq ( hermon, ( HERMON_MAP_EQ_MAP | 0 ), + &mask ) ) != 0 ) { + DBGC ( hermon, "Hermon %p MAP_EQ failed: %s\n", + hermon, strerror ( rc ) ); + goto err_map_eq; + } + + return 0; + + err_map_eq: + hermon_cmd_hw2sw_eq ( hermon, 0, &eqctx ); + err_sw2hw_eq: + hermon_free_mtt ( hermon, &hermon_eq->mtt ); + err_alloc_mtt: + free_dma ( hermon_eq->eqe, hermon_eq->eqe_size ); + err_eqe: + memset ( hermon_eq, 0, sizeof ( *hermon_eq ) ); + return rc; +} + +/** + * Destroy event queue + * + * @v hermon Hermon device + */ +static void hermon_destroy_eq ( struct hermon *hermon ) { + struct hermon_event_queue *hermon_eq = &hermon->eq; + struct hermonprm_eqc eqctx; + struct hermonprm_event_mask mask; + int rc; + + /* Unmap events from event queue */ + memset ( &mask, 0, sizeof ( mask ) ); + MLX_FILL_1 ( &mask, 1, port_state_change, 1 ); + if ( ( rc = hermon_cmd_map_eq ( hermon, ( HERMON_MAP_EQ_UNMAP | 0 ), + &mask ) ) != 0 ) { + DBGC ( hermon, "Hermon %p FATAL MAP_EQ failed to unmap: %s\n", + hermon, strerror ( rc ) ); + /* Continue; HCA may die but system should survive */ + } + + /* Take ownership back from hardware */ + if ( ( rc = hermon_cmd_hw2sw_eq ( hermon, 0, &eqctx ) ) != 0 ) { + DBGC ( hermon, "Hermon %p FATAL HW2SW_EQ failed: %s\n", + hermon, strerror ( rc ) ); + /* Leak memory and return; at least we avoid corruption */ + return; + } + + /* Free MTT entries */ + hermon_free_mtt ( hermon, &hermon_eq->mtt ); + + /* Free memory */ + free_dma ( hermon_eq->eqe, hermon_eq->eqe_size ); + memset ( hermon_eq, 0, sizeof ( *hermon_eq ) ); +} + +/** + * Handle port state event + * + * @v hermon Hermon device + * @v eqe Port state change event queue entry + */ +static void hermon_event_port_state_change ( struct hermon *hermon, + union hermonprm_event_entry *eqe){ + unsigned int port; + int link_up; + + /* Get port and link status */ + port = ( MLX_GET ( &eqe->port_state_change, data.p ) - 1 ); + link_up = ( MLX_GET ( &eqe->generic, event_sub_type ) & 0x04 ); + DBGC ( hermon, "Hermon %p port %d link %s\n", hermon, ( port + 1 ), + ( link_up ? "up" : "down" ) ); + + /* Sanity check */ + if ( port >= HERMON_NUM_PORTS ) { + DBGC ( hermon, "Hermon %p port %d does not exist!\n", + hermon, ( port + 1 ) ); + return; + } + + /* Notify Infiniband core of link state change */ + ib_link_state_changed ( hermon->ibdev[port] ); +} + +/** + * Poll event queue + * + * @v hermon Hermon device + */ +static void hermon_poll_eq ( struct hermon *hermon ) { + struct hermon_event_queue *hermon_eq = &hermon->eq; + union hermonprm_event_entry *eqe; + union hermonprm_doorbell_register db_reg; + unsigned int eqe_idx_mask; + unsigned int event_type; + + while ( 1 ) { + eqe_idx_mask = ( HERMON_NUM_EQES - 1 ); + eqe = &hermon_eq->eqe[hermon_eq->next_idx & eqe_idx_mask]; + if ( MLX_GET ( &eqe->generic, owner ) ^ + ( ( hermon_eq->next_idx & HERMON_NUM_EQES ) ? 1 : 0 ) ) { + /* Entry still owned by hardware; end of poll */ + break; + } + DBGCP ( hermon, "Hermon %p event:\n", hermon ); + DBGCP_HD ( hermon, eqe, sizeof ( *eqe ) ); + + /* Handle event */ + event_type = MLX_GET ( &eqe->generic, event_type ); + switch ( event_type ) { + case HERMON_EV_PORT_STATE_CHANGE: + hermon_event_port_state_change ( hermon, eqe ); + break; + default: + DBGC ( hermon, "Hermon %p unrecognised event type " + "%#x:\n", hermon, event_type ); + DBGC_HD ( hermon, eqe, sizeof ( *eqe ) ); + break; + } + + /* Update event queue's index */ + hermon_eq->next_idx++; + + /* Ring doorbell */ + memset ( &db_reg, 0, sizeof ( db_reg ) ); + MLX_FILL_1 ( &db_reg.event, 0, ci, hermon_eq->next_idx ); + DBGCP ( hermon, "Ringing doorbell %08lx with %08lx\n", + virt_to_phys ( hermon->uar + HERMON_DB_EQ0_OFFSET ), + db_reg.dword[0] ); + writel ( db_reg.dword[0], + ( hermon->uar + HERMON_DB_EQ0_OFFSET ) ); + } +} + +/** + * Event queue poll processor + * + * @v process Hermon event queue process + */ +static void hermon_step ( struct process *process ) { + struct hermon *hermon = + container_of ( process, struct hermon, event_process ); + + hermon_poll_eq ( hermon ); +} + /*************************************************************************** * * Firmware control @@ -1879,6 +2139,7 @@ static int hermon_probe ( struct pci_device *pci, goto err_alloc_hermon; } pci_set_drvdata ( pci, hermon ); + process_init ( &hermon->event_process, hermon_step, NULL ); /* Allocate Infiniband devices */ for ( i = 0 ; i < HERMON_NUM_PORTS ; i++ ) { @@ -1945,6 +2206,10 @@ static int hermon_probe ( struct pci_device *pci, if ( ( rc = hermon_setup_mpt ( hermon ) ) != 0 ) goto err_setup_mpt; + /* Set up event queue */ + if ( ( rc = hermon_create_eq ( hermon ) ) != 0 ) + goto err_create_eq; + /* Register Infiniband devices */ for ( i = 0 ; i < HERMON_NUM_PORTS ; i++ ) { if ( ( rc = register_ibdev ( hermon->ibdev[i] ) ) != 0 ) { @@ -1960,6 +2225,8 @@ static int hermon_probe ( struct pci_device *pci, err_register_ibdev: for ( ; i >= 0 ; i-- ) unregister_ibdev ( hermon->ibdev[i] ); + hermon_destroy_eq ( hermon ); + err_create_eq: err_setup_mpt: hermon_cmd_close_hca ( hermon ); err_init_hca: @@ -1976,6 +2243,7 @@ static int hermon_probe ( struct pci_device *pci, err_alloc_ibdev: for ( ; i >= 0 ; i-- ) free_ibdev ( hermon->ibdev[i] ); + process_del ( &hermon->event_process ); free ( hermon ); err_alloc_hermon: return rc; @@ -1992,6 +2260,7 @@ static void hermon_remove ( struct pci_device *pci ) { for ( i = ( HERMON_NUM_PORTS - 1 ) ; i >= 0 ; i-- ) unregister_ibdev ( hermon->ibdev[i] ); + hermon_destroy_eq ( hermon ); hermon_cmd_close_hca ( hermon ); hermon_free_icm ( hermon ); hermon_stop_firmware ( hermon ); @@ -2000,6 +2269,7 @@ static void hermon_remove ( struct pci_device *pci ) { free_dma ( hermon->mailbox_in, HERMON_MBOX_SIZE ); for ( i = ( HERMON_NUM_PORTS - 1 ) ; i >= 0 ; i-- ) free_ibdev ( hermon->ibdev[i] ); + process_del ( &hermon->event_process ); free ( hermon ); } diff --git a/src/drivers/infiniband/hermon.h b/src/drivers/infiniband/hermon.h index 959e6a9d..d9e3dd11 100644 --- a/src/drivers/infiniband/hermon.h +++ b/src/drivers/infiniband/hermon.h @@ -9,6 +9,7 @@ #include #include +#include #include "mlx_bitops.h" #include "MT25408_PRM.h" @@ -18,7 +19,7 @@ */ /* Ports in existence */ -#define HERMON_NUM_PORTS 1 +#define HERMON_NUM_PORTS 2 #define HERMON_PORT_BASE 1 /* PCI BARs */ @@ -48,6 +49,7 @@ #define HERMON_HCR_RST2INIT_QP 0x0019 #define HERMON_HCR_INIT2RTR_QP 0x001a #define HERMON_HCR_RTR2RTS_QP 0x001b +#define HERMON_HCR_RTS2RTS_QP 0x001c #define HERMON_HCR_2RST_QP 0x0021 #define HERMON_HCR_MAD_IFC 0x0024 #define HERMON_HCR_READ_MCG 0x0025 @@ -75,6 +77,14 @@ #define HERMON_PAGE_SIZE 4096 #define HERMON_DB_POST_SND_OFFSET 0x14 +#define HERMON_DB_EQ0_OFFSET 0x800 + +#define HERMON_QP_OPT_PARAM_QKEY 0x00000020UL + +#define HERMON_MAP_EQ_MAP ( 0UL << 31 ) +#define HERMON_MAP_EQ_UNMAP ( 1UL << 31 ) + +#define HERMON_EV_PORT_STATE_CHANGE 0x09 /* * Datatypes that seem to be missing from the autogenerated documentation @@ -108,12 +118,32 @@ struct hermonprm_send_db_register_st { pseudo_bit_t qn[0x00018]; } __attribute__ (( packed )); +struct hermonprm_event_db_register_st { + pseudo_bit_t ci[0x00018]; + pseudo_bit_t reserver[0x00007]; + pseudo_bit_t a[0x00001]; +} __attribute__ (( packed )); + struct hermonprm_scalar_parameter_st { pseudo_bit_t value_hi[0x00020]; /* -------------- */ pseudo_bit_t value[0x00020]; } __attribute__ (( packed )); +struct hermonprm_event_mask_st { + pseudo_bit_t reserved0[0x00020]; +/* -------------- */ + pseudo_bit_t completion[0x00001]; + pseudo_bit_t reserved1[0x0008]; + pseudo_bit_t port_state_change[0x00001]; + pseudo_bit_t reserved2[0x00016]; +} __attribute__ (( packed )); + +struct hermonprm_port_state_change_event_st { + pseudo_bit_t reserved[0x00020]; + struct hermonprm_port_state_change_st data; +} __attribute__ (( packed )); + /* * Wrapper structures for hardware datatypes * @@ -124,6 +154,9 @@ struct MLX_DECLARE_STRUCT ( hermonprm_completion_queue_entry ); struct MLX_DECLARE_STRUCT ( hermonprm_completion_with_error ); struct MLX_DECLARE_STRUCT ( hermonprm_cq_db_record ); struct MLX_DECLARE_STRUCT ( hermonprm_eqc ); +struct MLX_DECLARE_STRUCT ( hermonprm_event_db_register ); +struct MLX_DECLARE_STRUCT ( hermonprm_event_mask ); +struct MLX_DECLARE_STRUCT ( hermonprm_event_queue_entry ); struct MLX_DECLARE_STRUCT ( hermonprm_hca_command_register ); struct MLX_DECLARE_STRUCT ( hermonprm_init_hca ); struct MLX_DECLARE_STRUCT ( hermonprm_init_port ); @@ -132,6 +165,7 @@ struct MLX_DECLARE_STRUCT ( hermonprm_mcg_entry ); struct MLX_DECLARE_STRUCT ( hermonprm_mgm_hash ); struct MLX_DECLARE_STRUCT ( hermonprm_mpt ); struct MLX_DECLARE_STRUCT ( hermonprm_mtt ); +struct MLX_DECLARE_STRUCT ( hermonprm_port_state_change_event ); struct MLX_DECLARE_STRUCT ( hermonprm_qp_db_record ); struct MLX_DECLARE_STRUCT ( hermonprm_qp_ee_state_transitions ); struct MLX_DECLARE_STRUCT ( hermonprm_query_dev_cap ); @@ -175,8 +209,14 @@ union hermonprm_completion_entry { struct hermonprm_completion_with_error error; } __attribute__ (( packed )); +union hermonprm_event_entry { + struct hermonprm_event_queue_entry generic; + struct hermonprm_port_state_change_event port_state_change; +} __attribute__ (( packed )); + union hermonprm_doorbell_register { struct hermonprm_send_db_register send; + struct hermonprm_event_db_register event; uint32_t dword[1]; } __attribute__ (( packed )); @@ -362,6 +402,24 @@ struct hermon_completion_queue { */ #define HERMON_MAX_EQS 4 +/** A Hermon event queue */ +struct hermon_event_queue { + /** Event queue entries */ + union hermonprm_event_entry *eqe; + /** Size of event queue */ + size_t eqe_size; + /** MTT descriptor */ + struct hermon_mtt mtt; + /** Next event queue entry index */ + unsigned long next_idx; +}; + +/** Number of event queue entries + * + * This is a policy decision. + */ +#define HERMON_NUM_EQES 4 + /** A Hermon resource bitmask */ typedef uint32_t hermon_bitmask_t; @@ -397,6 +455,11 @@ struct hermon { */ unsigned long reserved_lkey; + /** Event queue */ + struct hermon_event_queue eq; + /** Event queue process */ + struct process event_process; + /** Completion queue in-use bitmask */ hermon_bitmask_t cq_inuse[ HERMON_BITMASK_SIZE ( HERMON_MAX_CQS ) ]; /** Queue pair in-use bitmask */ diff --git a/src/drivers/net/ipoib.c b/src/drivers/net/ipoib.c index d457b258..3b915bf0 100644 --- a/src/drivers/net/ipoib.c +++ b/src/drivers/net/ipoib.c @@ -80,10 +80,14 @@ struct ipoib_device { struct ib_gid broadcast_gid; /** Broadcast LID */ unsigned int broadcast_lid; - /** Joined to broadcast group */ - int broadcast_joined; /** Data queue key */ unsigned long data_qkey; + /** Attached to multicast group + * + * This flag indicates whether or not we have attached our + * data queue pair to the broadcast multicast GID. + */ + int broadcast_attached; }; /** @@ -272,6 +276,10 @@ static int ipoib_create_qset ( struct ipoib_device *ipoib, struct ib_device *ibdev = ipoib->ibdev; int rc; + /* Sanity check */ + assert ( qset->cq == NULL ); + assert ( qset->qp == NULL ); + /* Store queue parameters */ qset->recv_max_fill = num_recv_wqes; @@ -617,14 +625,24 @@ static void ipoib_recv_path_record ( struct ipoib_device *ipoib __unused, */ static void ipoib_recv_mc_member_record ( struct ipoib_device *ipoib, struct ib_mad_mc_member_record *mc_member_record ) { + int joined; + int rc; + /* Record parameters */ - ipoib->broadcast_joined = - ( mc_member_record->scope__join_state & 0x0f ); + joined = ( mc_member_record->scope__join_state & 0x0f ); ipoib->data_qkey = ntohl ( mc_member_record->qkey ); ipoib->broadcast_lid = ntohs ( mc_member_record->mlid ); DBGC ( ipoib, "IPoIB %p %s broadcast group: qkey %lx mlid %x\n", - ipoib, ( ipoib->broadcast_joined ? "joined" : "left" ), - ipoib->data_qkey, ipoib->broadcast_lid ); + ipoib, ( joined ? "joined" : "left" ), ipoib->data_qkey, + ipoib->broadcast_lid ); + + /* Update data queue pair qkey */ + if ( ( rc = ib_modify_qp ( ipoib->ibdev, ipoib->data.qp, + IB_MODIFY_QKEY, ipoib->data_qkey ) ) != 0 ){ + DBGC ( ipoib, "IPoIB %p could not update data qkey: %s\n", + ipoib, strerror ( rc ) ); + return; + } } /** @@ -741,6 +759,56 @@ static void ipoib_irq ( struct net_device *netdev __unused, /* No implementation */ } +/** + * Join IPv4 broadcast multicast group + * + * @v ipoib IPoIB device + * @ret rc Return status code + */ +static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) { + int rc; + + /* Sanity check */ + if ( ! ipoib->data.qp ) + return 0; + + /* Attach data queue to broadcast multicast GID */ + assert ( ipoib->broadcast_attached == 0 ); + if ( ( rc = ib_mcast_attach ( ipoib->ibdev, ipoib->data.qp, + &ipoib->broadcast_gid ) ) != 0 ){ + DBGC ( ipoib, "IPoIB %p could not attach to broadcast GID: " + "%s\n", ipoib, strerror ( rc ) ); + return rc; + } + ipoib->broadcast_attached = 1; + + /* Initiate broadcast group join */ + if ( ( rc = ipoib_mc_member_record ( ipoib, &ipoib->broadcast_gid, + 1 ) ) != 0 ) { + DBGC ( ipoib, "IPoIB %p could not send broadcast join: %s\n", + ipoib, strerror ( rc ) ); + return rc; + } + + return 0; +} + +/** + * Leave IPv4 broadcast multicast group + * + * @v ipoib IPoIB device + */ +static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) { + + /* Detach data queue from broadcast multicast GID */ + if ( ipoib->broadcast_attached ) { + assert ( ipoib->data.qp != NULL ); + ib_mcast_detach ( ipoib->ibdev, ipoib->data.qp, + &ipoib->broadcast_gid ); + ipoib->broadcast_attached = 0; + } +} + /** * Open IPoIB network device * @@ -749,22 +817,53 @@ static void ipoib_irq ( struct net_device *netdev __unused, */ static int ipoib_open ( struct net_device *netdev ) { struct ipoib_device *ipoib = netdev->priv; - struct ib_device *ibdev = ipoib->ibdev; + struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr ); int rc; - /* Attach to broadcast multicast GID */ - if ( ( rc = ib_mcast_attach ( ibdev, ipoib->data.qp, - &ipoib->broadcast_gid ) ) != 0 ) { - DBG ( "Could not attach to broadcast GID: %s\n", - strerror ( rc ) ); - return rc; + /* Allocate metadata queue set */ + if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->meta, + IPOIB_META_NUM_CQES, + IPOIB_META_NUM_SEND_WQES, + IPOIB_META_NUM_RECV_WQES, + IB_GLOBAL_QKEY ) ) != 0 ) { + DBGC ( ipoib, "IPoIB %p could not allocate metadata QP: %s\n", + ipoib, strerror ( rc ) ); + goto err_create_meta_qset; } + /* Allocate data queue set */ + if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->data, + IPOIB_DATA_NUM_CQES, + IPOIB_DATA_NUM_SEND_WQES, + IPOIB_DATA_NUM_RECV_WQES, + IB_GLOBAL_QKEY ) ) != 0 ) { + DBGC ( ipoib, "IPoIB %p could not allocate data QP: %s\n", + ipoib, strerror ( rc ) ); + goto err_create_data_qset; + } + + /* Update MAC address with data QPN */ + mac->qpn = htonl ( ipoib->data.qp->qpn ); + /* Fill receive rings */ ipoib_refill_recv ( ipoib, &ipoib->meta ); ipoib_refill_recv ( ipoib, &ipoib->data ); + /* Join broadcast group */ + if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) { + DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n", + ipoib, strerror ( rc ) ); + goto err_join_broadcast; + } + return 0; + + err_join_broadcast: + ipoib_destroy_qset ( ipoib, &ipoib->data ); + err_create_data_qset: + ipoib_destroy_qset ( ipoib, &ipoib->meta ); + err_create_meta_qset: + return rc; } /** @@ -774,12 +873,17 @@ static int ipoib_open ( struct net_device *netdev ) { */ static void ipoib_close ( struct net_device *netdev ) { struct ipoib_device *ipoib = netdev->priv; - struct ib_device *ibdev = ipoib->ibdev; + struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr ); - /* Detach from broadcast multicast GID */ - ib_mcast_detach ( ibdev, ipoib->data.qp, &ipoib->broadcast_gid ); + /* Leave broadcast group */ + ipoib_leave_broadcast_group ( ipoib ); - /* FIXME: should probably flush the receive ring */ + /* Remove data QPN from MAC address */ + mac->qpn = 0; + + /* Tear down the queues */ + ipoib_destroy_qset ( ipoib, &ipoib->data ); + ipoib_destroy_qset ( ipoib, &ipoib->meta ); } /** IPoIB network device operations */ @@ -792,44 +896,53 @@ static struct net_device_operations ipoib_operations = { }; /** - * Join IPoIB broadcast group + * Update IPoIB dynamic Infiniband parameters * * @v ipoib IPoIB device - * @ret rc Return status code + * + * The Infiniband port GID and partition key will change at runtime, + * when the link is established (or lost). The MAC address is based + * on the port GID, and the broadcast GID is based on the partition + * key. This function recalculates these IPoIB device parameters. */ -static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) { +static void ipoib_set_ib_params ( struct ipoib_device *ipoib ) { struct ib_device *ibdev = ipoib->ibdev; - unsigned int delay_ms; + struct ipoib_mac *mac; + + /* Calculate GID portion of MAC address based on port GID */ + mac = ( ( struct ipoib_mac * ) ipoib->netdev->ll_addr ); + memcpy ( &mac->gid, &ibdev->port_gid, sizeof ( mac->gid ) ); + + /* Calculate broadcast GID based on partition key */ + memcpy ( &ipoib->broadcast_gid, &ipv4_broadcast_gid, + sizeof ( ipoib->broadcast_gid ) ); + ipoib->broadcast_gid.u.words[2] = htons ( ibdev->pkey ); +} + +/** + * Handle link status change + * + * @v ibdev Infiniband device + */ +void ipoib_link_state_changed ( struct ib_device *ibdev ) { + struct net_device *netdev = ib_get_ownerdata ( ibdev ); + struct ipoib_device *ipoib = netdev->priv; int rc; - /* Make sure we have some receive descriptors */ - ipoib_refill_recv ( ipoib, &ipoib->meta ); + /* Leave existing broadcast group */ + ipoib_leave_broadcast_group ( ipoib ); - /* Send join request */ - if ( ( rc = ipoib_mc_member_record ( ipoib, &ipoib->broadcast_gid, - 1 ) ) != 0 ) { - DBGC ( ipoib, "IPoIB %p could not send broadcast join: %s\n", - ipoib, strerror ( rc ) ); - return rc; - } - - /* Wait for join to complete. Ideally we wouldn't delay for - * this long, but we need the queue key before we can set up - * the data queue pair, which we need before we can know the - * MAC address. + /* Update MAC address and broadcast GID based on new port GID + * and partition key. */ - for ( delay_ms = IPOIB_JOIN_MAX_DELAY_MS ; delay_ms ; delay_ms-- ) { - mdelay ( 1 ); - ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send, - ipoib_meta_complete_recv ); - ipoib_refill_recv ( ipoib, &ipoib->meta ); - if ( ipoib->broadcast_joined ) - return 0; - } - DBGC ( ipoib, "IPoIB %p timed out waiting for broadcast join\n", - ipoib ); + ipoib_set_ib_params ( ipoib ); - return -ETIMEDOUT; + /* Join new broadcast group */ + if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) { + DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: " + "%s\n", ipoib, strerror ( rc ) ); + return; + } } /** @@ -841,7 +954,6 @@ static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) { int ipoib_probe ( struct ib_device *ibdev ) { struct net_device *netdev; struct ipoib_device *ipoib; - struct ipoib_mac *mac; int rc; /* Allocate network device */ @@ -856,44 +968,11 @@ int ipoib_probe ( struct ib_device *ibdev ) { ipoib->netdev = netdev; ipoib->ibdev = ibdev; - /* Calculate broadcast GID */ - memcpy ( &ipoib->broadcast_gid, &ipv4_broadcast_gid, - sizeof ( ipoib->broadcast_gid ) ); - ipoib->broadcast_gid.u.words[2] = htons ( ibdev->pkey ); - - /* Allocate metadata queue set */ - if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->meta, - IPOIB_META_NUM_CQES, - IPOIB_META_NUM_SEND_WQES, - IPOIB_META_NUM_RECV_WQES, - IB_GLOBAL_QKEY ) ) != 0 ) { - DBGC ( ipoib, "IPoIB %p could not allocate metadata QP: %s\n", - ipoib, strerror ( rc ) ); - goto err_create_meta_qset; - } - - /* Join broadcast group */ - if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) { - DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n", - ipoib, strerror ( rc ) ); - goto err_join_broadcast_group; - } - - /* Allocate data queue set */ - if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->data, - IPOIB_DATA_NUM_CQES, - IPOIB_DATA_NUM_SEND_WQES, - IPOIB_DATA_NUM_RECV_WQES, - ipoib->data_qkey ) ) != 0 ) { - DBGC ( ipoib, "IPoIB %p could not allocate data QP: %s\n", - ipoib, strerror ( rc ) ); - goto err_create_data_qset; - } - - /* Construct MAC address */ - mac = ( ( struct ipoib_mac * ) netdev->ll_addr ); - mac->qpn = htonl ( ipoib->data.qp->qpn ); - memcpy ( &mac->gid, &ibdev->port_gid, sizeof ( mac->gid ) ); + /* Calculate as much of the broadcast GID and the MAC address + * as we can. We won't know either of these in full until we + * have link-up. + */ + ipoib_set_ib_params ( ipoib ); /* Register network device */ if ( ( rc = register_netdev ( netdev ) ) != 0 ) @@ -902,11 +981,6 @@ int ipoib_probe ( struct ib_device *ibdev ) { return 0; err_register_netdev: - ipoib_destroy_qset ( ipoib, &ipoib->data ); - err_join_broadcast_group: - err_create_data_qset: - ipoib_destroy_qset ( ipoib, &ipoib->meta ); - err_create_meta_qset: netdev_nullify ( netdev ); netdev_put ( netdev ); return rc; @@ -919,11 +993,8 @@ int ipoib_probe ( struct ib_device *ibdev ) { */ void ipoib_remove ( struct ib_device *ibdev ) { struct net_device *netdev = ib_get_ownerdata ( ibdev ); - struct ipoib_device *ipoib = netdev->priv; unregister_netdev ( netdev ); - ipoib_destroy_qset ( ipoib, &ipoib->data ); - ipoib_destroy_qset ( ipoib, &ipoib->meta ); netdev_nullify ( netdev ); netdev_put ( netdev ); } diff --git a/src/include/gpxe/infiniband.h b/src/include/gpxe/infiniband.h index 354dc579..8fc928c7 100644 --- a/src/include/gpxe/infiniband.h +++ b/src/include/gpxe/infiniband.h @@ -95,6 +95,11 @@ struct ib_queue_pair { void *owner_priv; }; +/** Infiniband queue pair modification flags */ +enum ib_queue_pair_mods { + IB_MODIFY_QKEY = 0x0001, +}; + /** An Infiniband Completion Queue */ struct ib_completion_queue { /** Completion queue number */ @@ -187,6 +192,16 @@ struct ib_device_operations { */ int ( * create_qp ) ( struct ib_device *ibdev, struct ib_queue_pair *qp ); + /** Modify queue pair + * + * @v ibdev Infiniband device + * @v qp Queue pair + * @v mod_list Modification list + * @ret rc Return status code + */ + int ( * modify_qp ) ( struct ib_device *ibdev, + struct ib_queue_pair *qp, + unsigned long mod_list ); /** Destroy queue pair * * @v ibdev Infiniband device @@ -291,6 +306,8 @@ struct ib_device { struct ib_device_operations *op; /** Port number */ unsigned int port; + /** Link state */ + int link_up; /** Port GID */ struct ib_gid port_gid; /** Subnet manager LID */ @@ -311,6 +328,8 @@ extern struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev, unsigned int num_send_wqes, struct ib_completion_queue *send_cq, unsigned int num_recv_wqes, struct ib_completion_queue *recv_cq, unsigned long qkey ); +extern int ib_modify_qp ( struct ib_device *ibdev, struct ib_queue_pair *qp, + unsigned long mod_list, unsigned long qkey ); extern void ib_destroy_qp ( struct ib_device *ibdev, struct ib_queue_pair *qp ); extern struct ib_work_queue * ib_find_wq ( struct ib_completion_queue *cq, @@ -319,6 +338,7 @@ extern struct ib_device * alloc_ibdev ( size_t priv_size ); extern int register_ibdev ( struct ib_device *ibdev ); extern void unregister_ibdev ( struct ib_device *ibdev ); extern void free_ibdev ( struct ib_device *ibdev ); +extern void ib_link_state_changed ( struct ib_device *ibdev ); /** * Post send work queue entry diff --git a/src/include/gpxe/ipoib.h b/src/include/gpxe/ipoib.h index 0551687d..bcbdc4c6 100644 --- a/src/include/gpxe/ipoib.h +++ b/src/include/gpxe/ipoib.h @@ -72,6 +72,7 @@ static inline struct net_device * alloc_ipoibdev ( size_t priv_size ) { return netdev; } +extern void ipoib_link_state_changed ( struct ib_device *ibdev ); extern int ipoib_probe ( struct ib_device *ibdev ); extern void ipoib_remove ( struct ib_device *ibdev ); diff --git a/src/net/infiniband.c b/src/net/infiniband.c index 39d11285..e5c79e96 100644 --- a/src/net/infiniband.c +++ b/src/net/infiniband.c @@ -152,15 +152,41 @@ struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev, return qp; } +/** + * Modify queue pair + * + * @v ibdev Infiniband device + * @v qp Queue pair + * @v mod_list Modification list + * @v qkey New queue key, if applicable + * @ret rc Return status code + */ +int ib_modify_qp ( struct ib_device *ibdev, struct ib_queue_pair *qp, + unsigned long mod_list, unsigned long qkey ) { + int rc; + + DBGC ( ibdev, "IBDEV %p modifying QPN %#lx\n", ibdev, qp->qpn ); + + if ( mod_list & IB_MODIFY_QKEY ) + qp->qkey = qkey; + + if ( ( rc = ibdev->op->modify_qp ( ibdev, qp, mod_list ) ) != 0 ) { + DBGC ( ibdev, "IBDEV %p could not modify QPN %#lx: %s\n", + ibdev, qp->qpn, strerror ( rc ) ); + return rc; + } + + return 0; +} + /** * Destroy queue pair * * @v ibdev Infiniband device * @v qp Queue pair */ -void ib_destroy_qp ( struct ib_device *ibdev, - struct ib_queue_pair *qp ) { - DBGC ( ibdev, "IBDEV %p destroying queue pair %#lx\n", +void ib_destroy_qp ( struct ib_device *ibdev, struct ib_queue_pair *qp ) { + DBGC ( ibdev, "IBDEV %p destroying QPN %#lx\n", ibdev, qp->qpn ); ibdev->op->destroy_qp ( ibdev, qp ); list_del ( &qp->send.list ); @@ -279,38 +305,6 @@ static int ib_get_pkey_table ( struct ib_device *ibdev, return 0; } -/** - * Wait for link up - * - * @v ibdev Infiniband device - * @ret rc Return status code - * - * This function shouldn't really exist. Unfortunately, IB links take - * a long time to come up, and we can't get various key parameters - * e.g. our own IPoIB MAC address without information from the subnet - * manager). We should eventually make link-up an asynchronous event. - */ -static int ib_wait_for_link ( struct ib_device *ibdev ) { - struct ib_mad_port_info port_info; - unsigned int retries; - int rc; - - printf ( "Waiting for Infiniband link-up..." ); - for ( retries = 20 ; retries ; retries-- ) { - if ( ( rc = ib_get_port_info ( ibdev, &port_info ) ) != 0 ) - continue; - if ( ( ( port_info.port_state__link_speed_supported ) & 0xf ) - == 4 ) { - printf ( "ok\n" ); - return 0; - } - printf ( "." ); - sleep ( 1 ); - } - printf ( "failed\n" ); - return -ENODEV; -}; - /** * Get MAD parameters * @@ -326,9 +320,13 @@ static int ib_get_mad_params ( struct ib_device *ibdev ) { } u; int rc; - /* Port info gives us the first half of the port GID and the SM LID */ + /* Port info gives us the link state, the first half of the + * port GID and the SM LID. + */ if ( ( rc = ib_get_port_info ( ibdev, &u.port_info ) ) != 0 ) return rc; + ibdev->link_up = ( ( u.port_info.port_state__link_speed_supported + & 0xf ) == 4 ); memcpy ( &ibdev->port_gid.u.bytes[0], u.port_info.gid_prefix, 8 ); ibdev->sm_lid = ntohs ( u.port_info.mastersm_lid ); @@ -391,10 +389,6 @@ int register_ibdev ( struct ib_device *ibdev ) { if ( ( rc = ib_open ( ibdev ) ) != 0 ) goto err_open; - /* Wait for link */ - if ( ( rc = ib_wait_for_link ( ibdev ) ) != 0 ) - goto err_wait_for_link; - /* Get MAD parameters */ if ( ( rc = ib_get_mad_params ( ibdev ) ) != 0 ) goto err_get_mad_params; @@ -410,7 +404,6 @@ int register_ibdev ( struct ib_device *ibdev ) { err_ipoib_probe: err_get_mad_params: - err_wait_for_link: ib_close ( ibdev ); err_open: return rc; @@ -435,3 +428,21 @@ void free_ibdev ( struct ib_device *ibdev ) { free ( ibdev ); } +/** + * Handle Infiniband link state change + * + * @v ibdev Infiniband device + */ +void ib_link_state_changed ( struct ib_device *ibdev ) { + int rc; + + /* Update MAD parameters */ + if ( ( rc = ib_get_mad_params ( ibdev ) ) != 0 ) { + DBGC ( ibdev, "IBDEV %p could not update MAD parameters: %s\n", + ibdev, strerror ( rc ) ); + return; + } + + /* Notify IPoIB of link state change */ + ipoib_link_state_changed ( ibdev ); +}