diff --git a/src/arch/i386/interface/pxe/pxe_undi.c b/src/arch/i386/interface/pxe/pxe_undi.c index f9282db1..5d212269 100644 --- a/src/arch/i386/interface/pxe/pxe_undi.c +++ b/src/arch/i386/interface/pxe/pxe_undi.c @@ -358,7 +358,8 @@ pxenv_undi_transmit ( struct s_PXENV_UNDI_TRANSMIT *undi_transmit ) { } /* Allocate and fill I/O buffer */ - iobuf = alloc_iob ( MAX_LL_HEADER_LEN + len ); + iobuf = alloc_iob ( MAX_LL_HEADER_LEN + + ( ( len > IOB_ZLEN ) ? len : IOB_ZLEN ) ); if ( ! iobuf ) { DBGC2 ( &pxe_netdev, " could not allocate iobuf\n" ); undi_transmit->Status = PXENV_STATUS_OUT_OF_RESOURCES; diff --git a/src/drivers/net/ipoib.c b/src/drivers/net/ipoib.c index 41ba348b..c1b8cad9 100644 --- a/src/drivers/net/ipoib.c +++ b/src/drivers/net/ipoib.c @@ -20,18 +20,23 @@ FILE_LICENCE ( GPL2_OR_LATER ); #include +#include #include #include #include #include #include #include +#include #include +#include +#include #include #include #include #include #include +#include #include /** @file @@ -58,6 +63,8 @@ struct ipoib_device { struct ib_completion_queue *cq; /** Queue pair */ struct ib_queue_pair *qp; + /** Local MAC */ + struct ipoib_mac mac; /** Broadcast MAC */ struct ipoib_mac broadcast; /** Joined to IPv4 broadcast multicast group @@ -68,6 +75,8 @@ struct ipoib_device { int broadcast_joined; /** IPv4 broadcast multicast group membership */ struct ib_mc_membership broadcast_membership; + /** REMAC cache */ + struct list_head peers; }; /** Broadcast IPoIB address */ @@ -89,99 +98,134 @@ struct errortab ipoib_errors[] __errortab = { /**************************************************************************** * - * IPoIB peer cache + * IPoIB REMAC cache * **************************************************************************** */ -/** - * IPoIB peer address - * - * The IPoIB link-layer header is only four bytes long and so does not - * have sufficient room to store IPoIB MAC address(es). We therefore - * maintain a cache of MAC addresses identified by a single-byte key, - * and abuse the spare two bytes within the link-layer header to - * communicate these MAC addresses between the link-layer code and the - * netdevice driver. - */ +/** An IPoIB REMAC cache entry */ struct ipoib_peer { - /** Key */ - uint8_t key; + /** List of REMAC cache entries */ + struct list_head list; + /** Remote Ethermet MAC */ + struct ipoib_remac remac; /** MAC address */ struct ipoib_mac mac; }; -/** Number of IPoIB peer cache entries - * - * Must be a power of two. - */ -#define IPOIB_NUM_CACHED_PEERS 4 - -/** IPoIB peer address cache */ -static struct ipoib_peer ipoib_peer_cache[IPOIB_NUM_CACHED_PEERS]; - -/** Oldest IPoIB peer cache entry index */ -static unsigned int ipoib_peer_cache_idx = 0; - -/** IPoIB peer cache entry validity flag */ -#define IPOIB_PEER_KEY_VALID 0x80 - /** - * Look up cached peer by key + * Find IPoIB MAC from REMAC * - * @v key Peer cache key - * @ret peer Peer cache entry, or NULL + * @v ipoib IPoIB device + * @v remac Remote Ethernet MAC + * @ret mac IPoIB MAC (or NULL if not found) */ -static struct ipoib_peer * ipoib_lookup_peer_by_key ( unsigned int key ) { +static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib, + const struct ipoib_remac *remac ) { struct ipoib_peer *peer; - unsigned int i; - if ( ! key ) - return NULL; + /* Check for broadcast REMAC */ + if ( is_broadcast_ether_addr ( remac ) ) + return &ipoib->broadcast; - for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) { - peer = &ipoib_peer_cache[i]; - if ( peer->key == key ) - return peer; + /* Try to find via REMAC cache */ + list_for_each_entry ( peer, &ipoib->peers, list ) { + if ( memcmp ( remac, &peer->remac, + sizeof ( peer->remac ) ) == 0 ) { + /* Move peer to start of list */ + list_del ( &peer->list ); + list_add ( &peer->list, &ipoib->peers ); + return &peer->mac; + } } - DBG ( "IPoIB warning: peer cache lost track of key %x while still in " - "use\n", key ); + DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n", + ipoib, eth_ntoa ( remac ) ); return NULL; } /** - * Store GID and QPN in peer cache + * Add IPoIB MAC to REMAC cache * - * @v mac Peer MAC address - * @ret peer Peer cache entry + * @v ipoib IPoIB device + * @v remac Remote Ethernet MAC + * @v mac IPoIB MAC + * @ret rc Return status code */ -static struct ipoib_peer * ipoib_cache_peer ( const struct ipoib_mac *mac ) { +static int ipoib_map_remac ( struct ipoib_device *ipoib, + const struct ipoib_remac *remac, + const struct ipoib_mac *mac ) { struct ipoib_peer *peer; - uint8_t key; - unsigned int i; - /* Look for existing cache entry */ - for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) { - peer = &ipoib_peer_cache[i]; - if ( memcmp ( &peer->mac, mac, sizeof ( peer->mac ) ) == 0 ) - return peer; + /* Check for existing entry in REMAC cache */ + list_for_each_entry ( peer, &ipoib->peers, list ) { + if ( memcmp ( remac, &peer->remac, + sizeof ( peer->remac ) ) == 0 ) { + /* Move peer to start of list */ + list_del ( &peer->list ); + list_add ( &peer->list, &ipoib->peers ); + /* Update MAC */ + memcpy ( &peer->mac, mac, sizeof ( peer->mac ) ); + return 0; + } } - /* No entry found: create a new one */ - key = ( ipoib_peer_cache_idx++ | IPOIB_PEER_KEY_VALID ); - peer = &ipoib_peer_cache[ key % IPOIB_NUM_CACHED_PEERS ]; - if ( peer->key ) - DBG ( "IPoIB peer %x evicted from cache\n", peer->key ); - - memset ( peer, 0, sizeof ( *peer ) ); - peer->key = key; + /* Create new entry */ + peer = malloc ( sizeof ( *peer ) ); + if ( ! peer ) + return -ENOMEM; + memcpy ( &peer->remac, remac, sizeof ( peer->remac ) ); memcpy ( &peer->mac, mac, sizeof ( peer->mac ) ); - DBG ( "IPoIB peer %x has MAC %s\n", - peer->key, ipoib_ntoa ( &peer->mac ) ); - return peer; + list_add ( &peer->list, &ipoib->peers ); + + return 0; } +/** + * Flush REMAC cache + * + * @v ipoib IPoIB device + */ +static void ipoib_flush_remac ( struct ipoib_device *ipoib ) { + struct ipoib_peer *peer; + struct ipoib_peer *tmp; + + list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) { + list_del ( &peer->list ); + free ( peer ); + } +} + +/** + * Discard some entries from the REMAC cache + * + * @ret discarded Number of cached items discarded + */ +static unsigned int ipoib_discard_remac ( void ) { + struct ib_device *ibdev; + struct ipoib_device *ipoib; + struct ipoib_peer *peer; + unsigned int discarded = 0; + + /* Try to discard one cache entry for each IPoIB device */ + for_each_ibdev ( ibdev ) { + ipoib = ib_get_ownerdata ( ibdev ); + list_for_each_entry_reverse ( peer, &ipoib->peers, list ) { + list_del ( &peer->list ); + free ( peer ); + discarded++; + break; + } + } + + return discarded; +} + +/** IPoIB cache discarder */ +struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_NORMAL ) = { + .discard = ipoib_discard_remac, +}; + /**************************************************************************** * * IPoIB link layer @@ -189,85 +233,6 @@ static struct ipoib_peer * ipoib_cache_peer ( const struct ipoib_mac *mac ) { **************************************************************************** */ -/** - * Add IPoIB link-layer header - * - * @v netdev Network device - * @v iobuf I/O buffer - * @v ll_dest Link-layer destination address - * @v ll_source Source link-layer address - * @v net_proto Network-layer protocol, in network-byte order - * @ret rc Return status code - */ -static int ipoib_push ( struct net_device *netdev __unused, - struct io_buffer *iobuf, const void *ll_dest, - const void *ll_source __unused, uint16_t net_proto ) { - struct ipoib_hdr *ipoib_hdr = - iob_push ( iobuf, sizeof ( *ipoib_hdr ) ); - const struct ipoib_mac *dest_mac = ll_dest; - const struct ipoib_mac *src_mac = ll_source; - struct ipoib_peer *dest; - struct ipoib_peer *src; - - /* Add link-layer addresses to cache */ - dest = ipoib_cache_peer ( dest_mac ); - src = ipoib_cache_peer ( src_mac ); - - /* Build IPoIB header */ - ipoib_hdr->proto = net_proto; - ipoib_hdr->u.peer.dest = dest->key; - ipoib_hdr->u.peer.src = src->key; - - return 0; -} - -/** - * Remove IPoIB link-layer header - * - * @v netdev Network device - * @v iobuf I/O buffer - * @ret ll_dest Link-layer destination address - * @ret ll_source Source link-layer address - * @ret net_proto Network-layer protocol, in network-byte order - * @ret flags Packet flags - * @ret rc Return status code - */ -static int ipoib_pull ( struct net_device *netdev, - struct io_buffer *iobuf, const void **ll_dest, - const void **ll_source, uint16_t *net_proto, - unsigned int *flags ) { - struct ipoib_device *ipoib = netdev->priv; - struct ipoib_hdr *ipoib_hdr = iobuf->data; - struct ipoib_peer *dest; - struct ipoib_peer *source; - - /* Sanity check */ - if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) { - DBG ( "IPoIB packet too short for link-layer header\n" ); - DBG_HD ( iobuf->data, iob_len ( iobuf ) ); - return -EINVAL; - } - - /* Strip off IPoIB header */ - iob_pull ( iobuf, sizeof ( *ipoib_hdr ) ); - - /* Identify source and destination addresses, and clear - * reserved word in IPoIB header - */ - dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest ); - source = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.src ); - ipoib_hdr->u.reserved = 0; - - /* Fill in required fields */ - *ll_dest = ( dest ? &dest->mac : &ipoib->broadcast ); - *ll_source = ( source ? &source->mac : &ipoib->broadcast ); - *net_proto = ipoib_hdr->proto; - *flags = ( ( *ll_dest == &ipoib->broadcast ) ? - ( LL_MULTICAST | LL_BROADCAST ) : 0 ); - - return 0; -} - /** * Initialise IPoIB link-layer address * @@ -275,115 +240,32 @@ static int ipoib_pull ( struct net_device *netdev, * @v ll_addr Link-layer address */ static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) { - const union ib_guid *guid = hw_addr; - struct ipoib_mac *mac = ll_addr; - - memset ( mac, 0, sizeof ( *mac ) ); - memcpy ( &mac->gid.s.guid, guid, sizeof ( mac->gid.s.guid ) ); -} - -/** - * Transcribe IPoIB link-layer address - * - * @v ll_addr Link-layer address - * @ret string Link-layer address in human-readable format - */ -const char * ipoib_ntoa ( const void *ll_addr ) { - static char buf[45]; - const struct ipoib_mac *mac = ll_addr; - - snprintf ( buf, sizeof ( buf ), "%08x:%08x:%08x:%08x:%08x", - htonl ( mac->flags__qpn ), htonl ( mac->gid.dwords[0] ), - htonl ( mac->gid.dwords[1] ), - htonl ( mac->gid.dwords[2] ), - htonl ( mac->gid.dwords[3] ) ); - return buf; -} - -/** - * Hash multicast address - * - * @v af Address family - * @v net_addr Network-layer address - * @v ll_addr Link-layer address to fill in - * @ret rc Return status code - */ -static int ipoib_mc_hash ( unsigned int af __unused, - const void *net_addr __unused, - void *ll_addr __unused ) { - - return -ENOTSUP; -} - -/** - * Generate Mellanox Ethernet-compatible compressed link-layer address - * - * @v ll_addr Link-layer address - * @v eth_addr Ethernet-compatible address to fill in - */ -static int ipoib_mlx_eth_addr ( const union ib_guid *guid, - uint8_t *eth_addr ) { - eth_addr[0] = ( ( guid->bytes[3] == 2 ) ? 0x00 : 0x02 ); - eth_addr[1] = guid->bytes[1]; - eth_addr[2] = guid->bytes[2]; - eth_addr[3] = guid->bytes[5]; - eth_addr[4] = guid->bytes[6]; - eth_addr[5] = guid->bytes[7]; - return 0; -} - -/** An IPoIB Ethernet-compatible compressed link-layer address generator */ -struct ipoib_eth_addr_handler { - /** GUID byte 1 */ - uint8_t byte1; - /** GUID byte 2 */ - uint8_t byte2; - /** Handler */ - int ( * eth_addr ) ( const union ib_guid *guid, - uint8_t *eth_addr ); -}; - -/** IPoIB Ethernet-compatible compressed link-layer address generators */ -static struct ipoib_eth_addr_handler ipoib_eth_addr_handlers[] = { - { 0x02, 0xc9, ipoib_mlx_eth_addr }, -}; - -/** - * Generate Ethernet-compatible compressed link-layer address - * - * @v ll_addr Link-layer address - * @v eth_addr Ethernet-compatible address to fill in - */ -static int ipoib_eth_addr ( const void *ll_addr, void *eth_addr ) { - const struct ipoib_mac *ipoib_addr = ll_addr; - const union ib_guid *guid = &ipoib_addr->gid.s.guid; - struct ipoib_eth_addr_handler *handler; + const uint8_t *guid = hw_addr; + uint8_t *eth_addr = ll_addr; + uint8_t guid_mask = IPOIB_GUID_MASK; unsigned int i; - for ( i = 0 ; i < ( sizeof ( ipoib_eth_addr_handlers ) / - sizeof ( ipoib_eth_addr_handlers[0] ) ) ; i++ ) { - handler = &ipoib_eth_addr_handlers[i]; - if ( ( handler->byte1 == guid->bytes[1] ) && - ( handler->byte2 == guid->bytes[2] ) ) { - return handler->eth_addr ( guid, eth_addr ); - } + /* Extract bytes from GUID according to mask */ + for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) { + if ( guid_mask & 0x80 ) + *(eth_addr++) = *guid; } - return -ENOTSUP; } /** IPoIB protocol */ struct ll_protocol ipoib_protocol __ll_protocol = { .name = "IPoIB", - .ll_proto = htons ( ARPHRD_INFINIBAND ), + .ll_proto = htons ( ARPHRD_ETHER ), .hw_addr_len = sizeof ( union ib_guid ), - .ll_addr_len = IPOIB_ALEN, - .ll_header_len = IPOIB_HLEN, - .push = ipoib_push, - .pull = ipoib_pull, + .ll_addr_len = ETH_ALEN, + .ll_header_len = ETH_HLEN, + .push = eth_push, + .pull = eth_pull, .init_addr = ipoib_init_addr, - .ntoa = ipoib_ntoa, - .mc_hash = ipoib_mc_hash, - .eth_addr = ipoib_eth_addr, + .ntoa = eth_ntoa, + .mc_hash = eth_mc_hash, + .eth_addr = eth_eth_addr, + .flags = LL_NAME_ONLY, }; /** @@ -398,12 +280,167 @@ struct net_device * alloc_ipoibdev ( size_t priv_size ) { netdev = alloc_netdev ( priv_size ); if ( netdev ) { netdev->ll_protocol = &ipoib_protocol; - netdev->ll_broadcast = ( uint8_t * ) &ipoib_broadcast; + netdev->ll_broadcast = eth_broadcast; netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE; } return netdev; } +/**************************************************************************** + * + * IPoIB translation layer + * + **************************************************************************** + */ + +/** + * Translate transmitted ARP packet + * + * @v netdev Network device + * @v iobuf Packet to be transmitted (with no link-layer headers) + * @ret rc Return status code + */ +static int ipoib_translate_tx_arp ( struct net_device *netdev, + struct io_buffer *iobuf ) { + struct ipoib_device *ipoib = netdev->priv; + struct arphdr *arphdr = iobuf->data; + struct ipoib_mac *target_ha = NULL; + void *sender_pa; + void *target_pa; + + /* Do nothing unless ARP contains eIPoIB link-layer addresses */ + if ( arphdr->ar_hln != ETH_ALEN ) + return 0; + + /* Fail unless we have room to expand packet */ + if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) - + ETH_ALEN ) ) ) { + DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n", + ipoib ); + return -ENOBUFS; + } + + /* Look up REMAC, if applicable */ + if ( arphdr->ar_op == ARPOP_REPLY ) { + target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr )); + if ( ! target_ha ) + return -ENXIO; + } + + /* Construct new packet */ + iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) ); + sender_pa = arp_sender_pa ( arphdr ); + target_pa = arp_target_pa ( arphdr ); + arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND ); + arphdr->ar_hln = sizeof ( ipoib->mac ); + memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln ); + memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln ); + memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) ); + memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) ); + if ( target_ha ) { + memcpy ( arp_target_ha ( arphdr ), target_ha, + sizeof ( *target_ha ) ); + } + + return 0; +} + +/** + * Translate transmitted packet + * + * @v netdev Network device + * @v iobuf Packet to be transmitted (with no link-layer headers) + * @v net_proto Network-layer protocol (in network byte order) + * @ret rc Return status code + */ +static int ipoib_translate_tx ( struct net_device *netdev, + struct io_buffer *iobuf, uint16_t net_proto ) { + + switch ( net_proto ) { + case htons ( ETH_P_ARP ) : + return ipoib_translate_tx_arp ( netdev, iobuf ); + case htons ( ETH_P_IP ) : + /* No translation needed */ + return 0; + default: + /* Cannot handle other traffic via eIPoIB */ + return -ENOTSUP; + } +} + +/** + * Translate received ARP packet + * + * @v netdev Network device + * @v iobuf Received packet (with no link-layer headers) + * @v remac Constructed Remote Ethernet MAC + * @ret rc Return status code + */ +static int ipoib_translate_rx_arp ( struct net_device *netdev, + struct io_buffer *iobuf, + struct ipoib_remac *remac ) { + struct ipoib_device *ipoib = netdev->priv; + struct arphdr *arphdr = iobuf->data; + void *sender_pa; + void *target_pa; + int rc; + + /* Do nothing unless ARP contains IPoIB link-layer addresses */ + if ( arphdr->ar_hln != sizeof ( ipoib->mac ) ) + return 0; + + /* Create REMAC cache entry */ + if ( ( rc = ipoib_map_remac ( ipoib, remac, + arp_sender_ha ( arphdr ) ) ) != 0 ) { + DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n", + ipoib, strerror ( rc ) ); + return rc; + } + + /* Construct new packet */ + sender_pa = arp_sender_pa ( arphdr ); + target_pa = arp_target_pa ( arphdr ); + arphdr->ar_hrd = htons ( ARPHRD_ETHER ); + arphdr->ar_hln = ETH_ALEN; + memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln ); + memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln ); + memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN ); + memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN ); + if ( arphdr->ar_op == ARPOP_REPLY ) { + /* Assume received replies were directed to us */ + memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN ); + } + iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) ); + + return 0; +} + +/** + * Translate received packet + * + * @v netdev Network device + * @v iobuf Received packet (with no link-layer headers) + * @v remac Constructed Remote Ethernet MAC + * @v net_proto Network-layer protocol (in network byte order) + * @ret rc Return status code + */ +static int ipoib_translate_rx ( struct net_device *netdev, + struct io_buffer *iobuf, + struct ipoib_remac *remac, + uint16_t net_proto ) { + + switch ( net_proto ) { + case htons ( ETH_P_ARP ) : + return ipoib_translate_rx_arp ( netdev, iobuf, remac ); + case htons ( ETH_P_IP ) : + /* No translation needed */ + return 0; + default: + /* Cannot handle other traffic via eIPoIB */ + return -ENOTSUP; + } +} + /**************************************************************************** * * IPoIB network device @@ -422,17 +459,18 @@ static int ipoib_transmit ( struct net_device *netdev, struct io_buffer *iobuf ) { struct ipoib_device *ipoib = netdev->priv; struct ib_device *ibdev = ipoib->ibdev; + struct ethhdr *ethhdr; struct ipoib_hdr *ipoib_hdr; - struct ipoib_peer *peer; + struct ipoib_mac *mac; struct ib_address_vector dest; + uint16_t net_proto; int rc; /* Sanity check */ - if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) { + if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) { DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib ); return -EINVAL; } - ipoib_hdr = iobuf->data; /* Attempting transmission while link is down will put the * queue pair into an error state, so don't try it. @@ -440,17 +478,30 @@ static int ipoib_transmit ( struct net_device *netdev, if ( ! ib_link_ok ( ibdev ) ) return -ENETUNREACH; + /* Strip eIPoIB header */ + ethhdr = iobuf->data; + net_proto = ethhdr->h_protocol; + iob_pull ( iobuf, sizeof ( *ethhdr ) ); + /* Identify destination address */ - peer = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest ); - if ( ! peer ) + mac = ipoib_find_remac ( ipoib, ( ( void *) ethhdr->h_dest ) ); + if ( ! mac ) return -ENXIO; - ipoib_hdr->u.reserved = 0; + + /* Translate packet if applicable */ + if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 ) + return rc; + + /* Prepend real IPoIB header */ + ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) ); + ipoib_hdr->proto = net_proto; + ipoib_hdr->reserved = 0; /* Construct address vector */ memset ( &dest, 0, sizeof ( dest ) ); - dest.qpn = ( ntohl ( peer->mac.flags__qpn ) & IB_QPN_MASK ); + dest.qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK ); dest.gid_present = 1; - memcpy ( &dest.gid, &peer->mac.gid, sizeof ( dest.gid ) ); + memcpy ( &dest.gid, &mac->gid, sizeof ( dest.gid ) ); if ( ( rc = ib_resolve_path ( ibdev, &dest ) ) != 0 ) { /* Path not resolved yet */ return rc; @@ -487,14 +538,15 @@ static void ipoib_complete_send ( struct ib_device *ibdev __unused, */ static void ipoib_complete_recv ( struct ib_device *ibdev __unused, struct ib_queue_pair *qp, - struct ib_address_vector *dest __unused, + struct ib_address_vector *dest, struct ib_address_vector *source, struct io_buffer *iobuf, int rc ) { struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp ); struct net_device *netdev = ipoib->netdev; struct ipoib_hdr *ipoib_hdr; - struct ipoib_mac ll_src; - struct ipoib_peer *src; + struct ethhdr *ethhdr; + struct ipoib_remac remac; + uint16_t net_proto; /* Record errors */ if ( rc != 0 ) { @@ -510,7 +562,6 @@ static void ipoib_complete_recv ( struct ib_device *ibdev __unused, netdev_rx_err ( netdev, iobuf, -EIO ); return; } - ipoib_hdr = iobuf->data; if ( ! source ) { DBGC ( ipoib, "IPoIB %p received packet without address " "vector\n", ipoib ); @@ -518,12 +569,37 @@ static void ipoib_complete_recv ( struct ib_device *ibdev __unused, return; } - /* Parse source address */ - if ( source->gid_present ) { - ll_src.flags__qpn = htonl ( source->qpn ); - memcpy ( &ll_src.gid, &source->gid, sizeof ( ll_src.gid ) ); - src = ipoib_cache_peer ( &ll_src ); - ipoib_hdr->u.peer.src = src->key; + /* Strip real IPoIB header */ + ipoib_hdr = iobuf->data; + net_proto = ipoib_hdr->proto; + iob_pull ( iobuf, sizeof ( *ipoib_hdr ) ); + + /* Construct source address from remote QPN and LID */ + remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA ); + remac.lid = htons ( source->lid ); + + /* Translate packet if applicable */ + if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac, + net_proto ) ) != 0 ) { + netdev_rx_err ( netdev, iobuf, rc ); + return; + } + + /* Prepend eIPoIB header */ + ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) ); + memcpy ( ðhdr->h_source, &remac, sizeof ( ethhdr->h_source ) ); + ethhdr->h_protocol = net_proto; + + /* Construct destination address */ + if ( dest->gid_present && ( memcmp ( &dest->gid, &ipoib->broadcast.gid, + sizeof ( dest->gid ) ) == 0 ) ) { + /* Broadcast GID; use the Ethernet broadcast address */ + memcpy ( ðhdr->h_dest, eth_broadcast, + sizeof ( ethhdr->h_dest ) ); + } else { + /* Assume destination address is local Ethernet MAC */ + memcpy ( ðhdr->h_dest, netdev->ll_addr, + sizeof ( ethhdr->h_dest ) ); } /* Hand off to network layer */ @@ -536,9 +612,40 @@ static struct ib_completion_queue_operations ipoib_cq_op = { .complete_recv = ipoib_complete_recv, }; +/** + * Allocate IPoIB receive I/O buffer + * + * @v len Length of buffer + * @ret iobuf I/O buffer, or NULL + * + * Some Infiniband hardware requires 2kB alignment of receive buffers + * and provides no way to disable header separation. The result is + * that there are only four bytes of link-layer header (the real IPoIB + * header) before the payload. This is not sufficient space to insert + * an eIPoIB link-layer pseudo-header. + * + * We therefore allocate I/O buffers offset to start slightly before + * the natural alignment boundary, in order to allow sufficient space. + */ +static struct io_buffer * ipoib_alloc_iob ( size_t len ) { + struct io_buffer *iobuf; + size_t reserve_len; + + /* Calculate additional length required at start of buffer */ + reserve_len = ( sizeof ( struct ethhdr ) - + sizeof ( struct ipoib_hdr ) ); + + /* Allocate buffer */ + iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len ); + if ( iobuf ) { + iob_reserve ( iobuf, reserve_len ); + } + return iobuf; +} + /** IPoIB queue pair operations */ static struct ib_queue_pair_operations ipoib_qp_op = { - .alloc_iob = alloc_iob, + .alloc_iob = ipoib_alloc_iob, }; /** @@ -550,7 +657,11 @@ static void ipoib_poll ( struct net_device *netdev ) { struct ipoib_device *ipoib = netdev->priv; struct ib_device *ibdev = ipoib->ibdev; + /* Poll Infiniband device */ ib_poll_eq ( ibdev ); + + /* Poll the retry timers (required for IPoIB multicast join) */ + retry_poll(); } /** @@ -617,15 +728,14 @@ static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) { static void ipoib_link_state_changed ( struct ib_device *ibdev ) { struct net_device *netdev = ib_get_ownerdata ( ibdev ); struct ipoib_device *ipoib = netdev->priv; - struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr ); int rc; /* Leave existing broadcast group */ ipoib_leave_broadcast_group ( ipoib ); /* Update MAC address based on potentially-new GID prefix */ - memcpy ( &mac->gid.s.prefix, &ibdev->gid.s.prefix, - sizeof ( mac->gid.s.prefix ) ); + memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix, + sizeof ( ipoib->mac.gid.s.prefix ) ); /* Update broadcast GID based on potentially-new partition key */ ipoib->broadcast.gid.words[2] = @@ -654,7 +764,6 @@ static void ipoib_link_state_changed ( struct ib_device *ibdev ) { static int ipoib_open ( struct net_device *netdev ) { struct ipoib_device *ipoib = netdev->priv; struct ib_device *ibdev = ipoib->ibdev; - struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr ); int rc; /* Open IB device */ @@ -686,7 +795,7 @@ static int ipoib_open ( struct net_device *netdev ) { ib_qp_set_ownerdata ( ipoib->qp, ipoib ); /* Update MAC address with QPN */ - mac->flags__qpn = htonl ( ipoib->qp->qpn ); + ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn ); /* Fill receive rings */ ib_refill_recv ( ibdev, ipoib->qp ); @@ -713,13 +822,15 @@ static int ipoib_open ( struct net_device *netdev ) { static void ipoib_close ( struct net_device *netdev ) { struct ipoib_device *ipoib = netdev->priv; struct ib_device *ibdev = ipoib->ibdev; - struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr ); + + /* Flush REMAC cache */ + ipoib_flush_remac ( ipoib ); /* Leave broadcast group */ ipoib_leave_broadcast_group ( ipoib ); /* Remove QPN from MAC address */ - mac->flags__qpn = 0; + ipoib->mac.flags__qpn = 0; /* Tear down the queues */ ib_destroy_qp ( ibdev, ipoib->qp ); @@ -759,15 +870,19 @@ static int ipoib_probe ( struct ib_device *ibdev ) { memset ( ipoib, 0, sizeof ( *ipoib ) ); ipoib->netdev = netdev; ipoib->ibdev = ibdev; + INIT_LIST_HEAD ( &ipoib->peers ); /* Extract hardware address */ memcpy ( netdev->hw_addr, &ibdev->gid.s.guid, sizeof ( ibdev->gid.s.guid ) ); - /* Set default broadcast address */ + /* Set local MAC address */ + memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid, + sizeof ( ipoib->mac.gid.s.guid ) ); + + /* Set default broadcast MAC address */ memcpy ( &ipoib->broadcast, &ipoib_broadcast, sizeof ( ipoib->broadcast ) ); - netdev->ll_broadcast = ( ( uint8_t * ) &ipoib->broadcast ); /* Register network device */ if ( ( rc = register_netdev ( netdev ) ) != 0 ) diff --git a/src/include/ipxe/dhcp.h b/src/include/ipxe/dhcp.h index 34b4d3fd..b97dfe32 100644 --- a/src/include/ipxe/dhcp.h +++ b/src/include/ipxe/dhcp.h @@ -651,8 +651,6 @@ struct dhcphdr { #define PXEBS_SETTINGS_NAME "pxebs" extern uint32_t dhcp_last_xid; -extern unsigned int dhcp_chaddr ( struct net_device *netdev, void *chaddr, - uint16_t *flags ); extern int dhcp_create_packet ( struct dhcp_packet *dhcppkt, struct net_device *netdev, uint8_t msgtype, uint32_t xid, const void *options, diff --git a/src/include/ipxe/ipoib.h b/src/include/ipxe/ipoib.h index e8f12dc5..68ff8df4 100644 --- a/src/include/ipxe/ipoib.h +++ b/src/include/ipxe/ipoib.h @@ -8,6 +8,7 @@ FILE_LICENCE ( GPL2_OR_LATER ); +#include #include /** IPoIB MAC address length */ @@ -33,25 +34,32 @@ struct ipoib_hdr { /** Network-layer protocol */ uint16_t proto; /** Reserved, must be zero */ - union { - /** Reserved, must be zero */ - uint16_t reserved; - /** Peer addresses - * - * We use these fields internally to represent the - * peer addresses using a lookup key. There simply - * isn't enough room in the IPoIB header to store - * literal source or destination MAC addresses. - */ - struct { - /** Destination address key */ - uint8_t dest; - /** Source address key */ - uint8_t src; - } __attribute__ (( packed )) peer; - } __attribute__ (( packed )) u; + uint16_t reserved; } __attribute__ (( packed )); +/** GUID mask used for constructing eIPoIB Local Ethernet MAC address (LEMAC) */ +#define IPOIB_GUID_MASK 0xe7 + +/** eIPoIB Remote Ethernet MAC address + * + * An eIPoIB REMAC address is an Ethernet-like (6 byte) link-layer + * pseudo-address used to look up a full IPoIB link-layer address. + */ +struct ipoib_remac { + /** Remote QPN + * + * Must be ORed with EIPOIB_QPN_LA so that eIPoIB REMAC + * addresses are considered as locally-assigned Ethernet MAC + * addreses. + */ + uint32_t qpn; + /** Remote LID */ + uint16_t lid; +} __attribute__ (( packed )); + +/** eIPoIB REMAC locally-assigned address indicator */ +#define EIPOIB_QPN_LA 0x02000000UL + extern const char * ipoib_ntoa ( const void *ll_addr ); extern struct net_device * alloc_ipoibdev ( size_t priv_size ); diff --git a/src/include/ipxe/netdevice.h b/src/include/ipxe/netdevice.h index 3633a165..e5dbd996 100644 --- a/src/include/ipxe/netdevice.h +++ b/src/include/ipxe/netdevice.h @@ -188,8 +188,17 @@ struct ll_protocol { uint8_t ll_addr_len; /** Link-layer header length */ uint8_t ll_header_len; + /** Flags */ + unsigned int flags; }; +/** Local link-layer address functions only as a name + * + * This flag indicates that the local link-layer address cannot + * directly be used as a destination address by a remote node. + */ +#define LL_NAME_ONLY 0x0001 + /** Network device operations */ struct net_device_operations { /** Open network device diff --git a/src/interface/efi/efi_snp.c b/src/interface/efi/efi_snp.c index 88072aa6..6d7865dd 100644 --- a/src/interface/efi/efi_snp.c +++ b/src/interface/efi/efi_snp.c @@ -590,7 +590,8 @@ efi_snp_transmit ( EFI_SIMPLE_NETWORK_PROTOCOL *snp, /* Allocate buffer */ ll_headroom = ( MAX_LL_HEADER_LEN - ll_header_len ); - iobuf = alloc_iob ( ll_headroom + len ); + iobuf = alloc_iob ( ll_headroom + + ( ( len > IOB_ZLEN ) ? len : IOB_ZLEN ) ); if ( ! iobuf ) { DBGC ( snpdev, "SNPDEV %p TX could not allocate %ld-byte " "buffer\n", snpdev, ( ( unsigned long ) len ) ); diff --git a/src/net/udp/dhcp.c b/src/net/udp/dhcp.c index a7206164..e652503d 100644 --- a/src/net/udp/dhcp.c +++ b/src/net/udp/dhcp.c @@ -938,10 +938,18 @@ int dhcp_create_packet ( struct dhcp_packet *dhcppkt, dhcphdr->magic = htonl ( DHCP_MAGIC_COOKIE ); dhcphdr->htype = ntohs ( netdev->ll_protocol->ll_proto ); dhcphdr->op = dhcp_op[msgtype]; - dhcphdr->hlen = dhcp_chaddr ( netdev, dhcphdr->chaddr, - &dhcphdr->flags ); + dhcphdr->hlen = netdev->ll_protocol->ll_addr_len; + memcpy ( dhcphdr->chaddr, netdev->ll_addr, + netdev->ll_protocol->ll_addr_len ); memcpy ( dhcphdr->options, options, options_len ); + /* If the local link-layer address functions only as a name + * (i.e. cannot be used as a destination address), then + * request broadcast responses. + */ + if ( netdev->ll_protocol->flags & LL_NAME_ONLY ) + dhcphdr->flags |= htons ( BOOTP_FL_BROADCAST ); + /* If the network device already has an IPv4 address then * unicast responses from the DHCP server may be rejected, so * request broadcast responses. diff --git a/src/usr/dhcpmgmt.c b/src/usr/dhcpmgmt.c index 1bd9bff0..b61c01aa 100644 --- a/src/usr/dhcpmgmt.c +++ b/src/usr/dhcpmgmt.c @@ -38,10 +38,6 @@ FILE_LICENCE ( GPL2_OR_LATER ); */ int dhcp ( struct net_device *netdev ) { - struct dhcphdr *dhcphdr; - typeof ( dhcphdr->chaddr ) chaddr; - unsigned int hlen; - unsigned int i; int rc; /* Check we can open the interface first */ @@ -53,12 +49,8 @@ int dhcp ( struct net_device *netdev ) { return rc; /* Perform DHCP */ - printf ( "DHCP (%s", netdev->name ); - hlen = dhcp_chaddr ( netdev, chaddr, NULL ); - for ( i = 0 ; i < hlen ; i++ ) - printf ( "%c%02x", ( i ? ':' : ' ' ), chaddr[i] ); - printf ( ")" ); - + printf ( "DHCP (%s %s)", netdev->name, + netdev->ll_protocol->ntoa ( netdev->ll_addr ) ); if ( ( rc = start_dhcp ( &monojob, netdev ) ) == 0 ) { rc = monojob_wait ( "" ); } else if ( rc > 0 ) {