david/ipxe
david
/
ipxe
Archived
1
0
Fork 0

[ipoib] Expose Ethernet-compatible eIPoIB link-layer addresses and headers

Almost all clients of the raw-packet interfaces (UNDI and SNP) can
handle only Ethernet link layers.  Expose an Ethernet-compatible link
layer to local clients, while remaining compatible with IPoIB on the
wire.  This requires manipulation of ARP (but not DHCP) packets within
the IPoIB driver.

This is ugly, but it's the only viable way to allow IPoIB devices to
be driven via the raw-packet interfaces.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
This commit is contained in:
Michael Brown 2012-08-29 23:34:14 +01:00
parent f54a61e434
commit 03f0c23f8b
8 changed files with 435 additions and 303 deletions

View File

@ -358,7 +358,8 @@ pxenv_undi_transmit ( struct s_PXENV_UNDI_TRANSMIT *undi_transmit ) {
} }
/* Allocate and fill I/O buffer */ /* Allocate and fill I/O buffer */
iobuf = alloc_iob ( MAX_LL_HEADER_LEN + len ); iobuf = alloc_iob ( MAX_LL_HEADER_LEN +
( ( len > IOB_ZLEN ) ? len : IOB_ZLEN ) );
if ( ! iobuf ) { if ( ! iobuf ) {
DBGC2 ( &pxe_netdev, " could not allocate iobuf\n" ); DBGC2 ( &pxe_netdev, " could not allocate iobuf\n" );
undi_transmit->Status = PXENV_STATUS_OUT_OF_RESOURCES; undi_transmit->Status = PXENV_STATUS_OUT_OF_RESOURCES;

View File

@ -20,18 +20,23 @@
FILE_LICENCE ( GPL2_OR_LATER ); FILE_LICENCE ( GPL2_OR_LATER );
#include <stdint.h> #include <stdint.h>
#include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#include <unistd.h> #include <unistd.h>
#include <string.h> #include <string.h>
#include <byteswap.h> #include <byteswap.h>
#include <errno.h> #include <errno.h>
#include <ipxe/errortab.h> #include <ipxe/errortab.h>
#include <ipxe/malloc.h>
#include <ipxe/if_arp.h> #include <ipxe/if_arp.h>
#include <ipxe/if_ether.h>
#include <ipxe/ethernet.h>
#include <ipxe/iobuf.h> #include <ipxe/iobuf.h>
#include <ipxe/netdevice.h> #include <ipxe/netdevice.h>
#include <ipxe/infiniband.h> #include <ipxe/infiniband.h>
#include <ipxe/ib_pathrec.h> #include <ipxe/ib_pathrec.h>
#include <ipxe/ib_mcast.h> #include <ipxe/ib_mcast.h>
#include <ipxe/retry.h>
#include <ipxe/ipoib.h> #include <ipxe/ipoib.h>
/** @file /** @file
@ -58,6 +63,8 @@ struct ipoib_device {
struct ib_completion_queue *cq; struct ib_completion_queue *cq;
/** Queue pair */ /** Queue pair */
struct ib_queue_pair *qp; struct ib_queue_pair *qp;
/** Local MAC */
struct ipoib_mac mac;
/** Broadcast MAC */ /** Broadcast MAC */
struct ipoib_mac broadcast; struct ipoib_mac broadcast;
/** Joined to IPv4 broadcast multicast group /** Joined to IPv4 broadcast multicast group
@ -68,6 +75,8 @@ struct ipoib_device {
int broadcast_joined; int broadcast_joined;
/** IPv4 broadcast multicast group membership */ /** IPv4 broadcast multicast group membership */
struct ib_mc_membership broadcast_membership; struct ib_mc_membership broadcast_membership;
/** REMAC cache */
struct list_head peers;
}; };
/** Broadcast IPoIB address */ /** Broadcast IPoIB address */
@ -89,99 +98,134 @@ struct errortab ipoib_errors[] __errortab = {
/**************************************************************************** /****************************************************************************
* *
* IPoIB peer cache * IPoIB REMAC cache
* *
**************************************************************************** ****************************************************************************
*/ */
/** /** An IPoIB REMAC cache entry */
* IPoIB peer address
*
* The IPoIB link-layer header is only four bytes long and so does not
* have sufficient room to store IPoIB MAC address(es). We therefore
* maintain a cache of MAC addresses identified by a single-byte key,
* and abuse the spare two bytes within the link-layer header to
* communicate these MAC addresses between the link-layer code and the
* netdevice driver.
*/
struct ipoib_peer { struct ipoib_peer {
/** Key */ /** List of REMAC cache entries */
uint8_t key; struct list_head list;
/** Remote Ethermet MAC */
struct ipoib_remac remac;
/** MAC address */ /** MAC address */
struct ipoib_mac mac; struct ipoib_mac mac;
}; };
/** Number of IPoIB peer cache entries
*
* Must be a power of two.
*/
#define IPOIB_NUM_CACHED_PEERS 4
/** IPoIB peer address cache */
static struct ipoib_peer ipoib_peer_cache[IPOIB_NUM_CACHED_PEERS];
/** Oldest IPoIB peer cache entry index */
static unsigned int ipoib_peer_cache_idx = 0;
/** IPoIB peer cache entry validity flag */
#define IPOIB_PEER_KEY_VALID 0x80
/** /**
* Look up cached peer by key * Find IPoIB MAC from REMAC
* *
* @v key Peer cache key * @v ipoib IPoIB device
* @ret peer Peer cache entry, or NULL * @v remac Remote Ethernet MAC
* @ret mac IPoIB MAC (or NULL if not found)
*/ */
static struct ipoib_peer * ipoib_lookup_peer_by_key ( unsigned int key ) { static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
const struct ipoib_remac *remac ) {
struct ipoib_peer *peer; struct ipoib_peer *peer;
unsigned int i;
if ( ! key ) /* Check for broadcast REMAC */
return NULL; if ( is_broadcast_ether_addr ( remac ) )
return &ipoib->broadcast;
for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) { /* Try to find via REMAC cache */
peer = &ipoib_peer_cache[i]; list_for_each_entry ( peer, &ipoib->peers, list ) {
if ( peer->key == key ) if ( memcmp ( remac, &peer->remac,
return peer; sizeof ( peer->remac ) ) == 0 ) {
/* Move peer to start of list */
list_del ( &peer->list );
list_add ( &peer->list, &ipoib->peers );
return &peer->mac;
}
} }
DBG ( "IPoIB warning: peer cache lost track of key %x while still in " DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
"use\n", key ); ipoib, eth_ntoa ( remac ) );
return NULL; return NULL;
} }
/** /**
* Store GID and QPN in peer cache * Add IPoIB MAC to REMAC cache
* *
* @v mac Peer MAC address * @v ipoib IPoIB device
* @ret peer Peer cache entry * @v remac Remote Ethernet MAC
* @v mac IPoIB MAC
* @ret rc Return status code
*/ */
static struct ipoib_peer * ipoib_cache_peer ( const struct ipoib_mac *mac ) { static int ipoib_map_remac ( struct ipoib_device *ipoib,
const struct ipoib_remac *remac,
const struct ipoib_mac *mac ) {
struct ipoib_peer *peer; struct ipoib_peer *peer;
uint8_t key;
unsigned int i;
/* Look for existing cache entry */ /* Check for existing entry in REMAC cache */
for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) { list_for_each_entry ( peer, &ipoib->peers, list ) {
peer = &ipoib_peer_cache[i]; if ( memcmp ( remac, &peer->remac,
if ( memcmp ( &peer->mac, mac, sizeof ( peer->mac ) ) == 0 ) sizeof ( peer->remac ) ) == 0 ) {
return peer; /* Move peer to start of list */
list_del ( &peer->list );
list_add ( &peer->list, &ipoib->peers );
/* Update MAC */
memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
return 0;
}
} }
/* No entry found: create a new one */ /* Create new entry */
key = ( ipoib_peer_cache_idx++ | IPOIB_PEER_KEY_VALID ); peer = malloc ( sizeof ( *peer ) );
peer = &ipoib_peer_cache[ key % IPOIB_NUM_CACHED_PEERS ]; if ( ! peer )
if ( peer->key ) return -ENOMEM;
DBG ( "IPoIB peer %x evicted from cache\n", peer->key ); memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
memset ( peer, 0, sizeof ( *peer ) );
peer->key = key;
memcpy ( &peer->mac, mac, sizeof ( peer->mac ) ); memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
DBG ( "IPoIB peer %x has MAC %s\n", list_add ( &peer->list, &ipoib->peers );
peer->key, ipoib_ntoa ( &peer->mac ) );
return peer; return 0;
} }
/**
* Flush REMAC cache
*
* @v ipoib IPoIB device
*/
static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
struct ipoib_peer *peer;
struct ipoib_peer *tmp;
list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
list_del ( &peer->list );
free ( peer );
}
}
/**
* Discard some entries from the REMAC cache
*
* @ret discarded Number of cached items discarded
*/
static unsigned int ipoib_discard_remac ( void ) {
struct ib_device *ibdev;
struct ipoib_device *ipoib;
struct ipoib_peer *peer;
unsigned int discarded = 0;
/* Try to discard one cache entry for each IPoIB device */
for_each_ibdev ( ibdev ) {
ipoib = ib_get_ownerdata ( ibdev );
list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
list_del ( &peer->list );
free ( peer );
discarded++;
break;
}
}
return discarded;
}
/** IPoIB cache discarder */
struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_NORMAL ) = {
.discard = ipoib_discard_remac,
};
/**************************************************************************** /****************************************************************************
* *
* IPoIB link layer * IPoIB link layer
@ -189,85 +233,6 @@ static struct ipoib_peer * ipoib_cache_peer ( const struct ipoib_mac *mac ) {
**************************************************************************** ****************************************************************************
*/ */
/**
* Add IPoIB link-layer header
*
* @v netdev Network device
* @v iobuf I/O buffer
* @v ll_dest Link-layer destination address
* @v ll_source Source link-layer address
* @v net_proto Network-layer protocol, in network-byte order
* @ret rc Return status code
*/
static int ipoib_push ( struct net_device *netdev __unused,
struct io_buffer *iobuf, const void *ll_dest,
const void *ll_source __unused, uint16_t net_proto ) {
struct ipoib_hdr *ipoib_hdr =
iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
const struct ipoib_mac *dest_mac = ll_dest;
const struct ipoib_mac *src_mac = ll_source;
struct ipoib_peer *dest;
struct ipoib_peer *src;
/* Add link-layer addresses to cache */
dest = ipoib_cache_peer ( dest_mac );
src = ipoib_cache_peer ( src_mac );
/* Build IPoIB header */
ipoib_hdr->proto = net_proto;
ipoib_hdr->u.peer.dest = dest->key;
ipoib_hdr->u.peer.src = src->key;
return 0;
}
/**
* Remove IPoIB link-layer header
*
* @v netdev Network device
* @v iobuf I/O buffer
* @ret ll_dest Link-layer destination address
* @ret ll_source Source link-layer address
* @ret net_proto Network-layer protocol, in network-byte order
* @ret flags Packet flags
* @ret rc Return status code
*/
static int ipoib_pull ( struct net_device *netdev,
struct io_buffer *iobuf, const void **ll_dest,
const void **ll_source, uint16_t *net_proto,
unsigned int *flags ) {
struct ipoib_device *ipoib = netdev->priv;
struct ipoib_hdr *ipoib_hdr = iobuf->data;
struct ipoib_peer *dest;
struct ipoib_peer *source;
/* Sanity check */
if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
DBG ( "IPoIB packet too short for link-layer header\n" );
DBG_HD ( iobuf->data, iob_len ( iobuf ) );
return -EINVAL;
}
/* Strip off IPoIB header */
iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
/* Identify source and destination addresses, and clear
* reserved word in IPoIB header
*/
dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest );
source = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.src );
ipoib_hdr->u.reserved = 0;
/* Fill in required fields */
*ll_dest = ( dest ? &dest->mac : &ipoib->broadcast );
*ll_source = ( source ? &source->mac : &ipoib->broadcast );
*net_proto = ipoib_hdr->proto;
*flags = ( ( *ll_dest == &ipoib->broadcast ) ?
( LL_MULTICAST | LL_BROADCAST ) : 0 );
return 0;
}
/** /**
* Initialise IPoIB link-layer address * Initialise IPoIB link-layer address
* *
@ -275,115 +240,32 @@ static int ipoib_pull ( struct net_device *netdev,
* @v ll_addr Link-layer address * @v ll_addr Link-layer address
*/ */
static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) { static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
const union ib_guid *guid = hw_addr; const uint8_t *guid = hw_addr;
struct ipoib_mac *mac = ll_addr; uint8_t *eth_addr = ll_addr;
uint8_t guid_mask = IPOIB_GUID_MASK;
memset ( mac, 0, sizeof ( *mac ) );
memcpy ( &mac->gid.s.guid, guid, sizeof ( mac->gid.s.guid ) );
}
/**
* Transcribe IPoIB link-layer address
*
* @v ll_addr Link-layer address
* @ret string Link-layer address in human-readable format
*/
const char * ipoib_ntoa ( const void *ll_addr ) {
static char buf[45];
const struct ipoib_mac *mac = ll_addr;
snprintf ( buf, sizeof ( buf ), "%08x:%08x:%08x:%08x:%08x",
htonl ( mac->flags__qpn ), htonl ( mac->gid.dwords[0] ),
htonl ( mac->gid.dwords[1] ),
htonl ( mac->gid.dwords[2] ),
htonl ( mac->gid.dwords[3] ) );
return buf;
}
/**
* Hash multicast address
*
* @v af Address family
* @v net_addr Network-layer address
* @v ll_addr Link-layer address to fill in
* @ret rc Return status code
*/
static int ipoib_mc_hash ( unsigned int af __unused,
const void *net_addr __unused,
void *ll_addr __unused ) {
return -ENOTSUP;
}
/**
* Generate Mellanox Ethernet-compatible compressed link-layer address
*
* @v ll_addr Link-layer address
* @v eth_addr Ethernet-compatible address to fill in
*/
static int ipoib_mlx_eth_addr ( const union ib_guid *guid,
uint8_t *eth_addr ) {
eth_addr[0] = ( ( guid->bytes[3] == 2 ) ? 0x00 : 0x02 );
eth_addr[1] = guid->bytes[1];
eth_addr[2] = guid->bytes[2];
eth_addr[3] = guid->bytes[5];
eth_addr[4] = guid->bytes[6];
eth_addr[5] = guid->bytes[7];
return 0;
}
/** An IPoIB Ethernet-compatible compressed link-layer address generator */
struct ipoib_eth_addr_handler {
/** GUID byte 1 */
uint8_t byte1;
/** GUID byte 2 */
uint8_t byte2;
/** Handler */
int ( * eth_addr ) ( const union ib_guid *guid,
uint8_t *eth_addr );
};
/** IPoIB Ethernet-compatible compressed link-layer address generators */
static struct ipoib_eth_addr_handler ipoib_eth_addr_handlers[] = {
{ 0x02, 0xc9, ipoib_mlx_eth_addr },
};
/**
* Generate Ethernet-compatible compressed link-layer address
*
* @v ll_addr Link-layer address
* @v eth_addr Ethernet-compatible address to fill in
*/
static int ipoib_eth_addr ( const void *ll_addr, void *eth_addr ) {
const struct ipoib_mac *ipoib_addr = ll_addr;
const union ib_guid *guid = &ipoib_addr->gid.s.guid;
struct ipoib_eth_addr_handler *handler;
unsigned int i; unsigned int i;
for ( i = 0 ; i < ( sizeof ( ipoib_eth_addr_handlers ) / /* Extract bytes from GUID according to mask */
sizeof ( ipoib_eth_addr_handlers[0] ) ) ; i++ ) { for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
handler = &ipoib_eth_addr_handlers[i]; if ( guid_mask & 0x80 )
if ( ( handler->byte1 == guid->bytes[1] ) && *(eth_addr++) = *guid;
( handler->byte2 == guid->bytes[2] ) ) {
return handler->eth_addr ( guid, eth_addr );
}
} }
return -ENOTSUP;
} }
/** IPoIB protocol */ /** IPoIB protocol */
struct ll_protocol ipoib_protocol __ll_protocol = { struct ll_protocol ipoib_protocol __ll_protocol = {
.name = "IPoIB", .name = "IPoIB",
.ll_proto = htons ( ARPHRD_INFINIBAND ), .ll_proto = htons ( ARPHRD_ETHER ),
.hw_addr_len = sizeof ( union ib_guid ), .hw_addr_len = sizeof ( union ib_guid ),
.ll_addr_len = IPOIB_ALEN, .ll_addr_len = ETH_ALEN,
.ll_header_len = IPOIB_HLEN, .ll_header_len = ETH_HLEN,
.push = ipoib_push, .push = eth_push,
.pull = ipoib_pull, .pull = eth_pull,
.init_addr = ipoib_init_addr, .init_addr = ipoib_init_addr,
.ntoa = ipoib_ntoa, .ntoa = eth_ntoa,
.mc_hash = ipoib_mc_hash, .mc_hash = eth_mc_hash,
.eth_addr = ipoib_eth_addr, .eth_addr = eth_eth_addr,
.flags = LL_NAME_ONLY,
}; };
/** /**
@ -398,12 +280,167 @@ struct net_device * alloc_ipoibdev ( size_t priv_size ) {
netdev = alloc_netdev ( priv_size ); netdev = alloc_netdev ( priv_size );
if ( netdev ) { if ( netdev ) {
netdev->ll_protocol = &ipoib_protocol; netdev->ll_protocol = &ipoib_protocol;
netdev->ll_broadcast = ( uint8_t * ) &ipoib_broadcast; netdev->ll_broadcast = eth_broadcast;
netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE; netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
} }
return netdev; return netdev;
} }
/****************************************************************************
*
* IPoIB translation layer
*
****************************************************************************
*/
/**
* Translate transmitted ARP packet
*
* @v netdev Network device
* @v iobuf Packet to be transmitted (with no link-layer headers)
* @ret rc Return status code
*/
static int ipoib_translate_tx_arp ( struct net_device *netdev,
struct io_buffer *iobuf ) {
struct ipoib_device *ipoib = netdev->priv;
struct arphdr *arphdr = iobuf->data;
struct ipoib_mac *target_ha = NULL;
void *sender_pa;
void *target_pa;
/* Do nothing unless ARP contains eIPoIB link-layer addresses */
if ( arphdr->ar_hln != ETH_ALEN )
return 0;
/* Fail unless we have room to expand packet */
if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
ETH_ALEN ) ) ) {
DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
ipoib );
return -ENOBUFS;
}
/* Look up REMAC, if applicable */
if ( arphdr->ar_op == ARPOP_REPLY ) {
target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
if ( ! target_ha )
return -ENXIO;
}
/* Construct new packet */
iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
sender_pa = arp_sender_pa ( arphdr );
target_pa = arp_target_pa ( arphdr );
arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
arphdr->ar_hln = sizeof ( ipoib->mac );
memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
if ( target_ha ) {
memcpy ( arp_target_ha ( arphdr ), target_ha,
sizeof ( *target_ha ) );
}
return 0;
}
/**
* Translate transmitted packet
*
* @v netdev Network device
* @v iobuf Packet to be transmitted (with no link-layer headers)
* @v net_proto Network-layer protocol (in network byte order)
* @ret rc Return status code
*/
static int ipoib_translate_tx ( struct net_device *netdev,
struct io_buffer *iobuf, uint16_t net_proto ) {
switch ( net_proto ) {
case htons ( ETH_P_ARP ) :
return ipoib_translate_tx_arp ( netdev, iobuf );
case htons ( ETH_P_IP ) :
/* No translation needed */
return 0;
default:
/* Cannot handle other traffic via eIPoIB */
return -ENOTSUP;
}
}
/**
* Translate received ARP packet
*
* @v netdev Network device
* @v iobuf Received packet (with no link-layer headers)
* @v remac Constructed Remote Ethernet MAC
* @ret rc Return status code
*/
static int ipoib_translate_rx_arp ( struct net_device *netdev,
struct io_buffer *iobuf,
struct ipoib_remac *remac ) {
struct ipoib_device *ipoib = netdev->priv;
struct arphdr *arphdr = iobuf->data;
void *sender_pa;
void *target_pa;
int rc;
/* Do nothing unless ARP contains IPoIB link-layer addresses */
if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
return 0;
/* Create REMAC cache entry */
if ( ( rc = ipoib_map_remac ( ipoib, remac,
arp_sender_ha ( arphdr ) ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
ipoib, strerror ( rc ) );
return rc;
}
/* Construct new packet */
sender_pa = arp_sender_pa ( arphdr );
target_pa = arp_target_pa ( arphdr );
arphdr->ar_hrd = htons ( ARPHRD_ETHER );
arphdr->ar_hln = ETH_ALEN;
memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
if ( arphdr->ar_op == ARPOP_REPLY ) {
/* Assume received replies were directed to us */
memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
}
iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
return 0;
}
/**
* Translate received packet
*
* @v netdev Network device
* @v iobuf Received packet (with no link-layer headers)
* @v remac Constructed Remote Ethernet MAC
* @v net_proto Network-layer protocol (in network byte order)
* @ret rc Return status code
*/
static int ipoib_translate_rx ( struct net_device *netdev,
struct io_buffer *iobuf,
struct ipoib_remac *remac,
uint16_t net_proto ) {
switch ( net_proto ) {
case htons ( ETH_P_ARP ) :
return ipoib_translate_rx_arp ( netdev, iobuf, remac );
case htons ( ETH_P_IP ) :
/* No translation needed */
return 0;
default:
/* Cannot handle other traffic via eIPoIB */
return -ENOTSUP;
}
}
/**************************************************************************** /****************************************************************************
* *
* IPoIB network device * IPoIB network device
@ -422,17 +459,18 @@ static int ipoib_transmit ( struct net_device *netdev,
struct io_buffer *iobuf ) { struct io_buffer *iobuf ) {
struct ipoib_device *ipoib = netdev->priv; struct ipoib_device *ipoib = netdev->priv;
struct ib_device *ibdev = ipoib->ibdev; struct ib_device *ibdev = ipoib->ibdev;
struct ethhdr *ethhdr;
struct ipoib_hdr *ipoib_hdr; struct ipoib_hdr *ipoib_hdr;
struct ipoib_peer *peer; struct ipoib_mac *mac;
struct ib_address_vector dest; struct ib_address_vector dest;
uint16_t net_proto;
int rc; int rc;
/* Sanity check */ /* Sanity check */
if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) { if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib ); DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
return -EINVAL; return -EINVAL;
} }
ipoib_hdr = iobuf->data;
/* Attempting transmission while link is down will put the /* Attempting transmission while link is down will put the
* queue pair into an error state, so don't try it. * queue pair into an error state, so don't try it.
@ -440,17 +478,30 @@ static int ipoib_transmit ( struct net_device *netdev,
if ( ! ib_link_ok ( ibdev ) ) if ( ! ib_link_ok ( ibdev ) )
return -ENETUNREACH; return -ENETUNREACH;
/* Strip eIPoIB header */
ethhdr = iobuf->data;
net_proto = ethhdr->h_protocol;
iob_pull ( iobuf, sizeof ( *ethhdr ) );
/* Identify destination address */ /* Identify destination address */
peer = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest ); mac = ipoib_find_remac ( ipoib, ( ( void *) ethhdr->h_dest ) );
if ( ! peer ) if ( ! mac )
return -ENXIO; return -ENXIO;
ipoib_hdr->u.reserved = 0;
/* Translate packet if applicable */
if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
return rc;
/* Prepend real IPoIB header */
ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
ipoib_hdr->proto = net_proto;
ipoib_hdr->reserved = 0;
/* Construct address vector */ /* Construct address vector */
memset ( &dest, 0, sizeof ( dest ) ); memset ( &dest, 0, sizeof ( dest ) );
dest.qpn = ( ntohl ( peer->mac.flags__qpn ) & IB_QPN_MASK ); dest.qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
dest.gid_present = 1; dest.gid_present = 1;
memcpy ( &dest.gid, &peer->mac.gid, sizeof ( dest.gid ) ); memcpy ( &dest.gid, &mac->gid, sizeof ( dest.gid ) );
if ( ( rc = ib_resolve_path ( ibdev, &dest ) ) != 0 ) { if ( ( rc = ib_resolve_path ( ibdev, &dest ) ) != 0 ) {
/* Path not resolved yet */ /* Path not resolved yet */
return rc; return rc;
@ -487,14 +538,15 @@ static void ipoib_complete_send ( struct ib_device *ibdev __unused,
*/ */
static void ipoib_complete_recv ( struct ib_device *ibdev __unused, static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
struct ib_queue_pair *qp, struct ib_queue_pair *qp,
struct ib_address_vector *dest __unused, struct ib_address_vector *dest,
struct ib_address_vector *source, struct ib_address_vector *source,
struct io_buffer *iobuf, int rc ) { struct io_buffer *iobuf, int rc ) {
struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp ); struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
struct net_device *netdev = ipoib->netdev; struct net_device *netdev = ipoib->netdev;
struct ipoib_hdr *ipoib_hdr; struct ipoib_hdr *ipoib_hdr;
struct ipoib_mac ll_src; struct ethhdr *ethhdr;
struct ipoib_peer *src; struct ipoib_remac remac;
uint16_t net_proto;
/* Record errors */ /* Record errors */
if ( rc != 0 ) { if ( rc != 0 ) {
@ -510,7 +562,6 @@ static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
netdev_rx_err ( netdev, iobuf, -EIO ); netdev_rx_err ( netdev, iobuf, -EIO );
return; return;
} }
ipoib_hdr = iobuf->data;
if ( ! source ) { if ( ! source ) {
DBGC ( ipoib, "IPoIB %p received packet without address " DBGC ( ipoib, "IPoIB %p received packet without address "
"vector\n", ipoib ); "vector\n", ipoib );
@ -518,12 +569,37 @@ static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
return; return;
} }
/* Parse source address */ /* Strip real IPoIB header */
if ( source->gid_present ) { ipoib_hdr = iobuf->data;
ll_src.flags__qpn = htonl ( source->qpn ); net_proto = ipoib_hdr->proto;
memcpy ( &ll_src.gid, &source->gid, sizeof ( ll_src.gid ) ); iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
src = ipoib_cache_peer ( &ll_src );
ipoib_hdr->u.peer.src = src->key; /* Construct source address from remote QPN and LID */
remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
remac.lid = htons ( source->lid );
/* Translate packet if applicable */
if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
net_proto ) ) != 0 ) {
netdev_rx_err ( netdev, iobuf, rc );
return;
}
/* Prepend eIPoIB header */
ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
ethhdr->h_protocol = net_proto;
/* Construct destination address */
if ( dest->gid_present && ( memcmp ( &dest->gid, &ipoib->broadcast.gid,
sizeof ( dest->gid ) ) == 0 ) ) {
/* Broadcast GID; use the Ethernet broadcast address */
memcpy ( &ethhdr->h_dest, eth_broadcast,
sizeof ( ethhdr->h_dest ) );
} else {
/* Assume destination address is local Ethernet MAC */
memcpy ( &ethhdr->h_dest, netdev->ll_addr,
sizeof ( ethhdr->h_dest ) );
} }
/* Hand off to network layer */ /* Hand off to network layer */
@ -536,9 +612,40 @@ static struct ib_completion_queue_operations ipoib_cq_op = {
.complete_recv = ipoib_complete_recv, .complete_recv = ipoib_complete_recv,
}; };
/**
* Allocate IPoIB receive I/O buffer
*
* @v len Length of buffer
* @ret iobuf I/O buffer, or NULL
*
* Some Infiniband hardware requires 2kB alignment of receive buffers
* and provides no way to disable header separation. The result is
* that there are only four bytes of link-layer header (the real IPoIB
* header) before the payload. This is not sufficient space to insert
* an eIPoIB link-layer pseudo-header.
*
* We therefore allocate I/O buffers offset to start slightly before
* the natural alignment boundary, in order to allow sufficient space.
*/
static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
struct io_buffer *iobuf;
size_t reserve_len;
/* Calculate additional length required at start of buffer */
reserve_len = ( sizeof ( struct ethhdr ) -
sizeof ( struct ipoib_hdr ) );
/* Allocate buffer */
iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
if ( iobuf ) {
iob_reserve ( iobuf, reserve_len );
}
return iobuf;
}
/** IPoIB queue pair operations */ /** IPoIB queue pair operations */
static struct ib_queue_pair_operations ipoib_qp_op = { static struct ib_queue_pair_operations ipoib_qp_op = {
.alloc_iob = alloc_iob, .alloc_iob = ipoib_alloc_iob,
}; };
/** /**
@ -550,7 +657,11 @@ static void ipoib_poll ( struct net_device *netdev ) {
struct ipoib_device *ipoib = netdev->priv; struct ipoib_device *ipoib = netdev->priv;
struct ib_device *ibdev = ipoib->ibdev; struct ib_device *ibdev = ipoib->ibdev;
/* Poll Infiniband device */
ib_poll_eq ( ibdev ); ib_poll_eq ( ibdev );
/* Poll the retry timers (required for IPoIB multicast join) */
retry_poll();
} }
/** /**
@ -617,15 +728,14 @@ static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
static void ipoib_link_state_changed ( struct ib_device *ibdev ) { static void ipoib_link_state_changed ( struct ib_device *ibdev ) {
struct net_device *netdev = ib_get_ownerdata ( ibdev ); struct net_device *netdev = ib_get_ownerdata ( ibdev );
struct ipoib_device *ipoib = netdev->priv; struct ipoib_device *ipoib = netdev->priv;
struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
int rc; int rc;
/* Leave existing broadcast group */ /* Leave existing broadcast group */
ipoib_leave_broadcast_group ( ipoib ); ipoib_leave_broadcast_group ( ipoib );
/* Update MAC address based on potentially-new GID prefix */ /* Update MAC address based on potentially-new GID prefix */
memcpy ( &mac->gid.s.prefix, &ibdev->gid.s.prefix, memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
sizeof ( mac->gid.s.prefix ) ); sizeof ( ipoib->mac.gid.s.prefix ) );
/* Update broadcast GID based on potentially-new partition key */ /* Update broadcast GID based on potentially-new partition key */
ipoib->broadcast.gid.words[2] = ipoib->broadcast.gid.words[2] =
@ -654,7 +764,6 @@ static void ipoib_link_state_changed ( struct ib_device *ibdev ) {
static int ipoib_open ( struct net_device *netdev ) { static int ipoib_open ( struct net_device *netdev ) {
struct ipoib_device *ipoib = netdev->priv; struct ipoib_device *ipoib = netdev->priv;
struct ib_device *ibdev = ipoib->ibdev; struct ib_device *ibdev = ipoib->ibdev;
struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
int rc; int rc;
/* Open IB device */ /* Open IB device */
@ -686,7 +795,7 @@ static int ipoib_open ( struct net_device *netdev ) {
ib_qp_set_ownerdata ( ipoib->qp, ipoib ); ib_qp_set_ownerdata ( ipoib->qp, ipoib );
/* Update MAC address with QPN */ /* Update MAC address with QPN */
mac->flags__qpn = htonl ( ipoib->qp->qpn ); ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
/* Fill receive rings */ /* Fill receive rings */
ib_refill_recv ( ibdev, ipoib->qp ); ib_refill_recv ( ibdev, ipoib->qp );
@ -713,13 +822,15 @@ static int ipoib_open ( struct net_device *netdev ) {
static void ipoib_close ( struct net_device *netdev ) { static void ipoib_close ( struct net_device *netdev ) {
struct ipoib_device *ipoib = netdev->priv; struct ipoib_device *ipoib = netdev->priv;
struct ib_device *ibdev = ipoib->ibdev; struct ib_device *ibdev = ipoib->ibdev;
struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
/* Flush REMAC cache */
ipoib_flush_remac ( ipoib );
/* Leave broadcast group */ /* Leave broadcast group */
ipoib_leave_broadcast_group ( ipoib ); ipoib_leave_broadcast_group ( ipoib );
/* Remove QPN from MAC address */ /* Remove QPN from MAC address */
mac->flags__qpn = 0; ipoib->mac.flags__qpn = 0;
/* Tear down the queues */ /* Tear down the queues */
ib_destroy_qp ( ibdev, ipoib->qp ); ib_destroy_qp ( ibdev, ipoib->qp );
@ -759,15 +870,19 @@ static int ipoib_probe ( struct ib_device *ibdev ) {
memset ( ipoib, 0, sizeof ( *ipoib ) ); memset ( ipoib, 0, sizeof ( *ipoib ) );
ipoib->netdev = netdev; ipoib->netdev = netdev;
ipoib->ibdev = ibdev; ipoib->ibdev = ibdev;
INIT_LIST_HEAD ( &ipoib->peers );
/* Extract hardware address */ /* Extract hardware address */
memcpy ( netdev->hw_addr, &ibdev->gid.s.guid, memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
sizeof ( ibdev->gid.s.guid ) ); sizeof ( ibdev->gid.s.guid ) );
/* Set default broadcast address */ /* Set local MAC address */
memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
sizeof ( ipoib->mac.gid.s.guid ) );
/* Set default broadcast MAC address */
memcpy ( &ipoib->broadcast, &ipoib_broadcast, memcpy ( &ipoib->broadcast, &ipoib_broadcast,
sizeof ( ipoib->broadcast ) ); sizeof ( ipoib->broadcast ) );
netdev->ll_broadcast = ( ( uint8_t * ) &ipoib->broadcast );
/* Register network device */ /* Register network device */
if ( ( rc = register_netdev ( netdev ) ) != 0 ) if ( ( rc = register_netdev ( netdev ) ) != 0 )

View File

@ -651,8 +651,6 @@ struct dhcphdr {
#define PXEBS_SETTINGS_NAME "pxebs" #define PXEBS_SETTINGS_NAME "pxebs"
extern uint32_t dhcp_last_xid; extern uint32_t dhcp_last_xid;
extern unsigned int dhcp_chaddr ( struct net_device *netdev, void *chaddr,
uint16_t *flags );
extern int dhcp_create_packet ( struct dhcp_packet *dhcppkt, extern int dhcp_create_packet ( struct dhcp_packet *dhcppkt,
struct net_device *netdev, uint8_t msgtype, struct net_device *netdev, uint8_t msgtype,
uint32_t xid, const void *options, uint32_t xid, const void *options,

View File

@ -8,6 +8,7 @@
FILE_LICENCE ( GPL2_OR_LATER ); FILE_LICENCE ( GPL2_OR_LATER );
#include <ipxe/if_arp.h>
#include <ipxe/infiniband.h> #include <ipxe/infiniband.h>
/** IPoIB MAC address length */ /** IPoIB MAC address length */
@ -33,25 +34,32 @@ struct ipoib_hdr {
/** Network-layer protocol */ /** Network-layer protocol */
uint16_t proto; uint16_t proto;
/** Reserved, must be zero */ /** Reserved, must be zero */
union { uint16_t reserved;
/** Reserved, must be zero */
uint16_t reserved;
/** Peer addresses
*
* We use these fields internally to represent the
* peer addresses using a lookup key. There simply
* isn't enough room in the IPoIB header to store
* literal source or destination MAC addresses.
*/
struct {
/** Destination address key */
uint8_t dest;
/** Source address key */
uint8_t src;
} __attribute__ (( packed )) peer;
} __attribute__ (( packed )) u;
} __attribute__ (( packed )); } __attribute__ (( packed ));
/** GUID mask used for constructing eIPoIB Local Ethernet MAC address (LEMAC) */
#define IPOIB_GUID_MASK 0xe7
/** eIPoIB Remote Ethernet MAC address
*
* An eIPoIB REMAC address is an Ethernet-like (6 byte) link-layer
* pseudo-address used to look up a full IPoIB link-layer address.
*/
struct ipoib_remac {
/** Remote QPN
*
* Must be ORed with EIPOIB_QPN_LA so that eIPoIB REMAC
* addresses are considered as locally-assigned Ethernet MAC
* addreses.
*/
uint32_t qpn;
/** Remote LID */
uint16_t lid;
} __attribute__ (( packed ));
/** eIPoIB REMAC locally-assigned address indicator */
#define EIPOIB_QPN_LA 0x02000000UL
extern const char * ipoib_ntoa ( const void *ll_addr ); extern const char * ipoib_ntoa ( const void *ll_addr );
extern struct net_device * alloc_ipoibdev ( size_t priv_size ); extern struct net_device * alloc_ipoibdev ( size_t priv_size );

View File

@ -188,8 +188,17 @@ struct ll_protocol {
uint8_t ll_addr_len; uint8_t ll_addr_len;
/** Link-layer header length */ /** Link-layer header length */
uint8_t ll_header_len; uint8_t ll_header_len;
/** Flags */
unsigned int flags;
}; };
/** Local link-layer address functions only as a name
*
* This flag indicates that the local link-layer address cannot
* directly be used as a destination address by a remote node.
*/
#define LL_NAME_ONLY 0x0001
/** Network device operations */ /** Network device operations */
struct net_device_operations { struct net_device_operations {
/** Open network device /** Open network device

View File

@ -590,7 +590,8 @@ efi_snp_transmit ( EFI_SIMPLE_NETWORK_PROTOCOL *snp,
/* Allocate buffer */ /* Allocate buffer */
ll_headroom = ( MAX_LL_HEADER_LEN - ll_header_len ); ll_headroom = ( MAX_LL_HEADER_LEN - ll_header_len );
iobuf = alloc_iob ( ll_headroom + len ); iobuf = alloc_iob ( ll_headroom +
( ( len > IOB_ZLEN ) ? len : IOB_ZLEN ) );
if ( ! iobuf ) { if ( ! iobuf ) {
DBGC ( snpdev, "SNPDEV %p TX could not allocate %ld-byte " DBGC ( snpdev, "SNPDEV %p TX could not allocate %ld-byte "
"buffer\n", snpdev, ( ( unsigned long ) len ) ); "buffer\n", snpdev, ( ( unsigned long ) len ) );

View File

@ -938,10 +938,18 @@ int dhcp_create_packet ( struct dhcp_packet *dhcppkt,
dhcphdr->magic = htonl ( DHCP_MAGIC_COOKIE ); dhcphdr->magic = htonl ( DHCP_MAGIC_COOKIE );
dhcphdr->htype = ntohs ( netdev->ll_protocol->ll_proto ); dhcphdr->htype = ntohs ( netdev->ll_protocol->ll_proto );
dhcphdr->op = dhcp_op[msgtype]; dhcphdr->op = dhcp_op[msgtype];
dhcphdr->hlen = dhcp_chaddr ( netdev, dhcphdr->chaddr, dhcphdr->hlen = netdev->ll_protocol->ll_addr_len;
&dhcphdr->flags ); memcpy ( dhcphdr->chaddr, netdev->ll_addr,
netdev->ll_protocol->ll_addr_len );
memcpy ( dhcphdr->options, options, options_len ); memcpy ( dhcphdr->options, options, options_len );
/* If the local link-layer address functions only as a name
* (i.e. cannot be used as a destination address), then
* request broadcast responses.
*/
if ( netdev->ll_protocol->flags & LL_NAME_ONLY )
dhcphdr->flags |= htons ( BOOTP_FL_BROADCAST );
/* If the network device already has an IPv4 address then /* If the network device already has an IPv4 address then
* unicast responses from the DHCP server may be rejected, so * unicast responses from the DHCP server may be rejected, so
* request broadcast responses. * request broadcast responses.

View File

@ -38,10 +38,6 @@ FILE_LICENCE ( GPL2_OR_LATER );
*/ */
int dhcp ( struct net_device *netdev ) { int dhcp ( struct net_device *netdev ) {
struct dhcphdr *dhcphdr;
typeof ( dhcphdr->chaddr ) chaddr;
unsigned int hlen;
unsigned int i;
int rc; int rc;
/* Check we can open the interface first */ /* Check we can open the interface first */
@ -53,12 +49,8 @@ int dhcp ( struct net_device *netdev ) {
return rc; return rc;
/* Perform DHCP */ /* Perform DHCP */
printf ( "DHCP (%s", netdev->name ); printf ( "DHCP (%s %s)", netdev->name,
hlen = dhcp_chaddr ( netdev, chaddr, NULL ); netdev->ll_protocol->ntoa ( netdev->ll_addr ) );
for ( i = 0 ; i < hlen ; i++ )
printf ( "%c%02x", ( i ? ':' : ' ' ), chaddr[i] );
printf ( ")" );
if ( ( rc = start_dhcp ( &monojob, netdev ) ) == 0 ) { if ( ( rc = start_dhcp ( &monojob, netdev ) ) == 0 ) {
rc = monojob_wait ( "" ); rc = monojob_wait ( "" );
} else if ( rc > 0 ) { } else if ( rc > 0 ) {