david/ipxe
david
/
ipxe
Archived
1
0
Fork 0

[tcp] Implement support for TCP Selective Acknowledgements (SACK)

The TCP Selective Acknowledgement option (specified in RFC2018)
provides a mechanism for the receiver to indicate packets that have
been received out of order (e.g. due to earlier dropped packets).

iPXE often operates in environments in which there is a high
probability of packet loss.  For example, the legacy USB keyboard
emulation in some BIOSes involves polling the USB bus from within a
system management interrupt: this introduces an invisible delay of
around 500us which is long enough for around 40 full-length packets to
be dropped.  Similarly, almost all 1Gbps USB2 devices will eventually
end up dropping packets because the USB2 bus does not provide enough
bandwidth to sustain a 1Gbps stream, and most devices will not provide
enough internal buffering to hold a full TCP window's worth of
received packets.

Add support for sending TCP Selective Acknowledgements.  This provides
the sender with more detailed information about which packets have
been lost, and so allows for a more efficient retransmission strategy.

We include a SACK-permitted option in our SYN packet, since
experimentation shows that at least Linux peers will not include a
SACK-permitted option in the SYN-ACK packet if one was not present in
the initial SYN.  (RFC2018 does not seem to mandate this behaviour,
but it is consistent with the approach taken in RFC1323.)  We ignore
any received SACK options; this is safe to do since SACK is only ever
advisory and we never have to send non-trivial amounts of data.

Since our TCP receive queue is a candidate for cache discarding under
low memory conditions, we may end up discarding data that has been
reported as received via a SACK option.  This is permitted by RFC2018.
We follow the stricture that SACK blocks must not report data which is
no longer held by the receiver: previously-reported blocks are
validated against the current receive queue before being included
within the current SACK block list.

Experiments in a qemu VM using forced packet drops (by setting
NETDEV_DISCARD_RATE to 32) show that implementing SACK improves
throughput by around 400%.

Experiments with a USB2 NIC (an SMSC7500) show that implementing SACK
improves throughput by around 700%, increasing the download rate from
35Mbps up to 250Mbps (which is approximately the usable bandwidth
limit for USB2).

Signed-off-by: Michael Brown <mcb30@ipxe.org>
This commit is contained in:
Michael Brown 2015-03-11 17:53:29 +00:00
parent bc985ca089
commit e0fc8fe781
2 changed files with 202 additions and 4 deletions

View File

@ -79,6 +79,48 @@ struct tcp_window_scale_padded_option {
*/
#define TCP_RX_WINDOW_SCALE 9
/** TCP selective acknowledgement permitted option */
struct tcp_sack_permitted_option {
uint8_t kind;
uint8_t length;
} __attribute__ (( packed ));
/** Padded TCP selective acknowledgement permitted option (used for sending) */
struct tcp_sack_permitted_padded_option {
uint8_t nop[2];
struct tcp_sack_permitted_option spopt;
} __attribute__ (( packed ));
/** Code for the TCP selective acknowledgement permitted option */
#define TCP_OPTION_SACK_PERMITTED 4
/** TCP selective acknowledgement option */
struct tcp_sack_option {
uint8_t kind;
uint8_t length;
} __attribute__ (( packed ));
/** TCP selective acknowledgement block */
struct tcp_sack_block {
uint32_t left;
uint32_t right;
} __attribute__ (( packed ));
/** Maximum number of selective acknowledgement blocks
*
* This allows for the presence of the TCP timestamp option.
*/
#define TCP_SACK_MAX 3
/** Padded TCP selective acknowledgement option (used for sending) */
struct tcp_sack_padded_option {
uint8_t nop[2];
struct tcp_sack_option sackopt;
} __attribute__ (( packed ));
/** Code for the TCP selective acknowledgement option */
#define TCP_OPTION_SACK 5
/** TCP timestamp option */
struct tcp_timestamp_option {
uint8_t kind;
@ -102,6 +144,8 @@ struct tcp_options {
const struct tcp_mss_option *mssopt;
/** Window scale option, if present */
const struct tcp_window_scale_option *wsopt;
/** SACK permitted option, if present */
const struct tcp_sack_permitted_option *spopt;
/** Timestamp option, if present */
const struct tcp_timestamp_option *tsopt;
};

View File

@ -104,6 +104,9 @@ struct tcp_connection {
/** Maximum receive window */
uint32_t max_rcv_win;
/** Selective acknowledgement list (in host-endian order) */
struct tcp_sack_block sack[TCP_SACK_MAX];
/** Transmit queue */
struct list_head tx_queue;
/** Receive queue */
@ -129,6 +132,8 @@ enum tcp_flags {
TCP_TS_ENABLED = 0x0002,
/** TCP acknowledgement is pending */
TCP_ACK_PENDING = 0x0004,
/** TCP selective acknowledgement is enabled */
TCP_SACK_ENABLED = 0x0008,
};
/** TCP internal header
@ -143,6 +148,8 @@ struct tcp_rx_queued_header {
* enqueued, and so excludes the SYN, if present.
*/
uint32_t seq;
/** Next SEQ value, in host-endian order */
uint32_t nxt;
/** Flags
*
* Only FIN is valid within this flags byte; all other flags
@ -449,6 +456,94 @@ static size_t tcp_xfer_window ( struct tcp_connection *tcp ) {
return tcp_xmit_win ( tcp );
}
/**
* Find selective acknowledgement block
*
* @v tcp TCP connection
* @v seq SEQ value in SACK block (in host-endian order)
* @v sack SACK block to fill in (in host-endian order)
* @ret len Length of SACK block
*/
static uint32_t tcp_sack_block ( struct tcp_connection *tcp, uint32_t seq,
struct tcp_sack_block *sack ) {
struct io_buffer *iobuf;
struct tcp_rx_queued_header *tcpqhdr;
uint32_t left = tcp->rcv_ack;
uint32_t right = left;
/* Find highest block which does not start after SEQ */
list_for_each_entry ( iobuf, &tcp->rx_queue, list ) {
tcpqhdr = iobuf->data;
if ( tcp_cmp ( tcpqhdr->seq, right ) > 0 ) {
if ( tcp_cmp ( tcpqhdr->seq, seq ) > 0 )
break;
left = tcpqhdr->seq;
}
if ( tcp_cmp ( tcpqhdr->nxt, right ) > 0 )
right = tcpqhdr->nxt;
}
/* Fail if this block does not contain SEQ */
if ( tcp_cmp ( right, seq ) < 0 )
return 0;
/* Populate SACK block */
sack->left = left;
sack->right = right;
return ( right - left );
}
/**
* Update TCP selective acknowledgement list
*
* @v tcp TCP connection
* @v seq SEQ value in first SACK block (in host-endian order)
* @ret count Number of SACK blocks
*/
static unsigned int tcp_sack ( struct tcp_connection *tcp, uint32_t seq ) {
struct tcp_sack_block sack[TCP_SACK_MAX];
unsigned int old = 0;
unsigned int new = 0;
unsigned int i;
uint32_t len;
/* Populate first new SACK block */
len = tcp_sack_block ( tcp, seq, &sack[0] );
if ( len )
new++;
/* Populate remaining new SACK blocks based on old SACK blocks */
for ( old = 0 ; old < TCP_SACK_MAX ; old++ ) {
/* Stop if we run out of space in the new list */
if ( new == TCP_SACK_MAX )
break;
/* Skip empty old SACK blocks */
if ( tcp->sack[old].left == tcp->sack[old].right )
continue;
/* Populate new SACK block */
len = tcp_sack_block ( tcp, tcp->sack[old].left, &sack[new] );
if ( len == 0 )
continue;
/* Eliminate duplicates */
for ( i = 0 ; i < new ; i++ ) {
if ( sack[i].left == sack[new].left ) {
new--;
break;
}
}
new++;
}
/* Update SACK list */
memset ( tcp->sack, 0, sizeof ( tcp->sack ) );
memcpy ( tcp->sack, sack, ( new * sizeof ( tcp->sack[0] ) ) );
return new;
}
/**
* Process TCP transmit queue
*
@ -493,9 +588,10 @@ static size_t tcp_process_tx_queue ( struct tcp_connection *tcp, size_t max_len,
}
/**
* Transmit any outstanding data
* Transmit any outstanding data (with selective acknowledgement)
*
* @v tcp TCP connection
* @v sack_seq SEQ for first selective acknowledgement (if any)
*
* Transmits any outstanding data on the connection.
*
@ -503,15 +599,21 @@ static size_t tcp_process_tx_queue ( struct tcp_connection *tcp, size_t max_len,
* will have been started if necessary, and so the stack will
* eventually attempt to retransmit the failed packet.
*/
static void tcp_xmit ( struct tcp_connection *tcp ) {
static void tcp_xmit_sack ( struct tcp_connection *tcp, uint32_t sack_seq ) {
struct io_buffer *iobuf;
struct tcp_header *tcphdr;
struct tcp_mss_option *mssopt;
struct tcp_window_scale_padded_option *wsopt;
struct tcp_timestamp_padded_option *tsopt;
struct tcp_sack_permitted_padded_option *spopt;
struct tcp_sack_padded_option *sackopt;
struct tcp_sack_block *sack;
void *payload;
unsigned int flags;
unsigned int sack_count;
unsigned int i;
size_t len = 0;
size_t sack_len;
uint32_t seq_len;
uint32_t app_win;
uint32_t max_rcv_win;
@ -590,6 +692,10 @@ static void tcp_xmit ( struct tcp_connection *tcp ) {
wsopt->wsopt.kind = TCP_OPTION_WS;
wsopt->wsopt.length = sizeof ( wsopt->wsopt );
wsopt->wsopt.scale = TCP_RX_WINDOW_SCALE;
spopt = iob_push ( iobuf, sizeof ( *spopt ) );
memset ( spopt->nop, TCP_OPTION_NOP, sizeof ( spopt ) );
spopt->spopt.kind = TCP_OPTION_SACK_PERMITTED;
spopt->spopt.length = sizeof ( spopt->spopt );
}
if ( ( flags & TCP_SYN ) || ( tcp->flags & TCP_TS_ENABLED ) ) {
tsopt = iob_push ( iobuf, sizeof ( *tsopt ) );
@ -599,6 +705,21 @@ static void tcp_xmit ( struct tcp_connection *tcp ) {
tsopt->tsopt.tsval = htonl ( currticks() );
tsopt->tsopt.tsecr = htonl ( tcp->ts_recent );
}
if ( ( tcp->flags & TCP_SACK_ENABLED ) &&
( ! list_empty ( &tcp->rx_queue ) ) &&
( ( sack_count = tcp_sack ( tcp, sack_seq ) ) != 0 ) ) {
sack_len = ( sack_count * sizeof ( *sack ) );
sackopt = iob_push ( iobuf, ( sizeof ( *sackopt ) + sack_len ));
memset ( sackopt->nop, TCP_OPTION_NOP, sizeof ( sackopt->nop ));
sackopt->sackopt.kind = TCP_OPTION_SACK;
sackopt->sackopt.length =
( sizeof ( sackopt->sackopt ) + sack_len );
sack = ( ( ( void * ) sackopt ) + sizeof ( *sackopt ) );
for ( i = 0 ; i < sack_count ; i++, sack++ ) {
sack->left = htonl ( tcp->sack[i].left );
sack->right = htonl ( tcp->sack[i].right );
}
}
if ( len != 0 )
flags |= TCP_PSH;
tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
@ -635,6 +756,17 @@ static void tcp_xmit ( struct tcp_connection *tcp ) {
profile_stop ( &tcp_tx_profiler );
}
/**
* Transmit any outstanding data
*
* @v tcp TCP connection
*/
static void tcp_xmit ( struct tcp_connection *tcp ) {
/* Transmit without an explicit first SACK */
tcp_xmit_sack ( tcp, tcp->rcv_ack );
}
/** TCP process descriptor */
static struct process_descriptor tcp_process_desc =
PROC_DESC_ONCE ( struct tcp_connection, process, tcp_xmit );
@ -804,6 +936,12 @@ static void tcp_rx_opts ( struct tcp_connection *tcp, const void *data,
case TCP_OPTION_WS:
options->wsopt = data;
break;
case TCP_OPTION_SACK_PERMITTED:
options->spopt = data;
break;
case TCP_OPTION_SACK:
/* Ignore received SACKs */
break;
case TCP_OPTION_TS:
options->tsopt = data;
break;
@ -823,6 +961,7 @@ static void tcp_rx_opts ( struct tcp_connection *tcp, const void *data,
* @v seq_len Sequence space length to consume
*/
static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
unsigned int sack;
/* Sanity check */
assert ( seq_len > 0 );
@ -840,6 +979,16 @@ static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
/* Update timestamp */
tcp->ts_recent = tcp->ts_val;
/* Update SACK list */
for ( sack = 0 ; sack < TCP_SACK_MAX ; sack++ ) {
if ( tcp->sack[sack].left == tcp->sack[sack].right )
continue;
if ( tcp_cmp ( tcp->sack[sack].left, tcp->rcv_ack ) < 0 )
tcp->sack[sack].left = tcp->rcv_ack;
if ( tcp_cmp ( tcp->sack[sack].right, tcp->rcv_ack ) < 0 )
tcp->sack[sack].right = tcp->rcv_ack;
}
/* Mark ACK as pending */
tcp->flags |= TCP_ACK_PENDING;
}
@ -860,6 +1009,8 @@ static int tcp_rx_syn ( struct tcp_connection *tcp, uint32_t seq,
tcp->rcv_ack = seq;
if ( options->tsopt )
tcp->flags |= TCP_TS_ENABLED;
if ( options->spopt )
tcp->flags |= TCP_SACK_ENABLED;
if ( options->wsopt ) {
tcp->snd_win_scale = options->wsopt->scale;
tcp->rcv_win_scale = TCP_RX_WINDOW_SCALE;
@ -1070,6 +1221,7 @@ static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
struct io_buffer *queued;
size_t len;
uint32_t seq_len;
uint32_t nxt;
/* Calculate remaining flags and sequence length. Note that
* SYN, if present, has already been processed by this point.
@ -1077,6 +1229,7 @@ static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
flags &= TCP_FIN;
len = iob_len ( iobuf );
seq_len = ( len + ( flags ? 1 : 0 ) );
nxt = ( seq + seq_len );
/* Discard immediately (to save memory) if:
*
@ -1087,7 +1240,7 @@ static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
*/
if ( ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) ||
( tcp_cmp ( seq, tcp->rcv_ack + tcp->rcv_win ) >= 0 ) ||
( tcp_cmp ( seq + seq_len, tcp->rcv_ack ) < 0 ) ||
( tcp_cmp ( nxt, tcp->rcv_ack ) < 0 ) ||
( seq_len == 0 ) ) {
free_iob ( iobuf );
return;
@ -1096,6 +1249,7 @@ static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
/* Add internal header */
tcpqhdr = iob_push ( iobuf, sizeof ( *tcpqhdr ) );
tcpqhdr->seq = seq;
tcpqhdr->nxt = nxt;
tcpqhdr->flags = flags;
/* Add to RX queue */
@ -1289,7 +1443,7 @@ static int tcp_rx ( struct io_buffer *iobuf,
if ( list_empty ( &tcp->rx_queue ) ) {
process_add ( &tcp->process );
} else {
tcp_xmit ( tcp );
tcp_xmit_sack ( tcp, seq );
}
/* If this packet was the last we expect to receive, set up