david/ipxe
david
/
ipxe
Archived
1
0
Fork 0

[hyperv] Cope with Windows Server 2016 enlightenments

An "enlightened" external bootloader (such as Windows Server 2016's
winload.exe) may take ownership of the Hyper-V connection before all
INT 13 operations have been completed.  When this happens, all VMBus
devices are implicitly closed and we are left with a non-functional
network connection.

Detect when our Hyper-V connection has been lost (by checking the
SynIC message page MSR).  Reclaim ownership of the Hyper-V connection
and reestablish any VMBus devices, without disrupting any existing
iPXE state (such as IPv4 settings attached to the network device).

Windows Server 2016 will not cleanly take ownership of an active
Hyper-V connection.  Experimentation shows that we can quiesce by
resetting only the SynIC message page MSR; this results in a
successful SAN boot (on a Windows 2012 R2 physical host).  Choose to
quiesce by resetting (almost) all MSRs, in the hope that this will be
more robust against corner cases such as a stray synthetic interrupt
occurring during the handover.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
This commit is contained in:
Michael Brown 2017-04-25 14:13:22 +01:00
parent 276d618ca9
commit b91cc983da
6 changed files with 345 additions and 14 deletions

View File

@ -40,6 +40,7 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
#include <ipxe/malloc.h>
#include <ipxe/device.h>
#include <ipxe/timer.h>
#include <ipxe/quiesce.h>
#include <ipxe/cpuid.h>
#include <ipxe/msr.h>
#include <ipxe/hyperv.h>
@ -299,6 +300,10 @@ static void hv_map_synic ( struct hv_hypervisor *hv ) {
uint64_t siefp;
uint64_t scontrol;
/* Zero SynIC message and event pages */
memset ( hv->synic.message, 0, PAGE_SIZE );
memset ( hv->synic.event, 0, PAGE_SIZE );
/* Map SynIC message page */
simp = rdmsr ( HV_X64_MSR_SIMP );
simp &= ( PAGE_SIZE - 1 );
@ -321,21 +326,14 @@ static void hv_map_synic ( struct hv_hypervisor *hv ) {
}
/**
* Unmap synthetic interrupt controller
* Unmap synthetic interrupt controller, leaving SCONTROL untouched
*
* @v hv Hyper-V hypervisor
*/
static void hv_unmap_synic ( struct hv_hypervisor *hv ) {
uint64_t scontrol;
static void hv_unmap_synic_no_scontrol ( struct hv_hypervisor *hv ) {
uint64_t siefp;
uint64_t simp;
/* Disable SynIC */
scontrol = rdmsr ( HV_X64_MSR_SCONTROL );
scontrol &= ~HV_SCONTROL_ENABLE;
DBGC2 ( hv, "HV %p SCONTROL MSR is %#08llx\n", hv, scontrol );
wrmsr ( HV_X64_MSR_SCONTROL, scontrol );
/* Unmap SynIC event page */
siefp = rdmsr ( HV_X64_MSR_SIEFP );
siefp &= ( ( PAGE_SIZE - 1 ) & ~HV_SIEFP_ENABLE );
@ -349,6 +347,24 @@ static void hv_unmap_synic ( struct hv_hypervisor *hv ) {
wrmsr ( HV_X64_MSR_SIMP, simp );
}
/**
* Unmap synthetic interrupt controller
*
* @v hv Hyper-V hypervisor
*/
static void hv_unmap_synic ( struct hv_hypervisor *hv ) {
uint64_t scontrol;
/* Disable SynIC */
scontrol = rdmsr ( HV_X64_MSR_SCONTROL );
scontrol &= ~HV_SCONTROL_ENABLE;
DBGC2 ( hv, "HV %p SCONTROL MSR is %#08llx\n", hv, scontrol );
wrmsr ( HV_X64_MSR_SCONTROL, scontrol );
/* Unmap SynIC event and message pages */
hv_unmap_synic_no_scontrol ( hv );
}
/**
* Enable synthetic interrupt
*
@ -385,8 +401,12 @@ void hv_disable_sint ( struct hv_hypervisor *hv, unsigned int sintx ) {
unsigned long msr = HV_X64_MSR_SINT ( sintx );
uint64_t sint;
/* Disable synthetic interrupt */
/* Do nothing if interrupt is already disabled */
sint = rdmsr ( msr );
if ( sint & HV_SINT_MASKED )
return;
/* Disable synthetic interrupt */
sint &= ~HV_SINT_AUTO_EOI;
sint |= HV_SINT_MASKED;
DBGC2 ( hv, "HV %p SINT%d MSR is %#08llx\n", hv, sintx, sint );
@ -589,6 +609,7 @@ static void hv_remove ( struct root_device *rootdev ) {
hv_free_pages ( hv, hv->hypercall, hv->synic.message, hv->synic.event,
NULL );
free ( hv );
rootdev_set_drvdata ( rootdev, NULL );
}
/** Hyper-V root device driver */
@ -603,6 +624,100 @@ struct root_device hv_root_device __root_device = {
.driver = &hv_root_driver,
};
/**
* Quiesce system
*
*/
static void hv_quiesce ( void ) {
struct hv_hypervisor *hv = rootdev_get_drvdata ( &hv_root_device );
unsigned int i;
/* Do nothing if we are not running in Hyper-V */
if ( ! hv )
return;
/* The "enlightened" portions of the Windows Server 2016 boot
* process will not cleanly take ownership of an active
* Hyper-V connection. Experimentation shows that the minimum
* requirement is that we disable the SynIC message page
* (i.e. zero the SIMP MSR).
*
* We cannot perform a full shutdown of the Hyper-V
* connection. Experimentation shows that if we disable the
* SynIC (i.e. zero the SCONTROL MSR) then Windows Server 2016
* will enter an indefinite wait loop.
*
* Attempt to create a safe handover environment by resetting
* all MSRs except for SCONTROL.
*
* Note that we do not shut down our VMBus devices, since we
* may need to unquiesce the system and continue operation.
*/
/* Disable all synthetic interrupts */
for ( i = 0 ; i <= HV_SINT_MAX ; i++ )
hv_disable_sint ( hv, i );
/* Unmap synthetic interrupt controller, leaving SCONTROL
* enabled (see above).
*/
hv_unmap_synic_no_scontrol ( hv );
/* Unmap hypercall page */
hv_unmap_hypercall ( hv );
DBGC ( hv, "HV %p quiesced\n", hv );
}
/**
* Unquiesce system
*
*/
static void hv_unquiesce ( void ) {
struct hv_hypervisor *hv = rootdev_get_drvdata ( &hv_root_device );
uint64_t simp;
int rc;
/* Do nothing if we are not running in Hyper-V */
if ( ! hv )
return;
/* Experimentation shows that the "enlightened" portions of
* Windows Server 2016 will break our Hyper-V connection at
* some point during a SAN boot. Surprisingly it does not
* change the guest OS ID MSR, but it does leave the SynIC
* message page disabled.
*
* Our own explicit quiescing procedure will also disable the
* SynIC message page. We can therefore use the SynIC message
* page enable bit as a heuristic to determine when we need to
* reestablish our Hyper-V connection.
*/
simp = rdmsr ( HV_X64_MSR_SIMP );
if ( simp & HV_SIMP_ENABLE )
return;
/* Remap hypercall page */
hv_map_hypercall ( hv );
/* Remap synthetic interrupt controller */
hv_map_synic ( hv );
/* Reset Hyper-V devices */
if ( ( rc = vmbus_reset ( hv, &hv_root_device.dev ) ) != 0 ) {
DBGC ( hv, "HV %p could not unquiesce: %s\n",
hv, strerror ( rc ) );
/* Nothing we can do */
return;
}
}
/** Hyper-V quiescer */
struct quiescer hv_quiescer __quiescer = {
.quiesce = hv_quiesce,
.unquiesce = hv_unquiesce,
};
/**
* Probe timer
*

View File

@ -259,6 +259,15 @@ static int netvsc_revoke_buffer ( struct netvsc_device *netvsc,
struct netvsc_revoke_buffer_message msg;
int rc;
/* If the buffer's GPADL is obsolete (i.e. was created before
* the most recent Hyper-V reset), then we will never receive
* a response to the revoke message. Since the GPADL is
* already destroyed as far as the hypervisor is concerned, no
* further action is required.
*/
if ( netvsc_is_obsolete ( netvsc ) )
return 0;
/* Construct message */
memset ( &msg, 0, sizeof ( msg ) );
msg.header.type = cpu_to_le32 ( buffer->revoke_type );
@ -474,6 +483,14 @@ static int netvsc_transmit ( struct rndis_device *rndis,
uint64_t xid;
int rc;
/* If the device is obsolete (i.e. was opened before the most
* recent Hyper-V reset), then we will never receive transmit
* completions. Fail transmissions immediately to minimise
* the delay in closing and reopening the device.
*/
if ( netvsc_is_obsolete ( netvsc ) )
return -EPIPE;
/* Sanity check */
assert ( iob_len ( iobuf ) >= sizeof ( *header ) );
assert ( iob_len ( iobuf ) == le32_to_cpu ( header->len ) );
@ -823,6 +840,35 @@ static int netvsc_probe ( struct vmbus_device *vmdev ) {
return rc;
}
/**
* Reset device
*
* @v vmdev VMBus device
* @ret rc Return status code
*/
static int netvsc_reset ( struct vmbus_device *vmdev ) {
struct rndis_device *rndis = vmbus_get_drvdata ( vmdev );
struct netvsc_device *netvsc = rndis->priv;
struct net_device *netdev = rndis->netdev;
int rc;
/* A closed device holds no NetVSC (or RNDIS) state, so there
* is nothing to reset.
*/
if ( ! netdev_is_open ( netdev ) )
return 0;
/* Close and reopen device to reset any stale state */
netdev_close ( netdev );
if ( ( rc = netdev_open ( netdev ) ) != 0 ) {
DBGC ( netvsc, "NETVSC %s could not reopen: %s\n",
netvsc->name, strerror ( rc ) );
return rc;
}
return 0;
}
/**
* Remove device
*
@ -844,5 +890,6 @@ struct vmbus_driver netvsc_driver __vmbus_driver = {
.type = VMBUS_TYPE ( 0xf8615163, 0xdf3e, 0x46c5, 0x913f,
0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e ),
.probe = netvsc_probe,
.reset = netvsc_reset,
.remove = netvsc_remove,
};

View File

@ -362,4 +362,19 @@ struct netvsc_device {
int wait_rc;
};
/**
* Check if NetVSC device is obsolete
*
* @v netvsc NetVSC device
* @v is_obsolete NetVSC device is obsolete
*
* Check if NetVSC device is obsolete (i.e. was opened before the most
* recent Hyper-V reset).
*/
static inline __attribute__ (( always_inline )) int
netvsc_is_obsolete ( struct netvsc_device *netvsc ) {
return vmbus_gpadl_is_obsolete ( netvsc->rx.gpadl );
}
#endif /* _NETVSC_H */

View File

@ -61,6 +61,9 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
/** Synthetic interrupt vector mask */
#define HV_SINT_VECTOR_MASK HV_SINT_VECTOR ( 0xff )
/** Maximum synthetic interrupt number */
#define HV_SINT_MAX 15
/** Post message */
#define HV_POST_MESSAGE 0x005c

View File

@ -479,6 +479,8 @@ struct vmbus_device {
/** Hyper-V hypervisor */
struct hv_hypervisor *hv;
/** Channel instance */
union uuid instance;
/** Channel ID */
unsigned int channel;
/** Monitor ID */
@ -527,6 +529,12 @@ struct vmbus_driver {
* @ret rc Return status code
*/
int ( * probe ) ( struct vmbus_device *vmdev );
/** Reset device
*
* @v vmdev VMBus device
* @ret rc Return status code
*/
int ( * reset ) ( struct vmbus_device *vmdev );
/** Remove device
*
* @v vmdev VMBus device
@ -609,6 +617,23 @@ vmbus_unregister_pages ( struct vmbus_device *vmdev,
list_del ( &pages->list );
}
extern unsigned int vmbus_obsolete_gpadl;
/**
* Check if GPADL is obsolete
*
* @v gpadl GPADL ID
* @v is_obsolete GPADL ID is obsolete
*
* Check if GPADL is obsolete (i.e. was created before the most recent
* Hyper-V reset).
*/
static inline __attribute__ (( always_inline )) int
vmbus_gpadl_is_obsolete ( unsigned int gpadl ) {
return ( gpadl <= vmbus_obsolete_gpadl );
}
extern int vmbus_establish_gpadl ( struct vmbus_device *vmdev, userptr_t data,
size_t len );
extern int vmbus_gpadl_teardown ( struct vmbus_device *vmdev,
@ -629,6 +654,7 @@ extern int vmbus_poll ( struct vmbus_device *vmdev );
extern void vmbus_dump_channel ( struct vmbus_device *vmdev );
extern int vmbus_probe ( struct hv_hypervisor *hv, struct device *parent );
extern int vmbus_reset ( struct hv_hypervisor *hv, struct device *parent );
extern void vmbus_remove ( struct hv_hypervisor *hv, struct device *parent );
#endif /* _IPXE_VMBUS_H */

View File

@ -50,6 +50,16 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
*/
#define VMBUS_GPADL_MAGIC 0x18ae0000
/** Current (i.e. most recently issued) GPADL ID */
static unsigned int vmbus_gpadl = VMBUS_GPADL_MAGIC;
/** Obsolete GPADL ID threshold
*
* When the Hyper-V connection is reset, any previous GPADLs are
* automatically rendered obsolete.
*/
unsigned int vmbus_obsolete_gpadl;
/**
* Post message
*
@ -281,12 +291,12 @@ int vmbus_establish_gpadl ( struct vmbus_device *vmdev, userptr_t data,
uint64_t pfn[pfn_count];
} __attribute__ (( packed )) gpadlhdr;
const struct vmbus_gpadl_created *created = &vmbus->message->created;
static unsigned int gpadl = VMBUS_GPADL_MAGIC;
unsigned int gpadl;
unsigned int i;
int rc;
/* Allocate GPADL ID */
gpadl++;
gpadl = ++vmbus_gpadl;
/* Construct message */
memset ( &gpadlhdr, 0, sizeof ( gpadlhdr ) );
@ -347,6 +357,15 @@ int vmbus_gpadl_teardown ( struct vmbus_device *vmdev, unsigned int gpadl ) {
const struct vmbus_gpadl_torndown *torndown = &vmbus->message->torndown;
int rc;
/* If GPADL is obsolete (i.e. was created before the most
* recent Hyper-V reset), then we will never receive a
* response to the teardown message. Since the GPADL is
* already destroyed as far as the hypervisor is concerned, no
* further action is required.
*/
if ( vmbus_gpadl_is_obsolete ( gpadl ) )
return 0;
/* Construct message */
memset ( &teardown, 0, sizeof ( teardown ) );
teardown.header.type = cpu_to_le32 ( VMBUS_GPADL_TEARDOWN );
@ -530,8 +549,7 @@ void vmbus_close ( struct vmbus_device *vmdev ) {
}
/* Tear down GPADL */
if ( ( rc = vmbus_gpadl_teardown ( vmdev,
vmdev->gpadl ) ) != 0 ) {
if ( ( rc = vmbus_gpadl_teardown ( vmdev, vmdev->gpadl ) ) != 0 ) {
DBGC ( vmdev, "VMBUS %s failed to tear down channel GPADL: "
"%s\n", vmdev->dev.name, strerror ( rc ) );
/* We can't prevent the remote VM from continuing to
@ -1187,6 +1205,8 @@ static int vmbus_probe_channels ( struct hv_hypervisor *hv,
&parent->children );
vmdev->dev.parent = parent;
vmdev->hv = hv;
memcpy ( &vmdev->instance, &offer->instance,
sizeof ( vmdev->instance ) );
vmdev->channel = channel;
vmdev->monitor = offer->monitor;
vmdev->signal = ( offer->monitored ?
@ -1201,6 +1221,7 @@ static int vmbus_probe_channels ( struct hv_hypervisor *hv,
} else if ( header->type ==
cpu_to_le32 ( VMBUS_ALL_OFFERS_DELIVERED ) ) {
/* End of offer list */
break;
} else {
@ -1244,6 +1265,77 @@ static int vmbus_probe_channels ( struct hv_hypervisor *hv,
return rc;
}
/**
* Reset channels
*
* @v hv Hyper-V hypervisor
* @v parent Parent device
* @ret rc Return status code
*/
static int vmbus_reset_channels ( struct hv_hypervisor *hv,
struct device *parent ) {
struct vmbus *vmbus = hv->vmbus;
const struct vmbus_message_header *header = &vmbus->message->header;
const struct vmbus_offer_channel *offer = &vmbus->message->offer;
const union uuid *type;
struct vmbus_device *vmdev;
unsigned int channel;
int rc;
/* Post message */
if ( ( rc = vmbus_post_empty_message ( hv, VMBUS_REQUEST_OFFERS ) ) !=0)
return rc;
/* Collect responses */
while ( 1 ) {
/* Wait for response */
if ( ( rc = vmbus_wait_for_any_message ( hv ) ) != 0 )
return rc;
/* Handle response */
if ( header->type == cpu_to_le32 ( VMBUS_OFFER_CHANNEL ) ) {
/* Parse offer */
type = &offer->type;
channel = le32_to_cpu ( offer->channel );
DBGC2 ( vmbus, "VMBUS %p offer %d type %s",
vmbus, channel, uuid_ntoa ( type ) );
if ( offer->monitored )
DBGC2 ( vmbus, " monitor %d", offer->monitor );
DBGC2 ( vmbus, "\n" );
/* Do nothing with the offer; we already have all
* of the relevant state from the initial probe.
*/
} else if ( header->type ==
cpu_to_le32 ( VMBUS_ALL_OFFERS_DELIVERED ) ) {
/* End of offer list */
break;
} else {
DBGC ( vmbus, "VMBUS %p unexpected offer response type "
"%d\n", vmbus, le32_to_cpu ( header->type ) );
return -EPROTO;
}
}
/* Reset all devices */
list_for_each_entry ( vmdev, &parent->children, dev.siblings ) {
if ( ( rc = vmdev->driver->reset ( vmdev ) ) != 0 ) {
DBGC ( vmdev, "VMBUS %s could not reset: %s\n",
vmdev->dev.name, strerror ( rc ) );
/* Continue attempting to reset other devices */
continue;
}
}
return 0;
}
/**
* Remove channels
*
@ -1330,6 +1422,39 @@ int vmbus_probe ( struct hv_hypervisor *hv, struct device *parent ) {
return rc;
}
/**
* Reset Hyper-V virtual machine bus
*
* @v hv Hyper-V hypervisor
* @v parent Parent device
* @ret rc Return status code
*/
int vmbus_reset ( struct hv_hypervisor *hv, struct device *parent ) {
struct vmbus *vmbus = hv->vmbus;
int rc;
/* Mark all existent GPADLs as obsolete */
vmbus_obsolete_gpadl = vmbus_gpadl;
/* Clear interrupt and monitor pages */
memset ( vmbus->intr, 0, PAGE_SIZE );
memset ( vmbus->monitor_in, 0, PAGE_SIZE );
memset ( vmbus->monitor_out, 0, PAGE_SIZE );
/* Enable message interrupt */
hv_enable_sint ( hv, VMBUS_MESSAGE_SINT );
/* Renegotiate protocol version */
if ( ( rc = vmbus_negotiate_version ( hv ) ) != 0 )
return rc;
/* Reenumerate channels */
if ( ( rc = vmbus_reset_channels ( hv, parent ) ) != 0 )
return rc;
return 0;
}
/**
* Remove Hyper-V virtual machine bus
*