提交 630517d8 编写于 作者: J Jiri Denemark

qemu: Handle post-copy migration failures

When migration fails in the post-copy mode, it's impossible to just kill
the destination domain and resume the source since the source no longer
contains current guest state. Let's mark domains on both sides as
VIR_DOMAIN_PAUSED_POSTCOPY_FAILED to let the upper layer decide what to
do with them.
Signed-off-by: NJiri Denemark <jdenemar@redhat.com>
上级 81b2a2c7
...@@ -1475,14 +1475,21 @@ qemuMigrationRestoreDomainState(virConnectPtr conn, virDomainObjPtr vm) ...@@ -1475,14 +1475,21 @@ qemuMigrationRestoreDomainState(virConnectPtr conn, virDomainObjPtr vm)
{ {
virQEMUDriverPtr driver = conn->privateData; virQEMUDriverPtr driver = conn->privateData;
qemuDomainObjPrivatePtr priv = vm->privateData; qemuDomainObjPrivatePtr priv = vm->privateData;
int state = virDomainObjGetState(vm, NULL); int reason;
virDomainState state = virDomainObjGetState(vm, &reason);
bool ret = false; bool ret = false;
VIR_DEBUG("driver=%p, vm=%p, pre-mig-state=%d, state=%d", VIR_DEBUG("driver=%p, vm=%p, pre-mig-state=%s, state=%s, reason=%s",
driver, vm, priv->preMigrationState, state); driver, vm,
virDomainStateTypeToString(priv->preMigrationState),
virDomainStateTypeToString(state),
virDomainStateReasonToString(state, reason));
if (state == VIR_DOMAIN_PAUSED && if (state != VIR_DOMAIN_PAUSED ||
priv->preMigrationState == VIR_DOMAIN_RUNNING) { reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED)
goto cleanup;
if (priv->preMigrationState == VIR_DOMAIN_RUNNING) {
/* This is basically the only restore possibility that's safe /* This is basically the only restore possibility that's safe
* and we should attempt to do */ * and we should attempt to do */
...@@ -2375,6 +2382,48 @@ qemuMigrationSetOffline(virQEMUDriverPtr driver, ...@@ -2375,6 +2382,48 @@ qemuMigrationSetOffline(virQEMUDriverPtr driver,
return ret; return ret;
} }
void
qemuMigrationPostcopyFailed(virQEMUDriverPtr driver,
virDomainObjPtr vm)
{
virDomainState state;
int reason;
state = virDomainObjGetState(vm, &reason);
if (state != VIR_DOMAIN_PAUSED &&
state != VIR_DOMAIN_RUNNING)
return;
if (state == VIR_DOMAIN_PAUSED &&
reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED)
return;
VIR_WARN("Migration of domain %s failed during post-copy; "
"leaving the domain paused", vm->def->name);
if (state == VIR_DOMAIN_RUNNING) {
virObjectEventPtr event;
if (qemuProcessStopCPUs(driver, vm,
VIR_DOMAIN_PAUSED_POSTCOPY_FAILED,
QEMU_ASYNC_JOB_MIGRATION_IN) < 0) {
VIR_WARN("Unable to pause guest CPUs for %s", vm->def->name);
return;
}
event = virDomainEventLifecycleNewFromObj(vm,
VIR_DOMAIN_EVENT_SUSPENDED,
VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED);
qemuDomainEventQueue(driver, event);
} else {
virDomainObjSetState(vm, VIR_DOMAIN_PAUSED,
VIR_DOMAIN_PAUSED_POSTCOPY_FAILED);
}
}
static int static int
qemuMigrationSetOption(virQEMUDriverPtr driver, qemuMigrationSetOption(virQEMUDriverPtr driver,
virDomainObjPtr vm, virDomainObjPtr vm,
...@@ -4007,8 +4056,8 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver, ...@@ -4007,8 +4056,8 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver,
if (flags & VIR_MIGRATE_OFFLINE) if (flags & VIR_MIGRATE_OFFLINE)
goto done; goto done;
/* Did the migration go as planned? If yes, kill off the /* Did the migration go as planned? If yes, kill off the domain object.
* domain object, but if no, resume CPUs * If something failed, resume CPUs, but only if we didn't use post-copy.
*/ */
if (retcode == 0) { if (retcode == 0) {
/* If guest uses SPICE and supports seamless migration we have to hold /* If guest uses SPICE and supports seamless migration we have to hold
...@@ -4027,6 +4076,7 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver, ...@@ -4027,6 +4076,7 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver,
qemuDomainEventEmitJobCompleted(driver, vm); qemuDomainEventEmitJobCompleted(driver, vm);
} else { } else {
virErrorPtr orig_err = virSaveLastError(); virErrorPtr orig_err = virSaveLastError();
int reason;
/* cancel any outstanding NBD jobs */ /* cancel any outstanding NBD jobs */
qemuMigrationCancelDriveMirror(driver, vm, false, qemuMigrationCancelDriveMirror(driver, vm, false,
...@@ -4035,7 +4085,10 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver, ...@@ -4035,7 +4085,10 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver,
virSetError(orig_err); virSetError(orig_err);
virFreeError(orig_err); virFreeError(orig_err);
if (qemuMigrationRestoreDomainState(conn, vm)) { if (virDomainObjGetState(vm, &reason) == VIR_DOMAIN_PAUSED &&
reason == VIR_DOMAIN_PAUSED_POSTCOPY) {
qemuMigrationPostcopyFailed(driver, vm);
} else if (qemuMigrationRestoreDomainState(conn, vm)) {
event = virDomainEventLifecycleNewFromObj(vm, event = virDomainEventLifecycleNewFromObj(vm,
VIR_DOMAIN_EVENT_RESUMED, VIR_DOMAIN_EVENT_RESUMED,
VIR_DOMAIN_EVENT_RESUMED_MIGRATED); VIR_DOMAIN_EVENT_RESUMED_MIGRATED);
...@@ -5871,6 +5924,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver, ...@@ -5871,6 +5924,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver,
int rc; int rc;
qemuDomainJobInfoPtr jobInfo = NULL; qemuDomainJobInfoPtr jobInfo = NULL;
bool inPostCopy = false; bool inPostCopy = false;
bool kill = true;
VIR_DEBUG("driver=%p, dconn=%p, vm=%p, cookiein=%s, cookieinlen=%d, " VIR_DEBUG("driver=%p, dconn=%p, vm=%p, cookiein=%s, cookieinlen=%d, "
"cookieout=%p, cookieoutlen=%p, flags=%lx, retcode=%d", "cookieout=%p, cookieoutlen=%p, flags=%lx, retcode=%d",
...@@ -6018,6 +6072,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver, ...@@ -6018,6 +6072,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver,
} }
if (inPostCopy) { if (inPostCopy) {
kill = false;
event = virDomainEventLifecycleNewFromObj(vm, event = virDomainEventLifecycleNewFromObj(vm,
VIR_DOMAIN_EVENT_RESUMED, VIR_DOMAIN_EVENT_RESUMED,
VIR_DOMAIN_EVENT_RESUMED_POSTCOPY); VIR_DOMAIN_EVENT_RESUMED_POSTCOPY);
...@@ -6077,14 +6132,18 @@ qemuMigrationFinish(virQEMUDriverPtr driver, ...@@ -6077,14 +6132,18 @@ qemuMigrationFinish(virQEMUDriverPtr driver,
if (!dom && if (!dom &&
!(flags & VIR_MIGRATE_OFFLINE) && !(flags & VIR_MIGRATE_OFFLINE) &&
virDomainObjIsActive(vm)) { virDomainObjIsActive(vm)) {
qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, if (kill) {
QEMU_ASYNC_JOB_MIGRATION_IN, qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED,
VIR_QEMU_PROCESS_STOP_MIGRATED); QEMU_ASYNC_JOB_MIGRATION_IN,
virDomainAuditStop(vm, "failed"); VIR_QEMU_PROCESS_STOP_MIGRATED);
event = virDomainEventLifecycleNewFromObj(vm, virDomainAuditStop(vm, "failed");
VIR_DOMAIN_EVENT_STOPPED, event = virDomainEventLifecycleNewFromObj(vm,
VIR_DOMAIN_EVENT_STOPPED_FAILED); VIR_DOMAIN_EVENT_STOPPED,
qemuDomainEventQueue(driver, event); VIR_DOMAIN_EVENT_STOPPED_FAILED);
qemuDomainEventQueue(driver, event);
} else {
qemuMigrationPostcopyFailed(driver, vm);
}
} }
if (dom) { if (dom) {
......
...@@ -213,4 +213,7 @@ int qemuMigrationRunIncoming(virQEMUDriverPtr driver, ...@@ -213,4 +213,7 @@ int qemuMigrationRunIncoming(virQEMUDriverPtr driver,
const char *uri, const char *uri,
qemuDomainAsyncJob asyncJob); qemuDomainAsyncJob asyncJob);
void qemuMigrationPostcopyFailed(virQEMUDriverPtr driver,
virDomainObjPtr vm);
#endif /* __QEMU_MIGRATION_H__ */ #endif /* __QEMU_MIGRATION_H__ */
...@@ -3139,8 +3139,13 @@ qemuProcessRecoverMigrationIn(virQEMUDriverPtr driver, ...@@ -3139,8 +3139,13 @@ qemuProcessRecoverMigrationIn(virQEMUDriverPtr driver,
virConnectPtr conn, virConnectPtr conn,
qemuMigrationJobPhase phase, qemuMigrationJobPhase phase,
virDomainState state, virDomainState state,
int reason ATTRIBUTE_UNUSED) int reason)
{ {
bool postcopy = (state == VIR_DOMAIN_PAUSED &&
reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) ||
(state == VIR_DOMAIN_RUNNING &&
reason == VIR_DOMAIN_RUNNING_POSTCOPY);
switch (phase) { switch (phase) {
case QEMU_MIGRATION_PHASE_NONE: case QEMU_MIGRATION_PHASE_NONE:
case QEMU_MIGRATION_PHASE_PERFORM2: case QEMU_MIGRATION_PHASE_PERFORM2:
...@@ -3173,8 +3178,10 @@ qemuProcessRecoverMigrationIn(virQEMUDriverPtr driver, ...@@ -3173,8 +3178,10 @@ qemuProcessRecoverMigrationIn(virQEMUDriverPtr driver,
case QEMU_MIGRATION_PHASE_FINISH3: case QEMU_MIGRATION_PHASE_FINISH3:
/* migration finished, we started resuming the domain but didn't /* migration finished, we started resuming the domain but didn't
* confirm success or failure yet; killing it seems safest unless * confirm success or failure yet; killing it seems safest unless
* we already started guest CPUs */ * we already started guest CPUs or we were in post-copy mode */
if (state != VIR_DOMAIN_RUNNING) { if (postcopy) {
qemuMigrationPostcopyFailed(driver, vm);
} else if (state != VIR_DOMAIN_RUNNING) {
VIR_DEBUG("Killing migrated domain %s", vm->def->name); VIR_DEBUG("Killing migrated domain %s", vm->def->name);
return -1; return -1;
} }
...@@ -3192,6 +3199,10 @@ qemuProcessRecoverMigrationOut(virQEMUDriverPtr driver, ...@@ -3192,6 +3199,10 @@ qemuProcessRecoverMigrationOut(virQEMUDriverPtr driver,
virDomainState state, virDomainState state,
int reason) int reason)
{ {
bool postcopy = state == VIR_DOMAIN_PAUSED &&
(reason == VIR_DOMAIN_PAUSED_POSTCOPY ||
reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED);
switch (phase) { switch (phase) {
case QEMU_MIGRATION_PHASE_NONE: case QEMU_MIGRATION_PHASE_NONE:
case QEMU_MIGRATION_PHASE_PREPARE: case QEMU_MIGRATION_PHASE_PREPARE:
...@@ -3209,26 +3220,44 @@ qemuProcessRecoverMigrationOut(virQEMUDriverPtr driver, ...@@ -3209,26 +3220,44 @@ qemuProcessRecoverMigrationOut(virQEMUDriverPtr driver,
case QEMU_MIGRATION_PHASE_PERFORM2: case QEMU_MIGRATION_PHASE_PERFORM2:
case QEMU_MIGRATION_PHASE_PERFORM3: case QEMU_MIGRATION_PHASE_PERFORM3:
/* migration is still in progress, let's cancel it and resume the /* migration is still in progress, let's cancel it and resume the
* domain */ * domain; however we can only do that before migration enters
VIR_DEBUG("Cancelling unfinished migration of domain %s", * post-copy mode
vm->def->name); */
if (qemuMigrationCancel(driver, vm) < 0) { if (postcopy) {
VIR_WARN("Could not cancel ongoing migration of domain %s", qemuMigrationPostcopyFailed(driver, vm);
vm->def->name); } else {
VIR_DEBUG("Cancelling unfinished migration of domain %s",
vm->def->name);
if (qemuMigrationCancel(driver, vm) < 0) {
VIR_WARN("Could not cancel ongoing migration of domain %s",
vm->def->name);
}
goto resume;
} }
goto resume; break;
case QEMU_MIGRATION_PHASE_PERFORM3_DONE: case QEMU_MIGRATION_PHASE_PERFORM3_DONE:
/* migration finished but we didn't have a chance to get the result /* migration finished but we didn't have a chance to get the result
* of Finish3 step; third party needs to check what to do next * of Finish3 step; third party needs to check what to do next; in
* post-copy mode we can use PAUSED_POSTCOPY_FAILED state for this
*/ */
if (postcopy)
qemuMigrationPostcopyFailed(driver, vm);
break; break;
case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED: case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED:
/* Finish3 failed, we need to resume the domain */ /* Finish3 failed, we need to resume the domain, but once we enter
VIR_DEBUG("Resuming domain %s after failed migration", * post-copy mode there's no way back, so let's just mark the domain
vm->def->name); * as broken in that case
goto resume; */
if (postcopy) {
qemuMigrationPostcopyFailed(driver, vm);
} else {
VIR_DEBUG("Resuming domain %s after failed migration",
vm->def->name);
goto resume;
}
break;
case QEMU_MIGRATION_PHASE_CONFIRM3: case QEMU_MIGRATION_PHASE_CONFIRM3:
/* migration completed, we need to kill the domain here */ /* migration completed, we need to kill the domain here */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册