From 630517d86077403381eefcf901e4e4e99590b083 Mon Sep 17 00:00:00 2001 From: Jiri Denemark Date: Wed, 13 Jan 2016 16:29:58 +0100 Subject: [PATCH] qemu: Handle post-copy migration failures When migration fails in the post-copy mode, it's impossible to just kill the destination domain and resume the source since the source no longer contains current guest state. Let's mark domains on both sides as VIR_DOMAIN_PAUSED_POSTCOPY_FAILED to let the upper layer decide what to do with them. Signed-off-by: Jiri Denemark --- src/qemu/qemu_migration.c | 91 ++++++++++++++++++++++++++++++++------- src/qemu/qemu_migration.h | 3 ++ src/qemu/qemu_process.c | 59 ++++++++++++++++++------- 3 files changed, 122 insertions(+), 31 deletions(-) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 3246eecb1f..1e00602fd1 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1475,14 +1475,21 @@ qemuMigrationRestoreDomainState(virConnectPtr conn, virDomainObjPtr vm) { virQEMUDriverPtr driver = conn->privateData; qemuDomainObjPrivatePtr priv = vm->privateData; - int state = virDomainObjGetState(vm, NULL); + int reason; + virDomainState state = virDomainObjGetState(vm, &reason); bool ret = false; - VIR_DEBUG("driver=%p, vm=%p, pre-mig-state=%d, state=%d", - driver, vm, priv->preMigrationState, state); + VIR_DEBUG("driver=%p, vm=%p, pre-mig-state=%s, state=%s, reason=%s", + driver, vm, + virDomainStateTypeToString(priv->preMigrationState), + virDomainStateTypeToString(state), + virDomainStateReasonToString(state, reason)); - if (state == VIR_DOMAIN_PAUSED && - priv->preMigrationState == VIR_DOMAIN_RUNNING) { + if (state != VIR_DOMAIN_PAUSED || + reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) + goto cleanup; + + if (priv->preMigrationState == VIR_DOMAIN_RUNNING) { /* This is basically the only restore possibility that's safe * and we should attempt to do */ @@ -2375,6 +2382,48 @@ qemuMigrationSetOffline(virQEMUDriverPtr driver, return ret; } + +void +qemuMigrationPostcopyFailed(virQEMUDriverPtr driver, + virDomainObjPtr vm) +{ + virDomainState state; + int reason; + + state = virDomainObjGetState(vm, &reason); + + if (state != VIR_DOMAIN_PAUSED && + state != VIR_DOMAIN_RUNNING) + return; + + if (state == VIR_DOMAIN_PAUSED && + reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) + return; + + VIR_WARN("Migration of domain %s failed during post-copy; " + "leaving the domain paused", vm->def->name); + + if (state == VIR_DOMAIN_RUNNING) { + virObjectEventPtr event; + + if (qemuProcessStopCPUs(driver, vm, + VIR_DOMAIN_PAUSED_POSTCOPY_FAILED, + QEMU_ASYNC_JOB_MIGRATION_IN) < 0) { + VIR_WARN("Unable to pause guest CPUs for %s", vm->def->name); + return; + } + + event = virDomainEventLifecycleNewFromObj(vm, + VIR_DOMAIN_EVENT_SUSPENDED, + VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED); + qemuDomainEventQueue(driver, event); + } else { + virDomainObjSetState(vm, VIR_DOMAIN_PAUSED, + VIR_DOMAIN_PAUSED_POSTCOPY_FAILED); + } +} + + static int qemuMigrationSetOption(virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -4007,8 +4056,8 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver, if (flags & VIR_MIGRATE_OFFLINE) goto done; - /* Did the migration go as planned? If yes, kill off the - * domain object, but if no, resume CPUs + /* Did the migration go as planned? If yes, kill off the domain object. + * If something failed, resume CPUs, but only if we didn't use post-copy. */ if (retcode == 0) { /* If guest uses SPICE and supports seamless migration we have to hold @@ -4027,6 +4076,7 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver, qemuDomainEventEmitJobCompleted(driver, vm); } else { virErrorPtr orig_err = virSaveLastError(); + int reason; /* cancel any outstanding NBD jobs */ qemuMigrationCancelDriveMirror(driver, vm, false, @@ -4035,7 +4085,10 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver, virSetError(orig_err); virFreeError(orig_err); - if (qemuMigrationRestoreDomainState(conn, vm)) { + if (virDomainObjGetState(vm, &reason) == VIR_DOMAIN_PAUSED && + reason == VIR_DOMAIN_PAUSED_POSTCOPY) { + qemuMigrationPostcopyFailed(driver, vm); + } else if (qemuMigrationRestoreDomainState(conn, vm)) { event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED, VIR_DOMAIN_EVENT_RESUMED_MIGRATED); @@ -5871,6 +5924,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver, int rc; qemuDomainJobInfoPtr jobInfo = NULL; bool inPostCopy = false; + bool kill = true; VIR_DEBUG("driver=%p, dconn=%p, vm=%p, cookiein=%s, cookieinlen=%d, " "cookieout=%p, cookieoutlen=%p, flags=%lx, retcode=%d", @@ -6018,6 +6072,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver, } if (inPostCopy) { + kill = false; event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED, VIR_DOMAIN_EVENT_RESUMED_POSTCOPY); @@ -6077,14 +6132,18 @@ qemuMigrationFinish(virQEMUDriverPtr driver, if (!dom && !(flags & VIR_MIGRATE_OFFLINE) && virDomainObjIsActive(vm)) { - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, - QEMU_ASYNC_JOB_MIGRATION_IN, - VIR_QEMU_PROCESS_STOP_MIGRATED); - virDomainAuditStop(vm, "failed"); - event = virDomainEventLifecycleNewFromObj(vm, - VIR_DOMAIN_EVENT_STOPPED, - VIR_DOMAIN_EVENT_STOPPED_FAILED); - qemuDomainEventQueue(driver, event); + if (kill) { + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, + QEMU_ASYNC_JOB_MIGRATION_IN, + VIR_QEMU_PROCESS_STOP_MIGRATED); + virDomainAuditStop(vm, "failed"); + event = virDomainEventLifecycleNewFromObj(vm, + VIR_DOMAIN_EVENT_STOPPED, + VIR_DOMAIN_EVENT_STOPPED_FAILED); + qemuDomainEventQueue(driver, event); + } else { + qemuMigrationPostcopyFailed(driver, vm); + } } if (dom) { diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h index 5951a44981..a927beafad 100644 --- a/src/qemu/qemu_migration.h +++ b/src/qemu/qemu_migration.h @@ -213,4 +213,7 @@ int qemuMigrationRunIncoming(virQEMUDriverPtr driver, const char *uri, qemuDomainAsyncJob asyncJob); +void qemuMigrationPostcopyFailed(virQEMUDriverPtr driver, + virDomainObjPtr vm); + #endif /* __QEMU_MIGRATION_H__ */ diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 0fb342a95e..c332747862 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3139,8 +3139,13 @@ qemuProcessRecoverMigrationIn(virQEMUDriverPtr driver, virConnectPtr conn, qemuMigrationJobPhase phase, virDomainState state, - int reason ATTRIBUTE_UNUSED) + int reason) { + bool postcopy = (state == VIR_DOMAIN_PAUSED && + reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) || + (state == VIR_DOMAIN_RUNNING && + reason == VIR_DOMAIN_RUNNING_POSTCOPY); + switch (phase) { case QEMU_MIGRATION_PHASE_NONE: case QEMU_MIGRATION_PHASE_PERFORM2: @@ -3173,8 +3178,10 @@ qemuProcessRecoverMigrationIn(virQEMUDriverPtr driver, case QEMU_MIGRATION_PHASE_FINISH3: /* migration finished, we started resuming the domain but didn't * confirm success or failure yet; killing it seems safest unless - * we already started guest CPUs */ - if (state != VIR_DOMAIN_RUNNING) { + * we already started guest CPUs or we were in post-copy mode */ + if (postcopy) { + qemuMigrationPostcopyFailed(driver, vm); + } else if (state != VIR_DOMAIN_RUNNING) { VIR_DEBUG("Killing migrated domain %s", vm->def->name); return -1; } @@ -3192,6 +3199,10 @@ qemuProcessRecoverMigrationOut(virQEMUDriverPtr driver, virDomainState state, int reason) { + bool postcopy = state == VIR_DOMAIN_PAUSED && + (reason == VIR_DOMAIN_PAUSED_POSTCOPY || + reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED); + switch (phase) { case QEMU_MIGRATION_PHASE_NONE: case QEMU_MIGRATION_PHASE_PREPARE: @@ -3209,26 +3220,44 @@ qemuProcessRecoverMigrationOut(virQEMUDriverPtr driver, case QEMU_MIGRATION_PHASE_PERFORM2: case QEMU_MIGRATION_PHASE_PERFORM3: /* migration is still in progress, let's cancel it and resume the - * domain */ - VIR_DEBUG("Cancelling unfinished migration of domain %s", - vm->def->name); - if (qemuMigrationCancel(driver, vm) < 0) { - VIR_WARN("Could not cancel ongoing migration of domain %s", - vm->def->name); + * domain; however we can only do that before migration enters + * post-copy mode + */ + if (postcopy) { + qemuMigrationPostcopyFailed(driver, vm); + } else { + VIR_DEBUG("Cancelling unfinished migration of domain %s", + vm->def->name); + if (qemuMigrationCancel(driver, vm) < 0) { + VIR_WARN("Could not cancel ongoing migration of domain %s", + vm->def->name); + } + goto resume; } - goto resume; + break; case QEMU_MIGRATION_PHASE_PERFORM3_DONE: /* migration finished but we didn't have a chance to get the result - * of Finish3 step; third party needs to check what to do next + * of Finish3 step; third party needs to check what to do next; in + * post-copy mode we can use PAUSED_POSTCOPY_FAILED state for this */ + if (postcopy) + qemuMigrationPostcopyFailed(driver, vm); break; case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED: - /* Finish3 failed, we need to resume the domain */ - VIR_DEBUG("Resuming domain %s after failed migration", - vm->def->name); - goto resume; + /* Finish3 failed, we need to resume the domain, but once we enter + * post-copy mode there's no way back, so let's just mark the domain + * as broken in that case + */ + if (postcopy) { + qemuMigrationPostcopyFailed(driver, vm); + } else { + VIR_DEBUG("Resuming domain %s after failed migration", + vm->def->name); + goto resume; + } + break; case QEMU_MIGRATION_PHASE_CONFIRM3: /* migration completed, we need to kill the domain here */ -- GitLab