diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 3246eecb1ffff3dc2d95e9ef8ed9c4f6c0bf7752..1e00602fd137230f8fff52d333e088b3e7aab35d 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1475,14 +1475,21 @@ qemuMigrationRestoreDomainState(virConnectPtr conn, virDomainObjPtr vm) { virQEMUDriverPtr driver = conn->privateData; qemuDomainObjPrivatePtr priv = vm->privateData; - int state = virDomainObjGetState(vm, NULL); + int reason; + virDomainState state = virDomainObjGetState(vm, &reason); bool ret = false; - VIR_DEBUG("driver=%p, vm=%p, pre-mig-state=%d, state=%d", - driver, vm, priv->preMigrationState, state); + VIR_DEBUG("driver=%p, vm=%p, pre-mig-state=%s, state=%s, reason=%s", + driver, vm, + virDomainStateTypeToString(priv->preMigrationState), + virDomainStateTypeToString(state), + virDomainStateReasonToString(state, reason)); - if (state == VIR_DOMAIN_PAUSED && - priv->preMigrationState == VIR_DOMAIN_RUNNING) { + if (state != VIR_DOMAIN_PAUSED || + reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) + goto cleanup; + + if (priv->preMigrationState == VIR_DOMAIN_RUNNING) { /* This is basically the only restore possibility that's safe * and we should attempt to do */ @@ -2375,6 +2382,48 @@ qemuMigrationSetOffline(virQEMUDriverPtr driver, return ret; } + +void +qemuMigrationPostcopyFailed(virQEMUDriverPtr driver, + virDomainObjPtr vm) +{ + virDomainState state; + int reason; + + state = virDomainObjGetState(vm, &reason); + + if (state != VIR_DOMAIN_PAUSED && + state != VIR_DOMAIN_RUNNING) + return; + + if (state == VIR_DOMAIN_PAUSED && + reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) + return; + + VIR_WARN("Migration of domain %s failed during post-copy; " + "leaving the domain paused", vm->def->name); + + if (state == VIR_DOMAIN_RUNNING) { + virObjectEventPtr event; + + if (qemuProcessStopCPUs(driver, vm, + VIR_DOMAIN_PAUSED_POSTCOPY_FAILED, + QEMU_ASYNC_JOB_MIGRATION_IN) < 0) { + VIR_WARN("Unable to pause guest CPUs for %s", vm->def->name); + return; + } + + event = virDomainEventLifecycleNewFromObj(vm, + VIR_DOMAIN_EVENT_SUSPENDED, + VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED); + qemuDomainEventQueue(driver, event); + } else { + virDomainObjSetState(vm, VIR_DOMAIN_PAUSED, + VIR_DOMAIN_PAUSED_POSTCOPY_FAILED); + } +} + + static int qemuMigrationSetOption(virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -4007,8 +4056,8 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver, if (flags & VIR_MIGRATE_OFFLINE) goto done; - /* Did the migration go as planned? If yes, kill off the - * domain object, but if no, resume CPUs + /* Did the migration go as planned? If yes, kill off the domain object. + * If something failed, resume CPUs, but only if we didn't use post-copy. */ if (retcode == 0) { /* If guest uses SPICE and supports seamless migration we have to hold @@ -4027,6 +4076,7 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver, qemuDomainEventEmitJobCompleted(driver, vm); } else { virErrorPtr orig_err = virSaveLastError(); + int reason; /* cancel any outstanding NBD jobs */ qemuMigrationCancelDriveMirror(driver, vm, false, @@ -4035,7 +4085,10 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver, virSetError(orig_err); virFreeError(orig_err); - if (qemuMigrationRestoreDomainState(conn, vm)) { + if (virDomainObjGetState(vm, &reason) == VIR_DOMAIN_PAUSED && + reason == VIR_DOMAIN_PAUSED_POSTCOPY) { + qemuMigrationPostcopyFailed(driver, vm); + } else if (qemuMigrationRestoreDomainState(conn, vm)) { event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED, VIR_DOMAIN_EVENT_RESUMED_MIGRATED); @@ -5871,6 +5924,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver, int rc; qemuDomainJobInfoPtr jobInfo = NULL; bool inPostCopy = false; + bool kill = true; VIR_DEBUG("driver=%p, dconn=%p, vm=%p, cookiein=%s, cookieinlen=%d, " "cookieout=%p, cookieoutlen=%p, flags=%lx, retcode=%d", @@ -6018,6 +6072,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver, } if (inPostCopy) { + kill = false; event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED, VIR_DOMAIN_EVENT_RESUMED_POSTCOPY); @@ -6077,14 +6132,18 @@ qemuMigrationFinish(virQEMUDriverPtr driver, if (!dom && !(flags & VIR_MIGRATE_OFFLINE) && virDomainObjIsActive(vm)) { - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, - QEMU_ASYNC_JOB_MIGRATION_IN, - VIR_QEMU_PROCESS_STOP_MIGRATED); - virDomainAuditStop(vm, "failed"); - event = virDomainEventLifecycleNewFromObj(vm, - VIR_DOMAIN_EVENT_STOPPED, - VIR_DOMAIN_EVENT_STOPPED_FAILED); - qemuDomainEventQueue(driver, event); + if (kill) { + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, + QEMU_ASYNC_JOB_MIGRATION_IN, + VIR_QEMU_PROCESS_STOP_MIGRATED); + virDomainAuditStop(vm, "failed"); + event = virDomainEventLifecycleNewFromObj(vm, + VIR_DOMAIN_EVENT_STOPPED, + VIR_DOMAIN_EVENT_STOPPED_FAILED); + qemuDomainEventQueue(driver, event); + } else { + qemuMigrationPostcopyFailed(driver, vm); + } } if (dom) { diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h index 5951a44981d85ca76ac249d725b2b7c5c51394b8..a927beafad47c0ebf4d9d4c57773e8931f9b03be 100644 --- a/src/qemu/qemu_migration.h +++ b/src/qemu/qemu_migration.h @@ -213,4 +213,7 @@ int qemuMigrationRunIncoming(virQEMUDriverPtr driver, const char *uri, qemuDomainAsyncJob asyncJob); +void qemuMigrationPostcopyFailed(virQEMUDriverPtr driver, + virDomainObjPtr vm); + #endif /* __QEMU_MIGRATION_H__ */ diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 0fb342a95e829bed60e4c39f63645d0eee331fc9..c3327478624a7b9834ae5bbd36c2fbe1111c5e1a 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3139,8 +3139,13 @@ qemuProcessRecoverMigrationIn(virQEMUDriverPtr driver, virConnectPtr conn, qemuMigrationJobPhase phase, virDomainState state, - int reason ATTRIBUTE_UNUSED) + int reason) { + bool postcopy = (state == VIR_DOMAIN_PAUSED && + reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) || + (state == VIR_DOMAIN_RUNNING && + reason == VIR_DOMAIN_RUNNING_POSTCOPY); + switch (phase) { case QEMU_MIGRATION_PHASE_NONE: case QEMU_MIGRATION_PHASE_PERFORM2: @@ -3173,8 +3178,10 @@ qemuProcessRecoverMigrationIn(virQEMUDriverPtr driver, case QEMU_MIGRATION_PHASE_FINISH3: /* migration finished, we started resuming the domain but didn't * confirm success or failure yet; killing it seems safest unless - * we already started guest CPUs */ - if (state != VIR_DOMAIN_RUNNING) { + * we already started guest CPUs or we were in post-copy mode */ + if (postcopy) { + qemuMigrationPostcopyFailed(driver, vm); + } else if (state != VIR_DOMAIN_RUNNING) { VIR_DEBUG("Killing migrated domain %s", vm->def->name); return -1; } @@ -3192,6 +3199,10 @@ qemuProcessRecoverMigrationOut(virQEMUDriverPtr driver, virDomainState state, int reason) { + bool postcopy = state == VIR_DOMAIN_PAUSED && + (reason == VIR_DOMAIN_PAUSED_POSTCOPY || + reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED); + switch (phase) { case QEMU_MIGRATION_PHASE_NONE: case QEMU_MIGRATION_PHASE_PREPARE: @@ -3209,26 +3220,44 @@ qemuProcessRecoverMigrationOut(virQEMUDriverPtr driver, case QEMU_MIGRATION_PHASE_PERFORM2: case QEMU_MIGRATION_PHASE_PERFORM3: /* migration is still in progress, let's cancel it and resume the - * domain */ - VIR_DEBUG("Cancelling unfinished migration of domain %s", - vm->def->name); - if (qemuMigrationCancel(driver, vm) < 0) { - VIR_WARN("Could not cancel ongoing migration of domain %s", - vm->def->name); + * domain; however we can only do that before migration enters + * post-copy mode + */ + if (postcopy) { + qemuMigrationPostcopyFailed(driver, vm); + } else { + VIR_DEBUG("Cancelling unfinished migration of domain %s", + vm->def->name); + if (qemuMigrationCancel(driver, vm) < 0) { + VIR_WARN("Could not cancel ongoing migration of domain %s", + vm->def->name); + } + goto resume; } - goto resume; + break; case QEMU_MIGRATION_PHASE_PERFORM3_DONE: /* migration finished but we didn't have a chance to get the result - * of Finish3 step; third party needs to check what to do next + * of Finish3 step; third party needs to check what to do next; in + * post-copy mode we can use PAUSED_POSTCOPY_FAILED state for this */ + if (postcopy) + qemuMigrationPostcopyFailed(driver, vm); break; case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED: - /* Finish3 failed, we need to resume the domain */ - VIR_DEBUG("Resuming domain %s after failed migration", - vm->def->name); - goto resume; + /* Finish3 failed, we need to resume the domain, but once we enter + * post-copy mode there's no way back, so let's just mark the domain + * as broken in that case + */ + if (postcopy) { + qemuMigrationPostcopyFailed(driver, vm); + } else { + VIR_DEBUG("Resuming domain %s after failed migration", + vm->def->name); + goto resume; + } + break; case QEMU_MIGRATION_PHASE_CONFIRM3: /* migration completed, we need to kill the domain here */