cpus.c 43.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/*
 * QEMU System Emulator
 *
 * Copyright (c) 2003-2008 Fabrice Bellard
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

/* Needed early for CONFIG_BSD etc. */
P
Peter Maydell 已提交
26
#include "qemu/osdep.h"
27

28
#include "monitor/monitor.h"
W
Wenchao Xia 已提交
29
#include "qapi/qmp/qerror.h"
30
#include "qemu/error-report.h"
31
#include "sysemu/sysemu.h"
32
#include "sysemu/block-backend.h"
33
#include "exec/gdbstub.h"
34 35
#include "sysemu/dma.h"
#include "sysemu/kvm.h"
L
Luiz Capitulino 已提交
36
#include "qmp-commands.h"
37

38
#include "qemu/thread.h"
39 40
#include "sysemu/cpus.h"
#include "sysemu/qtest.h"
41 42
#include "qemu/main-loop.h"
#include "qemu/bitmap.h"
43
#include "qemu/seqlock.h"
W
Wenchao Xia 已提交
44
#include "qapi-event.h"
45
#include "hw/nmi.h"
46
#include "sysemu/replay.h"
J
Jan Kiszka 已提交
47 48

#ifndef _WIN32
49
#include "qemu/compatfd.h"
J
Jan Kiszka 已提交
50
#endif
51

52 53 54 55
#ifdef CONFIG_LINUX

#include <sys/prctl.h>

M
Marcelo Tosatti 已提交
56 57 58 59
#ifndef PR_MCE_KILL
#define PR_MCE_KILL 33
#endif

60 61 62 63 64 65 66 67 68 69
#ifndef PR_MCE_KILL_SET
#define PR_MCE_KILL_SET 1
#endif

#ifndef PR_MCE_KILL_EARLY
#define PR_MCE_KILL_EARLY 1
#endif

#endif /* CONFIG_LINUX */

70
static CPUState *next_cpu;
71 72
int64_t max_delay;
int64_t max_advance;
73

74 75 76 77 78 79 80 81
/* vcpu throttling controls */
static QEMUTimer *throttle_timer;
static unsigned int throttle_percentage;

#define CPU_THROTTLE_PCT_MIN 1
#define CPU_THROTTLE_PCT_MAX 99
#define CPU_THROTTLE_TIMESLICE_NS 10000000

82 83 84 85 86
bool cpu_is_stopped(CPUState *cpu)
{
    return cpu->stopped || !runstate_is_running();
}

87
static bool cpu_thread_is_idle(CPUState *cpu)
88
{
89
    if (cpu->stop || cpu->queued_work_first) {
90 91
        return false;
    }
92
    if (cpu_is_stopped(cpu)) {
93 94
        return true;
    }
95
    if (!cpu->halted || cpu_has_work(cpu) ||
96
        kvm_halt_in_kernel()) {
97 98 99 100 101 102 103
        return false;
    }
    return true;
}

static bool all_cpu_threads_idle(void)
{
104
    CPUState *cpu;
105

A
Andreas Färber 已提交
106
    CPU_FOREACH(cpu) {
107
        if (!cpu_thread_is_idle(cpu)) {
108 109 110 111 112 113
            return false;
        }
    }
    return true;
}

P
Paolo Bonzini 已提交
114 115 116
/***********************************************************/
/* guest cycle counter */

117 118
/* Protected by TimersState seqlock */

119
static bool icount_sleep = true;
120
static int64_t vm_clock_warp_start = -1;
P
Paolo Bonzini 已提交
121 122 123 124
/* Conversion factor from emulated instructions to virtual clock ticks.  */
static int icount_time_shift;
/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
#define MAX_ICOUNT_SHIFT 10
125

P
Paolo Bonzini 已提交
126 127 128 129 130
static QEMUTimer *icount_rt_timer;
static QEMUTimer *icount_vm_timer;
static QEMUTimer *icount_warp_timer;

typedef struct TimersState {
131
    /* Protected by BQL.  */
P
Paolo Bonzini 已提交
132 133
    int64_t cpu_ticks_prev;
    int64_t cpu_ticks_offset;
134 135 136 137 138

    /* cpu_clock_offset can be read out of BQL, so protect it with
     * this lock.
     */
    QemuSeqLock vm_clock_seqlock;
P
Paolo Bonzini 已提交
139 140 141
    int64_t cpu_clock_offset;
    int32_t cpu_ticks_enabled;
    int64_t dummy;
142 143 144 145 146

    /* Compensate for varying guest execution speed.  */
    int64_t qemu_icount_bias;
    /* Only written by TCG thread */
    int64_t qemu_icount;
P
Paolo Bonzini 已提交
147 148
} TimersState;

L
Liu Ping Fan 已提交
149
static TimersState timers_state;
P
Paolo Bonzini 已提交
150

151
int64_t cpu_get_icount_raw(void)
P
Paolo Bonzini 已提交
152 153
{
    int64_t icount;
154
    CPUState *cpu = current_cpu;
P
Paolo Bonzini 已提交
155

156
    icount = timers_state.qemu_icount;
157
    if (cpu) {
158
        if (!cpu->can_do_io) {
159 160
            fprintf(stderr, "Bad icount read\n");
            exit(1);
P
Paolo Bonzini 已提交
161
        }
162
        icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
P
Paolo Bonzini 已提交
163
    }
164 165 166 167 168 169 170
    return icount;
}

/* Return the virtual CPU time, based on the instruction counter.  */
static int64_t cpu_get_icount_locked(void)
{
    int64_t icount = cpu_get_icount_raw();
171
    return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
P
Paolo Bonzini 已提交
172 173
}

P
Paolo Bonzini 已提交
174 175 176 177 178 179 180 181 182 183 184 185 186
int64_t cpu_get_icount(void)
{
    int64_t icount;
    unsigned start;

    do {
        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
        icount = cpu_get_icount_locked();
    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));

    return icount;
}

187 188 189 190 191
int64_t cpu_icount_to_ns(int64_t icount)
{
    return icount << icount_time_shift;
}

P
Paolo Bonzini 已提交
192
/* return the host CPU cycle counter and handle stop/restart */
193
/* Caller must hold the BQL */
P
Paolo Bonzini 已提交
194 195
int64_t cpu_get_ticks(void)
{
P
Paolo Bonzini 已提交
196 197
    int64_t ticks;

P
Paolo Bonzini 已提交
198 199 200
    if (use_icount) {
        return cpu_get_icount();
    }
P
Paolo Bonzini 已提交
201 202 203

    ticks = timers_state.cpu_ticks_offset;
    if (timers_state.cpu_ticks_enabled) {
204
        ticks += cpu_get_host_ticks();
P
Paolo Bonzini 已提交
205 206 207 208 209 210 211
    }

    if (timers_state.cpu_ticks_prev > ticks) {
        /* Note: non increasing ticks may happen if the host uses
           software suspend */
        timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
        ticks = timers_state.cpu_ticks_prev;
P
Paolo Bonzini 已提交
212
    }
P
Paolo Bonzini 已提交
213 214 215

    timers_state.cpu_ticks_prev = ticks;
    return ticks;
P
Paolo Bonzini 已提交
216 217
}

218
static int64_t cpu_get_clock_locked(void)
P
Paolo Bonzini 已提交
219
{
P
Paolo Bonzini 已提交
220
    int64_t ticks;
221

P
Paolo Bonzini 已提交
222 223 224
    ticks = timers_state.cpu_clock_offset;
    if (timers_state.cpu_ticks_enabled) {
        ticks += get_clock();
P
Paolo Bonzini 已提交
225
    }
226

P
Paolo Bonzini 已提交
227
    return ticks;
228 229 230 231 232 233 234 235 236 237 238 239 240 241
}

/* return the host CPU monotonic timer and handle stop/restart */
int64_t cpu_get_clock(void)
{
    int64_t ti;
    unsigned start;

    do {
        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
        ti = cpu_get_clock_locked();
    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));

    return ti;
P
Paolo Bonzini 已提交
242 243
}

244 245 246
/* enable cpu_get_ticks()
 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
 */
P
Paolo Bonzini 已提交
247 248
void cpu_enable_ticks(void)
{
249 250
    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
    seqlock_write_lock(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
251
    if (!timers_state.cpu_ticks_enabled) {
252
        timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
P
Paolo Bonzini 已提交
253 254 255
        timers_state.cpu_clock_offset -= get_clock();
        timers_state.cpu_ticks_enabled = 1;
    }
256
    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
257 258 259
}

/* disable cpu_get_ticks() : the clock is stopped. You must not call
260 261 262
 * cpu_get_ticks() after that.
 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
 */
P
Paolo Bonzini 已提交
263 264
void cpu_disable_ticks(void)
{
265 266
    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
    seqlock_write_lock(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
267
    if (timers_state.cpu_ticks_enabled) {
268
        timers_state.cpu_ticks_offset += cpu_get_host_ticks();
269
        timers_state.cpu_clock_offset = cpu_get_clock_locked();
P
Paolo Bonzini 已提交
270 271
        timers_state.cpu_ticks_enabled = 0;
    }
272
    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
273 274 275 276 277 278 279 280 281 282 283 284 285
}

/* Correlation between real and virtual time is always going to be
   fairly approximate, so ignore small variation.
   When the guest is idle real and virtual time will be aligned in
   the IO wait loop.  */
#define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)

static void icount_adjust(void)
{
    int64_t cur_time;
    int64_t cur_icount;
    int64_t delta;
286 287

    /* Protected by TimersState mutex.  */
P
Paolo Bonzini 已提交
288
    static int64_t last_delta;
289

P
Paolo Bonzini 已提交
290 291 292 293
    /* If the VM is not running, then do nothing.  */
    if (!runstate_is_running()) {
        return;
    }
294

P
Paolo Bonzini 已提交
295 296 297
    seqlock_write_lock(&timers_state.vm_clock_seqlock);
    cur_time = cpu_get_clock_locked();
    cur_icount = cpu_get_icount_locked();
298

P
Paolo Bonzini 已提交
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
    delta = cur_icount - cur_time;
    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
    if (delta > 0
        && last_delta + ICOUNT_WOBBLE < delta * 2
        && icount_time_shift > 0) {
        /* The guest is getting too far ahead.  Slow time down.  */
        icount_time_shift--;
    }
    if (delta < 0
        && last_delta - ICOUNT_WOBBLE > delta * 2
        && icount_time_shift < MAX_ICOUNT_SHIFT) {
        /* The guest is getting too far behind.  Speed time up.  */
        icount_time_shift++;
    }
    last_delta = delta;
314 315
    timers_state.qemu_icount_bias = cur_icount
                              - (timers_state.qemu_icount << icount_time_shift);
P
Paolo Bonzini 已提交
316
    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
317 318 319 320
}

static void icount_adjust_rt(void *opaque)
{
321
    timer_mod(icount_rt_timer,
322
              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
P
Paolo Bonzini 已提交
323 324 325 326 327
    icount_adjust();
}

static void icount_adjust_vm(void *opaque)
{
328 329 330
    timer_mod(icount_vm_timer,
                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
                   get_ticks_per_sec() / 10);
P
Paolo Bonzini 已提交
331 332 333 334 335 336 337 338
    icount_adjust();
}

static int64_t qemu_icount_round(int64_t count)
{
    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
}

339
static void icount_warp_rt(void)
P
Paolo Bonzini 已提交
340
{
P
Paolo Bonzini 已提交
341 342 343 344
    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
     * changes from -1 to another value, so the race here is okay.
     */
    if (atomic_read(&vm_clock_warp_start) == -1) {
P
Paolo Bonzini 已提交
345 346 347
        return;
    }

P
Paolo Bonzini 已提交
348
    seqlock_write_lock(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
349
    if (runstate_is_running()) {
350 351
        int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
                                     cpu_get_clock_locked());
P
Paolo Bonzini 已提交
352 353 354 355
        int64_t warp_delta;

        warp_delta = clock - vm_clock_warp_start;
        if (use_icount == 2) {
P
Paolo Bonzini 已提交
356
            /*
357
             * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
P
Paolo Bonzini 已提交
358 359
             * far ahead of real time.
             */
P
Paolo Bonzini 已提交
360
            int64_t cur_icount = cpu_get_icount_locked();
361
            int64_t delta = clock - cur_icount;
P
Paolo Bonzini 已提交
362
            warp_delta = MIN(warp_delta, delta);
P
Paolo Bonzini 已提交
363
        }
364
        timers_state.qemu_icount_bias += warp_delta;
P
Paolo Bonzini 已提交
365 366
    }
    vm_clock_warp_start = -1;
P
Paolo Bonzini 已提交
367
    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
368 369 370 371

    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
    }
P
Paolo Bonzini 已提交
372 373
}

P
Pavel Dovgalyuk 已提交
374
static void icount_timer_cb(void *opaque)
375
{
P
Pavel Dovgalyuk 已提交
376 377 378 379
    /* No need for a checkpoint because the timer already synchronizes
     * with CHECKPOINT_CLOCK_VIRTUAL_RT.
     */
    icount_warp_rt();
380 381
}

P
Paolo Bonzini 已提交
382 383
void qtest_clock_warp(int64_t dest)
{
384
    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
385
    AioContext *aio_context;
P
Paolo Bonzini 已提交
386
    assert(qtest_enabled());
387
    aio_context = qemu_get_aio_context();
P
Paolo Bonzini 已提交
388
    while (clock < dest) {
389
        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
390
        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
391

P
Paolo Bonzini 已提交
392
        seqlock_write_lock(&timers_state.vm_clock_seqlock);
393
        timers_state.qemu_icount_bias += warp;
P
Paolo Bonzini 已提交
394 395
        seqlock_write_unlock(&timers_state.vm_clock_seqlock);

396
        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
397
        timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
398
        clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
P
Paolo Bonzini 已提交
399
    }
400
    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
P
Paolo Bonzini 已提交
401 402
}

P
Pavel Dovgalyuk 已提交
403
void qemu_start_warp_timer(void)
P
Paolo Bonzini 已提交
404
{
405
    int64_t clock;
P
Paolo Bonzini 已提交
406 407
    int64_t deadline;

P
Pavel Dovgalyuk 已提交
408
    if (!use_icount) {
P
Paolo Bonzini 已提交
409 410 411
        return;
    }

P
Pavel Dovgalyuk 已提交
412 413 414 415 416 417 418 419
    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
     * do not fire, so computing the deadline does not make sense.
     */
    if (!runstate_is_running()) {
        return;
    }

    /* warp clock deterministically in record/replay mode */
P
Pavel Dovgalyuk 已提交
420
    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
P
Pavel Dovgalyuk 已提交
421 422 423
        return;
    }

424
    if (!all_cpu_threads_idle()) {
P
Paolo Bonzini 已提交
425 426 427
        return;
    }

P
Paolo Bonzini 已提交
428 429
    if (qtest_enabled()) {
        /* When testing, qtest commands advance icount.  */
P
Pavel Dovgalyuk 已提交
430
        return;
P
Paolo Bonzini 已提交
431 432
    }

433
    /* We want to use the earliest deadline from ALL vm_clocks */
434
    clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
435
    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
436
    if (deadline < 0) {
437 438 439 440 441
        static bool notified;
        if (!icount_sleep && !notified) {
            error_report("WARNING: icount sleep disabled and no active timers");
            notified = true;
        }
442
        return;
443 444
    }

P
Paolo Bonzini 已提交
445 446
    if (deadline > 0) {
        /*
447
         * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
P
Paolo Bonzini 已提交
448 449 450
         * sleep.  Otherwise, the CPU might be waiting for a future timer
         * interrupt to wake it up, but the interrupt never comes because
         * the vCPU isn't running any insns and thus doesn't advance the
451
         * QEMU_CLOCK_VIRTUAL.
P
Paolo Bonzini 已提交
452
         */
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
        if (!icount_sleep) {
            /*
             * We never let VCPUs sleep in no sleep icount mode.
             * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
             * to the next QEMU_CLOCK_VIRTUAL event and notify it.
             * It is useful when we want a deterministic execution time,
             * isolated from host latencies.
             */
            seqlock_write_lock(&timers_state.vm_clock_seqlock);
            timers_state.qemu_icount_bias += deadline;
            seqlock_write_unlock(&timers_state.vm_clock_seqlock);
            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
        } else {
            /*
             * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
             * "real" time, (related to the time left until the next event) has
             * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
             * This avoids that the warps are visible externally; for example,
             * you will not be sending network packets continuously instead of
             * every 100ms.
             */
            seqlock_write_lock(&timers_state.vm_clock_seqlock);
            if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
                vm_clock_warp_start = clock;
            }
            seqlock_write_unlock(&timers_state.vm_clock_seqlock);
            timer_mod_anticipate(icount_warp_timer, clock + deadline);
480
        }
481
    } else if (deadline == 0) {
482
        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
P
Paolo Bonzini 已提交
483 484 485
    }
}

P
Pavel Dovgalyuk 已提交
486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
static void qemu_account_warp_timer(void)
{
    if (!use_icount || !icount_sleep) {
        return;
    }

    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
     * do not fire, so computing the deadline does not make sense.
     */
    if (!runstate_is_running()) {
        return;
    }

    /* warp clock deterministically in record/replay mode */
    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
        return;
    }

    timer_del(icount_warp_timer);
    icount_warp_rt();
}

508 509 510 511 512 513 514 515 516 517 518 519
static bool icount_state_needed(void *opaque)
{
    return use_icount;
}

/*
 * This is a subsection for icount migration.
 */
static const VMStateDescription icount_vmstate_timers = {
    .name = "timer/icount",
    .version_id = 1,
    .minimum_version_id = 1,
520
    .needed = icount_state_needed,
521 522 523 524 525 526 527
    .fields = (VMStateField[]) {
        VMSTATE_INT64(qemu_icount_bias, TimersState),
        VMSTATE_INT64(qemu_icount, TimersState),
        VMSTATE_END_OF_LIST()
    }
};

P
Paolo Bonzini 已提交
528 529 530 531
static const VMStateDescription vmstate_timers = {
    .name = "timer",
    .version_id = 2,
    .minimum_version_id = 1,
532
    .fields = (VMStateField[]) {
P
Paolo Bonzini 已提交
533 534 535 536
        VMSTATE_INT64(cpu_ticks_offset, TimersState),
        VMSTATE_INT64(dummy, TimersState),
        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
        VMSTATE_END_OF_LIST()
537
    },
538 539 540
    .subsections = (const VMStateDescription*[]) {
        &icount_vmstate_timers,
        NULL
P
Paolo Bonzini 已提交
541 542 543
    }
};

544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611
static void cpu_throttle_thread(void *opaque)
{
    CPUState *cpu = opaque;
    double pct;
    double throttle_ratio;
    long sleeptime_ns;

    if (!cpu_throttle_get_percentage()) {
        return;
    }

    pct = (double)cpu_throttle_get_percentage()/100;
    throttle_ratio = pct / (1 - pct);
    sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);

    qemu_mutex_unlock_iothread();
    atomic_set(&cpu->throttle_thread_scheduled, 0);
    g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
    qemu_mutex_lock_iothread();
}

static void cpu_throttle_timer_tick(void *opaque)
{
    CPUState *cpu;
    double pct;

    /* Stop the timer if needed */
    if (!cpu_throttle_get_percentage()) {
        return;
    }
    CPU_FOREACH(cpu) {
        if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
            async_run_on_cpu(cpu, cpu_throttle_thread, cpu);
        }
    }

    pct = (double)cpu_throttle_get_percentage()/100;
    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
                                   CPU_THROTTLE_TIMESLICE_NS / (1-pct));
}

void cpu_throttle_set(int new_throttle_pct)
{
    /* Ensure throttle percentage is within valid range */
    new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
    new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);

    atomic_set(&throttle_percentage, new_throttle_pct);

    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
                                       CPU_THROTTLE_TIMESLICE_NS);
}

void cpu_throttle_stop(void)
{
    atomic_set(&throttle_percentage, 0);
}

bool cpu_throttle_active(void)
{
    return (cpu_throttle_get_percentage() != 0);
}

int cpu_throttle_get_percentage(void)
{
    return atomic_read(&throttle_percentage);
}

612 613 614 615
void cpu_ticks_init(void)
{
    seqlock_init(&timers_state.vm_clock_seqlock, NULL);
    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
616 617
    throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
                                           cpu_throttle_timer_tick, NULL);
618 619
}

620
void configure_icount(QemuOpts *opts, Error **errp)
P
Paolo Bonzini 已提交
621
{
622
    const char *option;
623
    char *rem_str = NULL;
624 625

    option = qemu_opt_get(opts, "shift");
P
Paolo Bonzini 已提交
626
    if (!option) {
627 628 629
        if (qemu_opt_get(opts, "align") != NULL) {
            error_setg(errp, "Please specify shift option when using align");
        }
P
Paolo Bonzini 已提交
630 631
        return;
    }
632 633

    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
634 635
    if (icount_sleep) {
        icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
P
Pavel Dovgalyuk 已提交
636
                                         icount_timer_cb, NULL);
637
    }
638

639
    icount_align_option = qemu_opt_get_bool(opts, "align", false);
640 641

    if (icount_align_option && !icount_sleep) {
642
        error_setg(errp, "align=on and sleep=off are incompatible");
643
    }
P
Paolo Bonzini 已提交
644
    if (strcmp(option, "auto") != 0) {
645 646 647 648 649
        errno = 0;
        icount_time_shift = strtol(option, &rem_str, 0);
        if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
            error_setg(errp, "icount: Invalid shift value");
        }
P
Paolo Bonzini 已提交
650 651
        use_icount = 1;
        return;
652 653
    } else if (icount_align_option) {
        error_setg(errp, "shift=auto and align=on are incompatible");
654
    } else if (!icount_sleep) {
655
        error_setg(errp, "shift=auto and sleep=off are incompatible");
P
Paolo Bonzini 已提交
656 657 658 659 660 661 662 663 664 665 666 667 668
    }

    use_icount = 2;

    /* 125MIPS seems a reasonable initial guess at the guest speed.
       It will be corrected fairly quickly anyway.  */
    icount_time_shift = 3;

    /* Have both realtime and virtual time triggers for speed adjustment.
       The realtime trigger catches emulated time passing too slowly,
       the virtual time trigger catches emulated time passing too fast.
       Realtime triggers occur even when idle, so use them less frequently
       than VM triggers.  */
669 670
    icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
                                   icount_adjust_rt, NULL);
671
    timer_mod(icount_rt_timer,
672
                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
673 674 675 676 677
    icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
                                        icount_adjust_vm, NULL);
    timer_mod(icount_vm_timer,
                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
                   get_ticks_per_sec() / 10);
P
Paolo Bonzini 已提交
678 679
}

680 681 682 683
/***********************************************************/
void hw_error(const char *fmt, ...)
{
    va_list ap;
684
    CPUState *cpu;
685 686 687 688 689

    va_start(ap, fmt);
    fprintf(stderr, "qemu: hardware error: ");
    vfprintf(stderr, fmt, ap);
    fprintf(stderr, "\n");
A
Andreas Färber 已提交
690
    CPU_FOREACH(cpu) {
691
        fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
692
        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
693 694 695 696 697 698 699
    }
    va_end(ap);
    abort();
}

void cpu_synchronize_all_states(void)
{
700
    CPUState *cpu;
701

A
Andreas Färber 已提交
702
    CPU_FOREACH(cpu) {
703
        cpu_synchronize_state(cpu);
704 705 706 707 708
    }
}

void cpu_synchronize_all_post_reset(void)
{
709
    CPUState *cpu;
710

A
Andreas Färber 已提交
711
    CPU_FOREACH(cpu) {
712
        cpu_synchronize_post_reset(cpu);
713 714 715 716 717
    }
}

void cpu_synchronize_all_post_init(void)
{
718
    CPUState *cpu;
719

A
Andreas Färber 已提交
720
    CPU_FOREACH(cpu) {
721
        cpu_synchronize_post_init(cpu);
722 723 724
    }
}

725
static int do_vm_stop(RunState state)
726
{
727 728
    int ret = 0;

729
    if (runstate_is_running()) {
730 731
        cpu_disable_ticks();
        pause_all_vcpus();
732
        runstate_set(state);
733
        vm_state_notify(0, state);
W
Wenchao Xia 已提交
734
        qapi_event_send_stop(&error_abort);
735
    }
736

737
    bdrv_drain_all();
738
    ret = blk_flush_all();
739

740
    return ret;
741 742
}

743
static bool cpu_can_run(CPUState *cpu)
744
{
A
Andreas Färber 已提交
745
    if (cpu->stop) {
746
        return false;
747
    }
748
    if (cpu_is_stopped(cpu)) {
749
        return false;
750
    }
751
    return true;
752 753
}

754
static void cpu_handle_guest_debug(CPUState *cpu)
755
{
756
    gdb_set_stop_cpu(cpu);
757
    qemu_system_debug_request();
758
    cpu->stopped = true;
759 760
}

761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
#ifdef CONFIG_LINUX
static void sigbus_reraise(void)
{
    sigset_t set;
    struct sigaction action;

    memset(&action, 0, sizeof(action));
    action.sa_handler = SIG_DFL;
    if (!sigaction(SIGBUS, &action, NULL)) {
        raise(SIGBUS);
        sigemptyset(&set);
        sigaddset(&set, SIGBUS);
        sigprocmask(SIG_UNBLOCK, &set, NULL);
    }
    perror("Failed to re-raise SIGBUS!\n");
    abort();
}

static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
                           void *ctx)
{
    if (kvm_on_sigbus(siginfo->ssi_code,
                      (void *)(intptr_t)siginfo->ssi_addr)) {
        sigbus_reraise();
    }
}

static void qemu_init_sigbus(void)
{
    struct sigaction action;

    memset(&action, 0, sizeof(action));
    action.sa_flags = SA_SIGINFO;
    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
    sigaction(SIGBUS, &action, NULL);

    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
}

800
static void qemu_kvm_eat_signals(CPUState *cpu)
801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820
{
    struct timespec ts = { 0, 0 };
    siginfo_t siginfo;
    sigset_t waitset;
    sigset_t chkset;
    int r;

    sigemptyset(&waitset);
    sigaddset(&waitset, SIG_IPI);
    sigaddset(&waitset, SIGBUS);

    do {
        r = sigtimedwait(&waitset, &siginfo, &ts);
        if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
            perror("sigtimedwait");
            exit(1);
        }

        switch (r) {
        case SIGBUS:
821
            if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
822 823 824 825 826 827 828 829 830 831 832 833 834 835 836
                sigbus_reraise();
            }
            break;
        default:
            break;
        }

        r = sigpending(&chkset);
        if (r == -1) {
            perror("sigpending");
            exit(1);
        }
    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
}

837 838 839 840 841
#else /* !CONFIG_LINUX */

static void qemu_init_sigbus(void)
{
}
842

843
static void qemu_kvm_eat_signals(CPUState *cpu)
844 845
{
}
846 847
#endif /* !CONFIG_LINUX */

848
#ifndef _WIN32
849 850 851 852
static void dummy_signal(int sig)
{
}

853
static void qemu_kvm_init_cpu_signals(CPUState *cpu)
854 855 856 857 858 859 860 861 862 863 864 865
{
    int r;
    sigset_t set;
    struct sigaction sigact;

    memset(&sigact, 0, sizeof(sigact));
    sigact.sa_handler = dummy_signal;
    sigaction(SIG_IPI, &sigact, NULL);

    pthread_sigmask(SIG_BLOCK, NULL, &set);
    sigdelset(&set, SIG_IPI);
    sigdelset(&set, SIGBUS);
866
    r = kvm_set_signal_mask(cpu, &set);
867 868 869 870 871 872
    if (r) {
        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
        exit(1);
    }
}

873
#else /* _WIN32 */
874
static void qemu_kvm_init_cpu_signals(CPUState *cpu)
875
{
876 877 878
    abort();
}
#endif /* _WIN32 */
879

880
static QemuMutex qemu_global_mutex;
881
static QemuCond qemu_io_proceeded_cond;
882
static unsigned iothread_requesting_mutex;
883 884 885 886 887 888 889

static QemuThread io_thread;

/* cpu creation */
static QemuCond qemu_cpu_cond;
/* system init */
static QemuCond qemu_pause_cond;
M
Marcelo Tosatti 已提交
890
static QemuCond qemu_work_cond;
891

P
Paolo Bonzini 已提交
892
void qemu_init_cpu_loop(void)
893
{
894
    qemu_init_sigbus();
895 896 897
    qemu_cond_init(&qemu_cpu_cond);
    qemu_cond_init(&qemu_pause_cond);
    qemu_cond_init(&qemu_work_cond);
898
    qemu_cond_init(&qemu_io_proceeded_cond);
899 900
    qemu_mutex_init(&qemu_global_mutex);

J
Jan Kiszka 已提交
901
    qemu_thread_get_self(&io_thread);
902 903
}

904
void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
M
Marcelo Tosatti 已提交
905 906 907
{
    struct qemu_work_item wi;

908
    if (qemu_cpu_is_self(cpu)) {
M
Marcelo Tosatti 已提交
909 910 911 912 913 914
        func(data);
        return;
    }

    wi.func = func;
    wi.data = data;
C
Chegu Vinod 已提交
915
    wi.free = false;
916 917

    qemu_mutex_lock(&cpu->work_mutex);
918 919
    if (cpu->queued_work_first == NULL) {
        cpu->queued_work_first = &wi;
920
    } else {
921
        cpu->queued_work_last->next = &wi;
922
    }
923
    cpu->queued_work_last = &wi;
M
Marcelo Tosatti 已提交
924 925
    wi.next = NULL;
    wi.done = false;
926
    qemu_mutex_unlock(&cpu->work_mutex);
M
Marcelo Tosatti 已提交
927

928
    qemu_cpu_kick(cpu);
929
    while (!atomic_mb_read(&wi.done)) {
930
        CPUState *self_cpu = current_cpu;
M
Marcelo Tosatti 已提交
931 932

        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
933
        current_cpu = self_cpu;
M
Marcelo Tosatti 已提交
934 935 936
    }
}

C
Chegu Vinod 已提交
937 938 939 940 941 942 943 944 945 946 947 948 949
void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
{
    struct qemu_work_item *wi;

    if (qemu_cpu_is_self(cpu)) {
        func(data);
        return;
    }

    wi = g_malloc0(sizeof(struct qemu_work_item));
    wi->func = func;
    wi->data = data;
    wi->free = true;
950 951

    qemu_mutex_lock(&cpu->work_mutex);
C
Chegu Vinod 已提交
952 953 954 955 956 957 958 959
    if (cpu->queued_work_first == NULL) {
        cpu->queued_work_first = wi;
    } else {
        cpu->queued_work_last->next = wi;
    }
    cpu->queued_work_last = wi;
    wi->next = NULL;
    wi->done = false;
960
    qemu_mutex_unlock(&cpu->work_mutex);
C
Chegu Vinod 已提交
961 962 963 964

    qemu_cpu_kick(cpu);
}

965
static void flush_queued_work(CPUState *cpu)
M
Marcelo Tosatti 已提交
966 967 968
{
    struct qemu_work_item *wi;

969
    if (cpu->queued_work_first == NULL) {
M
Marcelo Tosatti 已提交
970
        return;
971
    }
M
Marcelo Tosatti 已提交
972

973 974 975
    qemu_mutex_lock(&cpu->work_mutex);
    while (cpu->queued_work_first != NULL) {
        wi = cpu->queued_work_first;
976
        cpu->queued_work_first = wi->next;
977 978 979 980
        if (!cpu->queued_work_first) {
            cpu->queued_work_last = NULL;
        }
        qemu_mutex_unlock(&cpu->work_mutex);
M
Marcelo Tosatti 已提交
981
        wi->func(wi->data);
982
        qemu_mutex_lock(&cpu->work_mutex);
C
Chegu Vinod 已提交
983 984
        if (wi->free) {
            g_free(wi);
985 986
        } else {
            atomic_mb_set(&wi->done, true);
C
Chegu Vinod 已提交
987
        }
M
Marcelo Tosatti 已提交
988
    }
989
    qemu_mutex_unlock(&cpu->work_mutex);
M
Marcelo Tosatti 已提交
990 991 992
    qemu_cond_broadcast(&qemu_work_cond);
}

993
static void qemu_wait_io_event_common(CPUState *cpu)
994
{
A
Andreas Färber 已提交
995 996
    if (cpu->stop) {
        cpu->stop = false;
997
        cpu->stopped = true;
998
        qemu_cond_broadcast(&qemu_pause_cond);
999
    }
1000
    flush_queued_work(cpu);
1001
    cpu->thread_kicked = false;
1002 1003
}

1004
static void qemu_tcg_wait_io_event(CPUState *cpu)
1005
{
1006
    while (all_cpu_threads_idle()) {
1007
        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1008
    }
1009

1010 1011 1012
    while (iothread_requesting_mutex) {
        qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
    }
1013

A
Andreas Färber 已提交
1014
    CPU_FOREACH(cpu) {
1015
        qemu_wait_io_event_common(cpu);
1016
    }
1017 1018
}

1019
static void qemu_kvm_wait_io_event(CPUState *cpu)
1020
{
1021
    while (cpu_thread_is_idle(cpu)) {
A
Andreas Färber 已提交
1022
        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1023
    }
1024

1025
    qemu_kvm_eat_signals(cpu);
1026
    qemu_wait_io_event_common(cpu);
1027 1028
}

1029
static void *qemu_kvm_cpu_thread_fn(void *arg)
1030
{
1031
    CPUState *cpu = arg;
J
Jan Kiszka 已提交
1032
    int r;
1033

1034 1035
    rcu_register_thread();

1036
    qemu_mutex_lock_iothread();
1037
    qemu_thread_get_self(cpu->thread);
A
Andreas Färber 已提交
1038
    cpu->thread_id = qemu_get_thread_id();
1039
    cpu->can_do_io = 1;
1040
    current_cpu = cpu;
1041

1042
    r = kvm_init_vcpu(cpu);
J
Jan Kiszka 已提交
1043 1044 1045 1046
    if (r < 0) {
        fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
        exit(1);
    }
1047

1048
    qemu_kvm_init_cpu_signals(cpu);
1049 1050

    /* signal CPU creation */
1051
    cpu->created = true;
1052 1053 1054
    qemu_cond_signal(&qemu_cpu_cond);

    while (1) {
1055
        if (cpu_can_run(cpu)) {
1056
            r = kvm_cpu_exec(cpu);
1057
            if (r == EXCP_DEBUG) {
1058
                cpu_handle_guest_debug(cpu);
1059
            }
1060
        }
1061
        qemu_kvm_wait_io_event(cpu);
1062 1063 1064 1065 1066
    }

    return NULL;
}

A
Anthony Liguori 已提交
1067 1068 1069 1070 1071 1072
static void *qemu_dummy_cpu_thread_fn(void *arg)
{
#ifdef _WIN32
    fprintf(stderr, "qtest is not supported under Windows\n");
    exit(1);
#else
1073
    CPUState *cpu = arg;
A
Anthony Liguori 已提交
1074 1075 1076
    sigset_t waitset;
    int r;

1077 1078
    rcu_register_thread();

A
Anthony Liguori 已提交
1079
    qemu_mutex_lock_iothread();
1080
    qemu_thread_get_self(cpu->thread);
A
Andreas Färber 已提交
1081
    cpu->thread_id = qemu_get_thread_id();
1082
    cpu->can_do_io = 1;
A
Anthony Liguori 已提交
1083 1084 1085 1086 1087

    sigemptyset(&waitset);
    sigaddset(&waitset, SIG_IPI);

    /* signal CPU creation */
1088
    cpu->created = true;
A
Anthony Liguori 已提交
1089 1090
    qemu_cond_signal(&qemu_cpu_cond);

1091
    current_cpu = cpu;
A
Anthony Liguori 已提交
1092
    while (1) {
1093
        current_cpu = NULL;
A
Anthony Liguori 已提交
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
        qemu_mutex_unlock_iothread();
        do {
            int sig;
            r = sigwait(&waitset, &sig);
        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
        if (r == -1) {
            perror("sigwait");
            exit(1);
        }
        qemu_mutex_lock_iothread();
1104
        current_cpu = cpu;
1105
        qemu_wait_io_event_common(cpu);
A
Anthony Liguori 已提交
1106 1107 1108 1109 1110 1111
    }

    return NULL;
#endif
}

J
Jan Kiszka 已提交
1112 1113
static void tcg_exec_all(void);

1114
static void *qemu_tcg_cpu_thread_fn(void *arg)
1115
{
1116
    CPUState *cpu = arg;
1117

1118 1119
    rcu_register_thread();

1120
    qemu_mutex_lock_iothread();
1121
    qemu_thread_get_self(cpu->thread);
1122

1123 1124 1125
    CPU_FOREACH(cpu) {
        cpu->thread_id = qemu_get_thread_id();
        cpu->created = true;
1126
        cpu->can_do_io = 1;
1127
    }
1128 1129
    qemu_cond_signal(&qemu_cpu_cond);

1130
    /* wait for initial kick-off after machine start */
1131
    while (first_cpu->stopped) {
1132
        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1133 1134

        /* process any pending work */
A
Andreas Färber 已提交
1135
        CPU_FOREACH(cpu) {
1136
            qemu_wait_io_event_common(cpu);
1137
        }
1138
    }
1139

1140
    /* process any pending work */
1141
    atomic_mb_set(&exit_request, 1);
1142

1143
    while (1) {
J
Jan Kiszka 已提交
1144
        tcg_exec_all();
1145 1146

        if (use_icount) {
1147
            int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1148 1149

            if (deadline == 0) {
1150
                qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1151
            }
1152
        }
1153
        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
1154 1155 1156 1157 1158
    }

    return NULL;
}

1159
static void qemu_cpu_kick_thread(CPUState *cpu)
P
Paolo Bonzini 已提交
1160 1161 1162 1163
{
#ifndef _WIN32
    int err;

P
Paolo Bonzini 已提交
1164 1165
    if (cpu->thread_kicked) {
        return;
1166
    }
P
Paolo Bonzini 已提交
1167
    cpu->thread_kicked = true;
1168
    err = pthread_kill(cpu->thread->thread, SIG_IPI);
P
Paolo Bonzini 已提交
1169 1170 1171 1172 1173
    if (err) {
        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
        exit(1);
    }
#else /* _WIN32 */
P
Paolo Bonzini 已提交
1174 1175 1176
    abort();
#endif
}
1177

P
Paolo Bonzini 已提交
1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
static void qemu_cpu_kick_no_halt(void)
{
    CPUState *cpu;
    /* Ensure whatever caused the exit has reached the CPU threads before
     * writing exit_request.
     */
    atomic_mb_set(&exit_request, 1);
    cpu = atomic_mb_read(&tcg_current_cpu);
    if (cpu) {
        cpu_exit(cpu);
P
Paolo Bonzini 已提交
1188 1189 1190
    }
}

1191
void qemu_cpu_kick(CPUState *cpu)
1192
{
A
Andreas Färber 已提交
1193
    qemu_cond_broadcast(cpu->halt_cond);
P
Paolo Bonzini 已提交
1194 1195 1196 1197 1198
    if (tcg_enabled()) {
        qemu_cpu_kick_no_halt();
    } else {
        qemu_cpu_kick_thread(cpu);
    }
1199 1200
}

1201
void qemu_cpu_kick_self(void)
1202
{
1203
    assert(current_cpu);
1204
    qemu_cpu_kick_thread(current_cpu);
1205 1206
}

1207
bool qemu_cpu_is_self(CPUState *cpu)
1208
{
1209
    return qemu_thread_is_self(cpu->thread);
1210 1211
}

1212
bool qemu_in_vcpu_thread(void)
J
Juan Quintela 已提交
1213
{
1214
    return current_cpu && qemu_cpu_is_self(current_cpu);
J
Juan Quintela 已提交
1215 1216
}

1217 1218 1219 1220 1221 1222 1223
static __thread bool iothread_locked = false;

bool qemu_mutex_iothread_locked(void)
{
    return iothread_locked;
}

1224 1225
void qemu_mutex_lock_iothread(void)
{
1226
    atomic_inc(&iothread_requesting_mutex);
1227 1228 1229 1230
    /* In the simple case there is no need to bump the VCPU thread out of
     * TCG code execution.
     */
    if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1231
        !first_cpu || !first_cpu->created) {
1232
        qemu_mutex_lock(&qemu_global_mutex);
1233
        atomic_dec(&iothread_requesting_mutex);
1234 1235
    } else {
        if (qemu_mutex_trylock(&qemu_global_mutex)) {
P
Paolo Bonzini 已提交
1236
            qemu_cpu_kick_no_halt();
1237 1238
            qemu_mutex_lock(&qemu_global_mutex);
        }
1239
        atomic_dec(&iothread_requesting_mutex);
1240
        qemu_cond_broadcast(&qemu_io_proceeded_cond);
1241
    }
1242
    iothread_locked = true;
1243 1244 1245 1246
}

void qemu_mutex_unlock_iothread(void)
{
1247
    iothread_locked = false;
1248 1249 1250 1251 1252
    qemu_mutex_unlock(&qemu_global_mutex);
}

static int all_vcpus_paused(void)
{
A
Andreas Färber 已提交
1253
    CPUState *cpu;
1254

A
Andreas Färber 已提交
1255
    CPU_FOREACH(cpu) {
1256
        if (!cpu->stopped) {
1257
            return 0;
1258
        }
1259 1260 1261 1262 1263 1264 1265
    }

    return 1;
}

void pause_all_vcpus(void)
{
A
Andreas Färber 已提交
1266
    CPUState *cpu;
1267

1268
    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
A
Andreas Färber 已提交
1269
    CPU_FOREACH(cpu) {
1270 1271
        cpu->stop = true;
        qemu_cpu_kick(cpu);
1272 1273
    }

J
Juan Quintela 已提交
1274
    if (qemu_in_vcpu_thread()) {
1275 1276
        cpu_stop_current();
        if (!kvm_enabled()) {
A
Andreas Färber 已提交
1277
            CPU_FOREACH(cpu) {
1278 1279
                cpu->stop = false;
                cpu->stopped = true;
1280 1281 1282 1283 1284
            }
            return;
        }
    }

1285
    while (!all_vcpus_paused()) {
1286
        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
A
Andreas Färber 已提交
1287
        CPU_FOREACH(cpu) {
1288
            qemu_cpu_kick(cpu);
1289 1290 1291 1292
        }
    }
}

1293 1294 1295 1296 1297 1298 1299
void cpu_resume(CPUState *cpu)
{
    cpu->stop = false;
    cpu->stopped = false;
    qemu_cpu_kick(cpu);
}

1300 1301
void resume_all_vcpus(void)
{
A
Andreas Färber 已提交
1302
    CPUState *cpu;
1303

1304
    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
A
Andreas Färber 已提交
1305
    CPU_FOREACH(cpu) {
1306
        cpu_resume(cpu);
1307 1308 1309
    }
}

1310 1311 1312
/* For temporary buffers for forming a name */
#define VCPU_THREAD_NAME_SIZE 16

1313
static void qemu_tcg_init_vcpu(CPUState *cpu)
1314
{
1315
    char thread_name[VCPU_THREAD_NAME_SIZE];
1316 1317
    static QemuCond *tcg_halt_cond;
    static QemuThread *tcg_cpu_thread;
1318

1319 1320
    /* share a single thread for all cpus with TCG */
    if (!tcg_cpu_thread) {
1321
        cpu->thread = g_malloc0(sizeof(QemuThread));
A
Andreas Färber 已提交
1322 1323 1324
        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
        qemu_cond_init(cpu->halt_cond);
        tcg_halt_cond = cpu->halt_cond;
1325 1326 1327 1328
        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
                 cpu->cpu_index);
        qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
                           cpu, QEMU_THREAD_JOINABLE);
P
Paolo Bonzini 已提交
1329
#ifdef _WIN32
1330
        cpu->hThread = qemu_thread_get_handle(cpu->thread);
P
Paolo Bonzini 已提交
1331
#endif
1332
        while (!cpu->created) {
1333
            qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1334
        }
1335
        tcg_cpu_thread = cpu->thread;
1336
    } else {
1337
        cpu->thread = tcg_cpu_thread;
A
Andreas Färber 已提交
1338
        cpu->halt_cond = tcg_halt_cond;
1339 1340 1341
    }
}

1342
static void qemu_kvm_start_vcpu(CPUState *cpu)
1343
{
1344 1345
    char thread_name[VCPU_THREAD_NAME_SIZE];

1346
    cpu->thread = g_malloc0(sizeof(QemuThread));
A
Andreas Färber 已提交
1347 1348
    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
    qemu_cond_init(cpu->halt_cond);
1349 1350 1351 1352
    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
             cpu->cpu_index);
    qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
                       cpu, QEMU_THREAD_JOINABLE);
1353
    while (!cpu->created) {
1354
        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1355
    }
1356 1357
}

1358
static void qemu_dummy_start_vcpu(CPUState *cpu)
A
Anthony Liguori 已提交
1359
{
1360 1361
    char thread_name[VCPU_THREAD_NAME_SIZE];

1362
    cpu->thread = g_malloc0(sizeof(QemuThread));
A
Andreas Färber 已提交
1363 1364
    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
    qemu_cond_init(cpu->halt_cond);
1365 1366 1367
    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
             cpu->cpu_index);
    qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
A
Anthony Liguori 已提交
1368
                       QEMU_THREAD_JOINABLE);
1369
    while (!cpu->created) {
A
Anthony Liguori 已提交
1370 1371 1372 1373
        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
    }
}

1374
void qemu_init_vcpu(CPUState *cpu)
1375
{
1376 1377
    cpu->nr_cores = smp_cores;
    cpu->nr_threads = smp_threads;
1378
    cpu->stopped = true;
1379 1380 1381 1382 1383

    if (!cpu->as) {
        /* If the target cpu hasn't set up any address spaces itself,
         * give it the default one.
         */
1384 1385
        AddressSpace *as = address_space_init_shareable(cpu->memory,
                                                        "cpu-memory");
1386
        cpu->num_ases = 1;
1387
        cpu_address_space_init(cpu, as, 0);
1388 1389
    }

1390
    if (kvm_enabled()) {
1391
        qemu_kvm_start_vcpu(cpu);
A
Anthony Liguori 已提交
1392
    } else if (tcg_enabled()) {
1393
        qemu_tcg_init_vcpu(cpu);
A
Anthony Liguori 已提交
1394
    } else {
1395
        qemu_dummy_start_vcpu(cpu);
1396
    }
1397 1398
}

1399
void cpu_stop_current(void)
1400
{
1401 1402 1403 1404
    if (current_cpu) {
        current_cpu->stop = false;
        current_cpu->stopped = true;
        cpu_exit(current_cpu);
1405
        qemu_cond_broadcast(&qemu_pause_cond);
1406
    }
1407 1408
}

1409
int vm_stop(RunState state)
1410
{
J
Juan Quintela 已提交
1411
    if (qemu_in_vcpu_thread()) {
1412
        qemu_system_vmstop_request_prepare();
1413
        qemu_system_vmstop_request(state);
1414 1415 1416 1417
        /*
         * FIXME: should not return to device code in case
         * vm_stop() has been requested.
         */
1418
        cpu_stop_current();
1419
        return 0;
1420
    }
1421 1422

    return do_vm_stop(state);
1423 1424
}

1425 1426
/* does a state transition even if the VM is already stopped,
   current state is forgotten forever */
1427
int vm_stop_force_state(RunState state)
1428 1429
{
    if (runstate_is_running()) {
1430
        return vm_stop(state);
1431 1432
    } else {
        runstate_set(state);
1433 1434

        bdrv_drain_all();
1435 1436
        /* Make sure to return an error if the flush in a previous vm_stop()
         * failed. */
1437
        return blk_flush_all();
1438 1439 1440
    }
}

1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462
static int64_t tcg_get_icount_limit(void)
{
    int64_t deadline;

    if (replay_mode != REPLAY_MODE_PLAY) {
        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);

        /* Maintain prior (possibly buggy) behaviour where if no deadline
         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
         * nanoseconds.
         */
        if ((deadline < 0) || (deadline > INT32_MAX)) {
            deadline = INT32_MAX;
        }

        return qemu_icount_round(deadline);
    } else {
        return replay_get_instructions();
    }
}

1463
static int tcg_cpu_exec(CPUState *cpu)
1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475
{
    int ret;
#ifdef CONFIG_PROFILER
    int64_t ti;
#endif

#ifdef CONFIG_PROFILER
    ti = profile_getclock();
#endif
    if (use_icount) {
        int64_t count;
        int decr;
1476 1477
        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
                                    + cpu->icount_extra);
1478
        cpu->icount_decr.u16.low = 0;
1479
        cpu->icount_extra = 0;
1480
        count = tcg_get_icount_limit();
1481
        timers_state.qemu_icount += count;
1482 1483
        decr = (count > 0xffff) ? 0xffff : count;
        count -= decr;
1484
        cpu->icount_decr.u16.low = decr;
1485
        cpu->icount_extra = count;
1486
    }
1487
    ret = cpu_exec(cpu);
1488
#ifdef CONFIG_PROFILER
1489
    tcg_time += profile_getclock() - ti;
1490 1491 1492 1493
#endif
    if (use_icount) {
        /* Fold pending instructions back into the
           instruction counter, and clear the interrupt flag.  */
1494 1495
        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
                        + cpu->icount_extra);
1496
        cpu->icount_decr.u32 = 0;
1497
        cpu->icount_extra = 0;
1498
        replay_account_executed_instructions();
1499 1500 1501 1502
    }
    return ret;
}

J
Jan Kiszka 已提交
1503
static void tcg_exec_all(void)
1504
{
1505 1506
    int r;

1507
    /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
P
Pavel Dovgalyuk 已提交
1508
    qemu_account_warp_timer();
1509

1510
    if (next_cpu == NULL) {
1511
        next_cpu = first_cpu;
1512
    }
A
Andreas Färber 已提交
1513
    for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
1514
        CPUState *cpu = next_cpu;
1515

1516
        qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1517
                          (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1518

1519
        if (cpu_can_run(cpu)) {
1520
            r = tcg_cpu_exec(cpu);
1521
            if (r == EXCP_DEBUG) {
1522
                cpu_handle_guest_debug(cpu);
1523 1524
                break;
            }
1525
        } else if (cpu->stop || cpu->stopped) {
1526 1527 1528
            break;
        }
    }
1529 1530 1531

    /* Pairs with smp_wmb in qemu_cpu_kick.  */
    atomic_mb_set(&exit_request, 0);
1532 1533
}

1534
void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1535 1536
{
    /* XXX: implement xxx_cpu_list for targets that still miss it */
P
Peter Maydell 已提交
1537 1538
#if defined(cpu_list)
    cpu_list(f, cpu_fprintf);
1539 1540
#endif
}
L
Luiz Capitulino 已提交
1541 1542 1543 1544

CpuInfoList *qmp_query_cpus(Error **errp)
{
    CpuInfoList *head = NULL, *cur_item = NULL;
1545
    CPUState *cpu;
L
Luiz Capitulino 已提交
1546

A
Andreas Färber 已提交
1547
    CPU_FOREACH(cpu) {
L
Luiz Capitulino 已提交
1548
        CpuInfoList *info;
1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560
#if defined(TARGET_I386)
        X86CPU *x86_cpu = X86_CPU(cpu);
        CPUX86State *env = &x86_cpu->env;
#elif defined(TARGET_PPC)
        PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
        CPUPPCState *env = &ppc_cpu->env;
#elif defined(TARGET_SPARC)
        SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
        CPUSPARCState *env = &sparc_cpu->env;
#elif defined(TARGET_MIPS)
        MIPSCPU *mips_cpu = MIPS_CPU(cpu);
        CPUMIPSState *env = &mips_cpu->env;
1561 1562 1563
#elif defined(TARGET_TRICORE)
        TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
        CPUTriCoreState *env = &tricore_cpu->env;
1564
#endif
L
Luiz Capitulino 已提交
1565

1566
        cpu_synchronize_state(cpu);
L
Luiz Capitulino 已提交
1567 1568 1569

        info = g_malloc0(sizeof(*info));
        info->value = g_malloc0(sizeof(*info->value));
1570
        info->value->CPU = cpu->cpu_index;
1571
        info->value->current = (cpu == first_cpu);
1572
        info->value->halted = cpu->halted;
1573
        info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
A
Andreas Färber 已提交
1574
        info->value->thread_id = cpu->thread_id;
L
Luiz Capitulino 已提交
1575
#if defined(TARGET_I386)
E
Eric Blake 已提交
1576
        info->value->arch = CPU_INFO_ARCH_X86;
1577
        info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
L
Luiz Capitulino 已提交
1578
#elif defined(TARGET_PPC)
E
Eric Blake 已提交
1579
        info->value->arch = CPU_INFO_ARCH_PPC;
1580
        info->value->u.ppc.nip = env->nip;
L
Luiz Capitulino 已提交
1581
#elif defined(TARGET_SPARC)
E
Eric Blake 已提交
1582
        info->value->arch = CPU_INFO_ARCH_SPARC;
1583 1584
        info->value->u.q_sparc.pc = env->pc;
        info->value->u.q_sparc.npc = env->npc;
L
Luiz Capitulino 已提交
1585
#elif defined(TARGET_MIPS)
E
Eric Blake 已提交
1586
        info->value->arch = CPU_INFO_ARCH_MIPS;
1587
        info->value->u.q_mips.PC = env->active_tc.PC;
1588
#elif defined(TARGET_TRICORE)
E
Eric Blake 已提交
1589
        info->value->arch = CPU_INFO_ARCH_TRICORE;
1590
        info->value->u.tricore.PC = env->PC;
E
Eric Blake 已提交
1591 1592
#else
        info->value->arch = CPU_INFO_ARCH_OTHER;
L
Luiz Capitulino 已提交
1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605
#endif

        /* XXX: waiting for the qapi to support GSList */
        if (!cur_item) {
            head = cur_item = info;
        } else {
            cur_item->next = info;
            cur_item = info;
        }
    }

    return head;
}
L
Luiz Capitulino 已提交
1606 1607 1608 1609 1610 1611

void qmp_memsave(int64_t addr, int64_t size, const char *filename,
                 bool has_cpu, int64_t cpu_index, Error **errp)
{
    FILE *f;
    uint32_t l;
1612
    CPUState *cpu;
L
Luiz Capitulino 已提交
1613
    uint8_t buf[1024];
1614
    int64_t orig_addr = addr, orig_size = size;
L
Luiz Capitulino 已提交
1615 1616 1617 1618 1619

    if (!has_cpu) {
        cpu_index = 0;
    }

1620 1621
    cpu = qemu_get_cpu(cpu_index);
    if (cpu == NULL) {
1622 1623
        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
                   "a CPU number");
L
Luiz Capitulino 已提交
1624 1625 1626 1627 1628
        return;
    }

    f = fopen(filename, "wb");
    if (!f) {
1629
        error_setg_file_open(errp, errno, filename);
L
Luiz Capitulino 已提交
1630 1631 1632 1633 1634 1635 1636
        return;
    }

    while (size != 0) {
        l = sizeof(buf);
        if (l > size)
            l = size;
1637
        if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1638 1639
            error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
                             " specified", orig_addr, orig_size);
1640 1641
            goto exit;
        }
L
Luiz Capitulino 已提交
1642
        if (fwrite(buf, 1, l, f) != l) {
1643
            error_setg(errp, QERR_IO_ERROR);
L
Luiz Capitulino 已提交
1644 1645 1646 1647 1648 1649 1650 1651 1652
            goto exit;
        }
        addr += l;
        size -= l;
    }

exit:
    fclose(f);
}
L
Luiz Capitulino 已提交
1653 1654 1655 1656 1657 1658 1659 1660 1661 1662

void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
                  Error **errp)
{
    FILE *f;
    uint32_t l;
    uint8_t buf[1024];

    f = fopen(filename, "wb");
    if (!f) {
1663
        error_setg_file_open(errp, errno, filename);
L
Luiz Capitulino 已提交
1664 1665 1666 1667 1668 1669 1670
        return;
    }

    while (size != 0) {
        l = sizeof(buf);
        if (l > size)
            l = size;
1671
        cpu_physical_memory_read(addr, buf, l);
L
Luiz Capitulino 已提交
1672
        if (fwrite(buf, 1, l, f) != l) {
1673
            error_setg(errp, QERR_IO_ERROR);
L
Luiz Capitulino 已提交
1674 1675 1676 1677 1678 1679 1680 1681 1682
            goto exit;
        }
        addr += l;
        size -= l;
    }

exit:
    fclose(f);
}
L
Luiz Capitulino 已提交
1683 1684 1685 1686

void qmp_inject_nmi(Error **errp)
{
#if defined(TARGET_I386)
1687 1688
    CPUState *cs;

A
Andreas Färber 已提交
1689
    CPU_FOREACH(cs) {
1690
        X86CPU *cpu = X86_CPU(cs);
L
Luiz Capitulino 已提交
1691

1692
        if (!cpu->apic_state) {
1693
            cpu_interrupt(cs, CPU_INTERRUPT_NMI);
1694
        } else {
1695
            apic_deliver_nmi(cpu->apic_state);
1696
        }
L
Luiz Capitulino 已提交
1697 1698
    }
#else
1699
    nmi_monitor_handle(monitor_get_cpu_index(), errp);
L
Luiz Capitulino 已提交
1700 1701
#endif
}
1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718

void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
{
    if (!use_icount) {
        return;
    }

    cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
                (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
    if (icount_align_option) {
        cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
        cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
    } else {
        cpu_fprintf(f, "Max guest delay     NA\n");
        cpu_fprintf(f, "Max guest advance   NA\n");
    }
}