cpus.c 43.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/*
 * QEMU System Emulator
 *
 * Copyright (c) 2003-2008 Fabrice Bellard
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

/* Needed early for CONFIG_BSD etc. */
P
Peter Maydell 已提交
26
#include "qemu/osdep.h"
27 28
#include "qemu-common.h"
#include "cpu.h"
29
#include "monitor/monitor.h"
W
Wenchao Xia 已提交
30
#include "qapi/qmp/qerror.h"
31
#include "qemu/error-report.h"
32
#include "sysemu/sysemu.h"
33
#include "sysemu/block-backend.h"
34
#include "exec/gdbstub.h"
35 36
#include "sysemu/dma.h"
#include "sysemu/kvm.h"
L
Luiz Capitulino 已提交
37
#include "qmp-commands.h"
38
#include "exec/exec-all.h"
39

40
#include "qemu/thread.h"
41 42
#include "sysemu/cpus.h"
#include "sysemu/qtest.h"
43 44
#include "qemu/main-loop.h"
#include "qemu/bitmap.h"
45
#include "qemu/seqlock.h"
W
Wenchao Xia 已提交
46
#include "qapi-event.h"
47
#include "hw/nmi.h"
48
#include "sysemu/replay.h"
J
Jan Kiszka 已提交
49 50

#ifndef _WIN32
51
#include "qemu/compatfd.h"
J
Jan Kiszka 已提交
52
#endif
53

54 55 56 57
#ifdef CONFIG_LINUX

#include <sys/prctl.h>

M
Marcelo Tosatti 已提交
58 59 60 61
#ifndef PR_MCE_KILL
#define PR_MCE_KILL 33
#endif

62 63 64 65 66 67 68 69 70 71
#ifndef PR_MCE_KILL_SET
#define PR_MCE_KILL_SET 1
#endif

#ifndef PR_MCE_KILL_EARLY
#define PR_MCE_KILL_EARLY 1
#endif

#endif /* CONFIG_LINUX */

72 73
int64_t max_delay;
int64_t max_advance;
74

75 76 77 78 79 80 81 82
/* vcpu throttling controls */
static QEMUTimer *throttle_timer;
static unsigned int throttle_percentage;

#define CPU_THROTTLE_PCT_MIN 1
#define CPU_THROTTLE_PCT_MAX 99
#define CPU_THROTTLE_TIMESLICE_NS 10000000

83 84 85 86 87
bool cpu_is_stopped(CPUState *cpu)
{
    return cpu->stopped || !runstate_is_running();
}

88
static bool cpu_thread_is_idle(CPUState *cpu)
89
{
90
    if (cpu->stop || cpu->queued_work_first) {
91 92
        return false;
    }
93
    if (cpu_is_stopped(cpu)) {
94 95
        return true;
    }
96
    if (!cpu->halted || cpu_has_work(cpu) ||
97
        kvm_halt_in_kernel()) {
98 99 100 101 102 103 104
        return false;
    }
    return true;
}

static bool all_cpu_threads_idle(void)
{
105
    CPUState *cpu;
106

A
Andreas Färber 已提交
107
    CPU_FOREACH(cpu) {
108
        if (!cpu_thread_is_idle(cpu)) {
109 110 111 112 113 114
            return false;
        }
    }
    return true;
}

P
Paolo Bonzini 已提交
115 116 117
/***********************************************************/
/* guest cycle counter */

118 119
/* Protected by TimersState seqlock */

120
static bool icount_sleep = true;
121
static int64_t vm_clock_warp_start = -1;
P
Paolo Bonzini 已提交
122 123 124 125
/* Conversion factor from emulated instructions to virtual clock ticks.  */
static int icount_time_shift;
/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
#define MAX_ICOUNT_SHIFT 10
126

P
Paolo Bonzini 已提交
127 128 129 130 131
static QEMUTimer *icount_rt_timer;
static QEMUTimer *icount_vm_timer;
static QEMUTimer *icount_warp_timer;

typedef struct TimersState {
132
    /* Protected by BQL.  */
P
Paolo Bonzini 已提交
133 134
    int64_t cpu_ticks_prev;
    int64_t cpu_ticks_offset;
135 136 137 138 139

    /* cpu_clock_offset can be read out of BQL, so protect it with
     * this lock.
     */
    QemuSeqLock vm_clock_seqlock;
P
Paolo Bonzini 已提交
140 141 142
    int64_t cpu_clock_offset;
    int32_t cpu_ticks_enabled;
    int64_t dummy;
143 144 145 146 147

    /* Compensate for varying guest execution speed.  */
    int64_t qemu_icount_bias;
    /* Only written by TCG thread */
    int64_t qemu_icount;
P
Paolo Bonzini 已提交
148 149
} TimersState;

L
Liu Ping Fan 已提交
150
static TimersState timers_state;
P
Paolo Bonzini 已提交
151

152
int64_t cpu_get_icount_raw(void)
P
Paolo Bonzini 已提交
153 154
{
    int64_t icount;
155
    CPUState *cpu = current_cpu;
P
Paolo Bonzini 已提交
156

157
    icount = timers_state.qemu_icount;
158
    if (cpu) {
159
        if (!cpu->can_do_io) {
160 161
            fprintf(stderr, "Bad icount read\n");
            exit(1);
P
Paolo Bonzini 已提交
162
        }
163
        icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
P
Paolo Bonzini 已提交
164
    }
165 166 167 168 169 170 171
    return icount;
}

/* Return the virtual CPU time, based on the instruction counter.  */
static int64_t cpu_get_icount_locked(void)
{
    int64_t icount = cpu_get_icount_raw();
172
    return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
P
Paolo Bonzini 已提交
173 174
}

P
Paolo Bonzini 已提交
175 176 177 178 179 180 181 182 183 184 185 186 187
int64_t cpu_get_icount(void)
{
    int64_t icount;
    unsigned start;

    do {
        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
        icount = cpu_get_icount_locked();
    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));

    return icount;
}

188 189 190 191 192
int64_t cpu_icount_to_ns(int64_t icount)
{
    return icount << icount_time_shift;
}

C
Cao jin 已提交
193 194 195 196 197 198
/* return the time elapsed in VM between vm_start and vm_stop.  Unless
 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 * counter.
 *
 * Caller must hold the BQL
 */
P
Paolo Bonzini 已提交
199 200
int64_t cpu_get_ticks(void)
{
P
Paolo Bonzini 已提交
201 202
    int64_t ticks;

P
Paolo Bonzini 已提交
203 204 205
    if (use_icount) {
        return cpu_get_icount();
    }
P
Paolo Bonzini 已提交
206 207 208

    ticks = timers_state.cpu_ticks_offset;
    if (timers_state.cpu_ticks_enabled) {
209
        ticks += cpu_get_host_ticks();
P
Paolo Bonzini 已提交
210 211 212 213 214 215 216
    }

    if (timers_state.cpu_ticks_prev > ticks) {
        /* Note: non increasing ticks may happen if the host uses
           software suspend */
        timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
        ticks = timers_state.cpu_ticks_prev;
P
Paolo Bonzini 已提交
217
    }
P
Paolo Bonzini 已提交
218 219 220

    timers_state.cpu_ticks_prev = ticks;
    return ticks;
P
Paolo Bonzini 已提交
221 222
}

223
static int64_t cpu_get_clock_locked(void)
P
Paolo Bonzini 已提交
224
{
225
    int64_t time;
226

227
    time = timers_state.cpu_clock_offset;
P
Paolo Bonzini 已提交
228
    if (timers_state.cpu_ticks_enabled) {
229
        time += get_clock();
P
Paolo Bonzini 已提交
230
    }
231

232
    return time;
233 234
}

C
Cao jin 已提交
235
/* Return the monotonic time elapsed in VM, i.e.,
236 237
 * the time between vm_start and vm_stop
 */
238 239 240 241 242 243 244 245 246 247 248
int64_t cpu_get_clock(void)
{
    int64_t ti;
    unsigned start;

    do {
        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
        ti = cpu_get_clock_locked();
    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));

    return ti;
P
Paolo Bonzini 已提交
249 250
}

251
/* enable cpu_get_ticks()
252
 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
253
 */
P
Paolo Bonzini 已提交
254 255
void cpu_enable_ticks(void)
{
256
    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
257
    seqlock_write_begin(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
258
    if (!timers_state.cpu_ticks_enabled) {
259
        timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
P
Paolo Bonzini 已提交
260 261 262
        timers_state.cpu_clock_offset -= get_clock();
        timers_state.cpu_ticks_enabled = 1;
    }
263
    seqlock_write_end(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
264 265 266
}

/* disable cpu_get_ticks() : the clock is stopped. You must not call
267
 * cpu_get_ticks() after that.
268
 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
269
 */
P
Paolo Bonzini 已提交
270 271
void cpu_disable_ticks(void)
{
272
    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
273
    seqlock_write_begin(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
274
    if (timers_state.cpu_ticks_enabled) {
275
        timers_state.cpu_ticks_offset += cpu_get_host_ticks();
276
        timers_state.cpu_clock_offset = cpu_get_clock_locked();
P
Paolo Bonzini 已提交
277 278
        timers_state.cpu_ticks_enabled = 0;
    }
279
    seqlock_write_end(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
280 281 282 283 284 285
}

/* Correlation between real and virtual time is always going to be
   fairly approximate, so ignore small variation.
   When the guest is idle real and virtual time will be aligned in
   the IO wait loop.  */
286
#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
P
Paolo Bonzini 已提交
287 288 289 290 291 292

static void icount_adjust(void)
{
    int64_t cur_time;
    int64_t cur_icount;
    int64_t delta;
293 294

    /* Protected by TimersState mutex.  */
P
Paolo Bonzini 已提交
295
    static int64_t last_delta;
296

P
Paolo Bonzini 已提交
297 298 299 300
    /* If the VM is not running, then do nothing.  */
    if (!runstate_is_running()) {
        return;
    }
301

302
    seqlock_write_begin(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
303 304
    cur_time = cpu_get_clock_locked();
    cur_icount = cpu_get_icount_locked();
305

P
Paolo Bonzini 已提交
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
    delta = cur_icount - cur_time;
    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
    if (delta > 0
        && last_delta + ICOUNT_WOBBLE < delta * 2
        && icount_time_shift > 0) {
        /* The guest is getting too far ahead.  Slow time down.  */
        icount_time_shift--;
    }
    if (delta < 0
        && last_delta - ICOUNT_WOBBLE > delta * 2
        && icount_time_shift < MAX_ICOUNT_SHIFT) {
        /* The guest is getting too far behind.  Speed time up.  */
        icount_time_shift++;
    }
    last_delta = delta;
321 322
    timers_state.qemu_icount_bias = cur_icount
                              - (timers_state.qemu_icount << icount_time_shift);
323
    seqlock_write_end(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
324 325 326 327
}

static void icount_adjust_rt(void *opaque)
{
328
    timer_mod(icount_rt_timer,
329
              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
P
Paolo Bonzini 已提交
330 331 332 333 334
    icount_adjust();
}

static void icount_adjust_vm(void *opaque)
{
335 336
    timer_mod(icount_vm_timer,
                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
337
                   NANOSECONDS_PER_SECOND / 10);
P
Paolo Bonzini 已提交
338 339 340 341 342 343 344 345
    icount_adjust();
}

static int64_t qemu_icount_round(int64_t count)
{
    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
}

346
static void icount_warp_rt(void)
P
Paolo Bonzini 已提交
347
{
348 349 350
    unsigned seq;
    int64_t warp_start;

P
Paolo Bonzini 已提交
351 352 353
    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
     * changes from -1 to another value, so the race here is okay.
     */
354 355 356 357 358 359
    do {
        seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
        warp_start = vm_clock_warp_start;
    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));

    if (warp_start == -1) {
P
Paolo Bonzini 已提交
360 361 362
        return;
    }

363
    seqlock_write_begin(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
364
    if (runstate_is_running()) {
365 366
        int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
                                     cpu_get_clock_locked());
P
Paolo Bonzini 已提交
367 368 369 370
        int64_t warp_delta;

        warp_delta = clock - vm_clock_warp_start;
        if (use_icount == 2) {
P
Paolo Bonzini 已提交
371
            /*
372
             * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
P
Paolo Bonzini 已提交
373 374
             * far ahead of real time.
             */
P
Paolo Bonzini 已提交
375
            int64_t cur_icount = cpu_get_icount_locked();
376
            int64_t delta = clock - cur_icount;
P
Paolo Bonzini 已提交
377
            warp_delta = MIN(warp_delta, delta);
P
Paolo Bonzini 已提交
378
        }
379
        timers_state.qemu_icount_bias += warp_delta;
P
Paolo Bonzini 已提交
380 381
    }
    vm_clock_warp_start = -1;
382
    seqlock_write_end(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
383 384 385 386

    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
    }
P
Paolo Bonzini 已提交
387 388
}

P
Pavel Dovgalyuk 已提交
389
static void icount_timer_cb(void *opaque)
390
{
P
Pavel Dovgalyuk 已提交
391 392 393 394
    /* No need for a checkpoint because the timer already synchronizes
     * with CHECKPOINT_CLOCK_VIRTUAL_RT.
     */
    icount_warp_rt();
395 396
}

P
Paolo Bonzini 已提交
397 398
void qtest_clock_warp(int64_t dest)
{
399
    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
400
    AioContext *aio_context;
P
Paolo Bonzini 已提交
401
    assert(qtest_enabled());
402
    aio_context = qemu_get_aio_context();
P
Paolo Bonzini 已提交
403
    while (clock < dest) {
404
        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
405
        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
406

407
        seqlock_write_begin(&timers_state.vm_clock_seqlock);
408
        timers_state.qemu_icount_bias += warp;
409
        seqlock_write_end(&timers_state.vm_clock_seqlock);
P
Paolo Bonzini 已提交
410

411
        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
412
        timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
413
        clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
P
Paolo Bonzini 已提交
414
    }
415
    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
P
Paolo Bonzini 已提交
416 417
}

P
Pavel Dovgalyuk 已提交
418
void qemu_start_warp_timer(void)
P
Paolo Bonzini 已提交
419
{
420
    int64_t clock;
P
Paolo Bonzini 已提交
421 422
    int64_t deadline;

P
Pavel Dovgalyuk 已提交
423
    if (!use_icount) {
P
Paolo Bonzini 已提交
424 425 426
        return;
    }

P
Pavel Dovgalyuk 已提交
427 428 429 430 431 432 433 434
    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
     * do not fire, so computing the deadline does not make sense.
     */
    if (!runstate_is_running()) {
        return;
    }

    /* warp clock deterministically in record/replay mode */
P
Pavel Dovgalyuk 已提交
435
    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
P
Pavel Dovgalyuk 已提交
436 437 438
        return;
    }

439
    if (!all_cpu_threads_idle()) {
P
Paolo Bonzini 已提交
440 441 442
        return;
    }

P
Paolo Bonzini 已提交
443 444
    if (qtest_enabled()) {
        /* When testing, qtest commands advance icount.  */
P
Pavel Dovgalyuk 已提交
445
        return;
P
Paolo Bonzini 已提交
446 447
    }

448
    /* We want to use the earliest deadline from ALL vm_clocks */
449
    clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
450
    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
451
    if (deadline < 0) {
452 453 454 455 456
        static bool notified;
        if (!icount_sleep && !notified) {
            error_report("WARNING: icount sleep disabled and no active timers");
            notified = true;
        }
457
        return;
458 459
    }

P
Paolo Bonzini 已提交
460 461
    if (deadline > 0) {
        /*
462
         * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
P
Paolo Bonzini 已提交
463 464 465
         * sleep.  Otherwise, the CPU might be waiting for a future timer
         * interrupt to wake it up, but the interrupt never comes because
         * the vCPU isn't running any insns and thus doesn't advance the
466
         * QEMU_CLOCK_VIRTUAL.
P
Paolo Bonzini 已提交
467
         */
468 469 470 471 472 473 474 475
        if (!icount_sleep) {
            /*
             * We never let VCPUs sleep in no sleep icount mode.
             * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
             * to the next QEMU_CLOCK_VIRTUAL event and notify it.
             * It is useful when we want a deterministic execution time,
             * isolated from host latencies.
             */
476
            seqlock_write_begin(&timers_state.vm_clock_seqlock);
477
            timers_state.qemu_icount_bias += deadline;
478
            seqlock_write_end(&timers_state.vm_clock_seqlock);
479 480 481 482 483 484 485 486 487 488
            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
        } else {
            /*
             * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
             * "real" time, (related to the time left until the next event) has
             * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
             * This avoids that the warps are visible externally; for example,
             * you will not be sending network packets continuously instead of
             * every 100ms.
             */
489
            seqlock_write_begin(&timers_state.vm_clock_seqlock);
490 491 492
            if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
                vm_clock_warp_start = clock;
            }
493
            seqlock_write_end(&timers_state.vm_clock_seqlock);
494
            timer_mod_anticipate(icount_warp_timer, clock + deadline);
495
        }
496
    } else if (deadline == 0) {
497
        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
P
Paolo Bonzini 已提交
498 499 500
    }
}

P
Pavel Dovgalyuk 已提交
501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
static void qemu_account_warp_timer(void)
{
    if (!use_icount || !icount_sleep) {
        return;
    }

    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
     * do not fire, so computing the deadline does not make sense.
     */
    if (!runstate_is_running()) {
        return;
    }

    /* warp clock deterministically in record/replay mode */
    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
        return;
    }

    timer_del(icount_warp_timer);
    icount_warp_rt();
}

523 524 525 526 527 528 529 530 531 532 533 534
static bool icount_state_needed(void *opaque)
{
    return use_icount;
}

/*
 * This is a subsection for icount migration.
 */
static const VMStateDescription icount_vmstate_timers = {
    .name = "timer/icount",
    .version_id = 1,
    .minimum_version_id = 1,
535
    .needed = icount_state_needed,
536 537 538 539 540 541 542
    .fields = (VMStateField[]) {
        VMSTATE_INT64(qemu_icount_bias, TimersState),
        VMSTATE_INT64(qemu_icount, TimersState),
        VMSTATE_END_OF_LIST()
    }
};

P
Paolo Bonzini 已提交
543 544 545 546
static const VMStateDescription vmstate_timers = {
    .name = "timer",
    .version_id = 2,
    .minimum_version_id = 1,
547
    .fields = (VMStateField[]) {
P
Paolo Bonzini 已提交
548 549 550 551
        VMSTATE_INT64(cpu_ticks_offset, TimersState),
        VMSTATE_INT64(dummy, TimersState),
        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
        VMSTATE_END_OF_LIST()
552
    },
553 554 555
    .subsections = (const VMStateDescription*[]) {
        &icount_vmstate_timers,
        NULL
P
Paolo Bonzini 已提交
556 557 558
    }
};

559
static void cpu_throttle_thread(CPUState *cpu, void *opaque)
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
{
    double pct;
    double throttle_ratio;
    long sleeptime_ns;

    if (!cpu_throttle_get_percentage()) {
        return;
    }

    pct = (double)cpu_throttle_get_percentage()/100;
    throttle_ratio = pct / (1 - pct);
    sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);

    qemu_mutex_unlock_iothread();
    atomic_set(&cpu->throttle_thread_scheduled, 0);
    g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
    qemu_mutex_lock_iothread();
}

static void cpu_throttle_timer_tick(void *opaque)
{
    CPUState *cpu;
    double pct;

    /* Stop the timer if needed */
    if (!cpu_throttle_get_percentage()) {
        return;
    }
    CPU_FOREACH(cpu) {
        if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
590
            async_run_on_cpu(cpu, cpu_throttle_thread, NULL);
591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625
        }
    }

    pct = (double)cpu_throttle_get_percentage()/100;
    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
                                   CPU_THROTTLE_TIMESLICE_NS / (1-pct));
}

void cpu_throttle_set(int new_throttle_pct)
{
    /* Ensure throttle percentage is within valid range */
    new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
    new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);

    atomic_set(&throttle_percentage, new_throttle_pct);

    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
                                       CPU_THROTTLE_TIMESLICE_NS);
}

void cpu_throttle_stop(void)
{
    atomic_set(&throttle_percentage, 0);
}

bool cpu_throttle_active(void)
{
    return (cpu_throttle_get_percentage() != 0);
}

int cpu_throttle_get_percentage(void)
{
    return atomic_read(&throttle_percentage);
}

626 627
void cpu_ticks_init(void)
{
E
Emilio G. Cota 已提交
628
    seqlock_init(&timers_state.vm_clock_seqlock);
629
    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
630 631
    throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
                                           cpu_throttle_timer_tick, NULL);
632 633
}

634
void configure_icount(QemuOpts *opts, Error **errp)
P
Paolo Bonzini 已提交
635
{
636
    const char *option;
637
    char *rem_str = NULL;
638 639

    option = qemu_opt_get(opts, "shift");
P
Paolo Bonzini 已提交
640
    if (!option) {
641 642 643
        if (qemu_opt_get(opts, "align") != NULL) {
            error_setg(errp, "Please specify shift option when using align");
        }
P
Paolo Bonzini 已提交
644 645
        return;
    }
646 647

    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
648 649
    if (icount_sleep) {
        icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
P
Pavel Dovgalyuk 已提交
650
                                         icount_timer_cb, NULL);
651
    }
652

653
    icount_align_option = qemu_opt_get_bool(opts, "align", false);
654 655

    if (icount_align_option && !icount_sleep) {
656
        error_setg(errp, "align=on and sleep=off are incompatible");
657
    }
P
Paolo Bonzini 已提交
658
    if (strcmp(option, "auto") != 0) {
659 660 661 662 663
        errno = 0;
        icount_time_shift = strtol(option, &rem_str, 0);
        if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
            error_setg(errp, "icount: Invalid shift value");
        }
P
Paolo Bonzini 已提交
664 665
        use_icount = 1;
        return;
666 667
    } else if (icount_align_option) {
        error_setg(errp, "shift=auto and align=on are incompatible");
668
    } else if (!icount_sleep) {
669
        error_setg(errp, "shift=auto and sleep=off are incompatible");
P
Paolo Bonzini 已提交
670 671 672 673 674 675 676 677 678 679 680 681 682
    }

    use_icount = 2;

    /* 125MIPS seems a reasonable initial guess at the guest speed.
       It will be corrected fairly quickly anyway.  */
    icount_time_shift = 3;

    /* Have both realtime and virtual time triggers for speed adjustment.
       The realtime trigger catches emulated time passing too slowly,
       the virtual time trigger catches emulated time passing too fast.
       Realtime triggers occur even when idle, so use them less frequently
       than VM triggers.  */
683 684
    icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
                                   icount_adjust_rt, NULL);
685
    timer_mod(icount_rt_timer,
686
                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
687 688 689 690
    icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
                                        icount_adjust_vm, NULL);
    timer_mod(icount_vm_timer,
                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
691
                   NANOSECONDS_PER_SECOND / 10);
P
Paolo Bonzini 已提交
692 693
}

694 695 696 697
/***********************************************************/
void hw_error(const char *fmt, ...)
{
    va_list ap;
698
    CPUState *cpu;
699 700 701 702 703

    va_start(ap, fmt);
    fprintf(stderr, "qemu: hardware error: ");
    vfprintf(stderr, fmt, ap);
    fprintf(stderr, "\n");
A
Andreas Färber 已提交
704
    CPU_FOREACH(cpu) {
705
        fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
706
        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
707 708 709 710 711 712 713
    }
    va_end(ap);
    abort();
}

void cpu_synchronize_all_states(void)
{
714
    CPUState *cpu;
715

A
Andreas Färber 已提交
716
    CPU_FOREACH(cpu) {
717
        cpu_synchronize_state(cpu);
718 719 720 721 722
    }
}

void cpu_synchronize_all_post_reset(void)
{
723
    CPUState *cpu;
724

A
Andreas Färber 已提交
725
    CPU_FOREACH(cpu) {
726
        cpu_synchronize_post_reset(cpu);
727 728 729 730 731
    }
}

void cpu_synchronize_all_post_init(void)
{
732
    CPUState *cpu;
733

A
Andreas Färber 已提交
734
    CPU_FOREACH(cpu) {
735
        cpu_synchronize_post_init(cpu);
736 737 738
    }
}

739
static int do_vm_stop(RunState state)
740
{
741 742
    int ret = 0;

743
    if (runstate_is_running()) {
744 745
        cpu_disable_ticks();
        pause_all_vcpus();
746
        runstate_set(state);
747
        vm_state_notify(0, state);
W
Wenchao Xia 已提交
748
        qapi_event_send_stop(&error_abort);
749
    }
750

751
    bdrv_drain_all();
752
    replay_disable_events();
753
    ret = bdrv_flush_all();
754

755
    return ret;
756 757
}

758
static bool cpu_can_run(CPUState *cpu)
759
{
A
Andreas Färber 已提交
760
    if (cpu->stop) {
761
        return false;
762
    }
763
    if (cpu_is_stopped(cpu)) {
764
        return false;
765
    }
766
    return true;
767 768
}

769
static void cpu_handle_guest_debug(CPUState *cpu)
770
{
771
    gdb_set_stop_cpu(cpu);
772
    qemu_system_debug_request();
773
    cpu->stopped = true;
774 775
}

776 777 778 779 780 781 782 783 784 785 786 787
#ifdef CONFIG_LINUX
static void sigbus_reraise(void)
{
    sigset_t set;
    struct sigaction action;

    memset(&action, 0, sizeof(action));
    action.sa_handler = SIG_DFL;
    if (!sigaction(SIGBUS, &action, NULL)) {
        raise(SIGBUS);
        sigemptyset(&set);
        sigaddset(&set, SIGBUS);
788
        pthread_sigmask(SIG_UNBLOCK, &set, NULL);
789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814
    }
    perror("Failed to re-raise SIGBUS!\n");
    abort();
}

static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
                           void *ctx)
{
    if (kvm_on_sigbus(siginfo->ssi_code,
                      (void *)(intptr_t)siginfo->ssi_addr)) {
        sigbus_reraise();
    }
}

static void qemu_init_sigbus(void)
{
    struct sigaction action;

    memset(&action, 0, sizeof(action));
    action.sa_flags = SA_SIGINFO;
    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
    sigaction(SIGBUS, &action, NULL);

    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
}

815
static void qemu_kvm_eat_signals(CPUState *cpu)
816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835
{
    struct timespec ts = { 0, 0 };
    siginfo_t siginfo;
    sigset_t waitset;
    sigset_t chkset;
    int r;

    sigemptyset(&waitset);
    sigaddset(&waitset, SIG_IPI);
    sigaddset(&waitset, SIGBUS);

    do {
        r = sigtimedwait(&waitset, &siginfo, &ts);
        if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
            perror("sigtimedwait");
            exit(1);
        }

        switch (r) {
        case SIGBUS:
836
            if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
837 838 839 840 841 842 843 844 845 846 847 848 849 850 851
                sigbus_reraise();
            }
            break;
        default:
            break;
        }

        r = sigpending(&chkset);
        if (r == -1) {
            perror("sigpending");
            exit(1);
        }
    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
}

852 853 854 855 856
#else /* !CONFIG_LINUX */

static void qemu_init_sigbus(void)
{
}
857

858
static void qemu_kvm_eat_signals(CPUState *cpu)
859 860
{
}
861 862
#endif /* !CONFIG_LINUX */

863
#ifndef _WIN32
864 865 866 867
static void dummy_signal(int sig)
{
}

868
static void qemu_kvm_init_cpu_signals(CPUState *cpu)
869 870 871 872 873 874 875 876 877 878 879 880
{
    int r;
    sigset_t set;
    struct sigaction sigact;

    memset(&sigact, 0, sizeof(sigact));
    sigact.sa_handler = dummy_signal;
    sigaction(SIG_IPI, &sigact, NULL);

    pthread_sigmask(SIG_BLOCK, NULL, &set);
    sigdelset(&set, SIG_IPI);
    sigdelset(&set, SIGBUS);
881
    r = kvm_set_signal_mask(cpu, &set);
882 883 884 885 886 887
    if (r) {
        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
        exit(1);
    }
}

888
#else /* _WIN32 */
889
static void qemu_kvm_init_cpu_signals(CPUState *cpu)
890
{
891 892 893
    abort();
}
#endif /* _WIN32 */
894

895
static QemuMutex qemu_global_mutex;
896
static QemuCond qemu_io_proceeded_cond;
897
static unsigned iothread_requesting_mutex;
898 899 900 901 902 903 904 905

static QemuThread io_thread;

/* cpu creation */
static QemuCond qemu_cpu_cond;
/* system init */
static QemuCond qemu_pause_cond;

P
Paolo Bonzini 已提交
906
void qemu_init_cpu_loop(void)
907
{
908
    qemu_init_sigbus();
909 910
    qemu_cond_init(&qemu_cpu_cond);
    qemu_cond_init(&qemu_pause_cond);
911
    qemu_cond_init(&qemu_io_proceeded_cond);
912 913
    qemu_mutex_init(&qemu_global_mutex);

J
Jan Kiszka 已提交
914
    qemu_thread_get_self(&io_thread);
915 916
}

917
void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
M
Marcelo Tosatti 已提交
918
{
919
    do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
C
Chegu Vinod 已提交
920 921
}

G
Gu Zheng 已提交
922 923 924 925 926 927 928 929 930 931 932 933
static void qemu_kvm_destroy_vcpu(CPUState *cpu)
{
    if (kvm_destroy_vcpu(cpu) < 0) {
        error_report("kvm_destroy_vcpu failed");
        exit(EXIT_FAILURE);
    }
}

static void qemu_tcg_destroy_vcpu(CPUState *cpu)
{
}

934
static void qemu_wait_io_event_common(CPUState *cpu)
935
{
A
Andreas Färber 已提交
936 937
    if (cpu->stop) {
        cpu->stop = false;
938
        cpu->stopped = true;
939
        qemu_cond_broadcast(&qemu_pause_cond);
940
    }
941
    process_queued_cpu_work(cpu);
942
    cpu->thread_kicked = false;
943 944
}

945
static void qemu_tcg_wait_io_event(CPUState *cpu)
946
{
947
    while (all_cpu_threads_idle()) {
948
        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
949
    }
950

951 952 953
    while (iothread_requesting_mutex) {
        qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
    }
954

A
Andreas Färber 已提交
955
    CPU_FOREACH(cpu) {
956
        qemu_wait_io_event_common(cpu);
957
    }
958 959
}

960
static void qemu_kvm_wait_io_event(CPUState *cpu)
961
{
962
    while (cpu_thread_is_idle(cpu)) {
A
Andreas Färber 已提交
963
        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
964
    }
965

966
    qemu_kvm_eat_signals(cpu);
967
    qemu_wait_io_event_common(cpu);
968 969
}

970
static void *qemu_kvm_cpu_thread_fn(void *arg)
971
{
972
    CPUState *cpu = arg;
J
Jan Kiszka 已提交
973
    int r;
974

975 976
    rcu_register_thread();

977
    qemu_mutex_lock_iothread();
978
    qemu_thread_get_self(cpu->thread);
A
Andreas Färber 已提交
979
    cpu->thread_id = qemu_get_thread_id();
980
    cpu->can_do_io = 1;
981
    current_cpu = cpu;
982

983
    r = kvm_init_vcpu(cpu);
J
Jan Kiszka 已提交
984 985 986 987
    if (r < 0) {
        fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
        exit(1);
    }
988

989
    qemu_kvm_init_cpu_signals(cpu);
990 991

    /* signal CPU creation */
992
    cpu->created = true;
993 994
    qemu_cond_signal(&qemu_cpu_cond);

G
Gu Zheng 已提交
995
    do {
996
        if (cpu_can_run(cpu)) {
997
            r = kvm_cpu_exec(cpu);
998
            if (r == EXCP_DEBUG) {
999
                cpu_handle_guest_debug(cpu);
1000
            }
1001
        }
1002
        qemu_kvm_wait_io_event(cpu);
G
Gu Zheng 已提交
1003
    } while (!cpu->unplug || cpu_can_run(cpu));
1004

G
Gu Zheng 已提交
1005
    qemu_kvm_destroy_vcpu(cpu);
1006 1007
    cpu->created = false;
    qemu_cond_signal(&qemu_cpu_cond);
G
Gu Zheng 已提交
1008
    qemu_mutex_unlock_iothread();
1009 1010 1011
    return NULL;
}

A
Anthony Liguori 已提交
1012 1013 1014 1015 1016 1017
static void *qemu_dummy_cpu_thread_fn(void *arg)
{
#ifdef _WIN32
    fprintf(stderr, "qtest is not supported under Windows\n");
    exit(1);
#else
1018
    CPUState *cpu = arg;
A
Anthony Liguori 已提交
1019 1020 1021
    sigset_t waitset;
    int r;

1022 1023
    rcu_register_thread();

A
Anthony Liguori 已提交
1024
    qemu_mutex_lock_iothread();
1025
    qemu_thread_get_self(cpu->thread);
A
Andreas Färber 已提交
1026
    cpu->thread_id = qemu_get_thread_id();
1027
    cpu->can_do_io = 1;
A
Anthony Liguori 已提交
1028 1029 1030 1031 1032

    sigemptyset(&waitset);
    sigaddset(&waitset, SIG_IPI);

    /* signal CPU creation */
1033
    cpu->created = true;
A
Anthony Liguori 已提交
1034 1035
    qemu_cond_signal(&qemu_cpu_cond);

1036
    current_cpu = cpu;
A
Anthony Liguori 已提交
1037
    while (1) {
1038
        current_cpu = NULL;
A
Anthony Liguori 已提交
1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
        qemu_mutex_unlock_iothread();
        do {
            int sig;
            r = sigwait(&waitset, &sig);
        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
        if (r == -1) {
            perror("sigwait");
            exit(1);
        }
        qemu_mutex_lock_iothread();
1049
        current_cpu = cpu;
1050
        qemu_wait_io_event_common(cpu);
A
Anthony Liguori 已提交
1051 1052 1053 1054 1055 1056
    }

    return NULL;
#endif
}

1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120
static int64_t tcg_get_icount_limit(void)
{
    int64_t deadline;

    if (replay_mode != REPLAY_MODE_PLAY) {
        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);

        /* Maintain prior (possibly buggy) behaviour where if no deadline
         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
         * nanoseconds.
         */
        if ((deadline < 0) || (deadline > INT32_MAX)) {
            deadline = INT32_MAX;
        }

        return qemu_icount_round(deadline);
    } else {
        return replay_get_instructions();
    }
}

static int tcg_cpu_exec(CPUState *cpu)
{
    int ret;
#ifdef CONFIG_PROFILER
    int64_t ti;
#endif

#ifdef CONFIG_PROFILER
    ti = profile_getclock();
#endif
    if (use_icount) {
        int64_t count;
        int decr;
        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
                                    + cpu->icount_extra);
        cpu->icount_decr.u16.low = 0;
        cpu->icount_extra = 0;
        count = tcg_get_icount_limit();
        timers_state.qemu_icount += count;
        decr = (count > 0xffff) ? 0xffff : count;
        count -= decr;
        cpu->icount_decr.u16.low = decr;
        cpu->icount_extra = count;
    }
    cpu_exec_start(cpu);
    ret = cpu_exec(cpu);
    cpu_exec_end(cpu);
#ifdef CONFIG_PROFILER
    tcg_time += profile_getclock() - ti;
#endif
    if (use_icount) {
        /* Fold pending instructions back into the
           instruction counter, and clear the interrupt flag.  */
        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
                        + cpu->icount_extra);
        cpu->icount_decr.u32 = 0;
        cpu->icount_extra = 0;
        replay_account_executed_instructions();
    }
    return ret;
}

A
Alex Bennée 已提交
1121 1122 1123 1124
/* Destroy any remaining vCPUs which have been unplugged and have
 * finished running
 */
static void deal_with_unplugged_cpus(void)
1125
{
A
Alex Bennée 已提交
1126
    CPUState *cpu;
1127

A
Alex Bennée 已提交
1128 1129 1130 1131 1132
    CPU_FOREACH(cpu) {
        if (cpu->unplug && !cpu_can_run(cpu)) {
            qemu_tcg_destroy_vcpu(cpu);
            cpu->created = false;
            qemu_cond_signal(&qemu_cpu_cond);
1133 1134 1135 1136
            break;
        }
    }
}
J
Jan Kiszka 已提交
1137

1138
static void *qemu_tcg_cpu_thread_fn(void *arg)
1139
{
1140
    CPUState *cpu = arg;
1141

1142 1143
    rcu_register_thread();

1144
    qemu_mutex_lock_iothread();
1145
    qemu_thread_get_self(cpu->thread);
1146

1147 1148 1149
    CPU_FOREACH(cpu) {
        cpu->thread_id = qemu_get_thread_id();
        cpu->created = true;
1150
        cpu->can_do_io = 1;
1151
    }
1152 1153
    qemu_cond_signal(&qemu_cpu_cond);

1154
    /* wait for initial kick-off after machine start */
1155
    while (first_cpu->stopped) {
1156
        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1157 1158

        /* process any pending work */
A
Andreas Färber 已提交
1159
        CPU_FOREACH(cpu) {
1160
            qemu_wait_io_event_common(cpu);
1161
        }
1162
    }
1163

1164
    /* process any pending work */
1165
    atomic_mb_set(&exit_request, 1);
1166

A
Alex Bennée 已提交
1167 1168
    cpu = first_cpu;

1169
    while (1) {
A
Alex Bennée 已提交
1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199
        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
        qemu_account_warp_timer();

        if (!cpu) {
            cpu = first_cpu;
        }

        for (; cpu != NULL && !exit_request; cpu = CPU_NEXT(cpu)) {

            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);

            if (cpu_can_run(cpu)) {
                int r;
                r = tcg_cpu_exec(cpu);
                if (r == EXCP_DEBUG) {
                    cpu_handle_guest_debug(cpu);
                    break;
                }
            } else if (cpu->stop || cpu->stopped) {
                if (cpu->unplug) {
                    cpu = CPU_NEXT(cpu);
                }
                break;
            }

        } /* for cpu.. */

        /* Pairs with smp_wmb in qemu_cpu_kick.  */
        atomic_mb_set(&exit_request, 0);
1200 1201

        if (use_icount) {
1202
            int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1203 1204

            if (deadline == 0) {
1205
                qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1206
            }
1207
        }
1208
        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
A
Alex Bennée 已提交
1209
        deal_with_unplugged_cpus();
1210 1211 1212 1213 1214
    }

    return NULL;
}

1215
static void qemu_cpu_kick_thread(CPUState *cpu)
P
Paolo Bonzini 已提交
1216 1217 1218 1219
{
#ifndef _WIN32
    int err;

P
Paolo Bonzini 已提交
1220 1221
    if (cpu->thread_kicked) {
        return;
1222
    }
P
Paolo Bonzini 已提交
1223
    cpu->thread_kicked = true;
1224
    err = pthread_kill(cpu->thread->thread, SIG_IPI);
P
Paolo Bonzini 已提交
1225 1226 1227 1228 1229
    if (err) {
        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
        exit(1);
    }
#else /* _WIN32 */
P
Paolo Bonzini 已提交
1230 1231 1232
    abort();
#endif
}
1233

P
Paolo Bonzini 已提交
1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
static void qemu_cpu_kick_no_halt(void)
{
    CPUState *cpu;
    /* Ensure whatever caused the exit has reached the CPU threads before
     * writing exit_request.
     */
    atomic_mb_set(&exit_request, 1);
    cpu = atomic_mb_read(&tcg_current_cpu);
    if (cpu) {
        cpu_exit(cpu);
P
Paolo Bonzini 已提交
1244 1245 1246
    }
}

1247
void qemu_cpu_kick(CPUState *cpu)
1248
{
A
Andreas Färber 已提交
1249
    qemu_cond_broadcast(cpu->halt_cond);
P
Paolo Bonzini 已提交
1250 1251 1252 1253 1254
    if (tcg_enabled()) {
        qemu_cpu_kick_no_halt();
    } else {
        qemu_cpu_kick_thread(cpu);
    }
1255 1256
}

1257
void qemu_cpu_kick_self(void)
1258
{
1259
    assert(current_cpu);
1260
    qemu_cpu_kick_thread(current_cpu);
1261 1262
}

1263
bool qemu_cpu_is_self(CPUState *cpu)
1264
{
1265
    return qemu_thread_is_self(cpu->thread);
1266 1267
}

1268
bool qemu_in_vcpu_thread(void)
J
Juan Quintela 已提交
1269
{
1270
    return current_cpu && qemu_cpu_is_self(current_cpu);
J
Juan Quintela 已提交
1271 1272
}

1273 1274 1275 1276 1277 1278 1279
static __thread bool iothread_locked = false;

bool qemu_mutex_iothread_locked(void)
{
    return iothread_locked;
}

1280 1281
void qemu_mutex_lock_iothread(void)
{
1282
    atomic_inc(&iothread_requesting_mutex);
1283 1284 1285 1286
    /* In the simple case there is no need to bump the VCPU thread out of
     * TCG code execution.
     */
    if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1287
        !first_cpu || !first_cpu->created) {
1288
        qemu_mutex_lock(&qemu_global_mutex);
1289
        atomic_dec(&iothread_requesting_mutex);
1290 1291
    } else {
        if (qemu_mutex_trylock(&qemu_global_mutex)) {
P
Paolo Bonzini 已提交
1292
            qemu_cpu_kick_no_halt();
1293 1294
            qemu_mutex_lock(&qemu_global_mutex);
        }
1295
        atomic_dec(&iothread_requesting_mutex);
1296
        qemu_cond_broadcast(&qemu_io_proceeded_cond);
1297
    }
1298
    iothread_locked = true;
1299 1300 1301 1302
}

void qemu_mutex_unlock_iothread(void)
{
1303
    iothread_locked = false;
1304 1305 1306
    qemu_mutex_unlock(&qemu_global_mutex);
}

1307
static bool all_vcpus_paused(void)
1308
{
A
Andreas Färber 已提交
1309
    CPUState *cpu;
1310

A
Andreas Färber 已提交
1311
    CPU_FOREACH(cpu) {
1312
        if (!cpu->stopped) {
1313
            return false;
1314
        }
1315 1316
    }

1317
    return true;
1318 1319 1320 1321
}

void pause_all_vcpus(void)
{
A
Andreas Färber 已提交
1322
    CPUState *cpu;
1323

1324
    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
A
Andreas Färber 已提交
1325
    CPU_FOREACH(cpu) {
1326 1327
        cpu->stop = true;
        qemu_cpu_kick(cpu);
1328 1329
    }

J
Juan Quintela 已提交
1330
    if (qemu_in_vcpu_thread()) {
1331 1332
        cpu_stop_current();
        if (!kvm_enabled()) {
A
Andreas Färber 已提交
1333
            CPU_FOREACH(cpu) {
1334 1335
                cpu->stop = false;
                cpu->stopped = true;
1336 1337 1338 1339 1340
            }
            return;
        }
    }

1341
    while (!all_vcpus_paused()) {
1342
        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
A
Andreas Färber 已提交
1343
        CPU_FOREACH(cpu) {
1344
            qemu_cpu_kick(cpu);
1345 1346 1347 1348
        }
    }
}

1349 1350 1351 1352 1353 1354 1355
void cpu_resume(CPUState *cpu)
{
    cpu->stop = false;
    cpu->stopped = false;
    qemu_cpu_kick(cpu);
}

1356 1357
void resume_all_vcpus(void)
{
A
Andreas Färber 已提交
1358
    CPUState *cpu;
1359

1360
    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
A
Andreas Färber 已提交
1361
    CPU_FOREACH(cpu) {
1362
        cpu_resume(cpu);
1363 1364 1365
    }
}

G
Gu Zheng 已提交
1366 1367 1368 1369 1370 1371 1372
void cpu_remove(CPUState *cpu)
{
    cpu->stop = true;
    cpu->unplug = true;
    qemu_cpu_kick(cpu);
}

1373 1374 1375 1376 1377 1378 1379 1380
void cpu_remove_sync(CPUState *cpu)
{
    cpu_remove(cpu);
    while (cpu->created) {
        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
    }
}

1381 1382 1383
/* For temporary buffers for forming a name */
#define VCPU_THREAD_NAME_SIZE 16

1384
static void qemu_tcg_init_vcpu(CPUState *cpu)
1385
{
1386
    char thread_name[VCPU_THREAD_NAME_SIZE];
1387 1388
    static QemuCond *tcg_halt_cond;
    static QemuThread *tcg_cpu_thread;
1389

1390 1391
    /* share a single thread for all cpus with TCG */
    if (!tcg_cpu_thread) {
1392
        cpu->thread = g_malloc0(sizeof(QemuThread));
A
Andreas Färber 已提交
1393 1394 1395
        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
        qemu_cond_init(cpu->halt_cond);
        tcg_halt_cond = cpu->halt_cond;
1396 1397 1398 1399
        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
                 cpu->cpu_index);
        qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
                           cpu, QEMU_THREAD_JOINABLE);
P
Paolo Bonzini 已提交
1400
#ifdef _WIN32
1401
        cpu->hThread = qemu_thread_get_handle(cpu->thread);
P
Paolo Bonzini 已提交
1402
#endif
1403
        while (!cpu->created) {
1404
            qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1405
        }
1406
        tcg_cpu_thread = cpu->thread;
1407
    } else {
1408
        cpu->thread = tcg_cpu_thread;
A
Andreas Färber 已提交
1409
        cpu->halt_cond = tcg_halt_cond;
1410 1411 1412
    }
}

1413
static void qemu_kvm_start_vcpu(CPUState *cpu)
1414
{
1415 1416
    char thread_name[VCPU_THREAD_NAME_SIZE];

1417
    cpu->thread = g_malloc0(sizeof(QemuThread));
A
Andreas Färber 已提交
1418 1419
    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
    qemu_cond_init(cpu->halt_cond);
1420 1421 1422 1423
    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
             cpu->cpu_index);
    qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
                       cpu, QEMU_THREAD_JOINABLE);
1424
    while (!cpu->created) {
1425
        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1426
    }
1427 1428
}

1429
static void qemu_dummy_start_vcpu(CPUState *cpu)
A
Anthony Liguori 已提交
1430
{
1431 1432
    char thread_name[VCPU_THREAD_NAME_SIZE];

1433
    cpu->thread = g_malloc0(sizeof(QemuThread));
A
Andreas Färber 已提交
1434 1435
    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
    qemu_cond_init(cpu->halt_cond);
1436 1437 1438
    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
             cpu->cpu_index);
    qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
A
Anthony Liguori 已提交
1439
                       QEMU_THREAD_JOINABLE);
1440
    while (!cpu->created) {
A
Anthony Liguori 已提交
1441 1442 1443 1444
        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
    }
}

1445
void qemu_init_vcpu(CPUState *cpu)
1446
{
1447 1448
    cpu->nr_cores = smp_cores;
    cpu->nr_threads = smp_threads;
1449
    cpu->stopped = true;
1450 1451 1452 1453 1454

    if (!cpu->as) {
        /* If the target cpu hasn't set up any address spaces itself,
         * give it the default one.
         */
1455 1456
        AddressSpace *as = address_space_init_shareable(cpu->memory,
                                                        "cpu-memory");
1457
        cpu->num_ases = 1;
1458
        cpu_address_space_init(cpu, as, 0);
1459 1460
    }

1461
    if (kvm_enabled()) {
1462
        qemu_kvm_start_vcpu(cpu);
A
Anthony Liguori 已提交
1463
    } else if (tcg_enabled()) {
1464
        qemu_tcg_init_vcpu(cpu);
A
Anthony Liguori 已提交
1465
    } else {
1466
        qemu_dummy_start_vcpu(cpu);
1467
    }
1468 1469
}

1470
void cpu_stop_current(void)
1471
{
1472 1473 1474 1475
    if (current_cpu) {
        current_cpu->stop = false;
        current_cpu->stopped = true;
        cpu_exit(current_cpu);
1476
        qemu_cond_broadcast(&qemu_pause_cond);
1477
    }
1478 1479
}

1480
int vm_stop(RunState state)
1481
{
J
Juan Quintela 已提交
1482
    if (qemu_in_vcpu_thread()) {
1483
        qemu_system_vmstop_request_prepare();
1484
        qemu_system_vmstop_request(state);
1485 1486 1487 1488
        /*
         * FIXME: should not return to device code in case
         * vm_stop() has been requested.
         */
1489
        cpu_stop_current();
1490
        return 0;
1491
    }
1492 1493

    return do_vm_stop(state);
1494 1495
}

1496 1497
/* does a state transition even if the VM is already stopped,
   current state is forgotten forever */
1498
int vm_stop_force_state(RunState state)
1499 1500
{
    if (runstate_is_running()) {
1501
        return vm_stop(state);
1502 1503
    } else {
        runstate_set(state);
1504 1505

        bdrv_drain_all();
1506 1507
        /* Make sure to return an error if the flush in a previous vm_stop()
         * failed. */
1508
        return bdrv_flush_all();
1509 1510 1511
    }
}

1512
void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1513 1514
{
    /* XXX: implement xxx_cpu_list for targets that still miss it */
P
Peter Maydell 已提交
1515 1516
#if defined(cpu_list)
    cpu_list(f, cpu_fprintf);
1517 1518
#endif
}
L
Luiz Capitulino 已提交
1519 1520 1521 1522

CpuInfoList *qmp_query_cpus(Error **errp)
{
    CpuInfoList *head = NULL, *cur_item = NULL;
1523
    CPUState *cpu;
L
Luiz Capitulino 已提交
1524

A
Andreas Färber 已提交
1525
    CPU_FOREACH(cpu) {
L
Luiz Capitulino 已提交
1526
        CpuInfoList *info;
1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538
#if defined(TARGET_I386)
        X86CPU *x86_cpu = X86_CPU(cpu);
        CPUX86State *env = &x86_cpu->env;
#elif defined(TARGET_PPC)
        PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
        CPUPPCState *env = &ppc_cpu->env;
#elif defined(TARGET_SPARC)
        SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
        CPUSPARCState *env = &sparc_cpu->env;
#elif defined(TARGET_MIPS)
        MIPSCPU *mips_cpu = MIPS_CPU(cpu);
        CPUMIPSState *env = &mips_cpu->env;
1539 1540 1541
#elif defined(TARGET_TRICORE)
        TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
        CPUTriCoreState *env = &tricore_cpu->env;
1542
#endif
L
Luiz Capitulino 已提交
1543

1544
        cpu_synchronize_state(cpu);
L
Luiz Capitulino 已提交
1545 1546 1547

        info = g_malloc0(sizeof(*info));
        info->value = g_malloc0(sizeof(*info->value));
1548
        info->value->CPU = cpu->cpu_index;
1549
        info->value->current = (cpu == first_cpu);
1550
        info->value->halted = cpu->halted;
1551
        info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
A
Andreas Färber 已提交
1552
        info->value->thread_id = cpu->thread_id;
L
Luiz Capitulino 已提交
1553
#if defined(TARGET_I386)
E
Eric Blake 已提交
1554
        info->value->arch = CPU_INFO_ARCH_X86;
1555
        info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
L
Luiz Capitulino 已提交
1556
#elif defined(TARGET_PPC)
E
Eric Blake 已提交
1557
        info->value->arch = CPU_INFO_ARCH_PPC;
1558
        info->value->u.ppc.nip = env->nip;
L
Luiz Capitulino 已提交
1559
#elif defined(TARGET_SPARC)
E
Eric Blake 已提交
1560
        info->value->arch = CPU_INFO_ARCH_SPARC;
1561 1562
        info->value->u.q_sparc.pc = env->pc;
        info->value->u.q_sparc.npc = env->npc;
L
Luiz Capitulino 已提交
1563
#elif defined(TARGET_MIPS)
E
Eric Blake 已提交
1564
        info->value->arch = CPU_INFO_ARCH_MIPS;
1565
        info->value->u.q_mips.PC = env->active_tc.PC;
1566
#elif defined(TARGET_TRICORE)
E
Eric Blake 已提交
1567
        info->value->arch = CPU_INFO_ARCH_TRICORE;
1568
        info->value->u.tricore.PC = env->PC;
E
Eric Blake 已提交
1569 1570
#else
        info->value->arch = CPU_INFO_ARCH_OTHER;
L
Luiz Capitulino 已提交
1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583
#endif

        /* XXX: waiting for the qapi to support GSList */
        if (!cur_item) {
            head = cur_item = info;
        } else {
            cur_item->next = info;
            cur_item = info;
        }
    }

    return head;
}
L
Luiz Capitulino 已提交
1584 1585 1586 1587 1588 1589

void qmp_memsave(int64_t addr, int64_t size, const char *filename,
                 bool has_cpu, int64_t cpu_index, Error **errp)
{
    FILE *f;
    uint32_t l;
1590
    CPUState *cpu;
L
Luiz Capitulino 已提交
1591
    uint8_t buf[1024];
1592
    int64_t orig_addr = addr, orig_size = size;
L
Luiz Capitulino 已提交
1593 1594 1595 1596 1597

    if (!has_cpu) {
        cpu_index = 0;
    }

1598 1599
    cpu = qemu_get_cpu(cpu_index);
    if (cpu == NULL) {
1600 1601
        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
                   "a CPU number");
L
Luiz Capitulino 已提交
1602 1603 1604 1605 1606
        return;
    }

    f = fopen(filename, "wb");
    if (!f) {
1607
        error_setg_file_open(errp, errno, filename);
L
Luiz Capitulino 已提交
1608 1609 1610 1611 1612 1613 1614
        return;
    }

    while (size != 0) {
        l = sizeof(buf);
        if (l > size)
            l = size;
1615
        if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1616 1617
            error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
                             " specified", orig_addr, orig_size);
1618 1619
            goto exit;
        }
L
Luiz Capitulino 已提交
1620
        if (fwrite(buf, 1, l, f) != l) {
1621
            error_setg(errp, QERR_IO_ERROR);
L
Luiz Capitulino 已提交
1622 1623 1624 1625 1626 1627 1628 1629 1630
            goto exit;
        }
        addr += l;
        size -= l;
    }

exit:
    fclose(f);
}
L
Luiz Capitulino 已提交
1631 1632 1633 1634 1635 1636 1637 1638 1639 1640

void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
                  Error **errp)
{
    FILE *f;
    uint32_t l;
    uint8_t buf[1024];

    f = fopen(filename, "wb");
    if (!f) {
1641
        error_setg_file_open(errp, errno, filename);
L
Luiz Capitulino 已提交
1642 1643 1644 1645 1646 1647 1648
        return;
    }

    while (size != 0) {
        l = sizeof(buf);
        if (l > size)
            l = size;
1649
        cpu_physical_memory_read(addr, buf, l);
L
Luiz Capitulino 已提交
1650
        if (fwrite(buf, 1, l, f) != l) {
1651
            error_setg(errp, QERR_IO_ERROR);
L
Luiz Capitulino 已提交
1652 1653 1654 1655 1656 1657 1658 1659 1660
            goto exit;
        }
        addr += l;
        size -= l;
    }

exit:
    fclose(f);
}
L
Luiz Capitulino 已提交
1661 1662 1663

void qmp_inject_nmi(Error **errp)
{
1664
    nmi_monitor_handle(monitor_get_cpu_index(), errp);
L
Luiz Capitulino 已提交
1665
}
1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682

void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
{
    if (!use_icount) {
        return;
    }

    cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
                (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
    if (icount_align_option) {
        cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
        cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
    } else {
        cpu_fprintf(f, "Max guest delay     NA\n");
        cpu_fprintf(f, "Max guest advance   NA\n");
    }
}