Merge remote-tracking branch 'bonzini/iommu-for-anthony' into staging

# By Paolo Bonzini (10) and others # Via Paolo Bonzini * bonzini/iommu-for-anthony: exec: remove qemu_safe_ram_ptr icount: make it thread-safe icount: document (future) locking rules for icount icount: prepare the code for future races in calling qemu_clock_warp icount: reorganize icount_warp_rt icount: use cpu_get_icount() directly timer: add timer_mod_anticipate and timer_mod_anticipate_ns timer: extract timer_mod_ns_locked and timerlist_rearm timer: make qemu_clock_enable sync between disable and timer's cb qemu-thread: add QemuEvent timer: protect timers_state's clock with seqlock seqlock: introduce read-write seqlock vga: Mark relevant portio lists regions as coalesced MMIO flushing cirrus: Mark vga io region as coalesced MMIO flushing portio: Allow to mark portio lists as coalesced MMIO flushing compatfd: switch to QemuThread memory: fix 128 arithmetic in info mtree Message-id: 1382024935-28297-1-git-send-email-pbonzini@redhat.com Signed-off-by: N Anthony Liguori <aliguori@amazon.com>

Merge remote-tracking branch 'bonzini/iommu-for-anthony' into staging
# By Paolo Bonzini (10) and others # Via Paolo Bonzini * bonzini/iommu-for-anthony: exec: remove qemu_safe_ram_ptr icount: make it thread-safe icount: document (future) locking rules for icount icount: prepare the code for future races in calling qemu_clock_warp icount: reorganize icount_warp_rt icount: use cpu_get_icount() directly timer: add timer_mod_anticipate and timer_mod_anticipate_ns timer: extract timer_mod_ns_locked and timerlist_rearm timer: make qemu_clock_enable sync between disable and timer's cb qemu-thread: add QemuEvent timer: protect timers_state's clock with seqlock seqlock: introduce read-write seqlock vga: Mark relevant portio lists regions as coalesced MMIO flushing cirrus: Mark vga io region as coalesced MMIO flushing portio: Allow to mark portio lists as coalesced MMIO flushing compatfd: switch to QemuThread memory: fix 128 arithmetic in info mtree Message-id: 1382024935-28297-1-git-send-email-pbonzini@redhat.com Signed-off-by: N Anthony Liguori <aliguori@amazon.com>
98964491 · Anthony Liguori · 1cb9b64d · 041603fe · 98964491 · 98964491
17 changed file
--- a/cpus.c
+++ b/cpus.c
@@ -37,6 +37,7 @@
 #include "sysemu/qtest.h"
 #include "qemu/main-loop.h"
 #include "qemu/bitmap.h"
+#include "qemu/seqlock.h"

 #ifndef _WIN32
 #include "qemu/compatfd.h"
@@ -97,21 +98,32 @@ static bool all_cpu_threads_idle(void)
 /***********************************************************/
 /* guest cycle counter */

+/* Protected by TimersState seqlock */
+
+/* Compensate for varying guest execution speed.  */
+static int64_t qemu_icount_bias;
+static int64_t vm_clock_warp_start;
 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 static int icount_time_shift;
 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 #define MAX_ICOUNT_SHIFT 10
-/* Compensate for varying guest execution speed.  */
-static int64_t qemu_icount_bias;
+
+/* Only written by TCG thread */
+static int64_t qemu_icount;
+
 static QEMUTimer *icount_rt_timer;
 static QEMUTimer *icount_vm_timer;
 static QEMUTimer *icount_warp_timer;
-static int64_t vm_clock_warp_start;
-static int64_t qemu_icount;

 typedef struct TimersState {
+    /* Protected by BQL.  */
    int64_t cpu_ticks_prev;
    int64_t cpu_ticks_offset;
+
+    /* cpu_clock_offset can be read out of BQL, so protect it with
+     * this lock.
+     */
+    QemuSeqLock vm_clock_seqlock;
    int64_t cpu_clock_offset;
    int32_t cpu_ticks_enabled;
    int64_t dummy;
@@ -120,7 +132,7 @@ typedef struct TimersState {
 static TimersState timers_state;

 /* Return the virtual CPU time, based on the instruction counter.  */
-int64_t cpu_get_icount(void)
+static int64_t cpu_get_icount_locked(void)
 {
    int64_t icount;
    CPUState *cpu = current_cpu;
@@ -136,7 +148,21 @@ int64_t cpu_get_icount(void)
    return qemu_icount_bias + (icount << icount_time_shift);
 }

+int64_t cpu_get_icount(void)
+{
+    int64_t icount;
+    unsigned start;
+
+    do {
+        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
+        icount = cpu_get_icount_locked();
+    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
+
+    return icount;
+}
+
 /* return the host CPU cycle counter and handle stop/restart */
+/* Caller must hold the BQL */
 int64_t cpu_get_ticks(void)
 {
    if (use_icount) {
@@ -157,37 +183,63 @@ int64_t cpu_get_ticks(void)
    }
 }

-/* return the host CPU monotonic timer and handle stop/restart */
-int64_t cpu_get_clock(void)
+static int64_t cpu_get_clock_locked(void)
 {
    int64_t ti;
+
    if (!timers_state.cpu_ticks_enabled) {
-        return timers_state.cpu_clock_offset;
+        ti = timers_state.cpu_clock_offset;
    } else {
        ti = get_clock();
-        return ti + timers_state.cpu_clock_offset;
+        ti += timers_state.cpu_clock_offset;
    }
+
+    return ti;
+}
+
+/* return the host CPU monotonic timer and handle stop/restart */
+int64_t cpu_get_clock(void)
+{
+    int64_t ti;
+    unsigned start;
+
+    do {
+        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
+        ti = cpu_get_clock_locked();
+    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
+
+    return ti;
 }

-/* enable cpu_get_ticks() */
+/* enable cpu_get_ticks()
+ * Caller must hold BQL which server as mutex for vm_clock_seqlock.
+ */
 void cpu_enable_ticks(void)
 {
+    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
+    seqlock_write_lock(&timers_state.vm_clock_seqlock);
    if (!timers_state.cpu_ticks_enabled) {
        timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
        timers_state.cpu_clock_offset -= get_clock();
        timers_state.cpu_ticks_enabled = 1;
    }
+    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 }

 /* disable cpu_get_ticks() : the clock is stopped. You must not call
-   cpu_get_ticks() after that.  */
+ * cpu_get_ticks() after that.
+ * Caller must hold BQL which server as mutex for vm_clock_seqlock.
+ */
 void cpu_disable_ticks(void)
 {
+    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
+    seqlock_write_lock(&timers_state.vm_clock_seqlock);
    if (timers_state.cpu_ticks_enabled) {
        timers_state.cpu_ticks_offset = cpu_get_ticks();
-        timers_state.cpu_clock_offset = cpu_get_clock();
+        timers_state.cpu_clock_offset = cpu_get_clock_locked();
        timers_state.cpu_ticks_enabled = 0;
    }
+    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 }

 /* Correlation between real and virtual time is always going to be
@@ -201,13 +253,19 @@ static void icount_adjust(void)
    int64_t cur_time;
    int64_t cur_icount;
    int64_t delta;
+
+    /* Protected by TimersState mutex.  */
    static int64_t last_delta;
+
    /* If the VM is not running, then do nothing.  */
    if (!runstate_is_running()) {
        return;
    }
-    cur_time = cpu_get_clock();
-    cur_icount = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+
+    seqlock_write_lock(&timers_state.vm_clock_seqlock);
+    cur_time = cpu_get_clock_locked();
+    cur_icount = cpu_get_icount_locked();
+
    delta = cur_icount - cur_time;
    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
    if (delta > 0
@@ -224,6 +282,7 @@ static void icount_adjust(void)
    }
    last_delta = delta;
    qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
+    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 }

 static void icount_adjust_rt(void *opaque)
@@ -248,30 +307,37 @@ static int64_t qemu_icount_round(int64_t count)

 static void icount_warp_rt(void *opaque)
 {
-    if (vm_clock_warp_start == -1) {
+    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
+     * changes from -1 to another value, so the race here is okay.
+     */
+    if (atomic_read(&vm_clock_warp_start) == -1) {
        return;
    }

+    seqlock_write_lock(&timers_state.vm_clock_seqlock);
    if (runstate_is_running()) {
        int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-        int64_t warp_delta = clock - vm_clock_warp_start;
-        if (use_icount == 1) {
-            qemu_icount_bias += warp_delta;
-        } else {
+        int64_t warp_delta;
+
+        warp_delta = clock - vm_clock_warp_start;
+        if (use_icount == 2) {
            /*
             * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
             * far ahead of real time.
             */
-            int64_t cur_time = cpu_get_clock();
-            int64_t cur_icount = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+            int64_t cur_time = cpu_get_clock_locked();
+            int64_t cur_icount = cpu_get_icount_locked();
            int64_t delta = cur_time - cur_icount;
-            qemu_icount_bias += MIN(warp_delta, delta);
+            warp_delta = MIN(warp_delta, delta);
+        }
+        qemu_icount_bias += warp_delta;
    }
+    vm_clock_warp_start = -1;
+    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+
    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
    }
-    }
-    vm_clock_warp_start = -1;
 }

 void qtest_clock_warp(int64_t dest)
@@ -281,7 +347,10 @@ void qtest_clock_warp(int64_t dest)
    while (clock < dest) {
        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
        int64_t warp = MIN(dest - clock, deadline);
+        seqlock_write_lock(&timers_state.vm_clock_seqlock);
        qemu_icount_bias += warp;
+        seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+
        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
        clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
    }
@@ -290,6 +359,7 @@ void qtest_clock_warp(int64_t dest)

 void qemu_clock_warp(QEMUClockType type)
 {
+    int64_t clock;
    int64_t deadline;

    /*
@@ -309,8 +379,8 @@ void qemu_clock_warp(QEMUClockType type)
     * the earliest QEMU_CLOCK_VIRTUAL timer.
     */
    icount_warp_rt(NULL);
-    if (!all_cpu_threads_idle() || !qemu_clock_has_timers(QEMU_CLOCK_VIRTUAL)) {
    timer_del(icount_warp_timer);
+    if (!all_cpu_threads_idle()) {
        return;
    }

@@ -319,17 +389,11 @@ void qemu_clock_warp(QEMUClockType type)
 	return;
    }

-    vm_clock_warp_start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    /* We want to use the earliest deadline from ALL vm_clocks */
+    clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
-
-    /* Maintain prior (possibly buggy) behaviour where if no deadline
-     * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
-     * INT32_MAX nanoseconds ahead, we still use INT32_MAX
-     * nanoseconds.
-     */
-    if ((deadline < 0) || (deadline > INT32_MAX)) {
-        deadline = INT32_MAX;
+    if (deadline < 0) {
+        return;
    }

    if (deadline > 0) {
@@ -350,7 +414,12 @@ void qemu_clock_warp(QEMUClockType type)
         * you will not be sending network packets continuously instead of
         * every 100ms.
         */
-        timer_mod(icount_warp_timer, vm_clock_warp_start + deadline);
+        seqlock_write_lock(&timers_state.vm_clock_seqlock);
+        if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
+            vm_clock_warp_start = clock;
+        }
+        seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+        timer_mod_anticipate(icount_warp_timer, clock + deadline);
    } else if (deadline == 0) {
        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
    }
@@ -371,6 +440,7 @@ static const VMStateDescription vmstate_timers = {

 void configure_icount(const char *option)
 {
+    seqlock_init(&timers_state.vm_clock_seqlock, NULL);
    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
    if (!option) {
        return;

--- a/exec.c
+++ b/exec.c
@@ -129,7 +129,6 @@ static PhysPageMap next_map;

 static void io_mem_init(void);
 static void memory_map_init(void);
-static void *qemu_safe_ram_ptr(ram_addr_t addr);

 static MemoryRegion io_mem_watch;
 #endif
@@ -626,22 +625,39 @@ void cpu_abort(CPUArchState *env, const char *fmt, ...)
 }

 #if !defined(CONFIG_USER_ONLY)
+static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
+{
+    RAMBlock *block;
+
+    /* The list is protected by the iothread lock here.  */
+    block = ram_list.mru_block;
+    if (block && addr - block->offset < block->length) {
+        goto found;
+    }
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        if (addr - block->offset < block->length) {
+            goto found;
+        }
+    }
+
+    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
+    abort();
+
+found:
+    ram_list.mru_block = block;
+    return block;
+}
+
 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t end,
                                      uintptr_t length)
 {
-    uintptr_t start1;
+    RAMBlock *block;
+    ram_addr_t start1;

-    /* we modify the TLB cache so that the dirty bit will be set again
-       when accessing the range */
-    start1 = (uintptr_t)qemu_safe_ram_ptr(start);
-    /* Check that we don't span multiple blocks - this breaks the
-       address comparisons below.  */
-    if ((uintptr_t)qemu_safe_ram_ptr(end - 1) - start1
-            != (end - 1) - start) {
-        abort();
-    }
+    block = qemu_get_ram_block(start);
+    assert(block == qemu_get_ram_block(end - 1));
+    start1 = (uintptr_t)block->host + (start - block->offset);
    cpu_tlb_reset_dirty_all(start1, length);
-
 }

 /* Note: start and end must be within the same ram block.  */
@@ -1269,29 +1285,6 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
 }
 #endif /* !_WIN32 */

-static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
-{
-    RAMBlock *block;
-
-    /* The list is protected by the iothread lock here.  */
-    block = ram_list.mru_block;
-    if (block && addr - block->offset < block->length) {
-        goto found;
-    }
-    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-        if (addr - block->offset < block->length) {
-            goto found;
-        }
-    }
-
-    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
-    abort();
-
-found:
-    ram_list.mru_block = block;
-    return block;
-}
-
 /* Return a host pointer to ram allocated with qemu_ram_alloc.
   With the exception of the softmmu code in this file, this should
   only be used for local memory (e.g. video ram) that the device owns,
@@ -1319,40 +1312,6 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
    return block->host + (addr - block->offset);
 }

-/* Return a host pointer to ram allocated with qemu_ram_alloc.  Same as
- * qemu_get_ram_ptr but do not touch ram_list.mru_block.
- *
- * ??? Is this still necessary?
- */
-static void *qemu_safe_ram_ptr(ram_addr_t addr)
-{
-    RAMBlock *block;
-
-    /* The list is protected by the iothread lock here.  */
-    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-        if (addr - block->offset < block->length) {
-            if (xen_enabled()) {
-                /* We need to check if the requested address is in the RAM
-                 * because we don't want to map the entire memory in QEMU.
-                 * In that case just map until the end of the page.
-                 */
-                if (block->offset == 0) {
-                    return xen_map_cache(addr, 0, 0);
-                } else if (block->host == NULL) {
-                    block->host =
-                        xen_map_cache(block->offset, block->length, 1);
-                }
-            }
-            return block->host + (addr - block->offset);
-        }
-    }
-
-    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
-    abort();
-
-    return NULL;
-}
-
 /* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr
 * but takes a size argument */
 static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)

--- a/hw/display/cirrus_vga.c
+++ b/hw/display/cirrus_vga.c
@@ -2447,7 +2447,6 @@ static uint64_t cirrus_vga_ioport_read(void *opaque, hwaddr addr,
    VGACommonState *s = &c->vga;
    int val, index;

-    qemu_flush_coalesced_mmio_buffer();
    addr += 0x3b0;

    if (vga_ioport_invalid(s, addr)) {
@@ -2544,7 +2543,6 @@ static void cirrus_vga_ioport_write(void *opaque, hwaddr addr, uint64_t val,
    VGACommonState *s = &c->vga;
    int index;

-    qemu_flush_coalesced_mmio_buffer();
    addr += 0x3b0;

    /* check port range access depending on color/monochrome mode */
@@ -2843,6 +2841,7 @@ static void cirrus_init_common(CirrusVGAState *s, Object *owner,
    /* Register ioport 0x3b0 - 0x3df */
    memory_region_init_io(&s->cirrus_vga_io, owner, &cirrus_vga_io_ops, s,
                          "cirrus-io", 0x30);
+    memory_region_set_flush_coalesced(&s->cirrus_vga_io);
    memory_region_add_subregion(system_io, 0x3b0, &s->cirrus_vga_io);

    memory_region_init(&s->low_mem_container, owner,

--- a/hw/display/qxl.c
+++ b/hw/display/qxl.c
@@ -2073,6 +2073,7 @@ static int qxl_init_primary(PCIDevice *dev)
             pci_address_space(dev), pci_address_space_io(dev), false);
    portio_list_init(qxl_vga_port_list, OBJECT(dev), qxl_vga_portio_list,
                     vga, "vga");
+    portio_list_set_flush_coalesced(qxl_vga_port_list);
    portio_list_add(qxl_vga_port_list, pci_address_space_io(dev), 0x3b0);

    vga->con = graphic_console_init(DEVICE(dev), &qxl_ops, qxl);

--- a/hw/display/vga.c
+++ b/hw/display/vga.c
@@ -359,8 +359,6 @@ uint32_t vga_ioport_read(void *opaque, uint32_t addr)
    VGACommonState *s = opaque;
    int val, index;

-    qemu_flush_coalesced_mmio_buffer();
-
    if (vga_ioport_invalid(s, addr)) {
        val = 0xff;
    } else {
@@ -453,8 +451,6 @@ void vga_ioport_write(void *opaque, uint32_t addr, uint32_t val)
    VGACommonState *s = opaque;
    int index;

-    qemu_flush_coalesced_mmio_buffer();
-
    /* check port range access depending on color/monochrome mode */
    if (vga_ioport_invalid(s, addr)) {
        return;
@@ -2373,6 +2369,7 @@ void vga_init(VGACommonState *s, Object *obj, MemoryRegion *address_space,
    memory_region_set_coalescing(vga_io_memory);
    if (init_vga_ports) {
        portio_list_init(vga_port_list, obj, vga_ports, s, "vga");
+        portio_list_set_flush_coalesced(vga_port_list);
        portio_list_add(vga_port_list, address_space_io, 0x3b0);
    }
    if (vbe_ports) {

--- a/include/exec/ioport.h
+++ b/include/exec/ioport.h
@@ -64,11 +64,13 @@ typedef struct PortioList {
    struct MemoryRegion **regions;
    void *opaque;
    const char *name;
+    bool flush_coalesced_mmio;
 } PortioList;

 void portio_list_init(PortioList *piolist, Object *owner,
                      const struct MemoryRegionPortio *callbacks,
                      void *opaque, const char *name);
+void portio_list_set_flush_coalesced(PortioList *piolist);
 void portio_list_destroy(PortioList *piolist);
 void portio_list_add(PortioList *piolist,
                     struct MemoryRegion *address_space,

--- a/include/qemu/seqlock.h
+++ b/include/qemu/seqlock.h
+/*
+ * Seqlock implementation for QEMU
+ *
+ * Copyright Red Hat, Inc. 2013
+ *
+ * Author:
+ *  Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#ifndef QEMU_SEQLOCK_H
+#define QEMU_SEQLOCK_H 1
+
+#include <qemu/atomic.h>
+#include <qemu/thread.h>
+
+typedef struct QemuSeqLock QemuSeqLock;
+
+struct QemuSeqLock {
+    QemuMutex *mutex;
+    unsigned sequence;
+};
+
+static inline void seqlock_init(QemuSeqLock *sl, QemuMutex *mutex)
+{
+    sl->mutex = mutex;
+    sl->sequence = 0;
+}
+
+/* Lock out other writers and update the count.  */
+static inline void seqlock_write_lock(QemuSeqLock *sl)
+{
+    if (sl->mutex) {
+        qemu_mutex_lock(sl->mutex);
+    }
+    ++sl->sequence;
+
+    /* Write sequence before updating other fields.  */
+    smp_wmb();
+}
+
+static inline void seqlock_write_unlock(QemuSeqLock *sl)
+{
+    /* Write other fields before finalizing sequence.  */
+    smp_wmb();
+
+    ++sl->sequence;
+    if (sl->mutex) {
+        qemu_mutex_unlock(sl->mutex);
+    }
+}
+
+static inline unsigned seqlock_read_begin(QemuSeqLock *sl)
+{
+    /* Always fail if a write is in progress.  */
+    unsigned ret = sl->sequence & ~1;
+
+    /* Read sequence before reading other fields.  */
+    smp_rmb();
+    return ret;
+}
+
+static int seqlock_read_retry(const QemuSeqLock *sl, unsigned start)
+{
+    /* Read other fields before reading final sequence.  */
+    smp_rmb();
+    return unlikely(sl->sequence != start);
+}
+
+#endif
--- a/include/qemu/thread-posix.h
+++ b/include/qemu/thread-posix.h
@@ -21,6 +21,14 @@ struct QemuSemaphore {
 #endif
 };

+struct QemuEvent {
+#ifndef __linux__
+    pthread_mutex_t lock;
+    pthread_cond_t cond;
+#endif
+    unsigned value;
+};
+
 struct QemuThread {
    pthread_t thread;
 };

--- a/include/qemu/thread-win32.h
+++ b/include/qemu/thread-win32.h
@@ -17,6 +17,10 @@ struct QemuSemaphore {
    HANDLE sema;
 };

+struct QemuEvent {
+    HANDLE event;
+};
+
 typedef struct QemuThreadData QemuThreadData;
 struct QemuThread {
    QemuThreadData *data;

--- a/include/qemu/thread.h
+++ b/include/qemu/thread.h
@@ -7,6 +7,7 @@
 typedef struct QemuMutex QemuMutex;
 typedef struct QemuCond QemuCond;
 typedef struct QemuSemaphore QemuSemaphore;
+typedef struct QemuEvent QemuEvent;
 typedef struct QemuThread QemuThread;

 #ifdef _WIN32
@@ -45,6 +46,12 @@ void qemu_sem_wait(QemuSemaphore *sem);
 int qemu_sem_timedwait(QemuSemaphore *sem, int ms);
 void qemu_sem_destroy(QemuSemaphore *sem);

+void qemu_event_init(QemuEvent *ev, bool init);
+void qemu_event_set(QemuEvent *ev);
+void qemu_event_reset(QemuEvent *ev);
+void qemu_event_wait(QemuEvent *ev);
+void qemu_event_destroy(QemuEvent *ev);
+
 void qemu_thread_create(QemuThread *thread,
                        void *(*start_routine)(void *),
                        void *arg, int mode);

--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -189,6 +189,12 @@ void qemu_clock_notify(QEMUClockType type);
 * @enabled: true to enable, false to disable
 *
 * Enable or disable a clock
+ * Disabling the clock will wait for related timerlists to stop
+ * executing qemu_run_timers.  Thus, this functions should not
+ * be used from the callback of a timer that is based on @clock.
+ * Doing so would cause a deadlock.
+ *
+ * Caller should hold BQL.
 */
 void qemu_clock_enable(QEMUClockType type, bool enabled);

@@ -538,6 +544,19 @@ void timer_del(QEMUTimer *ts);
 */
 void timer_mod_ns(QEMUTimer *ts, int64_t expire_time);

+/**
+ * timer_mod_anticipate_ns:
+ * @ts: the timer
+ * @expire_time: the expiry time in nanoseconds
+ *
+ * Modify a timer to expire at @expire_time or the current time,
+ * whichever comes earlier.
+ *
+ * This function is thread-safe but the timer and its timer list must not be
+ * freed while this function is running.
+ */
+void timer_mod_anticipate_ns(QEMUTimer *ts, int64_t expire_time);
+
 /**
 * timer_mod:
 * @ts: the timer
@@ -551,6 +570,19 @@ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time);
 */
 void timer_mod(QEMUTimer *ts, int64_t expire_timer);

+/**
+ * timer_mod_anticipate:
+ * @ts: the timer
+ * @expire_time: the expiry time in nanoseconds
+ *
+ * Modify a timer to expire at @expire_time or the current time, whichever
+ * comes earlier, taking into account the scale associated with the timer.
+ *
+ * This function is thread-safe but the timer and its timer list must not be
+ * freed while this function is running.
+ */
+void timer_mod_anticipate(QEMUTimer *ts, int64_t expire_time);
+
 /**
 * timer_pending:
 * @ts: the timer
@@ -653,7 +685,9 @@ static inline int64_t qemu_soonest_timeout(int64_t timeout1, int64_t timeout2)
 void init_clocks(void);

 int64_t cpu_get_ticks(void);
+/* Caller must hold BQL */
 void cpu_enable_ticks(void);
+/* Caller must hold BQL */
 void cpu_disable_ticks(void);

 static inline int64_t get_ticks_per_sec(void)

--- a/ioport.c
+++ b/ioport.c
@@ -139,6 +139,12 @@ void portio_list_init(PortioList *piolist,
    piolist->opaque = opaque;
    piolist->owner = owner;
    piolist->name = name;
+    piolist->flush_coalesced_mmio = false;
+}
+
+void portio_list_set_flush_coalesced(PortioList *piolist)
+{
+    piolist->flush_coalesced_mmio = true;
 }

 void portio_list_destroy(PortioList *piolist)
@@ -231,6 +237,9 @@ static void portio_list_add_1(PortioList *piolist,
     */
    memory_region_init_io(&mrpio->mr, piolist->owner, &portio_ops, mrpio,
                          piolist->name, off_high - off_low);
+    if (piolist->flush_coalesced_mmio) {
+        memory_region_set_flush_coalesced(&mrpio->mr);
+    }
    memory_region_add_subregion(piolist->address_space,
                                start + off_low, &mrpio->mr);
    piolist->regions[piolist->nr] = &mrpio->mr;

--- a/memory.c
+++ b/memory.c
@@ -1809,7 +1809,9 @@ static void mtree_print_mr(fprintf_function mon_printf, void *f,
                   mr->alias->name,
                   mr->alias_offset,
                   mr->alias_offset
-                   + (hwaddr)int128_get64(mr->size) - 1);
+                   + (int128_nz(mr->size) ?
+                      (hwaddr)int128_get64(int128_sub(mr->size,
+                                                      int128_one())) : 0));
    } else {
        mon_printf(f,
                   TARGET_FMT_plx "-" TARGET_FMT_plx " (prio %d, %c%c): %s\n",

--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -45,6 +45,7 @@
 /* timers */

 typedef struct QEMUClock {
+    /* We rely on BQL to protect the timerlists */
    QLIST_HEAD(, QEMUTimerList) timerlists;

    NotifierList reset_notifiers;
@@ -71,6 +72,9 @@ struct QEMUTimerList {
    QLIST_ENTRY(QEMUTimerList) list;
    QEMUTimerListNotifyCB *notify_cb;
    void *notify_opaque;
+
+    /* lightweight method to mark the end of timerlist's running */
+    QemuEvent timers_done_ev;
 };

 /**
@@ -99,6 +103,7 @@ QEMUTimerList *timerlist_new(QEMUClockType type,
    QEMUClock *clock = qemu_clock_ptr(type);

    timer_list = g_malloc0(sizeof(QEMUTimerList));
+    qemu_event_init(&timer_list->timers_done_ev, false);
    timer_list->clock = clock;
    timer_list->notify_cb = cb;
    timer_list->notify_opaque = opaque;
@@ -143,13 +148,25 @@ void qemu_clock_notify(QEMUClockType type)
    }
 }

+/* Disabling the clock will wait for related timerlists to stop
+ * executing qemu_run_timers.  Thus, this functions should not
+ * be used from the callback of a timer that is based on @clock.
+ * Doing so would cause a deadlock.
+ *
+ * Caller should hold BQL.
+ */
 void qemu_clock_enable(QEMUClockType type, bool enabled)
 {
    QEMUClock *clock = qemu_clock_ptr(type);
+    QEMUTimerList *tl;
    bool old = clock->enabled;
    clock->enabled = enabled;
    if (enabled && !old) {
        qemu_clock_notify(type);
+    } else if (!enabled && old) {
+        QLIST_FOREACH(tl, &clock->timerlists, list) {
+            qemu_event_wait(&tl->timers_done_ev);
+        }
    }
 }

@@ -338,6 +355,34 @@ static void timer_del_locked(QEMUTimerList *timer_list, QEMUTimer *ts)
    }
 }

+static bool timer_mod_ns_locked(QEMUTimerList *timer_list,
+                                QEMUTimer *ts, int64_t expire_time)
+{
+    QEMUTimer **pt, *t;
+
+    /* add the timer in the sorted list */
+    pt = &timer_list->active_timers;
+    for (;;) {
+        t = *pt;
+        if (!timer_expired_ns(t, expire_time)) {
+            break;
+        }
+        pt = &t->next;
+    }
+    ts->expire_time = MAX(expire_time, 0);
+    ts->next = *pt;
+    *pt = ts;
+
+    return pt == &timer_list->active_timers;
+}
+
+static void timerlist_rearm(QEMUTimerList *timer_list)
+{
+    /* Interrupt execution to force deadline recalculation.  */
+    qemu_clock_warp(timer_list->clock->type);
+    timerlist_notify(timer_list);
+}
+
 /* stop a timer, but do not dealloc it */
 void timer_del(QEMUTimer *ts)
 {
@@ -353,30 +398,39 @@ void timer_del(QEMUTimer *ts)
 void timer_mod_ns(QEMUTimer *ts, int64_t expire_time)
 {
    QEMUTimerList *timer_list = ts->timer_list;
-    QEMUTimer **pt, *t;
+    bool rearm;

    qemu_mutex_lock(&timer_list->active_timers_lock);
    timer_del_locked(timer_list, ts);
+    rearm = timer_mod_ns_locked(timer_list, ts, expire_time);
+    qemu_mutex_unlock(&timer_list->active_timers_lock);

-    /* add the timer in the sorted list */
-    pt = &timer_list->active_timers;
-    for(;;) {
-        t = *pt;
-        if (!timer_expired_ns(t, expire_time)) {
-            break;
+    if (rearm) {
+        timerlist_rearm(timer_list);
    }
-        pt = &t->next;
+}
+
+/* modify the current timer so that it will be fired when current_time
+   >= expire_time or the current deadline, whichever comes earlier.
+   The corresponding callback will be called. */
+void timer_mod_anticipate_ns(QEMUTimer *ts, int64_t expire_time)
+{
+    QEMUTimerList *timer_list = ts->timer_list;
+    bool rearm;
+
+    qemu_mutex_lock(&timer_list->active_timers_lock);
+    if (ts->expire_time == -1 || ts->expire_time > expire_time) {
+        if (ts->expire_time != -1) {
+            timer_del_locked(timer_list, ts);
+        }
+        rearm = timer_mod_ns_locked(timer_list, ts, expire_time);
+    } else {
+        rearm = false;
    }
-    ts->expire_time = MAX(expire_time, 0);
-    ts->next = *pt;
-    *pt = ts;
    qemu_mutex_unlock(&timer_list->active_timers_lock);

-    /* Rearm if necessary  */
-    if (pt == &timer_list->active_timers) {
-        /* Interrupt execution to force deadline recalculation.  */
-        qemu_clock_warp(timer_list->clock->type);
-        timerlist_notify(timer_list);
+    if (rearm) {
+        timerlist_rearm(timer_list);
    }
 }

@@ -385,6 +439,11 @@ void timer_mod(QEMUTimer *ts, int64_t expire_time)
    timer_mod_ns(ts, expire_time * ts->scale);
 }

+void timer_mod_anticipate(QEMUTimer *ts, int64_t expire_time)
+{
+    timer_mod_anticipate_ns(ts, expire_time * ts->scale);
+}
+
 bool timer_pending(QEMUTimer *ts)
 {
    return ts->expire_time >= 0;
@@ -403,8 +462,9 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
    QEMUTimerCB *cb;
    void *opaque;

+    qemu_event_reset(&timer_list->timers_done_ev);
    if (!timer_list->clock->enabled) {
-        return progress;
+        goto out;
    }

    current_time = qemu_clock_get_ns(timer_list->clock->type);
@@ -428,6 +488,9 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
        cb(opaque);
        progress = true;
    }
+
+out:
+    qemu_event_set(&timer_list->timers_done_ev);
    return progress;
 }


--- a/util/compatfd.c
+++ b/util/compatfd.c
@@ -15,9 +15,9 @@

 #include "qemu-common.h"
 #include "qemu/compatfd.h"
+#include "qemu/thread.h"

 #include <sys/syscall.h>
-#include <pthread.h>

 struct sigfd_compat_info
 {
@@ -28,10 +28,6 @@ struct sigfd_compat_info
 static void *sigwait_compat(void *opaque)
 {
    struct sigfd_compat_info *info = opaque;
-    sigset_t all;
-
-    sigfillset(&all);
-    pthread_sigmask(SIG_BLOCK, &all, NULL);

    while (1) {
        int sig;
@@ -71,9 +67,8 @@ static void *sigwait_compat(void *opaque)

 static int qemu_signalfd_compat(const sigset_t *mask)
 {
-    pthread_attr_t attr;
-    pthread_t tid;
    struct sigfd_compat_info *info;
+    QemuThread thread;
    int fds[2];

    info = malloc(sizeof(*info));
@@ -93,12 +88,7 @@ static int qemu_signalfd_compat(const sigset_t *mask)
    memcpy(&info->mask, mask, sizeof(*mask));
    info->fd = fds[1];

-    pthread_attr_init(&attr);
-    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
-
-    pthread_create(&tid, &attr, sigwait_compat, info);
-
-    pthread_attr_destroy(&attr);
+    qemu_thread_create(&thread, sigwait_compat, info, QEMU_THREAD_DETACHED);

    return fds[0];
 }

--- a/util/qemu-thread-posix.c
+++ b/util/qemu-thread-posix.c
@@ -20,7 +20,12 @@
 #include <limits.h>
 #include <unistd.h>
 #include <sys/time.h>
+#ifdef __linux__
+#include <sys/syscall.h>
+#include <linux/futex.h>
+#endif
 #include "qemu/thread.h"
+#include "qemu/atomic.h"

 static void error_exit(int err, const char *msg)
 {
@@ -272,6 +277,117 @@ void qemu_sem_wait(QemuSemaphore *sem)
 #endif
 }

+#ifdef __linux__
+#define futex(...)              syscall(__NR_futex, __VA_ARGS__)
+
+static inline void futex_wake(QemuEvent *ev, int n)
+{
+    futex(ev, FUTEX_WAKE, n, NULL, NULL, 0);
+}
+
+static inline void futex_wait(QemuEvent *ev, unsigned val)
+{
+    futex(ev, FUTEX_WAIT, (int) val, NULL, NULL, 0);
+}
+#else
+static inline void futex_wake(QemuEvent *ev, int n)
+{
+    if (n == 1) {
+        pthread_cond_signal(&ev->cond);
+    } else {
+        pthread_cond_broadcast(&ev->cond);
+    }
+}
+
+static inline void futex_wait(QemuEvent *ev, unsigned val)
+{
+    pthread_mutex_lock(&ev->lock);
+    if (ev->value == val) {
+        pthread_cond_wait(&ev->cond, &ev->lock);
+    }
+    pthread_mutex_unlock(&ev->lock);
+}
+#endif
+
+/* Valid transitions:
+ * - free->set, when setting the event
+ * - busy->set, when setting the event, followed by futex_wake
+ * - set->free, when resetting the event
+ * - free->busy, when waiting
+ *
+ * set->busy does not happen (it can be observed from the outside but
+ * it really is set->free->busy).
+ *
+ * busy->free provably cannot happen; to enforce it, the set->free transition
+ * is done with an OR, which becomes a no-op if the event has concurrently
+ * transitioned to free or busy.
+ */
+
+#define EV_SET         0
+#define EV_FREE        1
+#define EV_BUSY       -1
+
+void qemu_event_init(QemuEvent *ev, bool init)
+{
+#ifndef __linux__
+    pthread_mutex_init(&ev->lock, NULL);
+    pthread_cond_init(&ev->cond, NULL);
+#endif
+
+    ev->value = (init ? EV_SET : EV_FREE);
+}
+
+void qemu_event_destroy(QemuEvent *ev)
+{
+#ifndef __linux__
+    pthread_mutex_destroy(&ev->lock);
+    pthread_cond_destroy(&ev->cond);
+#endif
+}
+
+void qemu_event_set(QemuEvent *ev)
+{
+    if (atomic_mb_read(&ev->value) != EV_SET) {
+        if (atomic_xchg(&ev->value, EV_SET) == EV_BUSY) {
+            /* There were waiters, wake them up.  */
+            futex_wake(ev, INT_MAX);
+        }
+    }
+}
+
+void qemu_event_reset(QemuEvent *ev)
+{
+    if (atomic_mb_read(&ev->value) == EV_SET) {
+        /*
+         * If there was a concurrent reset (or even reset+wait),
+         * do nothing.  Otherwise change EV_SET->EV_FREE.
+         */
+        atomic_or(&ev->value, EV_FREE);
+    }
+}
+
+void qemu_event_wait(QemuEvent *ev)
+{
+    unsigned value;
+
+    value = atomic_mb_read(&ev->value);
+    if (value != EV_SET) {
+        if (value == EV_FREE) {
+            /*
+             * Leave the event reset and tell qemu_event_set that there
+             * are waiters.  No need to retry, because there cannot be
+             * a concurent busy->free transition.  After the CAS, the
+             * event will be either set or busy.
+             */
+            if (atomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) {
+                return;
+            }
+        }
+        futex_wait(ev, EV_BUSY);
+    }
+}
+
+
 void qemu_thread_create(QemuThread *thread,
                       void *(*start_routine)(void*),
                       void *arg, int mode)

--- a/util/qemu-thread-win32.c
+++ b/util/qemu-thread-win32.c
@@ -227,6 +227,32 @@ void qemu_sem_wait(QemuSemaphore *sem)
    }
 }

+void qemu_event_init(QemuEvent *ev, bool init)
+{
+    /* Manual reset.  */
+    ev->event = CreateEvent(NULL, TRUE, init, NULL);
+}
+
+void qemu_event_destroy(QemuEvent *ev)
+{
+    CloseHandle(ev->event);
+}
+
+void qemu_event_set(QemuEvent *ev)
+{
+    SetEvent(ev->event);
+}
+
+void qemu_event_reset(QemuEvent *ev)
+{
+    ResetEvent(ev->event);
+}
+
+void qemu_event_wait(QemuEvent *ev)
+{
+    WaitForSingleObject(ev->event, INFINITE);
+}
+
 struct QemuThreadData {
    /* Passed to win32_start_routine.  */
    void             *(*start_routine)(void *);