exec.c 122.6 KB
Newer Older
B
bellard 已提交
1
/*
2
 *  Virtual page mapping
3
 *
B
bellard 已提交
4 5 6 7 8 9 10 11 12 13 14 15 16
 *  Copyright (c) 2003 Fabrice Bellard
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
B
bellard 已提交
18
 */
19

P
Peter Maydell 已提交
20
#include "qemu/osdep.h"
21
#include "qemu-common.h"
22
#include "qapi/error.h"
B
bellard 已提交
23

24
#include "qemu/cutils.h"
B
bellard 已提交
25
#include "cpu.h"
26
#include "exec/exec-all.h"
27
#include "exec/target_page.h"
B
bellard 已提交
28
#include "tcg.h"
29
#include "hw/qdev-core.h"
F
Fam Zheng 已提交
30
#include "hw/qdev-properties.h"
31
#if !defined(CONFIG_USER_ONLY)
32
#include "hw/boards.h"
33
#include "hw/xen/xen.h"
34
#endif
35
#include "sysemu/kvm.h"
36
#include "sysemu/sysemu.h"
37
#include "sysemu/tcg.h"
38 39
#include "qemu/timer.h"
#include "qemu/config-file.h"
40
#include "qemu/error-report.h"
41
#include "qemu/qemu-print.h"
42
#if defined(CONFIG_USER_ONLY)
43
#include "qemu.h"
J
Jun Nakajima 已提交
44
#else /* !CONFIG_USER_ONLY */
45
#include "exec/memory.h"
P
Paolo Bonzini 已提交
46
#include "exec/ioport.h"
47
#include "sysemu/dma.h"
48
#include "sysemu/hostmem.h"
49
#include "sysemu/hw_accel.h"
50
#include "exec/address-spaces.h"
51
#include "sysemu/xen-mapcache.h"
52
#include "trace-root.h"
53

54 55 56 57
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
#include <linux/falloc.h>
#endif

58
#endif
M
Mike Day 已提交
59
#include "qemu/rcu_queue.h"
60
#include "qemu/main-loop.h"
61
#include "translate-all.h"
62
#include "sysemu/replay.h"
63

64
#include "exec/memory-internal.h"
65
#include "exec/ram_addr.h"
66
#include "exec/log.h"
67

68 69
#include "migration/vmstate.h"

70
#include "qemu/range.h"
71 72 73
#ifndef _WIN32
#include "qemu/mmap-alloc.h"
#endif
74

75 76
#include "monitor/monitor.h"

77
//#define DEBUG_SUBPAGE
T
ths 已提交
78

79
#if !defined(CONFIG_USER_ONLY)
M
Mike Day 已提交
80 81 82
/* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
 * are protected by the ramlist lock.
 */
M
Mike Day 已提交
83
RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
A
Avi Kivity 已提交
84 85

static MemoryRegion *system_memory;
86
static MemoryRegion *system_io;
A
Avi Kivity 已提交
87

88 89
AddressSpace address_space_io;
AddressSpace address_space_memory;
90

91
MemoryRegion io_mem_rom, io_mem_notdirty;
92
static MemoryRegion io_mem_unassigned;
93
#endif
94

95 96 97 98 99
#ifdef TARGET_PAGE_BITS_VARY
int target_page_bits;
bool target_page_bits_decided;
#endif

100 101
CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);

B
bellard 已提交
102 103
/* current CPU in the current thread. It is only valid inside
   cpu_exec() */
P
Paolo Bonzini 已提交
104
__thread CPUState *current_cpu;
P
pbrook 已提交
105
/* 0 = Do not count executed instructions.
T
ths 已提交
106
   1 = Precise instruction counting.
P
pbrook 已提交
107
   2 = Adaptive rate instruction counting.  */
108
int use_icount;
B
bellard 已提交
109

Y
Yang Zhong 已提交
110 111 112
uintptr_t qemu_host_page_size;
intptr_t qemu_host_page_mask;

113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
bool set_preferred_target_page_bits(int bits)
{
    /* The target page size is the lowest common denominator for all
     * the CPUs in the system, so we can only make it smaller, never
     * larger. And we can't make it smaller once we've committed to
     * a particular size.
     */
#ifdef TARGET_PAGE_BITS_VARY
    assert(bits >= TARGET_PAGE_BITS_MIN);
    if (target_page_bits == 0 || target_page_bits > bits) {
        if (target_page_bits_decided) {
            return false;
        }
        target_page_bits = bits;
    }
#endif
    return true;
}

132
#if !defined(CONFIG_USER_ONLY)
133

134 135 136 137 138 139 140 141 142 143
static void finalize_target_page_bits(void)
{
#ifdef TARGET_PAGE_BITS_VARY
    if (target_page_bits == 0) {
        target_page_bits = TARGET_PAGE_BITS_MIN;
    }
    target_page_bits_decided = true;
#endif
}

144 145 146
typedef struct PhysPageEntry PhysPageEntry;

struct PhysPageEntry {
M
Michael S. Tsirkin 已提交
147
    /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
148
    uint32_t skip : 6;
M
Michael S. Tsirkin 已提交
149
     /* index into phys_sections (!skip) or phys_map_nodes (skip) */
150
    uint32_t ptr : 26;
151 152
};

153 154
#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)

155
/* Size of the L2 (and L3, etc) page tables.  */
156
#define ADDR_SPACE_BITS 64
157

M
Michael S. Tsirkin 已提交
158
#define P_L2_BITS 9
159 160 161 162 163
#define P_L2_SIZE (1 << P_L2_BITS)

#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)

typedef PhysPageEntry Node[P_L2_SIZE];
164

165
typedef struct PhysPageMap {
166 167
    struct rcu_head rcu;

168 169 170 171 172 173 174 175
    unsigned sections_nb;
    unsigned sections_nb_alloc;
    unsigned nodes_nb;
    unsigned nodes_nb_alloc;
    Node *nodes;
    MemoryRegionSection *sections;
} PhysPageMap;

176
struct AddressSpaceDispatch {
177
    MemoryRegionSection *mru_section;
178 179 180 181
    /* This is a multi-level map on the physical address space.
     * The bottom level has pointers to MemoryRegionSections.
     */
    PhysPageEntry phys_map;
182
    PhysPageMap map;
183 184
};

185 186 187
#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
typedef struct subpage_t {
    MemoryRegion iomem;
188
    FlatView *fv;
189
    hwaddr base;
190
    uint16_t sub_section[];
191 192
} subpage_t;

193 194 195
#define PHYS_SECTION_UNASSIGNED 0
#define PHYS_SECTION_NOTDIRTY 1
#define PHYS_SECTION_ROM 2
196

197
static void io_mem_init(void);
A
Avi Kivity 已提交
198
static void memory_map_init(void);
199
static void tcg_log_global_after_sync(MemoryListener *listener);
200
static void tcg_commit(MemoryListener *listener);
201

202 203 204 205 206 207 208 209 210 211 212 213 214 215
/**
 * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 * @cpu: the CPU whose AddressSpace this is
 * @as: the AddressSpace itself
 * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 * @tcg_as_listener: listener for tracking changes to the AddressSpace
 */
struct CPUAddressSpace {
    CPUState *cpu;
    AddressSpace *as;
    struct AddressSpaceDispatch *memory_dispatch;
    MemoryListener tcg_as_listener;
};

216 217 218 219 220 221
struct DirtyBitmapSnapshot {
    ram_addr_t start;
    ram_addr_t end;
    unsigned long dirty[];
};

222
#endif
B
bellard 已提交
223

224
#if !defined(CONFIG_USER_ONLY)
225

226
static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
227
{
228
    static unsigned alloc_hint = 16;
229
    if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
230
        map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes);
231
        map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
232
        alloc_hint = map->nodes_nb_alloc;
233
    }
234 235
}

236
static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
237 238
{
    unsigned i;
239
    uint32_t ret;
240 241
    PhysPageEntry e;
    PhysPageEntry *p;
242

243
    ret = map->nodes_nb++;
244
    p = map->nodes[ret];
245
    assert(ret != PHYS_MAP_NODE_NIL);
246
    assert(ret != map->nodes_nb_alloc);
247 248 249

    e.skip = leaf ? 0 : 1;
    e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
250
    for (i = 0; i < P_L2_SIZE; ++i) {
251
        memcpy(&p[i], &e, sizeof(e));
252
    }
253
    return ret;
254 255
}

256
static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
257
                                hwaddr *index, uint64_t *nb, uint16_t leaf,
258
                                int level)
259 260
{
    PhysPageEntry *p;
261
    hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
262

M
Michael S. Tsirkin 已提交
263
    if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
264
        lp->ptr = phys_map_node_alloc(map, level == 0);
B
bellard 已提交
265
    }
266
    p = map->nodes[lp->ptr];
267
    lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
268

269
    while (*nb && lp < &p[P_L2_SIZE]) {
270
        if ((*index & (step - 1)) == 0 && *nb >= step) {
M
Michael S. Tsirkin 已提交
271
            lp->skip = 0;
272
            lp->ptr = leaf;
273 274
            *index += step;
            *nb -= step;
275
        } else {
276
            phys_page_set_level(map, lp, index, nb, leaf, level - 1);
277 278
        }
        ++lp;
279 280 281
    }
}

A
Avi Kivity 已提交
282
static void phys_page_set(AddressSpaceDispatch *d,
283
                          hwaddr index, uint64_t nb,
284
                          uint16_t leaf)
285
{
286
    /* Wildly overreserve - it doesn't matter much. */
287
    phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
288

289
    phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
B
bellard 已提交
290 291
}

292 293 294
/* Compact a non leaf page entry. Simply detect that the entry has a single child,
 * and update our entry so we can skip it and go directly to the destination.
 */
295
static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
{
    unsigned valid_ptr = P_L2_SIZE;
    int valid = 0;
    PhysPageEntry *p;
    int i;

    if (lp->ptr == PHYS_MAP_NODE_NIL) {
        return;
    }

    p = nodes[lp->ptr];
    for (i = 0; i < P_L2_SIZE; i++) {
        if (p[i].ptr == PHYS_MAP_NODE_NIL) {
            continue;
        }

        valid_ptr = i;
        valid++;
        if (p[i].skip) {
315
            phys_page_compact(&p[i], nodes);
316 317 318 319 320 321 322 323 324 325 326
        }
    }

    /* We can only compress if there's only one child. */
    if (valid != 1) {
        return;
    }

    assert(valid_ptr < P_L2_SIZE);

    /* Don't compress if it won't fit in the # of bits we have. */
327 328
    if (P_L2_LEVELS >= (1 << 6) &&
        lp->skip + p[valid_ptr].skip >= (1 << 6)) {
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
        return;
    }

    lp->ptr = p[valid_ptr].ptr;
    if (!p[valid_ptr].skip) {
        /* If our only child is a leaf, make this a leaf. */
        /* By design, we should have made this node a leaf to begin with so we
         * should never reach here.
         * But since it's so simple to handle this, let's do it just in case we
         * change this rule.
         */
        lp->skip = 0;
    } else {
        lp->skip += p[valid_ptr].skip;
    }
}

346
void address_space_dispatch_compact(AddressSpaceDispatch *d)
347 348
{
    if (d->phys_map.skip) {
349
        phys_page_compact(&d->phys_map, d->map.nodes);
350 351 352
    }
}

F
Fam Zheng 已提交
353 354 355 356 357 358
static inline bool section_covers_addr(const MemoryRegionSection *section,
                                       hwaddr addr)
{
    /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
     * the section must cover the entire address space.
     */
359
    return int128_gethi(section->size) ||
F
Fam Zheng 已提交
360
           range_covers_byte(section->offset_within_address_space,
361
                             int128_getlo(section->size), addr);
F
Fam Zheng 已提交
362 363
}

364
static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
B
bellard 已提交
365
{
366 367 368
    PhysPageEntry lp = d->phys_map, *p;
    Node *nodes = d->map.nodes;
    MemoryRegionSection *sections = d->map.sections;
369
    hwaddr index = addr >> TARGET_PAGE_BITS;
370
    int i;
371

M
Michael S. Tsirkin 已提交
372
    for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
373
        if (lp.ptr == PHYS_MAP_NODE_NIL) {
374
            return &sections[PHYS_SECTION_UNASSIGNED];
375
        }
376
        p = nodes[lp.ptr];
377
        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
378
    }
379

F
Fam Zheng 已提交
380
    if (section_covers_addr(&sections[lp.ptr], addr)) {
381 382 383 384
        return &sections[lp.ptr];
    } else {
        return &sections[PHYS_SECTION_UNASSIGNED];
    }
385 386
}

387
/* Called from RCU critical section */
388
static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
389 390
                                                        hwaddr addr,
                                                        bool resolve_subpage)
391
{
392
    MemoryRegionSection *section = atomic_read(&d->mru_section);
393 394
    subpage_t *subpage;

395 396
    if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
        !section_covers_addr(section, addr)) {
397
        section = phys_page_find(d, addr);
398
        atomic_set(&d->mru_section, section);
399
    }
400 401
    if (resolve_subpage && section->mr->subpage) {
        subpage = container_of(section->mr, subpage_t, iomem);
402
        section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
403 404
    }
    return section;
405 406
}

407
/* Called from RCU critical section */
408
static MemoryRegionSection *
409
address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
410
                                 hwaddr *plen, bool resolve_subpage)
411 412
{
    MemoryRegionSection *section;
413
    MemoryRegion *mr;
414
    Int128 diff;
415

416
    section = address_space_lookup_region(d, addr, resolve_subpage);
417 418 419 420 421 422
    /* Compute offset within MemoryRegionSection */
    addr -= section->offset_within_address_space;

    /* Compute offset within MemoryRegion */
    *xlat = addr + section->offset_within_region;

423
    mr = section->mr;
424 425 426 427 428 429 430 431 432 433 434 435

    /* MMIO registers can be expected to perform full-width accesses based only
     * on their address, without considering adjacent registers that could
     * decode to completely different MemoryRegions.  When such registers
     * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
     * regions overlap wildly.  For this reason we cannot clamp the accesses
     * here.
     *
     * If the length is small (as is the case for address_space_ldl/stl),
     * everything works fine.  If the incoming length is large, however,
     * the caller really has to do the clamping through memory_access_size.
     */
436
    if (memory_region_is_ram(mr)) {
437
        diff = int128_sub(section->size, int128_make64(addr));
438 439
        *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
    }
440 441
    return section;
}
442

443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459
/**
 * address_space_translate_iommu - translate an address through an IOMMU
 * memory region and then through the target address space.
 *
 * @iommu_mr: the IOMMU memory region that we start the translation from
 * @addr: the address to be translated through the MMU
 * @xlat: the translated address offset within the destination memory region.
 *        It cannot be %NULL.
 * @plen_out: valid read/write length of the translated address. It
 *            cannot be %NULL.
 * @page_mask_out: page mask for the translated address. This
 *            should only be meaningful for IOMMU translated
 *            addresses, since there may be huge pages that this bit
 *            would tell. It can be %NULL if we don't care about it.
 * @is_write: whether the translation operation is for write
 * @is_mmio: whether this can be MMIO, set true if it can
 * @target_as: the address space targeted by the IOMMU
460
 * @attrs: transaction attributes
461 462 463 464 465 466 467 468 469 470
 *
 * This function is called from RCU critical section.  It is the common
 * part of flatview_do_translate and address_space_translate_cached.
 */
static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
                                                         hwaddr *xlat,
                                                         hwaddr *plen_out,
                                                         hwaddr *page_mask_out,
                                                         bool is_write,
                                                         bool is_mmio,
471 472
                                                         AddressSpace **target_as,
                                                         MemTxAttrs attrs)
473 474 475 476 477 478 479
{
    MemoryRegionSection *section;
    hwaddr page_mask = (hwaddr)-1;

    do {
        hwaddr addr = *xlat;
        IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
480 481 482 483 484 485 486 487 488
        int iommu_idx = 0;
        IOMMUTLBEntry iotlb;

        if (imrc->attrs_to_index) {
            iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
        }

        iotlb = imrc->translate(iommu_mr, addr, is_write ?
                                IOMMU_WO : IOMMU_RO, iommu_idx);
489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515

        if (!(iotlb.perm & (1 << is_write))) {
            goto unassigned;
        }

        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
                | (addr & iotlb.addr_mask));
        page_mask &= iotlb.addr_mask;
        *plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1);
        *target_as = iotlb.target_as;

        section = address_space_translate_internal(
                address_space_to_dispatch(iotlb.target_as), addr, xlat,
                plen_out, is_mmio);

        iommu_mr = memory_region_get_iommu(section->mr);
    } while (unlikely(iommu_mr));

    if (page_mask_out) {
        *page_mask_out = page_mask;
    }
    return *section;

unassigned:
    return (MemoryRegionSection) { .mr = &io_mem_unassigned };
}

516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
/**
 * flatview_do_translate - translate an address in FlatView
 *
 * @fv: the flat view that we want to translate on
 * @addr: the address to be translated in above address space
 * @xlat: the translated address offset within memory region. It
 *        cannot be @NULL.
 * @plen_out: valid read/write length of the translated address. It
 *            can be @NULL when we don't care about it.
 * @page_mask_out: page mask for the translated address. This
 *            should only be meaningful for IOMMU translated
 *            addresses, since there may be huge pages that this bit
 *            would tell. It can be @NULL if we don't care about it.
 * @is_write: whether the translation operation is for write
 * @is_mmio: whether this can be MMIO, set true if it can
531
 * @target_as: the address space targeted by the IOMMU
532
 * @attrs: memory transaction attributes
533 534 535
 *
 * This function is called from RCU critical section
 */
536 537 538
static MemoryRegionSection flatview_do_translate(FlatView *fv,
                                                 hwaddr addr,
                                                 hwaddr *xlat,
539 540
                                                 hwaddr *plen_out,
                                                 hwaddr *page_mask_out,
541 542
                                                 bool is_write,
                                                 bool is_mmio,
543 544
                                                 AddressSpace **target_as,
                                                 MemTxAttrs attrs)
545 546
{
    MemoryRegionSection *section;
547
    IOMMUMemoryRegion *iommu_mr;
548 549
    hwaddr plen = (hwaddr)(-1);

550 551
    if (!plen_out) {
        plen_out = &plen;
552
    }
553

554 555 556
    section = address_space_translate_internal(
            flatview_to_dispatch(fv), addr, xlat,
            plen_out, is_mmio);
557

558 559 560 561 562
    iommu_mr = memory_region_get_iommu(section->mr);
    if (unlikely(iommu_mr)) {
        return address_space_translate_iommu(iommu_mr, xlat,
                                             plen_out, page_mask_out,
                                             is_write, is_mmio,
563
                                             target_as, attrs);
564
    }
565
    if (page_mask_out) {
566 567
        /* Not behind an IOMMU, use default page size. */
        *page_mask_out = ~TARGET_PAGE_MASK;
568 569
    }

570
    return *section;
571 572 573
}

/* Called from RCU critical section */
574
IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
575
                                            bool is_write, MemTxAttrs attrs)
576
{
577
    MemoryRegionSection section;
578
    hwaddr xlat, page_mask;
A
Avi Kivity 已提交
579

580 581 582 583 584
    /*
     * This can never be MMIO, and we don't really care about plen,
     * but page mask.
     */
    section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
585 586
                                    NULL, &page_mask, is_write, false, &as,
                                    attrs);
A
Avi Kivity 已提交
587

588 589 590 591
    /* Illegal translation */
    if (section.mr == &io_mem_unassigned) {
        goto iotlb_fail;
    }
A
Avi Kivity 已提交
592

593 594 595 596 597
    /* Convert memory region offset into address space offset */
    xlat += section.offset_within_address_space -
        section.offset_within_region;

    return (IOMMUTLBEntry) {
598
        .target_as = as,
599 600 601
        .iova = addr & ~page_mask,
        .translated_addr = xlat & ~page_mask,
        .addr_mask = page_mask,
602 603 604 605 606 607 608 609 610
        /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
        .perm = IOMMU_RW,
    };

iotlb_fail:
    return (IOMMUTLBEntry) {0};
}

/* Called from RCU critical section */
611
MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
612 613
                                 hwaddr *plen, bool is_write,
                                 MemTxAttrs attrs)
614 615 616
{
    MemoryRegion *mr;
    MemoryRegionSection section;
617
    AddressSpace *as = NULL;
618 619

    /* This can be MMIO, so setup MMIO bit. */
620
    section = flatview_do_translate(fv, addr, xlat, plen, NULL,
621
                                    is_write, true, &as, attrs);
622 623
    mr = section.mr;

624
    if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
625
        hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
626
        *plen = MIN(page, *plen);
627 628
    }

A
Avi Kivity 已提交
629
    return mr;
630 631
}

632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
typedef struct TCGIOMMUNotifier {
    IOMMUNotifier n;
    MemoryRegion *mr;
    CPUState *cpu;
    int iommu_idx;
    bool active;
} TCGIOMMUNotifier;

static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
{
    TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);

    if (!notifier->active) {
        return;
    }
    tlb_flush(notifier->cpu);
    notifier->active = false;
    /* We leave the notifier struct on the list to avoid reallocating it later.
     * Generally the number of IOMMUs a CPU deals with will be small.
     * In any case we can't unregister the iommu notifier from a notify
     * callback.
     */
}

static void tcg_register_iommu_notifier(CPUState *cpu,
                                        IOMMUMemoryRegion *iommu_mr,
                                        int iommu_idx)
{
    /* Make sure this CPU has an IOMMU notifier registered for this
     * IOMMU/IOMMU index combination, so that we can flush its TLB
     * when the IOMMU tells us the mappings we've cached have changed.
     */
    MemoryRegion *mr = MEMORY_REGION(iommu_mr);
    TCGIOMMUNotifier *notifier;
    int i;

    for (i = 0; i < cpu->iommu_notifiers->len; i++) {
669
        notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
670 671 672 673 674 675 676
        if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
            break;
        }
    }
    if (i == cpu->iommu_notifiers->len) {
        /* Not found, add a new entry at the end of the array */
        cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
677 678
        notifier = g_new0(TCGIOMMUNotifier, 1);
        g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709

        notifier->mr = mr;
        notifier->iommu_idx = iommu_idx;
        notifier->cpu = cpu;
        /* Rather than trying to register interest in the specific part
         * of the iommu's address space that we've accessed and then
         * expand it later as subsequent accesses touch more of it, we
         * just register interest in the whole thing, on the assumption
         * that iommu reconfiguration will be rare.
         */
        iommu_notifier_init(&notifier->n,
                            tcg_iommu_unmap_notify,
                            IOMMU_NOTIFIER_UNMAP,
                            0,
                            HWADDR_MAX,
                            iommu_idx);
        memory_region_register_iommu_notifier(notifier->mr, &notifier->n);
    }

    if (!notifier->active) {
        notifier->active = true;
    }
}

static void tcg_iommu_free_notifier_list(CPUState *cpu)
{
    /* Destroy the CPU's notifier list */
    int i;
    TCGIOMMUNotifier *notifier;

    for (i = 0; i < cpu->iommu_notifiers->len; i++) {
710
        notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
711
        memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
712
        g_free(notifier);
713 714 715 716
    }
    g_array_free(cpu->iommu_notifiers, true);
}

717
/* Called from RCU critical section */
718
MemoryRegionSection *
719
address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
720 721
                                  hwaddr *xlat, hwaddr *plen,
                                  MemTxAttrs attrs, int *prot)
722
{
A
Avi Kivity 已提交
723
    MemoryRegionSection *section;
724 725 726 727
    IOMMUMemoryRegion *iommu_mr;
    IOMMUMemoryRegionClass *imrc;
    IOMMUTLBEntry iotlb;
    int iommu_idx;
728
    AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
729

730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764
    for (;;) {
        section = address_space_translate_internal(d, addr, &addr, plen, false);

        iommu_mr = memory_region_get_iommu(section->mr);
        if (!iommu_mr) {
            break;
        }

        imrc = memory_region_get_iommu_class_nocheck(iommu_mr);

        iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
        tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
        /* We need all the permissions, so pass IOMMU_NONE so the IOMMU
         * doesn't short-cut its translation table walk.
         */
        iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
                | (addr & iotlb.addr_mask));
        /* Update the caller's prot bits to remove permissions the IOMMU
         * is giving us a failure response for. If we get down to no
         * permissions left at all we can give up now.
         */
        if (!(iotlb.perm & IOMMU_RO)) {
            *prot &= ~(PAGE_READ | PAGE_EXEC);
        }
        if (!(iotlb.perm & IOMMU_WO)) {
            *prot &= ~PAGE_WRITE;
        }

        if (!*prot) {
            goto translate_fail;
        }

        d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
    }
A
Avi Kivity 已提交
765

766
    assert(!memory_region_is_iommu(section->mr));
767
    *xlat = addr;
A
Avi Kivity 已提交
768
    return section;
769 770 771

translate_fail:
    return &d->map.sections[PHYS_SECTION_UNASSIGNED];
772
}
773
#endif
B
bellard 已提交
774

775
#if !defined(CONFIG_USER_ONLY)
776 777

static int cpu_common_post_load(void *opaque, int version_id)
B
bellard 已提交
778
{
779
    CPUState *cpu = opaque;
B
bellard 已提交
780

781 782
    /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
       version_id is increased. */
783
    cpu->interrupt_request &= ~0x01;
784
    tlb_flush(cpu);
785

786 787 788 789 790 791 792
    /* loadvm has just updated the content of RAM, bypassing the
     * usual mechanisms that ensure we flush TBs for writes to
     * memory we've translated code from. So we must flush all TBs,
     * which will now be stale.
     */
    tb_flush(cpu);

793
    return 0;
B
bellard 已提交
794
}
B
bellard 已提交
795

796 797 798 799
static int cpu_common_pre_load(void *opaque)
{
    CPUState *cpu = opaque;

800
    cpu->exception_index = -1;
801 802 803 804 805 806 807 808

    return 0;
}

static bool cpu_common_exception_index_needed(void *opaque)
{
    CPUState *cpu = opaque;

809
    return tcg_enabled() && cpu->exception_index != -1;
810 811 812 813 814 815
}

static const VMStateDescription vmstate_cpu_common_exception_index = {
    .name = "cpu_common/exception_index",
    .version_id = 1,
    .minimum_version_id = 1,
816
    .needed = cpu_common_exception_index_needed,
817 818 819 820 821 822
    .fields = (VMStateField[]) {
        VMSTATE_INT32(exception_index, CPUState),
        VMSTATE_END_OF_LIST()
    }
};

823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840
static bool cpu_common_crash_occurred_needed(void *opaque)
{
    CPUState *cpu = opaque;

    return cpu->crash_occurred;
}

static const VMStateDescription vmstate_cpu_common_crash_occurred = {
    .name = "cpu_common/crash_occurred",
    .version_id = 1,
    .minimum_version_id = 1,
    .needed = cpu_common_crash_occurred_needed,
    .fields = (VMStateField[]) {
        VMSTATE_BOOL(crash_occurred, CPUState),
        VMSTATE_END_OF_LIST()
    }
};

841
const VMStateDescription vmstate_cpu_common = {
842 843 844
    .name = "cpu_common",
    .version_id = 1,
    .minimum_version_id = 1,
845
    .pre_load = cpu_common_pre_load,
846
    .post_load = cpu_common_post_load,
847
    .fields = (VMStateField[]) {
848 849
        VMSTATE_UINT32(halted, CPUState),
        VMSTATE_UINT32(interrupt_request, CPUState),
850
        VMSTATE_END_OF_LIST()
851
    },
852 853
    .subsections = (const VMStateDescription*[]) {
        &vmstate_cpu_common_exception_index,
854
        &vmstate_cpu_common_crash_occurred,
855
        NULL
856 857
    }
};
858

859
#endif
B
bellard 已提交
860

861
CPUState *qemu_get_cpu(int index)
B
bellard 已提交
862
{
A
Andreas Färber 已提交
863
    CPUState *cpu;
B
bellard 已提交
864

A
Andreas Färber 已提交
865
    CPU_FOREACH(cpu) {
866
        if (cpu->cpu_index == index) {
A
Andreas Färber 已提交
867
            return cpu;
868
        }
B
bellard 已提交
869
    }
870

A
Andreas Färber 已提交
871
    return NULL;
B
bellard 已提交
872 873
}

874
#if !defined(CONFIG_USER_ONLY)
P
Peter Xu 已提交
875 876
void cpu_address_space_init(CPUState *cpu, int asidx,
                            const char *prefix, MemoryRegion *mr)
877
{
878
    CPUAddressSpace *newas;
P
Peter Xu 已提交
879
    AddressSpace *as = g_new0(AddressSpace, 1);
880
    char *as_name;
P
Peter Xu 已提交
881 882

    assert(mr);
883 884 885
    as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
    address_space_init(as, mr, as_name);
    g_free(as_name);
886 887 888 889

    /* Target code should have set num_ases before calling us */
    assert(asidx < cpu->num_ases);

890 891 892 893 894
    if (asidx == 0) {
        /* address space 0 gets the convenience alias */
        cpu->as = as;
    }

895 896
    /* KVM cannot currently support multiple address spaces. */
    assert(asidx == 0 || !kvm_enabled());
897

898 899
    if (!cpu->cpu_ases) {
        cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
900
    }
901

902 903 904
    newas = &cpu->cpu_ases[asidx];
    newas->cpu = cpu;
    newas->as = as;
905
    if (tcg_enabled()) {
906
        newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
907 908
        newas->tcg_as_listener.commit = tcg_commit;
        memory_listener_register(&newas->tcg_as_listener, as);
909
    }
910
}
911 912 913 914 915 916

AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
{
    /* Return the AddressSpace corresponding to the specified index */
    return cpu->cpu_ases[asidx].as;
}
917 918
#endif

919
void cpu_exec_unrealizefn(CPUState *cpu)
920
{
921 922
    CPUClass *cc = CPU_GET_CLASS(cpu);

923
    cpu_list_remove(cpu);
924 925 926 927 928 929 930

    if (cc->vmsd != NULL) {
        vmstate_unregister(NULL, cc->vmsd, cpu);
    }
    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
        vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
    }
931 932 933
#ifndef CONFIG_USER_ONLY
    tcg_iommu_free_notifier_list(cpu);
#endif
934 935
}

F
Fam Zheng 已提交
936 937 938
Property cpu_common_props[] = {
#ifndef CONFIG_USER_ONLY
    /* Create a memory property for softmmu CPU object,
939
     * so users can wire up its memory. (This can't go in hw/core/cpu.c
F
Fam Zheng 已提交
940 941 942 943 944 945 946 947 948 949
     * because that file is compiled only once for both user-mode
     * and system builds.) The default if no link is set up is to use
     * the system address space.
     */
    DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
                     MemoryRegion *),
#endif
    DEFINE_PROP_END_OF_LIST(),
};

L
Laurent Vivier 已提交
950
void cpu_exec_initfn(CPUState *cpu)
B
bellard 已提交
951
{
952
    cpu->as = NULL;
953
    cpu->num_ases = 0;
954

955 956
#ifndef CONFIG_USER_ONLY
    cpu->thread_id = qemu_get_thread_id();
957 958
    cpu->memory = system_memory;
    object_ref(OBJECT(cpu->memory));
959
#endif
L
Laurent Vivier 已提交
960 961
}

962
void cpu_exec_realizefn(CPUState *cpu, Error **errp)
L
Laurent Vivier 已提交
963
{
964
    CPUClass *cc = CPU_GET_CLASS(cpu);
965
    static bool tcg_target_initialized;
966

967
    cpu_list_add(cpu);
968

969 970
    if (tcg_enabled() && !tcg_target_initialized) {
        tcg_target_initialized = true;
971 972
        cc->tcg_initialize();
    }
E
Emilio G. Cota 已提交
973
    tlb_init(cpu);
974

975
#ifndef CONFIG_USER_ONLY
976
    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
977
        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
978
    }
979
    if (cc->vmsd != NULL) {
980
        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
981
    }
982

983
    cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
984
#endif
B
bellard 已提交
985 986
}

987
const char *parse_cpu_option(const char *cpu_option)
988 989 990 991 992 993
{
    ObjectClass *oc;
    CPUClass *cc;
    gchar **model_pieces;
    const char *cpu_type;

994
    model_pieces = g_strsplit(cpu_option, ",", 2);
995 996 997 998
    if (!model_pieces[0]) {
        error_report("-cpu option cannot be empty");
        exit(1);
    }
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013

    oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
    if (oc == NULL) {
        error_report("unable to find CPU model '%s'", model_pieces[0]);
        g_strfreev(model_pieces);
        exit(EXIT_FAILURE);
    }

    cpu_type = object_class_get_name(oc);
    cc = CPU_CLASS(oc);
    cc->parse_features(cpu_type, model_pieces[1], &error_fatal);
    g_strfreev(model_pieces);
    return cpu_type;
}

1014
#if defined(CONFIG_USER_ONLY)
1015
void tb_invalidate_phys_addr(target_ulong addr)
1016
{
1017
    mmap_lock();
1018
    tb_invalidate_phys_page_range(addr, addr + 1, 0);
1019 1020
    mmap_unlock();
}
1021 1022 1023 1024 1025

static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
{
    tb_invalidate_phys_addr(pc);
}
1026
#else
1027 1028 1029 1030 1031 1032
void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
{
    ram_addr_t ram_addr;
    MemoryRegion *mr;
    hwaddr l = 1;

1033 1034 1035 1036
    if (!tcg_enabled()) {
        return;
    }

1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
    rcu_read_lock();
    mr = address_space_translate(as, addr, &addr, &l, false, attrs);
    if (!(memory_region_is_ram(mr)
          || memory_region_is_romd(mr))) {
        rcu_read_unlock();
        return;
    }
    ram_addr = memory_region_get_ram_addr(mr) + addr;
    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1, 0);
    rcu_read_unlock();
}

1049 1050 1051 1052 1053 1054 1055 1056
static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
{
    MemTxAttrs attrs;
    hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
    int asidx = cpu_asidx_from_attrs(cpu, attrs);
    if (phys != -1) {
        /* Locks grabbed by tb_invalidate_phys_addr */
        tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
1057
                                phys | (pc & ~TARGET_PAGE_MASK), attrs);
1058
    }
1059
}
1060
#endif
B
bellard 已提交
1061

1062
#ifndef CONFIG_USER_ONLY
1063
/* Add a watchpoint.  */
1064
int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
1065
                          int flags, CPUWatchpoint **watchpoint)
1066
{
1067
    CPUWatchpoint *wp;
1068

1069
    /* forbid ranges which are empty or run off the end of the address space */
1070
    if (len == 0 || (addr + len - 1) < addr) {
1071 1072
        error_report("tried to set invalid watchpoint at %"
                     VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
1073 1074
        return -EINVAL;
    }
1075
    wp = g_malloc(sizeof(*wp));
1076 1077

    wp->vaddr = addr;
1078
    wp->len = len;
1079 1080
    wp->flags = flags;

1081
    /* keep all GDB-injected watchpoints in front */
1082 1083 1084 1085 1086
    if (flags & BP_GDB) {
        QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
    } else {
        QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
    }
1087

1088
    tlb_flush_page(cpu, addr);
1089 1090 1091 1092

    if (watchpoint)
        *watchpoint = wp;
    return 0;
1093 1094
}

1095
/* Remove a specific watchpoint.  */
1096
int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
1097
                          int flags)
1098
{
1099
    CPUWatchpoint *wp;
1100

1101
    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1102
        if (addr == wp->vaddr && len == wp->len
1103
                && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
1104
            cpu_watchpoint_remove_by_ref(cpu, wp);
1105 1106 1107
            return 0;
        }
    }
1108
    return -ENOENT;
1109 1110
}

1111
/* Remove a specific watchpoint by reference.  */
1112
void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
1113
{
1114
    QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
1115

1116
    tlb_flush_page(cpu, watchpoint->vaddr);
1117

1118
    g_free(watchpoint);
1119 1120 1121
}

/* Remove all matching watchpoints.  */
1122
void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
1123
{
1124
    CPUWatchpoint *wp, *next;
1125

1126
    QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
1127 1128 1129
        if (wp->flags & mask) {
            cpu_watchpoint_remove_by_ref(cpu, wp);
        }
1130
    }
1131
}
1132 1133 1134 1135 1136 1137

/* Return true if this watchpoint address matches the specified
 * access (ie the address range covered by the watchpoint overlaps
 * partially or completely with the address range covered by the
 * access).
 */
1138 1139
static inline bool watchpoint_address_matches(CPUWatchpoint *wp,
                                              vaddr addr, vaddr len)
1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151
{
    /* We know the lengths are non-zero, but a little caution is
     * required to avoid errors in the case where the range ends
     * exactly at the top of the address space and so addr + len
     * wraps round to zero.
     */
    vaddr wpend = wp->vaddr + wp->len - 1;
    vaddr addrend = addr + len - 1;

    return !(addr > wpend || wp->vaddr > addrend);
}

1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
/* Return flags for watchpoints that match addr + prot.  */
int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len)
{
    CPUWatchpoint *wp;
    int ret = 0;

    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
        if (watchpoint_address_matches(wp, addr, TARGET_PAGE_SIZE)) {
            ret |= wp->flags;
        }
    }
    return ret;
}
1165
#endif /* !CONFIG_USER_ONLY */
1166

1167
/* Add a breakpoint.  */
1168
int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
1169
                          CPUBreakpoint **breakpoint)
B
bellard 已提交
1170
{
1171
    CPUBreakpoint *bp;
1172

1173
    bp = g_malloc(sizeof(*bp));
B
bellard 已提交
1174

1175 1176 1177
    bp->pc = pc;
    bp->flags = flags;

1178
    /* keep all GDB-injected breakpoints in front */
1179
    if (flags & BP_GDB) {
1180
        QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
1181
    } else {
1182
        QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
1183
    }
1184

1185
    breakpoint_invalidate(cpu, pc);
1186

1187
    if (breakpoint) {
1188
        *breakpoint = bp;
1189
    }
B
bellard 已提交
1190 1191 1192
    return 0;
}

1193
/* Remove a specific breakpoint.  */
1194
int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
1195 1196 1197
{
    CPUBreakpoint *bp;

1198
    QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1199
        if (bp->pc == pc && bp->flags == flags) {
1200
            cpu_breakpoint_remove_by_ref(cpu, bp);
1201 1202
            return 0;
        }
1203
    }
1204
    return -ENOENT;
1205 1206
}

1207
/* Remove a specific breakpoint by reference.  */
1208
void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
B
bellard 已提交
1209
{
1210 1211 1212
    QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);

    breakpoint_invalidate(cpu, breakpoint->pc);
1213

1214
    g_free(breakpoint);
1215 1216 1217
}

/* Remove all matching breakpoints. */
1218
void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
1219
{
1220
    CPUBreakpoint *bp, *next;
1221

1222
    QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
1223 1224 1225
        if (bp->flags & mask) {
            cpu_breakpoint_remove_by_ref(cpu, bp);
        }
1226
    }
B
bellard 已提交
1227 1228
}

B
bellard 已提交
1229 1230
/* enable or disable single step mode. EXCP_DEBUG is returned by the
   CPU loop after each instruction */
1231
void cpu_single_step(CPUState *cpu, int enabled)
B
bellard 已提交
1232
{
1233 1234 1235
    if (cpu->singlestep_enabled != enabled) {
        cpu->singlestep_enabled = enabled;
        if (kvm_enabled()) {
1236
            kvm_update_guest_debug(cpu, 0);
1237
        } else {
S
Stuart Brady 已提交
1238
            /* must flush all the translated code to avoid inconsistencies */
1239
            /* XXX: only flush what is necessary */
1240
            tb_flush(cpu);
1241
        }
B
bellard 已提交
1242 1243 1244
    }
}

1245
void cpu_abort(CPUState *cpu, const char *fmt, ...)
B
bellard 已提交
1246 1247
{
    va_list ap;
P
pbrook 已提交
1248
    va_list ap2;
B
bellard 已提交
1249 1250

    va_start(ap, fmt);
P
pbrook 已提交
1251
    va_copy(ap2, ap);
B
bellard 已提交
1252 1253 1254
    fprintf(stderr, "qemu: fatal: ");
    vfprintf(stderr, fmt, ap);
    fprintf(stderr, "\n");
1255
    cpu_dump_state(cpu, stderr, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1256
    if (qemu_log_separate()) {
1257
        qemu_log_lock();
1258 1259 1260
        qemu_log("qemu: fatal: ");
        qemu_log_vprintf(fmt, ap2);
        qemu_log("\n");
1261
        log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1262
        qemu_log_flush();
1263
        qemu_log_unlock();
1264
        qemu_log_close();
1265
    }
P
pbrook 已提交
1266
    va_end(ap2);
1267
    va_end(ap);
1268
    replay_finish();
1269 1270 1271 1272 1273
#if defined(CONFIG_USER_ONLY)
    {
        struct sigaction act;
        sigfillset(&act.sa_mask);
        act.sa_handler = SIG_DFL;
1274
        act.sa_flags = 0;
1275 1276 1277
        sigaction(SIGABRT, &act, NULL);
    }
#endif
B
bellard 已提交
1278 1279 1280
    abort();
}

1281
#if !defined(CONFIG_USER_ONLY)
M
Mike Day 已提交
1282
/* Called from RCU critical section */
P
Paolo Bonzini 已提交
1283 1284 1285 1286
static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
{
    RAMBlock *block;

P
Paolo Bonzini 已提交
1287
    block = atomic_rcu_read(&ram_list.mru_block);
1288
    if (block && addr - block->offset < block->max_length) {
1289
        return block;
P
Paolo Bonzini 已提交
1290
    }
P
Peter Xu 已提交
1291
    RAMBLOCK_FOREACH(block) {
1292
        if (addr - block->offset < block->max_length) {
P
Paolo Bonzini 已提交
1293 1294 1295 1296 1297 1298 1299 1300
            goto found;
        }
    }

    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
    abort();

found:
P
Paolo Bonzini 已提交
1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316
    /* It is safe to write mru_block outside the iothread lock.  This
     * is what happens:
     *
     *     mru_block = xxx
     *     rcu_read_unlock()
     *                                        xxx removed from list
     *                  rcu_read_lock()
     *                  read mru_block
     *                                        mru_block = NULL;
     *                                        call_rcu(reclaim_ramblock, xxx);
     *                  rcu_read_unlock()
     *
     * atomic_rcu_set is not needed here.  The block was already published
     * when it was placed into the list.  Here we're just making an extra
     * copy of the pointer.
     */
P
Paolo Bonzini 已提交
1317 1318 1319 1320
    ram_list.mru_block = block;
    return block;
}

1321
static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
J
Juan Quintela 已提交
1322
{
1323
    CPUState *cpu;
P
Paolo Bonzini 已提交
1324
    ram_addr_t start1;
1325 1326 1327
    RAMBlock *block;
    ram_addr_t end;

1328
    assert(tcg_enabled());
1329 1330
    end = TARGET_PAGE_ALIGN(start + length);
    start &= TARGET_PAGE_MASK;
J
Juan Quintela 已提交
1331

M
Mike Day 已提交
1332
    rcu_read_lock();
P
Paolo Bonzini 已提交
1333 1334
    block = qemu_get_ram_block(start);
    assert(block == qemu_get_ram_block(end - 1));
1335
    start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
1336 1337 1338
    CPU_FOREACH(cpu) {
        tlb_reset_dirty(cpu, start1, length);
    }
M
Mike Day 已提交
1339
    rcu_read_unlock();
J
Juan Quintela 已提交
1340 1341
}

P
pbrook 已提交
1342
/* Note: start and end must be within the same ram block.  */
1343 1344 1345
bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
                                              ram_addr_t length,
                                              unsigned client)
1346
{
1347
    DirtyMemoryBlocks *blocks;
1348
    unsigned long end, page;
1349
    bool dirty = false;
1350 1351
    RAMBlock *ramblock;
    uint64_t mr_offset, mr_size;
1352 1353 1354 1355

    if (length == 0) {
        return false;
    }
B
bellard 已提交
1356

1357 1358
    end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
    page = start >> TARGET_PAGE_BITS;
1359 1360 1361 1362

    rcu_read_lock();

    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1363 1364 1365 1366
    ramblock = qemu_get_ram_block(start);
    /* Range sanity check on the ramblock */
    assert(start >= ramblock->offset &&
           start + length <= ramblock->offset + ramblock->used_length);
1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377

    while (page < end) {
        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);

        dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
                                              offset, num);
        page += num;
    }

1378 1379 1380 1381
    mr_offset = (ram_addr_t)(page << TARGET_PAGE_BITS) - ramblock->offset;
    mr_size = (end - page) << TARGET_PAGE_BITS;
    memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);

1382
    rcu_read_unlock();
1383 1384

    if (dirty && tcg_enabled()) {
1385
        tlb_reset_dirty_range_all(start, length);
P
pbrook 已提交
1386
    }
1387 1388

    return dirty;
1389 1390
}

1391
DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
1392
    (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
1393 1394
{
    DirtyMemoryBlocks *blocks;
1395
    ram_addr_t start = memory_region_get_ram_addr(mr) + offset;
1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436
    unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
    ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
    ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
    DirtyBitmapSnapshot *snap;
    unsigned long page, end, dest;

    snap = g_malloc0(sizeof(*snap) +
                     ((last - first) >> (TARGET_PAGE_BITS + 3)));
    snap->start = first;
    snap->end   = last;

    page = first >> TARGET_PAGE_BITS;
    end  = last  >> TARGET_PAGE_BITS;
    dest = 0;

    rcu_read_lock();

    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);

    while (page < end) {
        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);

        assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
        assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
        offset >>= BITS_PER_LEVEL;

        bitmap_copy_and_clear_atomic(snap->dirty + dest,
                                     blocks->blocks[idx] + offset,
                                     num);
        page += num;
        dest += num >> BITS_PER_LEVEL;
    }

    rcu_read_unlock();

    if (tcg_enabled()) {
        tlb_reset_dirty_range_all(start, length);
    }

1437 1438
    memory_region_clear_dirty_bitmap(mr, offset, length);

1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462
    return snap;
}

bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
                                            ram_addr_t start,
                                            ram_addr_t length)
{
    unsigned long page, end;

    assert(start >= snap->start);
    assert(start + length <= snap->end);

    end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
    page = (start - snap->start) >> TARGET_PAGE_BITS;

    while (page < end) {
        if (test_bit(page, snap->dirty)) {
            return true;
        }
        page++;
    }
    return false;
}

1463
/* Called from RCU critical section */
1464
hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1465 1466 1467 1468 1469
                                       MemoryRegionSection *section,
                                       target_ulong vaddr,
                                       hwaddr paddr, hwaddr xlat,
                                       int prot,
                                       target_ulong *address)
B
Blue Swirl 已提交
1470
{
A
Avi Kivity 已提交
1471
    hwaddr iotlb;
B
Blue Swirl 已提交
1472

1473
    if (memory_region_is_ram(section->mr)) {
B
Blue Swirl 已提交
1474
        /* Normal RAM.  */
1475
        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
B
Blue Swirl 已提交
1476
        if (!section->readonly) {
1477
            iotlb |= PHYS_SECTION_NOTDIRTY;
B
Blue Swirl 已提交
1478
        } else {
1479
            iotlb |= PHYS_SECTION_ROM;
B
Blue Swirl 已提交
1480 1481
        }
    } else {
1482 1483
        AddressSpaceDispatch *d;

1484
        d = flatview_to_dispatch(section->fv);
1485
        iotlb = section - d->map.sections;
1486
        iotlb += xlat;
B
Blue Swirl 已提交
1487 1488 1489 1490
    }

    return iotlb;
}
1491 1492
#endif /* defined(CONFIG_USER_ONLY) */

1493
#if !defined(CONFIG_USER_ONLY)
1494

1495 1496
static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
                            uint16_t section);
1497
static subpage_t *subpage_init(FlatView *fv, hwaddr base);
1498

1499
static void *(*phys_mem_alloc)(size_t size, uint64_t *align, bool shared) =
1500
                               qemu_anon_ram_alloc;
1501 1502 1503 1504 1505 1506

/*
 * Set a custom physical guest memory alloator.
 * Accelerators with unusual needs may need this.  Hopefully, we can
 * get rid of it eventually.
 */
1507
void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align, bool shared))
1508 1509 1510 1511
{
    phys_mem_alloc = alloc;
}

1512 1513
static uint16_t phys_section_add(PhysPageMap *map,
                                 MemoryRegionSection *section)
1514
{
1515 1516 1517 1518
    /* The physical section number is ORed with a page-aligned
     * pointer to produce the iotlb entries.  Thus it should
     * never overflow into the page-aligned value.
     */
1519
    assert(map->sections_nb < TARGET_PAGE_SIZE);
1520

1521 1522 1523 1524
    if (map->sections_nb == map->sections_nb_alloc) {
        map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
        map->sections = g_renew(MemoryRegionSection, map->sections,
                                map->sections_nb_alloc);
1525
    }
1526
    map->sections[map->sections_nb] = *section;
P
Paolo Bonzini 已提交
1527
    memory_region_ref(section->mr);
1528
    return map->sections_nb++;
1529 1530
}

1531 1532
static void phys_section_destroy(MemoryRegion *mr)
{
D
Don Slutz 已提交
1533 1534
    bool have_sub_page = mr->subpage;

P
Paolo Bonzini 已提交
1535 1536
    memory_region_unref(mr);

D
Don Slutz 已提交
1537
    if (have_sub_page) {
1538
        subpage_t *subpage = container_of(mr, subpage_t, iomem);
P
Peter Crosthwaite 已提交
1539
        object_unref(OBJECT(&subpage->iomem));
1540 1541 1542 1543
        g_free(subpage);
    }
}

P
Paolo Bonzini 已提交
1544
static void phys_sections_free(PhysPageMap *map)
1545
{
1546 1547
    while (map->sections_nb > 0) {
        MemoryRegionSection *section = &map->sections[--map->sections_nb];
1548 1549
        phys_section_destroy(section->mr);
    }
1550 1551
    g_free(map->sections);
    g_free(map->nodes);
1552 1553
}

1554
static void register_subpage(FlatView *fv, MemoryRegionSection *section)
1555
{
1556
    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1557
    subpage_t *subpage;
A
Avi Kivity 已提交
1558
    hwaddr base = section->offset_within_address_space
1559
        & TARGET_PAGE_MASK;
1560
    MemoryRegionSection *existing = phys_page_find(d, base);
1561 1562
    MemoryRegionSection subsection = {
        .offset_within_address_space = base,
1563
        .size = int128_make64(TARGET_PAGE_SIZE),
1564
    };
A
Avi Kivity 已提交
1565
    hwaddr start, end;
1566

1567
    assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1568

1569
    if (!(existing->mr->subpage)) {
1570 1571
        subpage = subpage_init(fv, base);
        subsection.fv = fv;
1572
        subsection.mr = &subpage->iomem;
A
Avi Kivity 已提交
1573
        phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1574
                      phys_section_add(&d->map, &subsection));
1575
    } else {
1576
        subpage = container_of(existing->mr, subpage_t, iomem);
1577 1578
    }
    start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1579
    end = start + int128_get64(section->size) - 1;
1580 1581
    subpage_register(subpage, start, end,
                     phys_section_add(&d->map, section));
1582 1583 1584
}


1585
static void register_multipage(FlatView *fv,
1586
                               MemoryRegionSection *section)
1587
{
1588
    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
A
Avi Kivity 已提交
1589
    hwaddr start_addr = section->offset_within_address_space;
1590
    uint16_t section_index = phys_section_add(&d->map, section);
1591 1592
    uint64_t num_pages = int128_get64(int128_rshift(section->size,
                                                    TARGET_PAGE_BITS));
1593

1594 1595
    assert(num_pages);
    phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1596 1597
}

1598 1599 1600 1601 1602 1603 1604
/*
 * The range in *section* may look like this:
 *
 *      |s|PPPPPPP|s|
 *
 * where s stands for subpage and P for page.
 */
1605
void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
1606
{
1607
    MemoryRegionSection remain = *section;
1608
    Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1609

1610 1611 1612 1613
    /* register first subpage */
    if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
        uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
                        - remain.offset_within_address_space;
1614

1615
        MemoryRegionSection now = remain;
1616
        now.size = int128_min(int128_make64(left), now.size);
1617
        register_subpage(fv, &now);
1618 1619 1620
        if (int128_eq(remain.size, now.size)) {
            return;
        }
1621 1622 1623
        remain.size = int128_sub(remain.size, now.size);
        remain.offset_within_address_space += int128_get64(now.size);
        remain.offset_within_region += int128_get64(now.size);
1624 1625 1626 1627 1628 1629 1630 1631 1632
    }

    /* register whole pages */
    if (int128_ge(remain.size, page_size)) {
        MemoryRegionSection now = remain;
        now.size = int128_and(now.size, int128_neg(page_size));
        register_multipage(fv, &now);
        if (int128_eq(remain.size, now.size)) {
            return;
1633
        }
1634 1635 1636
        remain.size = int128_sub(remain.size, now.size);
        remain.offset_within_address_space += int128_get64(now.size);
        remain.offset_within_region += int128_get64(now.size);
1637
    }
1638 1639 1640

    /* register last subpage */
    register_subpage(fv, &remain);
1641 1642
}

1643 1644 1645 1646 1647 1648
void qemu_flush_coalesced_mmio_buffer(void)
{
    if (kvm_enabled())
        kvm_flush_coalesced_mmio_buffer();
}

1649 1650 1651 1652 1653 1654 1655 1656 1657 1658
void qemu_mutex_lock_ramlist(void)
{
    qemu_mutex_lock(&ram_list.mutex);
}

void qemu_mutex_unlock_ramlist(void)
{
    qemu_mutex_unlock(&ram_list.mutex);
}

1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678
void ram_block_dump(Monitor *mon)
{
    RAMBlock *block;
    char *psize;

    rcu_read_lock();
    monitor_printf(mon, "%24s %8s  %18s %18s %18s\n",
                   "Block Name", "PSize", "Offset", "Used", "Total");
    RAMBLOCK_FOREACH(block) {
        psize = size_to_str(block->page_size);
        monitor_printf(mon, "%24s %8s  0x%016" PRIx64 " 0x%016" PRIx64
                       " 0x%016" PRIx64 "\n", block->idstr, psize,
                       (uint64_t)block->offset,
                       (uint64_t)block->used_length,
                       (uint64_t)block->max_length);
        g_free(psize);
    }
    rcu_read_unlock();
}

1679 1680 1681 1682 1683 1684 1685
#ifdef __linux__
/*
 * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
 * may or may not name the same files / on the same filesystem now as
 * when we actually open and map them.  Iterate over the file
 * descriptors instead, and use qemu_fd_getpagesize().
 */
1686
static int find_min_backend_pagesize(Object *obj, void *opaque)
1687 1688 1689 1690
{
    long *hpsize_min = opaque;

    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1691 1692
        HostMemoryBackend *backend = MEMORY_BACKEND(obj);
        long hpsize = host_memory_backend_pagesize(backend);
1693

1694
        if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
1695
            *hpsize_min = hpsize;
1696 1697 1698 1699 1700 1701
        }
    }

    return 0;
}

1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722
static int find_max_backend_pagesize(Object *obj, void *opaque)
{
    long *hpsize_max = opaque;

    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
        HostMemoryBackend *backend = MEMORY_BACKEND(obj);
        long hpsize = host_memory_backend_pagesize(backend);

        if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
            *hpsize_max = hpsize;
        }
    }

    return 0;
}

/*
 * TODO: We assume right now that all mapped host memory backends are
 * used as RAM, however some might be used for different purposes.
 */
long qemu_minrampagesize(void)
1723 1724 1725 1726
{
    long hpsize = LONG_MAX;
    long mainrampagesize;
    Object *memdev_root;
1727
    MachineState *ms = MACHINE(qdev_get_machine());
1728

1729
    mainrampagesize = qemu_mempath_getpagesize(mem_path);
1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742

    /* it's possible we have memory-backend objects with
     * hugepage-backed RAM. these may get mapped into system
     * address space via -numa parameters or memory hotplug
     * hooks. we want to take these into account, but we
     * also want to make sure these supported hugepage
     * sizes are applicable across the entire range of memory
     * we may boot from, so we take the min across all
     * backends, and assume normal pages in cases where a
     * backend isn't backed by hugepages.
     */
    memdev_root = object_resolve_path("/objects", NULL);
    if (memdev_root) {
1743
        object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754
    }
    if (hpsize == LONG_MAX) {
        /* No additional memory regions found ==> Report main RAM page size */
        return mainrampagesize;
    }

    /* If NUMA is disabled or the NUMA nodes are not backed with a
     * memory-backend, then there is at least one node using "normal" RAM,
     * so if its page size is smaller we have got to report that size instead.
     */
    if (hpsize > mainrampagesize &&
1755 1756
        (ms->numa_state == NULL ||
         ms->numa_state->num_nodes == 0 ||
1757
         ms->numa_state->nodes[0].node_memdev == NULL)) {
1758 1759 1760 1761 1762 1763 1764 1765 1766 1767
        static bool warned;
        if (!warned) {
            error_report("Huge page support disabled (n/a for main memory).");
            warned = true;
        }
        return mainrampagesize;
    }

    return hpsize;
}
1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779

long qemu_maxrampagesize(void)
{
    long pagesize = qemu_mempath_getpagesize(mem_path);
    Object *memdev_root = object_resolve_path("/objects", NULL);

    if (memdev_root) {
        object_child_foreach(memdev_root, find_max_backend_pagesize,
                             &pagesize);
    }
    return pagesize;
}
1780
#else
1781 1782 1783 1784 1785
long qemu_minrampagesize(void)
{
    return getpagesize();
}
long qemu_maxrampagesize(void)
1786 1787 1788 1789 1790
{
    return getpagesize();
}
#endif

1791
#ifdef CONFIG_POSIX
1792 1793 1794 1795 1796 1797 1798 1799 1800
static int64_t get_file_size(int fd)
{
    int64_t size = lseek(fd, 0, SEEK_END);
    if (size < 0) {
        return -errno;
    }
    return size;
}

1801 1802 1803 1804
static int file_ram_open(const char *path,
                         const char *region_name,
                         bool *created,
                         Error **errp)
1805 1806
{
    char *filename;
1807 1808
    char *sanitized_name;
    char *c;
1809
    int fd = -1;
1810

1811
    *created = false;
1812 1813 1814 1815 1816
    for (;;) {
        fd = open(path, O_RDWR);
        if (fd >= 0) {
            /* @path names an existing file, use it */
            break;
1817
        }
1818 1819 1820 1821
        if (errno == ENOENT) {
            /* @path names a file that doesn't exist, create it */
            fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
            if (fd >= 0) {
1822
                *created = true;
1823 1824 1825 1826 1827
                break;
            }
        } else if (errno == EISDIR) {
            /* @path names a directory, create a file there */
            /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1828
            sanitized_name = g_strdup(region_name);
1829 1830 1831 1832 1833
            for (c = sanitized_name; *c != '\0'; c++) {
                if (*c == '/') {
                    *c = '_';
                }
            }
1834

1835 1836 1837
            filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
                                       sanitized_name);
            g_free(sanitized_name);
1838

1839 1840 1841 1842 1843 1844 1845
            fd = mkstemp(filename);
            if (fd >= 0) {
                unlink(filename);
                g_free(filename);
                break;
            }
            g_free(filename);
1846
        }
1847 1848 1849 1850
        if (errno != EEXIST && errno != EINTR) {
            error_setg_errno(errp, errno,
                             "can't open backing store %s for guest RAM",
                             path);
1851
            return -1;
1852 1853 1854 1855 1856
        }
        /*
         * Try again on EINTR and EEXIST.  The latter happens when
         * something else creates the file between our two open().
         */
1857
    }
1858

1859 1860 1861 1862 1863 1864 1865 1866 1867
    return fd;
}

static void *file_ram_alloc(RAMBlock *block,
                            ram_addr_t memory,
                            int fd,
                            bool truncate,
                            Error **errp)
{
1868
    MachineState *ms = MACHINE(qdev_get_machine());
1869 1870
    void *area;

1871
    block->page_size = qemu_fd_getpagesize(fd);
1872 1873 1874 1875 1876
    if (block->mr->align % block->page_size) {
        error_setg(errp, "alignment 0x%" PRIx64
                   " must be multiples of page size 0x%zx",
                   block->mr->align, block->page_size);
        return NULL;
1877 1878 1879 1880
    } else if (block->mr->align && !is_power_of_2(block->mr->align)) {
        error_setg(errp, "alignment 0x%" PRIx64
                   " must be a power of two", block->mr->align);
        return NULL;
1881 1882
    }
    block->mr->align = MAX(block->page_size, block->mr->align);
1883 1884 1885 1886 1887
#if defined(__s390x__)
    if (kvm_enabled()) {
        block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
    }
#endif
1888

1889
    if (memory < block->page_size) {
1890
        error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1891 1892
                   "or larger than page size 0x%zx",
                   memory, block->page_size);
1893
        return NULL;
1894 1895
    }

1896
    memory = ROUND_UP(memory, block->page_size);
1897 1898 1899 1900 1901 1902

    /*
     * ftruncate is not supported by hugetlbfs in older
     * hosts, so don't bother bailing out on errors.
     * If anything goes wrong with it under other filesystems,
     * mmap will fail.
1903 1904 1905 1906 1907 1908 1909 1910
     *
     * Do not truncate the non-empty backend file to avoid corrupting
     * the existing data in the file. Disabling shrinking is not
     * enough. For example, the current vNVDIMM implementation stores
     * the guest NVDIMM labels at the end of the backend file. If the
     * backend file is later extended, QEMU will not be able to find
     * those labels. Therefore, extending the non-empty backend file
     * is disabled as well.
1911
     */
1912
    if (truncate && ftruncate(fd, memory)) {
Y
Yoshiaki Tamura 已提交
1913
        perror("ftruncate");
1914
    }
1915

1916
    area = qemu_ram_mmap(fd, memory, block->mr->align,
1917
                         block->flags & RAM_SHARED, block->flags & RAM_PMEM);
1918
    if (area == MAP_FAILED) {
1919
        error_setg_errno(errp, errno,
1920
                         "unable to map backing store for guest RAM");
1921
        return NULL;
1922
    }
1923 1924

    if (mem_prealloc) {
1925
        os_mem_prealloc(fd, area, memory, ms->smp.cpus, errp);
1926
        if (errp && *errp) {
1927
            qemu_ram_munmap(fd, area, memory);
1928
            return NULL;
1929
        }
1930 1931
    }

A
Alex Williamson 已提交
1932
    block->fd = fd;
1933 1934 1935 1936
    return area;
}
#endif

1937 1938 1939 1940
/* Allocate space within the ram_addr_t space that governs the
 * dirty bitmaps.
 * Called with the ramlist lock held.
 */
1941
static ram_addr_t find_ram_offset(ram_addr_t size)
A
Alex Williamson 已提交
1942 1943
{
    RAMBlock *block, *next_block;
A
Alex Williamson 已提交
1944
    ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
A
Alex Williamson 已提交
1945

1946 1947
    assert(size != 0); /* it would hand out same offset multiple times */

M
Mike Day 已提交
1948
    if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
A
Alex Williamson 已提交
1949
        return 0;
M
Mike Day 已提交
1950
    }
A
Alex Williamson 已提交
1951

P
Peter Xu 已提交
1952
    RAMBLOCK_FOREACH(block) {
1953
        ram_addr_t candidate, next = RAM_ADDR_MAX;
A
Alex Williamson 已提交
1954

1955 1956 1957
        /* Align blocks to start on a 'long' in the bitmap
         * which makes the bitmap sync'ing take the fast path.
         */
1958
        candidate = block->offset + block->max_length;
1959
        candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
A
Alex Williamson 已提交
1960

1961 1962 1963
        /* Search for the closest following block
         * and find the gap.
         */
P
Peter Xu 已提交
1964
        RAMBLOCK_FOREACH(next_block) {
1965
            if (next_block->offset >= candidate) {
A
Alex Williamson 已提交
1966 1967 1968
                next = MIN(next, next_block->offset);
            }
        }
1969 1970 1971 1972 1973 1974 1975 1976

        /* If it fits remember our place and remember the size
         * of gap, but keep going so that we might find a smaller
         * gap to fill so avoiding fragmentation.
         */
        if (next - candidate >= size && next - candidate < mingap) {
            offset = candidate;
            mingap = next - candidate;
A
Alex Williamson 已提交
1977
        }
1978 1979

        trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
A
Alex Williamson 已提交
1980
    }
A
Alex Williamson 已提交
1981 1982 1983 1984 1985 1986 1987

    if (offset == RAM_ADDR_MAX) {
        fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
                (uint64_t)size);
        abort();
    }

1988 1989
    trace_find_ram_offset(size, offset);

A
Alex Williamson 已提交
1990 1991 1992
    return offset;
}

1993
static unsigned long last_ram_page(void)
1994 1995 1996 1997
{
    RAMBlock *block;
    ram_addr_t last = 0;

M
Mike Day 已提交
1998
    rcu_read_lock();
P
Peter Xu 已提交
1999
    RAMBLOCK_FOREACH(block) {
2000
        last = MAX(last, block->offset + block->max_length);
M
Mike Day 已提交
2001
    }
M
Mike Day 已提交
2002
    rcu_read_unlock();
2003
    return last >> TARGET_PAGE_BITS;
2004 2005
}

2006 2007 2008 2009 2010
static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
{
    int ret;

    /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
2011
    if (!machine_dump_guest_core(current_machine)) {
2012 2013 2014 2015 2016 2017 2018 2019 2020
        ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
        if (ret) {
            perror("qemu_madvise");
            fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
                            "but dump_guest_core=off specified\n");
        }
    }
}

D
Dr. David Alan Gilbert 已提交
2021 2022 2023 2024 2025
const char *qemu_ram_get_idstr(RAMBlock *rb)
{
    return rb->idstr;
}

2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040
void *qemu_ram_get_host_addr(RAMBlock *rb)
{
    return rb->host;
}

ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
{
    return rb->offset;
}

ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
{
    return rb->used_length;
}

2041 2042 2043 2044 2045
bool qemu_ram_is_shared(RAMBlock *rb)
{
    return rb->flags & RAM_SHARED;
}

2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056
/* Note: Only set at the start of postcopy */
bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
{
    return rb->flags & RAM_UF_ZEROPAGE;
}

void qemu_ram_set_uf_zeroable(RAMBlock *rb)
{
    rb->flags |= RAM_UF_ZEROPAGE;
}

2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071
bool qemu_ram_is_migratable(RAMBlock *rb)
{
    return rb->flags & RAM_MIGRATABLE;
}

void qemu_ram_set_migratable(RAMBlock *rb)
{
    rb->flags |= RAM_MIGRATABLE;
}

void qemu_ram_unset_migratable(RAMBlock *rb)
{
    rb->flags &= ~RAM_MIGRATABLE;
}

2072
/* Called with iothread lock held.  */
G
Gonglei 已提交
2073
void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
2074
{
G
Gonglei 已提交
2075
    RAMBlock *block;
2076

2077 2078
    assert(new_block);
    assert(!new_block->idstr[0]);
2079

2080 2081
    if (dev) {
        char *id = qdev_get_dev_path(dev);
2082 2083
        if (id) {
            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
2084
            g_free(id);
2085 2086 2087 2088
        }
    }
    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);

G
Gonglei 已提交
2089
    rcu_read_lock();
P
Peter Xu 已提交
2090
    RAMBLOCK_FOREACH(block) {
G
Gonglei 已提交
2091 2092
        if (block != new_block &&
            !strcmp(block->idstr, new_block->idstr)) {
2093 2094 2095 2096 2097
            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
                    new_block->idstr);
            abort();
        }
    }
M
Mike Day 已提交
2098
    rcu_read_unlock();
2099 2100
}

2101
/* Called with iothread lock held.  */
G
Gonglei 已提交
2102
void qemu_ram_unset_idstr(RAMBlock *block)
2103
{
2104 2105 2106 2107
    /* FIXME: arch_init.c assumes that this is not called throughout
     * migration.  Ignore the problem since hot-unplug during migration
     * does not work anyway.
     */
2108 2109 2110 2111 2112
    if (block) {
        memset(block->idstr, 0, sizeof(block->idstr));
    }
}

2113 2114 2115 2116 2117
size_t qemu_ram_pagesize(RAMBlock *rb)
{
    return rb->page_size;
}

2118 2119 2120 2121 2122 2123
/* Returns the largest size of page in use */
size_t qemu_ram_pagesize_largest(void)
{
    RAMBlock *block;
    size_t largest = 0;

P
Peter Xu 已提交
2124
    RAMBLOCK_FOREACH(block) {
2125 2126 2127 2128 2129 2130
        largest = MAX(largest, qemu_ram_pagesize(block));
    }

    return largest;
}

2131 2132
static int memory_try_enable_merging(void *addr, size_t len)
{
2133
    if (!machine_mem_merge(current_machine)) {
2134 2135 2136 2137 2138 2139 2140
        /* disabled by the user */
        return 0;
    }

    return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
}

2141 2142 2143 2144 2145 2146 2147
/* Only legal before guest might have detected the memory size: e.g. on
 * incoming migration, or right after reset.
 *
 * As memory core doesn't know how is memory accessed, it is up to
 * resize callback to update device state and/or add assertions to detect
 * misuse, if necessary.
 */
G
Gonglei 已提交
2148
int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
2149 2150 2151
{
    assert(block);

2152
    newsize = HOST_PAGE_ALIGN(newsize);
2153

2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175
    if (block->used_length == newsize) {
        return 0;
    }

    if (!(block->flags & RAM_RESIZEABLE)) {
        error_setg_errno(errp, EINVAL,
                         "Length mismatch: %s: 0x" RAM_ADDR_FMT
                         " in != 0x" RAM_ADDR_FMT, block->idstr,
                         newsize, block->used_length);
        return -EINVAL;
    }

    if (block->max_length < newsize) {
        error_setg_errno(errp, EINVAL,
                         "Length too large: %s: 0x" RAM_ADDR_FMT
                         " > 0x" RAM_ADDR_FMT, block->idstr,
                         newsize, block->max_length);
        return -EINVAL;
    }

    cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
    block->used_length = newsize;
2176 2177
    cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
                                        DIRTY_CLIENTS_ALL);
2178 2179 2180 2181 2182 2183 2184
    memory_region_set_size(block->mr, newsize);
    if (block->resized) {
        block->resized(block->idstr, newsize, block->host);
    }
    return 0;
}

2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225
/* Called with ram_list.mutex held */
static void dirty_memory_extend(ram_addr_t old_ram_size,
                                ram_addr_t new_ram_size)
{
    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
                                             DIRTY_MEMORY_BLOCK_SIZE);
    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
                                             DIRTY_MEMORY_BLOCK_SIZE);
    int i;

    /* Only need to extend if block count increased */
    if (new_num_blocks <= old_num_blocks) {
        return;
    }

    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
        DirtyMemoryBlocks *old_blocks;
        DirtyMemoryBlocks *new_blocks;
        int j;

        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
        new_blocks = g_malloc(sizeof(*new_blocks) +
                              sizeof(new_blocks->blocks[0]) * new_num_blocks);

        if (old_num_blocks) {
            memcpy(new_blocks->blocks, old_blocks->blocks,
                   old_num_blocks * sizeof(old_blocks->blocks[0]));
        }

        for (j = old_num_blocks; j < new_num_blocks; j++) {
            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
        }

        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);

        if (old_blocks) {
            g_free_rcu(old_blocks, rcu);
        }
    }
}

2226
static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
2227
{
2228
    RAMBlock *block;
M
Mike Day 已提交
2229
    RAMBlock *last_block = NULL;
2230
    ram_addr_t old_ram_size, new_ram_size;
2231
    Error *err = NULL;
2232

2233
    old_ram_size = last_ram_page();
2234

2235
    qemu_mutex_lock_ramlist();
2236
    new_block->offset = find_ram_offset(new_block->max_length);
2237 2238 2239

    if (!new_block->host) {
        if (xen_enabled()) {
2240
            xen_ram_alloc(new_block->offset, new_block->max_length,
2241 2242 2243 2244
                          new_block->mr, &err);
            if (err) {
                error_propagate(errp, err);
                qemu_mutex_unlock_ramlist();
2245
                return;
2246
            }
2247
        } else {
2248
            new_block->host = phys_mem_alloc(new_block->max_length,
2249
                                             &new_block->mr->align, shared);
2250
            if (!new_block->host) {
2251 2252 2253 2254
                error_setg_errno(errp, errno,
                                 "cannot set up guest memory '%s'",
                                 memory_region_name(new_block->mr));
                qemu_mutex_unlock_ramlist();
2255
                return;
2256
            }
2257
            memory_try_enable_merging(new_block->host, new_block->max_length);
2258
        }
2259
    }
P
pbrook 已提交
2260

L
Li Zhijian 已提交
2261 2262 2263
    new_ram_size = MAX(old_ram_size,
              (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
    if (new_ram_size > old_ram_size) {
2264
        dirty_memory_extend(old_ram_size, new_ram_size);
L
Li Zhijian 已提交
2265
    }
M
Mike Day 已提交
2266 2267 2268 2269
    /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
     * QLIST (which has an RCU-friendly variant) does not have insertion at
     * tail, so save the last element in last_block.
     */
P
Peter Xu 已提交
2270
    RAMBLOCK_FOREACH(block) {
M
Mike Day 已提交
2271
        last_block = block;
2272
        if (block->max_length < new_block->max_length) {
2273 2274 2275 2276
            break;
        }
    }
    if (block) {
M
Mike Day 已提交
2277
        QLIST_INSERT_BEFORE_RCU(block, new_block, next);
M
Mike Day 已提交
2278
    } else if (last_block) {
M
Mike Day 已提交
2279
        QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
M
Mike Day 已提交
2280
    } else { /* list is empty */
M
Mike Day 已提交
2281
        QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
2282
    }
2283
    ram_list.mru_block = NULL;
P
pbrook 已提交
2284

M
Mike Day 已提交
2285 2286
    /* Write list before version */
    smp_wmb();
U
Umesh Deshpande 已提交
2287
    ram_list.version++;
2288
    qemu_mutex_unlock_ramlist();
U
Umesh Deshpande 已提交
2289

2290
    cpu_physical_memory_set_dirty_range(new_block->offset,
2291 2292
                                        new_block->used_length,
                                        DIRTY_CLIENTS_ALL);
P
pbrook 已提交
2293

2294 2295 2296
    if (new_block->host) {
        qemu_ram_setup_dump(new_block->host, new_block->max_length);
        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
C
Cao jin 已提交
2297
        /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
2298
        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
P
Paolo Bonzini 已提交
2299
        ram_block_notify_add(new_block->host, new_block->max_length);
2300
    }
P
pbrook 已提交
2301
}
B
bellard 已提交
2302

2303
#ifdef CONFIG_POSIX
2304
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
2305
                                 uint32_t ram_flags, int fd,
2306
                                 Error **errp)
2307 2308
{
    RAMBlock *new_block;
2309
    Error *local_err = NULL;
2310
    int64_t file_size;
2311

J
Junyan He 已提交
2312 2313 2314
    /* Just support these ram flags by now. */
    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);

2315
    if (xen_enabled()) {
2316
        error_setg(errp, "-mem-path not supported with Xen");
2317
        return NULL;
2318 2319
    }

2320 2321 2322 2323 2324 2325
    if (kvm_enabled() && !kvm_has_sync_mmu()) {
        error_setg(errp,
                   "host lacks kvm mmu notifiers, -mem-path unsupported");
        return NULL;
    }

2326 2327 2328 2329 2330 2331
    if (phys_mem_alloc != qemu_anon_ram_alloc) {
        /*
         * file_ram_alloc() needs to allocate just like
         * phys_mem_alloc, but we haven't bothered to provide
         * a hook there.
         */
2332 2333
        error_setg(errp,
                   "-mem-path not supported with this accelerator");
2334
        return NULL;
2335 2336
    }

2337
    size = HOST_PAGE_ALIGN(size);
2338 2339 2340 2341 2342 2343 2344 2345
    file_size = get_file_size(fd);
    if (file_size > 0 && file_size < size) {
        error_setg(errp, "backing store %s size 0x%" PRIx64
                   " does not match 'size' option 0x" RAM_ADDR_FMT,
                   mem_path, file_size, size);
        return NULL;
    }

2346 2347
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
2348 2349
    new_block->used_length = size;
    new_block->max_length = size;
2350
    new_block->flags = ram_flags;
2351
    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
2352 2353
    if (!new_block->host) {
        g_free(new_block);
2354
        return NULL;
2355 2356
    }

2357
    ram_block_add(new_block, &local_err, ram_flags & RAM_SHARED);
2358 2359 2360
    if (local_err) {
        g_free(new_block);
        error_propagate(errp, local_err);
2361
        return NULL;
2362
    }
2363
    return new_block;
2364 2365 2366 2367 2368

}


RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
2369
                                   uint32_t ram_flags, const char *mem_path,
2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380
                                   Error **errp)
{
    int fd;
    bool created;
    RAMBlock *block;

    fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
    if (fd < 0) {
        return NULL;
    }

2381
    block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
2382 2383 2384 2385 2386 2387 2388 2389 2390
    if (!block) {
        if (created) {
            unlink(mem_path);
        }
        close(fd);
        return NULL;
    }

    return block;
2391
}
2392
#endif
2393

2394
static
2395 2396 2397 2398
RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
                                  void (*resized)(const char*,
                                                  uint64_t length,
                                                  void *host),
2399
                                  void *host, bool resizeable, bool share,
2400
                                  MemoryRegion *mr, Error **errp)
2401 2402
{
    RAMBlock *new_block;
2403
    Error *local_err = NULL;
2404

2405 2406
    size = HOST_PAGE_ALIGN(size);
    max_size = HOST_PAGE_ALIGN(max_size);
2407 2408
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
2409
    new_block->resized = resized;
2410 2411
    new_block->used_length = size;
    new_block->max_length = max_size;
2412
    assert(max_size >= size);
2413
    new_block->fd = -1;
2414
    new_block->page_size = getpagesize();
2415 2416
    new_block->host = host;
    if (host) {
2417
        new_block->flags |= RAM_PREALLOC;
2418
    }
2419 2420 2421
    if (resizeable) {
        new_block->flags |= RAM_RESIZEABLE;
    }
2422
    ram_block_add(new_block, &local_err, share);
2423 2424 2425
    if (local_err) {
        g_free(new_block);
        error_propagate(errp, local_err);
2426
        return NULL;
2427
    }
2428
    return new_block;
2429 2430
}

2431
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
2432 2433
                                   MemoryRegion *mr, Error **errp)
{
2434 2435
    return qemu_ram_alloc_internal(size, size, NULL, host, false,
                                   false, mr, errp);
2436 2437
}

2438 2439
RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share,
                         MemoryRegion *mr, Error **errp)
2440
{
2441 2442
    return qemu_ram_alloc_internal(size, size, NULL, NULL, false,
                                   share, mr, errp);
2443 2444
}

2445
RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
2446 2447 2448 2449 2450
                                     void (*resized)(const char*,
                                                     uint64_t length,
                                                     void *host),
                                     MemoryRegion *mr, Error **errp)
{
2451 2452
    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true,
                                   false, mr, errp);
2453 2454
}

P
Paolo Bonzini 已提交
2455 2456 2457 2458 2459 2460 2461 2462
static void reclaim_ramblock(RAMBlock *block)
{
    if (block->flags & RAM_PREALLOC) {
        ;
    } else if (xen_enabled()) {
        xen_invalidate_map_cache_entry(block->host);
#ifndef _WIN32
    } else if (block->fd >= 0) {
2463
        qemu_ram_munmap(block->fd, block->host, block->max_length);
P
Paolo Bonzini 已提交
2464 2465 2466 2467 2468 2469 2470 2471
        close(block->fd);
#endif
    } else {
        qemu_anon_ram_free(block->host, block->max_length);
    }
    g_free(block);
}

2472
void qemu_ram_free(RAMBlock *block)
B
bellard 已提交
2473
{
2474 2475 2476 2477
    if (!block) {
        return;
    }

P
Paolo Bonzini 已提交
2478 2479 2480 2481
    if (block->host) {
        ram_block_notify_remove(block->host, block->max_length);
    }

2482
    qemu_mutex_lock_ramlist();
2483 2484 2485 2486 2487 2488
    QLIST_REMOVE_RCU(block, next);
    ram_list.mru_block = NULL;
    /* Write list before version */
    smp_wmb();
    ram_list.version++;
    call_rcu(block, reclaim_ramblock, rcu);
2489
    qemu_mutex_unlock_ramlist();
B
bellard 已提交
2490 2491
}

H
Huang Ying 已提交
2492 2493 2494 2495 2496 2497 2498 2499
#ifndef _WIN32
void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
{
    RAMBlock *block;
    ram_addr_t offset;
    int flags;
    void *area, *vaddr;

P
Peter Xu 已提交
2500
    RAMBLOCK_FOREACH(block) {
H
Huang Ying 已提交
2501
        offset = addr - block->offset;
2502
        if (offset < block->max_length) {
2503
            vaddr = ramblock_ptr(block, offset);
2504
            if (block->flags & RAM_PREALLOC) {
H
Huang Ying 已提交
2505
                ;
2506 2507
            } else if (xen_enabled()) {
                abort();
H
Huang Ying 已提交
2508 2509
            } else {
                flags = MAP_FIXED;
2510
                if (block->fd >= 0) {
2511 2512
                    flags |= (block->flags & RAM_SHARED ?
                              MAP_SHARED : MAP_PRIVATE);
2513 2514
                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
                                flags, block->fd, offset);
H
Huang Ying 已提交
2515
                } else {
2516 2517 2518 2519 2520 2521 2522
                    /*
                     * Remap needs to match alloc.  Accelerators that
                     * set phys_mem_alloc never remap.  If they did,
                     * we'd need a remap hook here.
                     */
                    assert(phys_mem_alloc == qemu_anon_ram_alloc);

H
Huang Ying 已提交
2523 2524 2525 2526 2527
                    flags |= MAP_PRIVATE | MAP_ANONYMOUS;
                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
                                flags, -1, 0);
                }
                if (area != vaddr) {
2528 2529 2530
                    error_report("Could not remap addr: "
                                 RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
                                 length, addr);
H
Huang Ying 已提交
2531 2532
                    exit(1);
                }
2533
                memory_try_enable_merging(vaddr, length);
2534
                qemu_ram_setup_dump(vaddr, length);
H
Huang Ying 已提交
2535 2536 2537 2538 2539 2540
            }
        }
    }
}
#endif /* !_WIN32 */

2541
/* Return a host pointer to ram allocated with qemu_ram_alloc.
2542 2543 2544
 * This should not be used for general purpose DMA.  Use address_space_map
 * or address_space_rw instead. For local memory (e.g. video ram) that the
 * device owns, use memory_region_get_ram_ptr.
M
Mike Day 已提交
2545
 *
2546
 * Called within RCU critical section.
2547
 */
2548
void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
2549
{
2550 2551 2552 2553
    RAMBlock *block = ram_block;

    if (block == NULL) {
        block = qemu_get_ram_block(addr);
2554
        addr -= block->offset;
2555
    }
2556 2557

    if (xen_enabled() && block->host == NULL) {
2558 2559 2560 2561 2562
        /* We need to check if the requested address is in the RAM
         * because we don't want to map the entire memory in QEMU.
         * In that case just map until the end of the page.
         */
        if (block->offset == 0) {
2563
            return xen_map_cache(addr, 0, 0, false);
2564
        }
2565

2566
        block->host = xen_map_cache(block->offset, block->max_length, 1, false);
2567
    }
2568
    return ramblock_ptr(block, addr);
2569 2570
}

2571
/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
2572
 * but takes a size argument.
M
Mike Day 已提交
2573
 *
2574
 * Called within RCU critical section.
2575
 */
2576
static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
2577
                                 hwaddr *size, bool lock)
2578
{
2579
    RAMBlock *block = ram_block;
2580 2581 2582
    if (*size == 0) {
        return NULL;
    }
2583

2584 2585
    if (block == NULL) {
        block = qemu_get_ram_block(addr);
2586
        addr -= block->offset;
2587
    }
2588
    *size = MIN(*size, block->max_length - addr);
2589 2590 2591 2592 2593 2594 2595

    if (xen_enabled() && block->host == NULL) {
        /* We need to check if the requested address is in the RAM
         * because we don't want to map the entire memory in QEMU.
         * In that case just map the requested area.
         */
        if (block->offset == 0) {
2596
            return xen_map_cache(addr, *size, lock, lock);
2597 2598
        }

2599
        block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
2600
    }
2601

2602
    return ramblock_ptr(block, addr);
2603 2604
}

2605 2606 2607 2608 2609 2610 2611 2612 2613 2614
/* Return the offset of a hostpointer within a ramblock */
ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
{
    ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
    assert((uintptr_t)host >= (uintptr_t)rb->host);
    assert(res < rb->max_length);

    return res;
}

D
Dr. David Alan Gilbert 已提交
2615 2616 2617 2618 2619 2620 2621 2622 2623 2624
/*
 * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
 * in that RAMBlock.
 *
 * ptr: Host pointer to look up
 * round_offset: If true round the result offset down to a page boundary
 * *ram_addr: set to result ram_addr
 * *offset: set to result offset within the RAMBlock
 *
 * Returns: RAMBlock (or NULL if not found)
2625 2626 2627 2628 2629 2630 2631
 *
 * By the time this function returns, the returned pointer is not protected
 * by RCU anymore.  If the caller is not within an RCU critical section and
 * does not hold the iothread lock, it must have other means of protecting the
 * pointer, such as a reference to the region that includes the incoming
 * ram_addr_t.
 */
D
Dr. David Alan Gilbert 已提交
2632 2633
RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
                                   ram_addr_t *offset)
P
pbrook 已提交
2634
{
P
pbrook 已提交
2635 2636 2637
    RAMBlock *block;
    uint8_t *host = ptr;

2638
    if (xen_enabled()) {
2639
        ram_addr_t ram_addr;
M
Mike Day 已提交
2640
        rcu_read_lock();
2641 2642
        ram_addr = xen_ram_addr_from_mapcache(ptr);
        block = qemu_get_ram_block(ram_addr);
D
Dr. David Alan Gilbert 已提交
2643
        if (block) {
2644
            *offset = ram_addr - block->offset;
D
Dr. David Alan Gilbert 已提交
2645
        }
M
Mike Day 已提交
2646
        rcu_read_unlock();
D
Dr. David Alan Gilbert 已提交
2647
        return block;
2648 2649
    }

M
Mike Day 已提交
2650 2651
    rcu_read_lock();
    block = atomic_rcu_read(&ram_list.mru_block);
2652
    if (block && block->host && host - block->host < block->max_length) {
2653 2654 2655
        goto found;
    }

P
Peter Xu 已提交
2656
    RAMBLOCK_FOREACH(block) {
J
Jun Nakajima 已提交
2657 2658 2659 2660
        /* This case append when the block is not mapped. */
        if (block->host == NULL) {
            continue;
        }
2661
        if (host - block->host < block->max_length) {
2662
            goto found;
A
Alex Williamson 已提交
2663
        }
P
pbrook 已提交
2664
    }
J
Jun Nakajima 已提交
2665

M
Mike Day 已提交
2666
    rcu_read_unlock();
2667
    return NULL;
2668 2669

found:
D
Dr. David Alan Gilbert 已提交
2670 2671 2672 2673
    *offset = (host - block->host);
    if (round_offset) {
        *offset &= TARGET_PAGE_MASK;
    }
M
Mike Day 已提交
2674
    rcu_read_unlock();
D
Dr. David Alan Gilbert 已提交
2675 2676 2677
    return block;
}

D
Dr. David Alan Gilbert 已提交
2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688
/*
 * Finds the named RAMBlock
 *
 * name: The name of RAMBlock to find
 *
 * Returns: RAMBlock (or NULL if not found)
 */
RAMBlock *qemu_ram_block_by_name(const char *name)
{
    RAMBlock *block;

P
Peter Xu 已提交
2689
    RAMBLOCK_FOREACH(block) {
D
Dr. David Alan Gilbert 已提交
2690 2691 2692 2693 2694 2695 2696 2697
        if (!strcmp(name, block->idstr)) {
            return block;
        }
    }

    return NULL;
}

D
Dr. David Alan Gilbert 已提交
2698 2699
/* Some of the softmmu routines need to translate from a host pointer
   (typically a TLB entry) back to a ram offset.  */
2700
ram_addr_t qemu_ram_addr_from_host(void *ptr)
D
Dr. David Alan Gilbert 已提交
2701 2702
{
    RAMBlock *block;
2703
    ram_addr_t offset;
D
Dr. David Alan Gilbert 已提交
2704

2705
    block = qemu_ram_block_from_host(ptr, false, &offset);
D
Dr. David Alan Gilbert 已提交
2706
    if (!block) {
2707
        return RAM_ADDR_INVALID;
D
Dr. David Alan Gilbert 已提交
2708 2709
    }

2710
    return block->offset + offset;
M
Marcelo Tosatti 已提交
2711
}
A
Alex Williamson 已提交
2712

2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723
/* Called within RCU critical section. */
void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
                          CPUState *cpu,
                          vaddr mem_vaddr,
                          ram_addr_t ram_addr,
                          unsigned size)
{
    ndi->cpu = cpu;
    ndi->ram_addr = ram_addr;
    ndi->mem_vaddr = mem_vaddr;
    ndi->size = size;
E
Emilio G. Cota 已提交
2724
    ndi->pages = NULL;
2725

2726
    assert(tcg_enabled());
2727
    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
E
Emilio G. Cota 已提交
2728 2729
        ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
        tb_invalidate_phys_page_fast(ndi->pages, ram_addr, size);
2730
    }
2731 2732 2733 2734 2735
}

/* Called within RCU critical section. */
void memory_notdirty_write_complete(NotDirtyInfo *ndi)
{
E
Emilio G. Cota 已提交
2736
    if (ndi->pages) {
2737
        assert(tcg_enabled());
E
Emilio G. Cota 已提交
2738 2739
        page_collection_unlock(ndi->pages);
        ndi->pages = NULL;
2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762
    }

    /* Set both VGA and migration bits for simplicity and to remove
     * the notdirty callback faster.
     */
    cpu_physical_memory_set_dirty_range(ndi->ram_addr, ndi->size,
                                        DIRTY_CLIENTS_NOCODE);
    /* we remove the notdirty callback only if the code has been
       flushed */
    if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
        tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
    }
}

/* Called within RCU critical section.  */
static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
                               uint64_t val, unsigned size)
{
    NotDirtyInfo ndi;

    memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
                         ram_addr, size);

2763
    stn_p(qemu_map_ram_ptr(NULL, ram_addr), size, val);
2764
    memory_notdirty_write_complete(&ndi);
2765 2766
}

2767
static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
2768 2769
                                 unsigned size, bool is_write,
                                 MemTxAttrs attrs)
2770 2771 2772 2773
{
    return is_write;
}

2774 2775
static const MemoryRegionOps notdirty_mem_ops = {
    .write = notdirty_mem_write,
2776
    .valid.accepts = notdirty_mem_accepts,
2777
    .endianness = DEVICE_NATIVE_ENDIAN,
2778 2779 2780 2781 2782 2783 2784 2785 2786 2787
    .valid = {
        .min_access_size = 1,
        .max_access_size = 8,
        .unaligned = false,
    },
    .impl = {
        .min_access_size = 1,
        .max_access_size = 8,
        .unaligned = false,
    },
2788 2789
};

P
pbrook 已提交
2790
/* Generate a debug exception if a watchpoint has been hit.  */
2791 2792
void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
                          MemTxAttrs attrs, int flags, uintptr_t ra)
P
pbrook 已提交
2793
{
2794
    CPUClass *cc = CPU_GET_CLASS(cpu);
2795
    CPUWatchpoint *wp;
P
pbrook 已提交
2796

2797
    assert(tcg_enabled());
2798
    if (cpu->watchpoint_hit) {
2799 2800 2801 2802 2803 2804
        /*
         * We re-entered the check after replacing the TB.
         * Now raise the debug interrupt so that it will
         * trigger after the current instruction.
         */
        qemu_mutex_lock_iothread();
2805
        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2806
        qemu_mutex_unlock_iothread();
2807 2808
        return;
    }
2809 2810

    addr = cc->adjust_watchpoint_address(cpu, addr, len);
2811
    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2812
        if (watchpoint_address_matches(wp, addr, len)
2813
            && (wp->flags & flags)) {
2814 2815 2816 2817 2818
            if (flags == BP_MEM_READ) {
                wp->flags |= BP_WATCHPOINT_HIT_READ;
            } else {
                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
            }
2819
            wp->hitaddr = MAX(addr, wp->vaddr);
2820
            wp->hitattrs = attrs;
2821
            if (!cpu->watchpoint_hit) {
2822 2823 2824 2825 2826
                if (wp->flags & BP_CPU &&
                    !cc->debug_check_watchpoint(cpu, wp)) {
                    wp->flags &= ~BP_WATCHPOINT_HIT;
                    continue;
                }
2827
                cpu->watchpoint_hit = wp;
2828

E
Emilio G. Cota 已提交
2829
                mmap_lock();
2830
                tb_check_watchpoint(cpu);
2831
                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2832
                    cpu->exception_index = EXCP_DEBUG;
E
Emilio G. Cota 已提交
2833
                    mmap_unlock();
2834
                    cpu_loop_exit_restore(cpu, ra);
2835
                } else {
2836 2837
                    /* Force execution of one insn next time.  */
                    cpu->cflags_next_tb = 1 | curr_cflags();
E
Emilio G. Cota 已提交
2838
                    mmap_unlock();
2839 2840 2841
                    if (ra) {
                        cpu_restore_state(cpu, ra, true);
                    }
2842
                    cpu_loop_exit_noexc(cpu);
2843
                }
2844
            }
2845 2846
        } else {
            wp->flags &= ~BP_WATCHPOINT_HIT;
P
pbrook 已提交
2847 2848 2849 2850
        }
    }
}

2851
static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
2852
                                 MemTxAttrs attrs, uint8_t *buf, hwaddr len);
2853
static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
2854 2855
                                  const uint8_t *buf, hwaddr len);
static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
2856
                                  bool is_write, MemTxAttrs attrs);
2857

2858 2859
static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
                                unsigned len, MemTxAttrs attrs)
2860
{
2861
    subpage_t *subpage = opaque;
2862
    uint8_t buf[8];
2863
    MemTxResult res;
2864

2865
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2866
    printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2867
           subpage, len, addr);
2868
#endif
2869
    res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
2870 2871
    if (res) {
        return res;
2872
    }
2873 2874
    *data = ldn_p(buf, len);
    return MEMTX_OK;
2875 2876
}

2877 2878
static MemTxResult subpage_write(void *opaque, hwaddr addr,
                                 uint64_t value, unsigned len, MemTxAttrs attrs)
2879
{
2880
    subpage_t *subpage = opaque;
2881
    uint8_t buf[8];
2882

2883
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2884
    printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2885 2886
           " value %"PRIx64"\n",
           __func__, subpage, len, addr, value);
2887
#endif
2888
    stn_p(buf, len, value);
2889
    return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
2890 2891
}

2892
static bool subpage_accepts(void *opaque, hwaddr addr,
2893 2894
                            unsigned len, bool is_write,
                            MemTxAttrs attrs)
2895
{
2896
    subpage_t *subpage = opaque;
2897
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2898
    printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2899
           __func__, subpage, is_write ? 'w' : 'r', len, addr);
2900 2901
#endif

2902
    return flatview_access_valid(subpage->fv, addr + subpage->base,
2903
                                 len, is_write, attrs);
2904 2905
}

2906
static const MemoryRegionOps subpage_ops = {
2907 2908
    .read_with_attrs = subpage_read,
    .write_with_attrs = subpage_write,
2909 2910 2911 2912
    .impl.min_access_size = 1,
    .impl.max_access_size = 8,
    .valid.min_access_size = 1,
    .valid.max_access_size = 8,
2913
    .valid.accepts = subpage_accepts,
2914
    .endianness = DEVICE_NATIVE_ENDIAN,
2915 2916
};

2917 2918
static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
                            uint16_t section)
2919 2920 2921 2922 2923 2924 2925 2926
{
    int idx, eidx;

    if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
        return -1;
    idx = SUBPAGE_IDX(start);
    eidx = SUBPAGE_IDX(end);
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2927 2928
    printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
           __func__, mmio, start, end, idx, eidx, section);
2929 2930
#endif
    for (; idx <= eidx; idx++) {
2931
        mmio->sub_section[idx] = section;
2932 2933 2934 2935 2936
    }

    return 0;
}

2937
static subpage_t *subpage_init(FlatView *fv, hwaddr base)
2938
{
A
Anthony Liguori 已提交
2939
    subpage_t *mmio;
2940

2941
    /* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */
2942
    mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2943
    mmio->fv = fv;
2944
    mmio->base = base;
2945
    memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
P
Peter Crosthwaite 已提交
2946
                          NULL, TARGET_PAGE_SIZE);
A
Avi Kivity 已提交
2947
    mmio->iomem.subpage = true;
2948
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2949 2950
    printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
           mmio, base, TARGET_PAGE_SIZE);
2951 2952 2953 2954 2955
#endif

    return mmio;
}

2956
static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
2957
{
2958
    assert(fv);
2959
    MemoryRegionSection section = {
2960
        .fv = fv,
2961 2962 2963
        .mr = mr,
        .offset_within_address_space = 0,
        .offset_within_region = 0,
2964
        .size = int128_2_64(),
2965 2966
    };

2967
    return phys_section_add(map, &section);
2968 2969
}

2970 2971 2972 2973 2974 2975 2976
static void readonly_mem_write(void *opaque, hwaddr addr,
                               uint64_t val, unsigned size)
{
    /* Ignore any write to ROM. */
}

static bool readonly_mem_accepts(void *opaque, hwaddr addr,
2977 2978
                                 unsigned size, bool is_write,
                                 MemTxAttrs attrs)
2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001
{
    return is_write;
}

/* This will only be used for writes, because reads are special cased
 * to directly access the underlying host ram.
 */
static const MemoryRegionOps readonly_mem_ops = {
    .write = readonly_mem_write,
    .valid.accepts = readonly_mem_accepts,
    .endianness = DEVICE_NATIVE_ENDIAN,
    .valid = {
        .min_access_size = 1,
        .max_access_size = 8,
        .unaligned = false,
    },
    .impl = {
        .min_access_size = 1,
        .max_access_size = 8,
        .unaligned = false,
    },
};

3002 3003
MemoryRegionSection *iotlb_to_section(CPUState *cpu,
                                      hwaddr index, MemTxAttrs attrs)
3004
{
3005 3006
    int asidx = cpu_asidx_from_attrs(cpu, attrs);
    CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
3007
    AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
3008
    MemoryRegionSection *sections = d->map.sections;
P
Paolo Bonzini 已提交
3009

3010
    return &sections[index & ~TARGET_PAGE_MASK];
3011 3012
}

A
Avi Kivity 已提交
3013 3014
static void io_mem_init(void)
{
3015 3016
    memory_region_init_io(&io_mem_rom, NULL, &readonly_mem_ops,
                          NULL, NULL, UINT64_MAX);
3017
    memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
3018
                          NULL, UINT64_MAX);
3019 3020 3021 3022

    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
     * which can be called without the iothread mutex.
     */
3023
    memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
3024
                          NULL, UINT64_MAX);
3025
    memory_region_clear_global_locking(&io_mem_notdirty);
A
Avi Kivity 已提交
3026 3027
}

3028
AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
3029
{
3030 3031 3032
    AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
    uint16_t n;

3033
    n = dummy_section(&d->map, fv, &io_mem_unassigned);
3034
    assert(n == PHYS_SECTION_UNASSIGNED);
3035
    n = dummy_section(&d->map, fv, &io_mem_notdirty);
3036
    assert(n == PHYS_SECTION_NOTDIRTY);
3037
    n = dummy_section(&d->map, fv, &io_mem_rom);
3038
    assert(n == PHYS_SECTION_ROM);
3039

M
Michael S. Tsirkin 已提交
3040
    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
3041 3042

    return d;
3043 3044
}

3045
void address_space_dispatch_free(AddressSpaceDispatch *d)
3046 3047 3048 3049 3050
{
    phys_sections_free(&d->map);
    g_free(d);
}

3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079
static void do_nothing(CPUState *cpu, run_on_cpu_data d)
{
}

static void tcg_log_global_after_sync(MemoryListener *listener)
{
    CPUAddressSpace *cpuas;

    /* Wait for the CPU to end the current TB.  This avoids the following
     * incorrect race:
     *
     *      vCPU                         migration
     *      ----------------------       -------------------------
     *      TLB check -> slow path
     *        notdirty_mem_write
     *          write to RAM
     *          mark dirty
     *                                   clear dirty flag
     *      TLB check -> fast path
     *                                   read memory
     *        write to RAM
     *
     * by pushing the migration thread's memory read after the vCPU thread has
     * written the memory.
     */
    cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
    run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
}

3080
static void tcg_commit(MemoryListener *listener)
3081
{
3082 3083
    CPUAddressSpace *cpuas;
    AddressSpaceDispatch *d;
3084

3085
    assert(tcg_enabled());
3086 3087
    /* since each CPU stores ram addresses in its TLB cache, we must
       reset the modified entries */
3088 3089 3090 3091 3092 3093
    cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
    cpu_reloading_memory_map();
    /* The CPU and TLB are protected by the iothread lock.
     * We reload the dispatch pointer now because cpu_reloading_memory_map()
     * may have split the RCU critical section.
     */
3094
    d = address_space_to_dispatch(cpuas->as);
3095
    atomic_rcu_set(&cpuas->memory_dispatch, d);
3096
    tlb_flush(cpuas->cpu);
3097 3098
}

A
Avi Kivity 已提交
3099 3100
static void memory_map_init(void)
{
3101
    system_memory = g_malloc(sizeof(*system_memory));
3102

3103
    memory_region_init(system_memory, NULL, "system", UINT64_MAX);
3104
    address_space_init(&address_space_memory, system_memory, "memory");
3105

3106
    system_io = g_malloc(sizeof(*system_io));
3107 3108
    memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
                          65536);
3109
    address_space_init(&address_space_io, system_io, "I/O");
A
Avi Kivity 已提交
3110 3111 3112 3113 3114 3115 3116
}

MemoryRegion *get_system_memory(void)
{
    return system_memory;
}

3117 3118 3119 3120 3121
MemoryRegion *get_system_io(void)
{
    return system_io;
}

3122 3123
#endif /* !defined(CONFIG_USER_ONLY) */

B
bellard 已提交
3124 3125
/* physical memory access (slow version, mainly for debug) */
#if defined(CONFIG_USER_ONLY)
3126
int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3127
                        uint8_t *buf, target_ulong len, int is_write)
B
bellard 已提交
3128
{
3129 3130
    int flags;
    target_ulong l, page;
3131
    void * p;
B
bellard 已提交
3132 3133 3134 3135 3136 3137 3138 3139

    while (len > 0) {
        page = addr & TARGET_PAGE_MASK;
        l = (page + TARGET_PAGE_SIZE) - addr;
        if (l > len)
            l = len;
        flags = page_get_flags(page);
        if (!(flags & PAGE_VALID))
P
Paul Brook 已提交
3140
            return -1;
B
bellard 已提交
3141 3142
        if (is_write) {
            if (!(flags & PAGE_WRITE))
P
Paul Brook 已提交
3143
                return -1;
3144
            /* XXX: this code should not depend on lock_user */
A
aurel32 已提交
3145
            if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
P
Paul Brook 已提交
3146
                return -1;
A
aurel32 已提交
3147 3148
            memcpy(p, buf, l);
            unlock_user(p, addr, l);
B
bellard 已提交
3149 3150
        } else {
            if (!(flags & PAGE_READ))
P
Paul Brook 已提交
3151
                return -1;
3152
            /* XXX: this code should not depend on lock_user */
A
aurel32 已提交
3153
            if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
P
Paul Brook 已提交
3154
                return -1;
A
aurel32 已提交
3155
            memcpy(buf, p, l);
A
aurel32 已提交
3156
            unlock_user(p, addr, 0);
B
bellard 已提交
3157 3158 3159 3160 3161
        }
        len -= l;
        buf += l;
        addr += l;
    }
P
Paul Brook 已提交
3162
    return 0;
B
bellard 已提交
3163
}
B
bellard 已提交
3164

B
bellard 已提交
3165
#else
3166

3167
static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
A
Avi Kivity 已提交
3168
                                     hwaddr length)
3169
{
3170
    uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
3171 3172
    addr += memory_region_get_ram_addr(mr);

3173 3174 3175 3176 3177 3178 3179 3180 3181
    /* No early return if dirty_log_mask is or becomes 0, because
     * cpu_physical_memory_set_dirty_range will still call
     * xen_modified_memory.
     */
    if (dirty_log_mask) {
        dirty_log_mask =
            cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
    }
    if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
3182
        assert(tcg_enabled());
3183 3184
        tb_invalidate_phys_range(addr, addr + length);
        dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
3185
    }
3186
    cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
3187 3188
}

3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201
void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
{
    /*
     * In principle this function would work on other memory region types too,
     * but the ROM device use case is the only one where this operation is
     * necessary.  Other memory regions should use the
     * address_space_read/write() APIs.
     */
    assert(memory_region_is_romd(mr));

    invalidate_and_set_dirty(mr, addr, size);
}

3202
static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
3203
{
3204
    unsigned access_size_max = mr->ops->valid.max_access_size;
3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217

    /* Regions are assumed to support 1-4 byte accesses unless
       otherwise specified.  */
    if (access_size_max == 0) {
        access_size_max = 4;
    }

    /* Bound the maximum access by the alignment of the address.  */
    if (!mr->ops->impl.unaligned) {
        unsigned align_size_max = addr & -addr;
        if (align_size_max != 0 && align_size_max < access_size_max) {
            access_size_max = align_size_max;
        }
3218
    }
3219 3220 3221 3222

    /* Don't attempt accesses larger than the maximum.  */
    if (l > access_size_max) {
        l = access_size_max;
3223
    }
3224
    l = pow2floor(l);
3225 3226

    return l;
3227 3228
}

3229
static bool prepare_mmio_access(MemoryRegion *mr)
3230
{
3231 3232 3233 3234 3235 3236 3237 3238
    bool unlocked = !qemu_mutex_iothread_locked();
    bool release_lock = false;

    if (unlocked && mr->global_locking) {
        qemu_mutex_lock_iothread();
        unlocked = false;
        release_lock = true;
    }
3239
    if (mr->flush_coalesced_mmio) {
3240 3241 3242
        if (unlocked) {
            qemu_mutex_lock_iothread();
        }
3243
        qemu_flush_coalesced_mmio_buffer();
3244 3245 3246
        if (unlocked) {
            qemu_mutex_unlock_iothread();
        }
3247
    }
3248 3249

    return release_lock;
3250 3251
}

3252
/* Called within RCU critical section.  */
3253 3254 3255
static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
                                           MemTxAttrs attrs,
                                           const uint8_t *buf,
3256
                                           hwaddr len, hwaddr addr1,
3257
                                           hwaddr l, MemoryRegion *mr)
B
bellard 已提交
3258 3259
{
    uint8_t *ptr;
3260
    uint64_t val;
3261
    MemTxResult result = MEMTX_OK;
3262
    bool release_lock = false;
3263

3264
    for (;;) {
3265 3266 3267 3268 3269
        if (!memory_access_is_direct(mr, true)) {
            release_lock |= prepare_mmio_access(mr);
            l = memory_access_size(mr, l, addr1);
            /* XXX: could force current_cpu to NULL to avoid
               potential bugs */
3270
            val = ldn_he_p(buf, l);
3271
            result |= memory_region_dispatch_write(mr, addr1, val,
3272
                                                   size_memop(l), attrs);
B
bellard 已提交
3273
        } else {
3274
            /* RAM case */
3275
            ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3276 3277
            memcpy(ptr, buf, l);
            invalidate_and_set_dirty(mr, addr1, l);
B
bellard 已提交
3278
        }
3279 3280 3281 3282 3283 3284

        if (release_lock) {
            qemu_mutex_unlock_iothread();
            release_lock = false;
        }

B
bellard 已提交
3285 3286 3287
        len -= l;
        buf += l;
        addr += l;
3288 3289 3290 3291 3292 3293

        if (!len) {
            break;
        }

        l = len;
3294
        mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
B
bellard 已提交
3295
    }
3296

3297
    return result;
B
bellard 已提交
3298
}
B
bellard 已提交
3299

3300
/* Called from RCU critical section.  */
3301
static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
3302
                                  const uint8_t *buf, hwaddr len)
A
Avi Kivity 已提交
3303
{
3304 3305 3306 3307 3308
    hwaddr l;
    hwaddr addr1;
    MemoryRegion *mr;
    MemTxResult result = MEMTX_OK;

3309
    l = len;
3310
    mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
3311 3312
    result = flatview_write_continue(fv, addr, attrs, buf, len,
                                     addr1, l, mr);
3313 3314 3315 3316 3317

    return result;
}

/* Called within RCU critical section.  */
3318 3319
MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
                                   MemTxAttrs attrs, uint8_t *buf,
3320
                                   hwaddr len, hwaddr addr1, hwaddr l,
3321
                                   MemoryRegion *mr)
3322 3323 3324 3325 3326
{
    uint8_t *ptr;
    uint64_t val;
    MemTxResult result = MEMTX_OK;
    bool release_lock = false;
3327

3328
    for (;;) {
3329 3330 3331 3332
        if (!memory_access_is_direct(mr, false)) {
            /* I/O case */
            release_lock |= prepare_mmio_access(mr);
            l = memory_access_size(mr, l, addr1);
3333
            result |= memory_region_dispatch_read(mr, addr1, &val,
3334 3335
                                                  size_memop(l), attrs);
            stn_he_p(buf, l, val);
3336 3337
        } else {
            /* RAM case */
3338
            ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349
            memcpy(buf, ptr, l);
        }

        if (release_lock) {
            qemu_mutex_unlock_iothread();
            release_lock = false;
        }

        len -= l;
        buf += l;
        addr += l;
3350 3351 3352 3353 3354 3355

        if (!len) {
            break;
        }

        l = len;
3356
        mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
3357 3358 3359 3360 3361
    }

    return result;
}

3362 3363
/* Called from RCU critical section.  */
static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
3364
                                 MemTxAttrs attrs, uint8_t *buf, hwaddr len)
3365 3366 3367 3368
{
    hwaddr l;
    hwaddr addr1;
    MemoryRegion *mr;
3369

3370
    l = len;
3371
    mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
3372 3373
    return flatview_read_continue(fv, addr, attrs, buf, len,
                                  addr1, l, mr);
A
Avi Kivity 已提交
3374 3375
}

3376
MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
3377
                                    MemTxAttrs attrs, uint8_t *buf, hwaddr len)
3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391
{
    MemTxResult result = MEMTX_OK;
    FlatView *fv;

    if (len > 0) {
        rcu_read_lock();
        fv = address_space_to_flatview(as);
        result = flatview_read(fv, addr, attrs, buf, len);
        rcu_read_unlock();
    }

    return result;
}

3392 3393
MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
                                MemTxAttrs attrs,
3394
                                const uint8_t *buf, hwaddr len)
3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408
{
    MemTxResult result = MEMTX_OK;
    FlatView *fv;

    if (len > 0) {
        rcu_read_lock();
        fv = address_space_to_flatview(as);
        result = flatview_write(fv, addr, attrs, buf, len);
        rcu_read_unlock();
    }

    return result;
}

3409
MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
3410
                             uint8_t *buf, hwaddr len, bool is_write)
3411 3412 3413 3414 3415 3416 3417 3418
{
    if (is_write) {
        return address_space_write(as, addr, attrs, buf, len);
    } else {
        return address_space_read_full(as, addr, attrs, buf, len);
    }
}

A
Avi Kivity 已提交
3419
void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
3420
                            hwaddr len, int is_write)
A
Avi Kivity 已提交
3421
{
3422 3423
    address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
                     buf, len, is_write);
A
Avi Kivity 已提交
3424 3425
}

3426 3427 3428 3429 3430
enum write_rom_type {
    WRITE_DATA,
    FLUSH_CACHE,
};

3431 3432 3433 3434
static inline MemTxResult address_space_write_rom_internal(AddressSpace *as,
                                                           hwaddr addr,
                                                           MemTxAttrs attrs,
                                                           const uint8_t *buf,
3435
                                                           hwaddr len,
3436
                                                           enum write_rom_type type)
B
bellard 已提交
3437
{
3438
    hwaddr l;
B
bellard 已提交
3439
    uint8_t *ptr;
3440
    hwaddr addr1;
3441
    MemoryRegion *mr;
3442

3443
    rcu_read_lock();
B
bellard 已提交
3444
    while (len > 0) {
3445
        l = len;
3446
        mr = address_space_translate(as, addr, &addr1, &l, true, attrs);
3447

3448 3449
        if (!(memory_region_is_ram(mr) ||
              memory_region_is_romd(mr))) {
3450
            l = memory_access_size(mr, l, addr1);
B
bellard 已提交
3451 3452
        } else {
            /* ROM/RAM case */
3453
            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3454 3455 3456
            switch (type) {
            case WRITE_DATA:
                memcpy(ptr, buf, l);
3457
                invalidate_and_set_dirty(mr, addr1, l);
3458 3459 3460 3461 3462
                break;
            case FLUSH_CACHE:
                flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
                break;
            }
B
bellard 已提交
3463 3464 3465 3466 3467
        }
        len -= l;
        buf += l;
        addr += l;
    }
3468
    rcu_read_unlock();
3469
    return MEMTX_OK;
B
bellard 已提交
3470 3471
}

3472
/* used for ROM loading : can write in RAM and ROM */
3473 3474
MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
                                    MemTxAttrs attrs,
3475
                                    const uint8_t *buf, hwaddr len)
3476
{
3477 3478
    return address_space_write_rom_internal(as, addr, attrs,
                                            buf, len, WRITE_DATA);
3479 3480
}

3481
void cpu_flush_icache_range(hwaddr start, hwaddr len)
3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492
{
    /*
     * This function should do the same thing as an icache flush that was
     * triggered from within the guest. For TCG we are always cache coherent,
     * so there is no need to flush anything. For KVM / Xen we need to flush
     * the host's instruction cache at least.
     */
    if (tcg_enabled()) {
        return;
    }

3493 3494 3495
    address_space_write_rom_internal(&address_space_memory,
                                     start, MEMTXATTRS_UNSPECIFIED,
                                     NULL, len, FLUSH_CACHE);
3496 3497
}

3498
typedef struct {
3499
    MemoryRegion *mr;
3500
    void *buffer;
A
Avi Kivity 已提交
3501 3502
    hwaddr addr;
    hwaddr len;
F
Fam Zheng 已提交
3503
    bool in_use;
3504 3505 3506 3507
} BounceBuffer;

static BounceBuffer bounce;

3508
typedef struct MapClient {
3509
    QEMUBH *bh;
B
Blue Swirl 已提交
3510
    QLIST_ENTRY(MapClient) link;
3511 3512
} MapClient;

3513
QemuMutex map_client_list_lock;
3514
static QLIST_HEAD(, MapClient) map_client_list
B
Blue Swirl 已提交
3515
    = QLIST_HEAD_INITIALIZER(map_client_list);
3516

3517 3518 3519 3520 3521 3522
static void cpu_unregister_map_client_do(MapClient *client)
{
    QLIST_REMOVE(client, link);
    g_free(client);
}

3523 3524 3525 3526 3527 3528
static void cpu_notify_map_clients_locked(void)
{
    MapClient *client;

    while (!QLIST_EMPTY(&map_client_list)) {
        client = QLIST_FIRST(&map_client_list);
3529 3530
        qemu_bh_schedule(client->bh);
        cpu_unregister_map_client_do(client);
3531 3532 3533
    }
}

3534
void cpu_register_map_client(QEMUBH *bh)
3535
{
3536
    MapClient *client = g_malloc(sizeof(*client));
3537

3538
    qemu_mutex_lock(&map_client_list_lock);
3539
    client->bh = bh;
B
Blue Swirl 已提交
3540
    QLIST_INSERT_HEAD(&map_client_list, client, link);
3541 3542 3543
    if (!atomic_read(&bounce.in_use)) {
        cpu_notify_map_clients_locked();
    }
3544
    qemu_mutex_unlock(&map_client_list_lock);
3545 3546
}

3547
void cpu_exec_init_all(void)
3548
{
3549
    qemu_mutex_init(&ram_list.mutex);
3550 3551 3552 3553 3554 3555 3556 3557
    /* The data structures we set up here depend on knowing the page size,
     * so no more changes can be made after this point.
     * In an ideal world, nothing we did before we had finished the
     * machine setup would care about the target page size, and we could
     * do this much later, rather than requiring board models to state
     * up front what their requirements are.
     */
    finalize_target_page_bits();
3558
    io_mem_init();
3559
    memory_map_init();
3560
    qemu_mutex_init(&map_client_list_lock);
3561 3562
}

3563
void cpu_unregister_map_client(QEMUBH *bh)
3564 3565 3566
{
    MapClient *client;

3567 3568 3569 3570 3571 3572
    qemu_mutex_lock(&map_client_list_lock);
    QLIST_FOREACH(client, &map_client_list, link) {
        if (client->bh == bh) {
            cpu_unregister_map_client_do(client);
            break;
        }
3573
    }
3574
    qemu_mutex_unlock(&map_client_list_lock);
3575 3576 3577 3578
}

static void cpu_notify_map_clients(void)
{
3579
    qemu_mutex_lock(&map_client_list_lock);
3580
    cpu_notify_map_clients_locked();
3581
    qemu_mutex_unlock(&map_client_list_lock);
3582 3583
}

3584
static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
3585
                                  bool is_write, MemTxAttrs attrs)
3586
{
3587
    MemoryRegion *mr;
3588 3589 3590 3591
    hwaddr l, xlat;

    while (len > 0) {
        l = len;
3592
        mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3593 3594
        if (!memory_access_is_direct(mr, is_write)) {
            l = memory_access_size(mr, l, addr);
3595
            if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
3596 3597 3598 3599 3600 3601 3602 3603 3604 3605
                return false;
            }
        }

        len -= l;
        addr += l;
    }
    return true;
}

3606
bool address_space_access_valid(AddressSpace *as, hwaddr addr,
3607
                                hwaddr len, bool is_write,
3608
                                MemTxAttrs attrs)
3609
{
3610 3611 3612 3613 3614
    FlatView *fv;
    bool result;

    rcu_read_lock();
    fv = address_space_to_flatview(as);
3615
    result = flatview_access_valid(fv, addr, len, is_write, attrs);
3616 3617
    rcu_read_unlock();
    return result;
3618 3619
}

3620
static hwaddr
3621
flatview_extend_translation(FlatView *fv, hwaddr addr,
3622 3623 3624
                            hwaddr target_len,
                            MemoryRegion *mr, hwaddr base, hwaddr len,
                            bool is_write, MemTxAttrs attrs)
3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638
{
    hwaddr done = 0;
    hwaddr xlat;
    MemoryRegion *this_mr;

    for (;;) {
        target_len -= len;
        addr += len;
        done += len;
        if (target_len == 0) {
            return done;
        }

        len = target_len;
3639
        this_mr = flatview_translate(fv, addr, &xlat,
3640
                                     &len, is_write, attrs);
3641 3642 3643 3644 3645 3646
        if (this_mr != mr || xlat != base + done) {
            return done;
        }
    }
}

3647 3648 3649 3650
/* Map a physical memory region into a host virtual address.
 * May map a subset of the requested range, given by and returned in *plen.
 * May return NULL if resources needed to perform the mapping are exhausted.
 * Use only for reads OR writes - not for read-modify-write operations.
3651 3652
 * Use cpu_register_map_client() to know when retrying the map operation is
 * likely to succeed.
3653
 */
A
Avi Kivity 已提交
3654
void *address_space_map(AddressSpace *as,
A
Avi Kivity 已提交
3655 3656
                        hwaddr addr,
                        hwaddr *plen,
3657 3658
                        bool is_write,
                        MemTxAttrs attrs)
3659
{
A
Avi Kivity 已提交
3660
    hwaddr len = *plen;
3661 3662
    hwaddr l, xlat;
    MemoryRegion *mr;
3663
    void *ptr;
3664
    FlatView *fv;
3665

3666 3667 3668
    if (len == 0) {
        return NULL;
    }
3669

3670
    l = len;
3671
    rcu_read_lock();
3672
    fv = address_space_to_flatview(as);
3673
    mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3674

3675
    if (!memory_access_is_direct(mr, is_write)) {
F
Fam Zheng 已提交
3676
        if (atomic_xchg(&bounce.in_use, true)) {
3677
            rcu_read_unlock();
3678
            return NULL;
3679
        }
3680 3681 3682
        /* Avoid unbounded allocations */
        l = MIN(l, TARGET_PAGE_SIZE);
        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3683 3684
        bounce.addr = addr;
        bounce.len = l;
3685 3686 3687

        memory_region_ref(mr);
        bounce.mr = mr;
3688
        if (!is_write) {
3689
            flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
3690
                               bounce.buffer, l);
3691
        }
3692

3693
        rcu_read_unlock();
3694 3695 3696 3697 3698
        *plen = l;
        return bounce.buffer;
    }


3699
    memory_region_ref(mr);
3700
    *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
3701
                                        l, is_write, attrs);
3702
    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
3703 3704 3705
    rcu_read_unlock();

    return ptr;
3706 3707
}

A
Avi Kivity 已提交
3708
/* Unmaps a memory region previously mapped by address_space_map().
3709 3710 3711
 * Will also mark the memory as dirty if is_write == 1.  access_len gives
 * the amount of memory that was actually read or written by the caller.
 */
A
Avi Kivity 已提交
3712 3713
void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
                         int is_write, hwaddr access_len)
3714 3715
{
    if (buffer != bounce.buffer) {
3716 3717 3718
        MemoryRegion *mr;
        ram_addr_t addr1;

3719
        mr = memory_region_from_host(buffer, &addr1);
3720
        assert(mr != NULL);
3721
        if (is_write) {
3722
            invalidate_and_set_dirty(mr, addr1, access_len);
3723
        }
3724
        if (xen_enabled()) {
J
Jan Kiszka 已提交
3725
            xen_invalidate_map_cache_entry(buffer);
A
Anthony PERARD 已提交
3726
        }
3727
        memory_region_unref(mr);
3728 3729 3730
        return;
    }
    if (is_write) {
3731 3732
        address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
                            bounce.buffer, access_len);
3733
    }
3734
    qemu_vfree(bounce.buffer);
3735
    bounce.buffer = NULL;
3736
    memory_region_unref(bounce.mr);
F
Fam Zheng 已提交
3737
    atomic_mb_set(&bounce.in_use, false);
3738
    cpu_notify_map_clients();
3739
}
B
bellard 已提交
3740

A
Avi Kivity 已提交
3741 3742
void *cpu_physical_memory_map(hwaddr addr,
                              hwaddr *plen,
A
Avi Kivity 已提交
3743 3744
                              int is_write)
{
3745 3746
    return address_space_map(&address_space_memory, addr, plen, is_write,
                             MEMTXATTRS_UNSPECIFIED);
A
Avi Kivity 已提交
3747 3748
}

A
Avi Kivity 已提交
3749 3750
void cpu_physical_memory_unmap(void *buffer, hwaddr len,
                               int is_write, hwaddr access_len)
A
Avi Kivity 已提交
3751 3752 3753 3754
{
    return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
}

P
Paolo Bonzini 已提交
3755 3756 3757 3758 3759 3760 3761
#define ARG1_DECL                AddressSpace *as
#define ARG1                     as
#define SUFFIX
#define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
#define RCU_READ_LOCK(...)       rcu_read_lock()
#define RCU_READ_UNLOCK(...)     rcu_read_unlock()
#include "memory_ldst.inc.c"
3762

P
Paolo Bonzini 已提交
3763 3764 3765 3766 3767 3768
int64_t address_space_cache_init(MemoryRegionCache *cache,
                                 AddressSpace *as,
                                 hwaddr addr,
                                 hwaddr len,
                                 bool is_write)
{
3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782
    AddressSpaceDispatch *d;
    hwaddr l;
    MemoryRegion *mr;

    assert(len > 0);

    l = len;
    cache->fv = address_space_get_flatview(as);
    d = flatview_to_dispatch(cache->fv);
    cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);

    mr = cache->mrs.mr;
    memory_region_ref(mr);
    if (memory_access_is_direct(mr, is_write)) {
3783 3784 3785 3786
        /* We don't care about the memory attributes here as we're only
         * doing this if we found actual RAM, which behaves the same
         * regardless of attributes; so UNSPECIFIED is fine.
         */
3787
        l = flatview_extend_translation(cache->fv, addr, len, mr,
3788 3789
                                        cache->xlat, l, is_write,
                                        MEMTXATTRS_UNSPECIFIED);
3790 3791 3792 3793 3794 3795 3796 3797
        cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
    } else {
        cache->ptr = NULL;
    }

    cache->len = l;
    cache->is_write = is_write;
    return l;
P
Paolo Bonzini 已提交
3798 3799 3800 3801 3802 3803
}

void address_space_cache_invalidate(MemoryRegionCache *cache,
                                    hwaddr addr,
                                    hwaddr access_len)
{
3804 3805 3806 3807
    assert(cache->is_write);
    if (likely(cache->ptr)) {
        invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
    }
P
Paolo Bonzini 已提交
3808 3809 3810 3811
}

void address_space_cache_destroy(MemoryRegionCache *cache)
{
3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831
    if (!cache->mrs.mr) {
        return;
    }

    if (xen_enabled()) {
        xen_invalidate_map_cache_entry(cache->ptr);
    }
    memory_region_unref(cache->mrs.mr);
    flatview_unref(cache->fv);
    cache->mrs.mr = NULL;
    cache->fv = NULL;
}

/* Called from RCU critical section.  This function has the same
 * semantics as address_space_translate, but it only works on a
 * predefined range of a MemoryRegion that was mapped with
 * address_space_cache_init.
 */
static inline MemoryRegion *address_space_translate_cached(
    MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
3832
    hwaddr *plen, bool is_write, MemTxAttrs attrs)
3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850
{
    MemoryRegionSection section;
    MemoryRegion *mr;
    IOMMUMemoryRegion *iommu_mr;
    AddressSpace *target_as;

    assert(!cache->ptr);
    *xlat = addr + cache->xlat;

    mr = cache->mrs.mr;
    iommu_mr = memory_region_get_iommu(mr);
    if (!iommu_mr) {
        /* MMIO region.  */
        return mr;
    }

    section = address_space_translate_iommu(iommu_mr, xlat, plen,
                                            NULL, is_write, true,
3851
                                            &target_as, attrs);
3852 3853 3854 3855 3856 3857 3858 3859
    return section.mr;
}

/* Called from RCU critical section. address_space_read_cached uses this
 * out of line function when the target is an MMIO or IOMMU region.
 */
void
address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3860
                                   void *buf, hwaddr len)
3861 3862 3863 3864 3865
{
    hwaddr addr1, l;
    MemoryRegion *mr;

    l = len;
3866 3867
    mr = address_space_translate_cached(cache, addr, &addr1, &l, false,
                                        MEMTXATTRS_UNSPECIFIED);
3868 3869 3870 3871 3872 3873 3874 3875 3876 3877
    flatview_read_continue(cache->fv,
                           addr, MEMTXATTRS_UNSPECIFIED, buf, len,
                           addr1, l, mr);
}

/* Called from RCU critical section. address_space_write_cached uses this
 * out of line function when the target is an MMIO or IOMMU region.
 */
void
address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3878
                                    const void *buf, hwaddr len)
3879 3880 3881 3882 3883
{
    hwaddr addr1, l;
    MemoryRegion *mr;

    l = len;
3884 3885
    mr = address_space_translate_cached(cache, addr, &addr1, &l, true,
                                        MEMTXATTRS_UNSPECIFIED);
3886 3887 3888
    flatview_write_continue(cache->fv,
                            addr, MEMTXATTRS_UNSPECIFIED, buf, len,
                            addr1, l, mr);
P
Paolo Bonzini 已提交
3889 3890 3891 3892
}

#define ARG1_DECL                MemoryRegionCache *cache
#define ARG1                     cache
3893 3894 3895 3896
#define SUFFIX                   _cached_slow
#define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
#define RCU_READ_LOCK()          ((void)0)
#define RCU_READ_UNLOCK()        ((void)0)
P
Paolo Bonzini 已提交
3897 3898
#include "memory_ldst.inc.c"

3899
/* virtual memory access for debug (includes writing to ROM) */
3900
int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3901
                        uint8_t *buf, target_ulong len, int is_write)
B
bellard 已提交
3902
{
A
Avi Kivity 已提交
3903
    hwaddr phys_addr;
3904
    target_ulong l, page;
B
bellard 已提交
3905

3906
    cpu_synchronize_state(cpu);
B
bellard 已提交
3907
    while (len > 0) {
3908 3909 3910
        int asidx;
        MemTxAttrs attrs;

B
bellard 已提交
3911
        page = addr & TARGET_PAGE_MASK;
3912 3913
        phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
        asidx = cpu_asidx_from_attrs(cpu, attrs);
B
bellard 已提交
3914 3915 3916 3917 3918 3919
        /* if no physical page mapped, return an error */
        if (phys_addr == -1)
            return -1;
        l = (page + TARGET_PAGE_SIZE) - addr;
        if (l > len)
            l = len;
3920
        phys_addr += (addr & ~TARGET_PAGE_MASK);
3921
        if (is_write) {
3922
            address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr,
3923
                                    attrs, buf, l);
3924
        } else {
3925
            address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
3926
                             attrs, buf, l, 0);
3927
        }
B
bellard 已提交
3928 3929 3930 3931 3932 3933
        len -= l;
        buf += l;
        addr += l;
    }
    return 0;
}
3934 3935 3936 3937 3938

/*
 * Allows code that needs to deal with migration bitmaps etc to still be built
 * target independent.
 */
3939
size_t qemu_target_page_size(void)
3940
{
3941
    return TARGET_PAGE_SIZE;
3942 3943
}

3944 3945 3946 3947 3948 3949 3950 3951 3952
int qemu_target_page_bits(void)
{
    return TARGET_PAGE_BITS;
}

int qemu_target_page_bits_min(void)
{
    return TARGET_PAGE_BITS_MIN;
}
P
Paul Brook 已提交
3953
#endif
B
bellard 已提交
3954

3955
bool target_words_bigendian(void)
3956 3957 3958 3959 3960 3961 3962 3963
{
#if defined(TARGET_WORDS_BIGENDIAN)
    return true;
#else
    return false;
#endif
}

3964
#ifndef CONFIG_USER_ONLY
A
Avi Kivity 已提交
3965
bool cpu_physical_memory_is_io(hwaddr phys_addr)
3966
{
3967
    MemoryRegion*mr;
3968
    hwaddr l = 1;
3969
    bool res;
3970

3971
    rcu_read_lock();
3972
    mr = address_space_translate(&address_space_memory,
3973 3974
                                 phys_addr, &phys_addr, &l, false,
                                 MEMTXATTRS_UNSPECIFIED);
3975

3976 3977 3978
    res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
    rcu_read_unlock();
    return res;
3979
}
3980

3981
int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3982 3983
{
    RAMBlock *block;
3984
    int ret = 0;
3985

M
Mike Day 已提交
3986
    rcu_read_lock();
P
Peter Xu 已提交
3987
    RAMBLOCK_FOREACH(block) {
3988
        ret = func(block, opaque);
3989 3990 3991
        if (ret) {
            break;
        }
3992
    }
M
Mike Day 已提交
3993
    rcu_read_unlock();
3994
    return ret;
3995
}
3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017

/*
 * Unmap pages of memory from start to start+length such that
 * they a) read as 0, b) Trigger whatever fault mechanism
 * the OS provides for postcopy.
 * The pages must be unmapped by the end of the function.
 * Returns: 0 on success, none-0 on failure
 *
 */
int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
{
    int ret = -1;

    uint8_t *host_startaddr = rb->host + start;

    if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
        error_report("ram_block_discard_range: Unaligned start address: %p",
                     host_startaddr);
        goto err;
    }

    if ((start + length) <= rb->used_length) {
4018
        bool need_madvise, need_fallocate;
4019 4020 4021 4022 4023 4024 4025 4026 4027
        uint8_t *host_endaddr = host_startaddr + length;
        if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
            error_report("ram_block_discard_range: Unaligned end address: %p",
                         host_endaddr);
            goto err;
        }

        errno = ENOTSUP; /* If we are missing MADVISE etc */

4028 4029 4030 4031 4032 4033 4034 4035 4036 4037
        /* The logic here is messy;
         *    madvise DONTNEED fails for hugepages
         *    fallocate works on hugepages and shmem
         */
        need_madvise = (rb->page_size == qemu_host_page_size);
        need_fallocate = rb->fd != -1;
        if (need_fallocate) {
            /* For a file, this causes the area of the file to be zero'd
             * if read, and for hugetlbfs also causes it to be unmapped
             * so a userfault will trigger.
4038 4039 4040 4041
             */
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
            ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                            start, length);
4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054
            if (ret) {
                ret = -errno;
                error_report("ram_block_discard_range: Failed to fallocate "
                             "%s:%" PRIx64 " +%zx (%d)",
                             rb->idstr, start, length, ret);
                goto err;
            }
#else
            ret = -ENOSYS;
            error_report("ram_block_discard_range: fallocate not available/file"
                         "%s:%" PRIx64 " +%zx (%d)",
                         rb->idstr, start, length, ret);
            goto err;
4055 4056
#endif
        }
4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074
        if (need_madvise) {
            /* For normal RAM this causes it to be unmapped,
             * for shared memory it causes the local mapping to disappear
             * and to fall back on the file contents (which we just
             * fallocate'd away).
             */
#if defined(CONFIG_MADVISE)
            ret =  madvise(host_startaddr, length, MADV_DONTNEED);
            if (ret) {
                ret = -errno;
                error_report("ram_block_discard_range: Failed to discard range "
                             "%s:%" PRIx64 " +%zx (%d)",
                             rb->idstr, start, length, ret);
                goto err;
            }
#else
            ret = -ENOSYS;
            error_report("ram_block_discard_range: MADVISE not available"
4075 4076
                         "%s:%" PRIx64 " +%zx (%d)",
                         rb->idstr, start, length, ret);
4077 4078
            goto err;
#endif
4079
        }
4080 4081
        trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
                                      need_madvise, need_fallocate, ret);
4082 4083 4084 4085 4086 4087 4088 4089 4090 4091
    } else {
        error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
                     "/%zx/" RAM_ADDR_FMT")",
                     rb->idstr, start, length, rb->used_length);
    }

err:
    return ret;
}

J
Junyan He 已提交
4092 4093 4094 4095 4096
bool ramblock_is_pmem(RAMBlock *rb)
{
    return rb->flags & RAM_PMEM;
}

4097
#endif
Y
Yang Zhong 已提交
4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110

void page_size_init(void)
{
    /* NOTE: we can always suppose that qemu_host_page_size >=
       TARGET_PAGE_SIZE */
    if (qemu_host_page_size == 0) {
        qemu_host_page_size = qemu_real_host_page_size;
    }
    if (qemu_host_page_size < TARGET_PAGE_SIZE) {
        qemu_host_page_size = TARGET_PAGE_SIZE;
    }
    qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
}
4111 4112 4113

#if !defined(CONFIG_USER_ONLY)

4114
static void mtree_print_phys_entries(int start, int end, int skip, int ptr)
4115 4116
{
    if (start == end - 1) {
4117
        qemu_printf("\t%3d      ", start);
4118
    } else {
4119
        qemu_printf("\t%3d..%-3d ", start, end - 1);
4120
    }
4121
    qemu_printf(" skip=%d ", skip);
4122
    if (ptr == PHYS_MAP_NODE_NIL) {
4123
        qemu_printf(" ptr=NIL");
4124
    } else if (!skip) {
4125
        qemu_printf(" ptr=#%d", ptr);
4126
    } else {
4127
        qemu_printf(" ptr=[%d]", ptr);
4128
    }
4129
    qemu_printf("\n");
4130 4131 4132 4133 4134
}

#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
                           int128_sub((size), int128_one())) : 0)

4135
void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root)
4136 4137 4138
{
    int i;

4139 4140
    qemu_printf("  Dispatch\n");
    qemu_printf("    Physical sections\n");
4141 4142 4143 4144 4145 4146

    for (i = 0; i < d->map.sections_nb; ++i) {
        MemoryRegionSection *s = d->map.sections + i;
        const char *names[] = { " [unassigned]", " [not dirty]",
                                " [ROM]", " [watch]" };

4147 4148
        qemu_printf("      #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx
                    " %s%s%s%s%s",
4149 4150 4151 4152 4153 4154 4155 4156 4157 4158
            i,
            s->offset_within_address_space,
            s->offset_within_address_space + MR_SIZE(s->mr->size),
            s->mr->name ? s->mr->name : "(noname)",
            i < ARRAY_SIZE(names) ? names[i] : "",
            s->mr == root ? " [ROOT]" : "",
            s == d->mru_section ? " [MRU]" : "",
            s->mr->is_iommu ? " [iommu]" : "");

        if (s->mr->alias) {
4159
            qemu_printf(" alias=%s", s->mr->alias->name ?
4160 4161
                    s->mr->alias->name : "noname");
        }
4162
        qemu_printf("\n");
4163 4164
    }

4165
    qemu_printf("    Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
4166 4167 4168 4169 4170 4171
               P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
    for (i = 0; i < d->map.nodes_nb; ++i) {
        int j, jprev;
        PhysPageEntry prev;
        Node *n = d->map.nodes + i;

4172
        qemu_printf("      [%d]\n", i);
4173 4174 4175 4176 4177 4178 4179 4180

        for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
            PhysPageEntry *pe = *n + j;

            if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
                continue;
            }

4181
            mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
4182 4183 4184 4185 4186 4187

            jprev = j;
            prev = *pe;
        }

        if (jprev != ARRAY_SIZE(*n)) {
4188
            mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
4189 4190 4191 4192 4193
        }
    }
}

#endif