exec.c 103.5 KB
Newer Older
B
bellard 已提交
1
/*
2
 *  Virtual page mapping
3
 *
B
bellard 已提交
4 5 6 7 8 9 10 11 12 13 14 15 16
 *  Copyright (c) 2003 Fabrice Bellard
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
B
bellard 已提交
18
 */
P
Peter Maydell 已提交
19
#include "qemu/osdep.h"
20
#include "qapi/error.h"
21
#ifndef _WIN32
B
bellard 已提交
22
#endif
B
bellard 已提交
23

24
#include "qemu/cutils.h"
B
bellard 已提交
25
#include "cpu.h"
26
#include "exec/exec-all.h"
27
#include "exec/target_page.h"
B
bellard 已提交
28
#include "tcg.h"
29
#include "hw/qdev-core.h"
F
Fam Zheng 已提交
30
#include "hw/qdev-properties.h"
31
#if !defined(CONFIG_USER_ONLY)
32
#include "hw/boards.h"
33
#include "hw/xen/xen.h"
34
#endif
35
#include "sysemu/kvm.h"
36
#include "sysemu/sysemu.h"
37 38
#include "qemu/timer.h"
#include "qemu/config-file.h"
39
#include "qemu/error-report.h"
40
#if defined(CONFIG_USER_ONLY)
41
#include "qemu.h"
J
Jun Nakajima 已提交
42
#else /* !CONFIG_USER_ONLY */
43 44
#include "hw/hw.h"
#include "exec/memory.h"
P
Paolo Bonzini 已提交
45
#include "exec/ioport.h"
46
#include "sysemu/dma.h"
47
#include "sysemu/numa.h"
48
#include "sysemu/hw_accel.h"
49
#include "exec/address-spaces.h"
50
#include "sysemu/xen-mapcache.h"
51
#include "trace-root.h"
52

53 54 55 56 57
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
#include <fcntl.h>
#include <linux/falloc.h>
#endif

58
#endif
M
Mike Day 已提交
59
#include "qemu/rcu_queue.h"
60
#include "qemu/main-loop.h"
61
#include "translate-all.h"
62
#include "sysemu/replay.h"
63

64
#include "exec/memory-internal.h"
65
#include "exec/ram_addr.h"
66
#include "exec/log.h"
67

68 69
#include "migration/vmstate.h"

70
#include "qemu/range.h"
71 72 73
#ifndef _WIN32
#include "qemu/mmap-alloc.h"
#endif
74

75 76
#include "monitor/monitor.h"

77
//#define DEBUG_SUBPAGE
T
ths 已提交
78

79
#if !defined(CONFIG_USER_ONLY)
M
Mike Day 已提交
80 81 82
/* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
 * are protected by the ramlist lock.
 */
M
Mike Day 已提交
83
RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
A
Avi Kivity 已提交
84 85

static MemoryRegion *system_memory;
86
static MemoryRegion *system_io;
A
Avi Kivity 已提交
87

88 89
AddressSpace address_space_io;
AddressSpace address_space_memory;
90

91
MemoryRegion io_mem_rom, io_mem_notdirty;
92
static MemoryRegion io_mem_unassigned;
93

94 95 96
/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
#define RAM_PREALLOC   (1 << 0)

97 98 99
/* RAM is mmap-ed with MAP_SHARED */
#define RAM_SHARED     (1 << 1)

100 101 102 103 104
/* Only a portion of RAM (used_length) is actually used, and migrated.
 * This used_length size can change across reboots.
 */
#define RAM_RESIZEABLE (1 << 2)

105
#endif
106

107 108 109 110 111
#ifdef TARGET_PAGE_BITS_VARY
int target_page_bits;
bool target_page_bits_decided;
#endif

A
Andreas Färber 已提交
112
struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
B
bellard 已提交
113 114
/* current CPU in the current thread. It is only valid inside
   cpu_exec() */
P
Paolo Bonzini 已提交
115
__thread CPUState *current_cpu;
P
pbrook 已提交
116
/* 0 = Do not count executed instructions.
T
ths 已提交
117
   1 = Precise instruction counting.
P
pbrook 已提交
118
   2 = Adaptive rate instruction counting.  */
119
int use_icount;
B
bellard 已提交
120

Y
Yang Zhong 已提交
121 122 123 124 125
uintptr_t qemu_host_page_size;
intptr_t qemu_host_page_mask;
uintptr_t qemu_real_host_page_size;
intptr_t qemu_real_host_page_mask;

126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
bool set_preferred_target_page_bits(int bits)
{
    /* The target page size is the lowest common denominator for all
     * the CPUs in the system, so we can only make it smaller, never
     * larger. And we can't make it smaller once we've committed to
     * a particular size.
     */
#ifdef TARGET_PAGE_BITS_VARY
    assert(bits >= TARGET_PAGE_BITS_MIN);
    if (target_page_bits == 0 || target_page_bits > bits) {
        if (target_page_bits_decided) {
            return false;
        }
        target_page_bits = bits;
    }
#endif
    return true;
}

145
#if !defined(CONFIG_USER_ONLY)
146

147 148 149 150 151 152 153 154 155 156
static void finalize_target_page_bits(void)
{
#ifdef TARGET_PAGE_BITS_VARY
    if (target_page_bits == 0) {
        target_page_bits = TARGET_PAGE_BITS_MIN;
    }
    target_page_bits_decided = true;
#endif
}

157 158 159
typedef struct PhysPageEntry PhysPageEntry;

struct PhysPageEntry {
M
Michael S. Tsirkin 已提交
160
    /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
161
    uint32_t skip : 6;
M
Michael S. Tsirkin 已提交
162
     /* index into phys_sections (!skip) or phys_map_nodes (skip) */
163
    uint32_t ptr : 26;
164 165
};

166 167
#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)

168
/* Size of the L2 (and L3, etc) page tables.  */
169
#define ADDR_SPACE_BITS 64
170

M
Michael S. Tsirkin 已提交
171
#define P_L2_BITS 9
172 173 174 175 176
#define P_L2_SIZE (1 << P_L2_BITS)

#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)

typedef PhysPageEntry Node[P_L2_SIZE];
177

178
typedef struct PhysPageMap {
179 180
    struct rcu_head rcu;

181 182 183 184 185 186 187 188
    unsigned sections_nb;
    unsigned sections_nb_alloc;
    unsigned nodes_nb;
    unsigned nodes_nb_alloc;
    Node *nodes;
    MemoryRegionSection *sections;
} PhysPageMap;

189
struct AddressSpaceDispatch {
190 191
    struct rcu_head rcu;

192
    MemoryRegionSection *mru_section;
193 194 195 196
    /* This is a multi-level map on the physical address space.
     * The bottom level has pointers to MemoryRegionSections.
     */
    PhysPageEntry phys_map;
197
    PhysPageMap map;
198
    AddressSpace *as;
199 200
};

201 202 203
#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
typedef struct subpage_t {
    MemoryRegion iomem;
204
    AddressSpace *as;
205
    hwaddr base;
206
    uint16_t sub_section[];
207 208
} subpage_t;

209 210 211 212
#define PHYS_SECTION_UNASSIGNED 0
#define PHYS_SECTION_NOTDIRTY 1
#define PHYS_SECTION_ROM 2
#define PHYS_SECTION_WATCH 3
213

214
static void io_mem_init(void);
A
Avi Kivity 已提交
215
static void memory_map_init(void);
216
static void tcg_commit(MemoryListener *listener);
217

218
static MemoryRegion io_mem_watch;
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233

/**
 * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 * @cpu: the CPU whose AddressSpace this is
 * @as: the AddressSpace itself
 * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 * @tcg_as_listener: listener for tracking changes to the AddressSpace
 */
struct CPUAddressSpace {
    CPUState *cpu;
    AddressSpace *as;
    struct AddressSpaceDispatch *memory_dispatch;
    MemoryListener tcg_as_listener;
};

234 235 236 237 238 239
struct DirtyBitmapSnapshot {
    ram_addr_t start;
    ram_addr_t end;
    unsigned long dirty[];
};

240
#endif
B
bellard 已提交
241

242
#if !defined(CONFIG_USER_ONLY)
243

244
static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
245
{
246
    static unsigned alloc_hint = 16;
247
    if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
248
        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
249 250
        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
        map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
251
        alloc_hint = map->nodes_nb_alloc;
252
    }
253 254
}

255
static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
256 257
{
    unsigned i;
258
    uint32_t ret;
259 260
    PhysPageEntry e;
    PhysPageEntry *p;
261

262
    ret = map->nodes_nb++;
263
    p = map->nodes[ret];
264
    assert(ret != PHYS_MAP_NODE_NIL);
265
    assert(ret != map->nodes_nb_alloc);
266 267 268

    e.skip = leaf ? 0 : 1;
    e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
269
    for (i = 0; i < P_L2_SIZE; ++i) {
270
        memcpy(&p[i], &e, sizeof(e));
271
    }
272
    return ret;
273 274
}

275 276
static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
                                hwaddr *index, hwaddr *nb, uint16_t leaf,
277
                                int level)
278 279
{
    PhysPageEntry *p;
280
    hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
281

M
Michael S. Tsirkin 已提交
282
    if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
283
        lp->ptr = phys_map_node_alloc(map, level == 0);
B
bellard 已提交
284
    }
285
    p = map->nodes[lp->ptr];
286
    lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
287

288
    while (*nb && lp < &p[P_L2_SIZE]) {
289
        if ((*index & (step - 1)) == 0 && *nb >= step) {
M
Michael S. Tsirkin 已提交
290
            lp->skip = 0;
291
            lp->ptr = leaf;
292 293
            *index += step;
            *nb -= step;
294
        } else {
295
            phys_page_set_level(map, lp, index, nb, leaf, level - 1);
296 297
        }
        ++lp;
298 299 300
    }
}

A
Avi Kivity 已提交
301
static void phys_page_set(AddressSpaceDispatch *d,
A
Avi Kivity 已提交
302
                          hwaddr index, hwaddr nb,
303
                          uint16_t leaf)
304
{
305
    /* Wildly overreserve - it doesn't matter much. */
306
    phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
307

308
    phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
B
bellard 已提交
309 310
}

311 312 313
/* Compact a non leaf page entry. Simply detect that the entry has a single child,
 * and update our entry so we can skip it and go directly to the destination.
 */
314
static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
{
    unsigned valid_ptr = P_L2_SIZE;
    int valid = 0;
    PhysPageEntry *p;
    int i;

    if (lp->ptr == PHYS_MAP_NODE_NIL) {
        return;
    }

    p = nodes[lp->ptr];
    for (i = 0; i < P_L2_SIZE; i++) {
        if (p[i].ptr == PHYS_MAP_NODE_NIL) {
            continue;
        }

        valid_ptr = i;
        valid++;
        if (p[i].skip) {
334
            phys_page_compact(&p[i], nodes);
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
        }
    }

    /* We can only compress if there's only one child. */
    if (valid != 1) {
        return;
    }

    assert(valid_ptr < P_L2_SIZE);

    /* Don't compress if it won't fit in the # of bits we have. */
    if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
        return;
    }

    lp->ptr = p[valid_ptr].ptr;
    if (!p[valid_ptr].skip) {
        /* If our only child is a leaf, make this a leaf. */
        /* By design, we should have made this node a leaf to begin with so we
         * should never reach here.
         * But since it's so simple to handle this, let's do it just in case we
         * change this rule.
         */
        lp->skip = 0;
    } else {
        lp->skip += p[valid_ptr].skip;
    }
}

static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
{
    if (d->phys_map.skip) {
367
        phys_page_compact(&d->phys_map, d->map.nodes);
368 369 370
    }
}

F
Fam Zheng 已提交
371 372 373 374 375 376
static inline bool section_covers_addr(const MemoryRegionSection *section,
                                       hwaddr addr)
{
    /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
     * the section must cover the entire address space.
     */
377
    return int128_gethi(section->size) ||
F
Fam Zheng 已提交
378
           range_covers_byte(section->offset_within_address_space,
379
                             int128_getlo(section->size), addr);
F
Fam Zheng 已提交
380 381
}

382
static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
B
bellard 已提交
383
{
384 385 386
    PhysPageEntry lp = d->phys_map, *p;
    Node *nodes = d->map.nodes;
    MemoryRegionSection *sections = d->map.sections;
387
    hwaddr index = addr >> TARGET_PAGE_BITS;
388
    int i;
389

M
Michael S. Tsirkin 已提交
390
    for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
391
        if (lp.ptr == PHYS_MAP_NODE_NIL) {
392
            return &sections[PHYS_SECTION_UNASSIGNED];
393
        }
394
        p = nodes[lp.ptr];
395
        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
396
    }
397

F
Fam Zheng 已提交
398
    if (section_covers_addr(&sections[lp.ptr], addr)) {
399 400 401 402
        return &sections[lp.ptr];
    } else {
        return &sections[PHYS_SECTION_UNASSIGNED];
    }
403 404
}

B
Blue Swirl 已提交
405 406
bool memory_region_is_unassigned(MemoryRegion *mr)
{
P
Paolo Bonzini 已提交
407
    return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
408
        && mr != &io_mem_watch;
B
bellard 已提交
409
}
410

411
/* Called from RCU critical section */
412
static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
413 414
                                                        hwaddr addr,
                                                        bool resolve_subpage)
415
{
416
    MemoryRegionSection *section = atomic_read(&d->mru_section);
417
    subpage_t *subpage;
418
    bool update;
419

420 421 422 423
    if (section && section != &d->map.sections[PHYS_SECTION_UNASSIGNED] &&
        section_covers_addr(section, addr)) {
        update = false;
    } else {
424
        section = phys_page_find(d, addr);
425 426
        update = true;
    }
427 428
    if (resolve_subpage && section->mr->subpage) {
        subpage = container_of(section->mr, subpage_t, iomem);
429
        section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
430
    }
431 432 433
    if (update) {
        atomic_set(&d->mru_section, section);
    }
434
    return section;
435 436
}

437
/* Called from RCU critical section */
438
static MemoryRegionSection *
439
address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
440
                                 hwaddr *plen, bool resolve_subpage)
441 442
{
    MemoryRegionSection *section;
443
    MemoryRegion *mr;
444
    Int128 diff;
445

446
    section = address_space_lookup_region(d, addr, resolve_subpage);
447 448 449 450 451 452
    /* Compute offset within MemoryRegionSection */
    addr -= section->offset_within_address_space;

    /* Compute offset within MemoryRegion */
    *xlat = addr + section->offset_within_region;

453
    mr = section->mr;
454 455 456 457 458 459 460 461 462 463 464 465

    /* MMIO registers can be expected to perform full-width accesses based only
     * on their address, without considering adjacent registers that could
     * decode to completely different MemoryRegions.  When such registers
     * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
     * regions overlap wildly.  For this reason we cannot clamp the accesses
     * here.
     *
     * If the length is small (as is the case for address_space_ldl/stl),
     * everything works fine.  If the incoming length is large, however,
     * the caller really has to do the clamping through memory_access_size.
     */
466
    if (memory_region_is_ram(mr)) {
467
        diff = int128_sub(section->size, int128_make64(addr));
468 469
        *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
    }
470 471
    return section;
}
472

473
/* Called from RCU critical section */
474 475 476 477 478
static MemoryRegionSection address_space_do_translate(AddressSpace *as,
                                                      hwaddr addr,
                                                      hwaddr *xlat,
                                                      hwaddr *plen,
                                                      bool is_write,
479 480
                                                      bool is_mmio,
                                                      AddressSpace **target_as)
481
{
482
    IOMMUTLBEntry iotlb;
483
    MemoryRegionSection *section;
484
    IOMMUMemoryRegion *iommu_mr;
485
    IOMMUMemoryRegionClass *imrc;
486 487 488

    for (;;) {
        AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
489
        section = address_space_translate_internal(d, addr, &addr, plen, is_mmio);
490

491 492
        iommu_mr = memory_region_get_iommu(section->mr);
        if (!iommu_mr) {
493 494
            break;
        }
495
        imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
496

497 498
        iotlb = imrc->translate(iommu_mr, addr, is_write ?
                                IOMMU_WO : IOMMU_RO);
499 500 501
        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
                | (addr & iotlb.addr_mask));
        *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
502
        if (!(iotlb.perm & (1 << is_write))) {
503
            goto translate_fail;
504 505 506
        }

        as = iotlb.target_as;
507
        *target_as = iotlb.target_as;
508 509
    }

510 511 512 513 514 515
    *xlat = addr;

    return *section;

translate_fail:
    return (MemoryRegionSection) { .mr = &io_mem_unassigned };
516 517 518
}

/* Called from RCU critical section */
519 520
IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
                                            bool is_write)
521
{
522 523
    MemoryRegionSection section;
    hwaddr xlat, plen;
A
Avi Kivity 已提交
524

525 526
    /* Try to get maximum page mask during translation. */
    plen = (hwaddr)-1;
A
Avi Kivity 已提交
527

528 529
    /* This can never be MMIO. */
    section = address_space_do_translate(as, addr, &xlat, &plen,
530
                                         is_write, false, &as);
A
Avi Kivity 已提交
531

532 533 534 535
    /* Illegal translation */
    if (section.mr == &io_mem_unassigned) {
        goto iotlb_fail;
    }
A
Avi Kivity 已提交
536

537 538 539 540 541 542 543 544 545 546
    /* Convert memory region offset into address space offset */
    xlat += section.offset_within_address_space -
        section.offset_within_region;

    if (plen == (hwaddr)-1) {
        /*
         * We use default page size here. Logically it only happens
         * for identity mappings.
         */
        plen = TARGET_PAGE_SIZE;
A
Avi Kivity 已提交
547 548
    }

549 550 551 552
    /* Convert to address mask */
    plen -= 1;

    return (IOMMUTLBEntry) {
553
        .target_as = as,
554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
        .iova = addr & ~plen,
        .translated_addr = xlat & ~plen,
        .addr_mask = plen,
        /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
        .perm = IOMMU_RW,
    };

iotlb_fail:
    return (IOMMUTLBEntry) {0};
}

/* Called from RCU critical section */
MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
                                      hwaddr *xlat, hwaddr *plen,
                                      bool is_write)
{
    MemoryRegion *mr;
    MemoryRegionSection section;

    /* This can be MMIO, so setup MMIO bit. */
574 575
    section = address_space_do_translate(as, addr, xlat, plen, is_write, true,
                                         &as);
576 577
    mr = section.mr;

578
    if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
579
        hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
580
        *plen = MIN(page, *plen);
581 582
    }

A
Avi Kivity 已提交
583
    return mr;
584 585
}

586
/* Called from RCU critical section */
587
MemoryRegionSection *
588
address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
P
Paolo Bonzini 已提交
589
                                  hwaddr *xlat, hwaddr *plen)
590
{
A
Avi Kivity 已提交
591
    MemoryRegionSection *section;
592
    AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
593 594

    section = address_space_translate_internal(d, addr, xlat, plen, false);
A
Avi Kivity 已提交
595

596
    assert(!memory_region_is_iommu(section->mr));
A
Avi Kivity 已提交
597
    return section;
598
}
599
#endif
B
bellard 已提交
600

601
#if !defined(CONFIG_USER_ONLY)
602 603

static int cpu_common_post_load(void *opaque, int version_id)
B
bellard 已提交
604
{
605
    CPUState *cpu = opaque;
B
bellard 已提交
606

607 608
    /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
       version_id is increased. */
609
    cpu->interrupt_request &= ~0x01;
610
    tlb_flush(cpu);
611 612

    return 0;
B
bellard 已提交
613
}
B
bellard 已提交
614

615 616 617 618
static int cpu_common_pre_load(void *opaque)
{
    CPUState *cpu = opaque;

619
    cpu->exception_index = -1;
620 621 622 623 624 625 626 627

    return 0;
}

static bool cpu_common_exception_index_needed(void *opaque)
{
    CPUState *cpu = opaque;

628
    return tcg_enabled() && cpu->exception_index != -1;
629 630 631 632 633 634
}

static const VMStateDescription vmstate_cpu_common_exception_index = {
    .name = "cpu_common/exception_index",
    .version_id = 1,
    .minimum_version_id = 1,
635
    .needed = cpu_common_exception_index_needed,
636 637 638 639 640 641
    .fields = (VMStateField[]) {
        VMSTATE_INT32(exception_index, CPUState),
        VMSTATE_END_OF_LIST()
    }
};

642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659
static bool cpu_common_crash_occurred_needed(void *opaque)
{
    CPUState *cpu = opaque;

    return cpu->crash_occurred;
}

static const VMStateDescription vmstate_cpu_common_crash_occurred = {
    .name = "cpu_common/crash_occurred",
    .version_id = 1,
    .minimum_version_id = 1,
    .needed = cpu_common_crash_occurred_needed,
    .fields = (VMStateField[]) {
        VMSTATE_BOOL(crash_occurred, CPUState),
        VMSTATE_END_OF_LIST()
    }
};

660
const VMStateDescription vmstate_cpu_common = {
661 662 663
    .name = "cpu_common",
    .version_id = 1,
    .minimum_version_id = 1,
664
    .pre_load = cpu_common_pre_load,
665
    .post_load = cpu_common_post_load,
666
    .fields = (VMStateField[]) {
667 668
        VMSTATE_UINT32(halted, CPUState),
        VMSTATE_UINT32(interrupt_request, CPUState),
669
        VMSTATE_END_OF_LIST()
670
    },
671 672
    .subsections = (const VMStateDescription*[]) {
        &vmstate_cpu_common_exception_index,
673
        &vmstate_cpu_common_crash_occurred,
674
        NULL
675 676
    }
};
677

678
#endif
B
bellard 已提交
679

680
CPUState *qemu_get_cpu(int index)
B
bellard 已提交
681
{
A
Andreas Färber 已提交
682
    CPUState *cpu;
B
bellard 已提交
683

A
Andreas Färber 已提交
684
    CPU_FOREACH(cpu) {
685
        if (cpu->cpu_index == index) {
A
Andreas Färber 已提交
686
            return cpu;
687
        }
B
bellard 已提交
688
    }
689

A
Andreas Färber 已提交
690
    return NULL;
B
bellard 已提交
691 692
}

693
#if !defined(CONFIG_USER_ONLY)
694
void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx)
695
{
696 697 698 699 700
    CPUAddressSpace *newas;

    /* Target code should have set num_ases before calling us */
    assert(asidx < cpu->num_ases);

701 702 703 704 705
    if (asidx == 0) {
        /* address space 0 gets the convenience alias */
        cpu->as = as;
    }

706 707
    /* KVM cannot currently support multiple address spaces. */
    assert(asidx == 0 || !kvm_enabled());
708

709 710
    if (!cpu->cpu_ases) {
        cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
711
    }
712

713 714 715
    newas = &cpu->cpu_ases[asidx];
    newas->cpu = cpu;
    newas->as = as;
716
    if (tcg_enabled()) {
717 718
        newas->tcg_as_listener.commit = tcg_commit;
        memory_listener_register(&newas->tcg_as_listener, as);
719
    }
720
}
721 722 723 724 725 726

AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
{
    /* Return the AddressSpace corresponding to the specified index */
    return cpu->cpu_ases[asidx].as;
}
727 728
#endif

729
void cpu_exec_unrealizefn(CPUState *cpu)
730
{
731 732
    CPUClass *cc = CPU_GET_CLASS(cpu);

733
    cpu_list_remove(cpu);
734 735 736 737 738 739 740

    if (cc->vmsd != NULL) {
        vmstate_unregister(NULL, cc->vmsd, cpu);
    }
    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
        vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
    }
741 742
}

F
Fam Zheng 已提交
743 744 745 746 747 748 749 750 751 752 753 754 755 756
Property cpu_common_props[] = {
#ifndef CONFIG_USER_ONLY
    /* Create a memory property for softmmu CPU object,
     * so users can wire up its memory. (This can't go in qom/cpu.c
     * because that file is compiled only once for both user-mode
     * and system builds.) The default if no link is set up is to use
     * the system address space.
     */
    DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
                     MemoryRegion *),
#endif
    DEFINE_PROP_END_OF_LIST(),
};

L
Laurent Vivier 已提交
757
void cpu_exec_initfn(CPUState *cpu)
B
bellard 已提交
758
{
759
    cpu->as = NULL;
760
    cpu->num_ases = 0;
761

762 763
#ifndef CONFIG_USER_ONLY
    cpu->thread_id = qemu_get_thread_id();
764 765
    cpu->memory = system_memory;
    object_ref(OBJECT(cpu->memory));
766
#endif
L
Laurent Vivier 已提交
767 768
}

769
void cpu_exec_realizefn(CPUState *cpu, Error **errp)
L
Laurent Vivier 已提交
770 771
{
    CPUClass *cc ATTRIBUTE_UNUSED = CPU_GET_CLASS(cpu);
772

773
    cpu_list_add(cpu);
774 775

#ifndef CONFIG_USER_ONLY
776
    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
777
        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
778
    }
779
    if (cc->vmsd != NULL) {
780
        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
781
    }
782
#endif
B
bellard 已提交
783 784
}

785
#if defined(CONFIG_USER_ONLY)
786
static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
787
{
788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804
    mmap_lock();
    tb_lock();
    tb_invalidate_phys_page_range(pc, pc + 1, 0);
    tb_unlock();
    mmap_unlock();
}
#else
static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
{
    MemTxAttrs attrs;
    hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
    int asidx = cpu_asidx_from_attrs(cpu, attrs);
    if (phys != -1) {
        /* Locks grabbed by tb_invalidate_phys_addr */
        tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
                                phys | (pc & ~TARGET_PAGE_MASK));
    }
805
}
806
#endif
B
bellard 已提交
807

808
#if defined(CONFIG_USER_ONLY)
809
void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
810 811 812 813

{
}

814 815 816 817 818 819 820 821 822 823
int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
                          int flags)
{
    return -ENOSYS;
}

void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
{
}

824
int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
825 826 827 828 829
                          int flags, CPUWatchpoint **watchpoint)
{
    return -ENOSYS;
}
#else
830
/* Add a watchpoint.  */
831
int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
832
                          int flags, CPUWatchpoint **watchpoint)
833
{
834
    CPUWatchpoint *wp;
835

836
    /* forbid ranges which are empty or run off the end of the address space */
837
    if (len == 0 || (addr + len - 1) < addr) {
838 839
        error_report("tried to set invalid watchpoint at %"
                     VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
840 841
        return -EINVAL;
    }
842
    wp = g_malloc(sizeof(*wp));
843 844

    wp->vaddr = addr;
845
    wp->len = len;
846 847
    wp->flags = flags;

848
    /* keep all GDB-injected watchpoints in front */
849 850 851 852 853
    if (flags & BP_GDB) {
        QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
    } else {
        QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
    }
854

855
    tlb_flush_page(cpu, addr);
856 857 858 859

    if (watchpoint)
        *watchpoint = wp;
    return 0;
860 861
}

862
/* Remove a specific watchpoint.  */
863
int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
864
                          int flags)
865
{
866
    CPUWatchpoint *wp;
867

868
    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
869
        if (addr == wp->vaddr && len == wp->len
870
                && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
871
            cpu_watchpoint_remove_by_ref(cpu, wp);
872 873 874
            return 0;
        }
    }
875
    return -ENOENT;
876 877
}

878
/* Remove a specific watchpoint by reference.  */
879
void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
880
{
881
    QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
882

883
    tlb_flush_page(cpu, watchpoint->vaddr);
884

885
    g_free(watchpoint);
886 887 888
}

/* Remove all matching watchpoints.  */
889
void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
890
{
891
    CPUWatchpoint *wp, *next;
892

893
    QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
894 895 896
        if (wp->flags & mask) {
            cpu_watchpoint_remove_by_ref(cpu, wp);
        }
897
    }
898
}
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919

/* Return true if this watchpoint address matches the specified
 * access (ie the address range covered by the watchpoint overlaps
 * partially or completely with the address range covered by the
 * access).
 */
static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
                                                  vaddr addr,
                                                  vaddr len)
{
    /* We know the lengths are non-zero, but a little caution is
     * required to avoid errors in the case where the range ends
     * exactly at the top of the address space and so addr + len
     * wraps round to zero.
     */
    vaddr wpend = wp->vaddr + wp->len - 1;
    vaddr addrend = addr + len - 1;

    return !(addr > wpend || wp->vaddr > addrend);
}

920
#endif
921

922
/* Add a breakpoint.  */
923
int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
924
                          CPUBreakpoint **breakpoint)
B
bellard 已提交
925
{
926
    CPUBreakpoint *bp;
927

928
    bp = g_malloc(sizeof(*bp));
B
bellard 已提交
929

930 931 932
    bp->pc = pc;
    bp->flags = flags;

933
    /* keep all GDB-injected breakpoints in front */
934
    if (flags & BP_GDB) {
935
        QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
936
    } else {
937
        QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
938
    }
939

940
    breakpoint_invalidate(cpu, pc);
941

942
    if (breakpoint) {
943
        *breakpoint = bp;
944
    }
B
bellard 已提交
945 946 947
    return 0;
}

948
/* Remove a specific breakpoint.  */
949
int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
950 951 952
{
    CPUBreakpoint *bp;

953
    QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
954
        if (bp->pc == pc && bp->flags == flags) {
955
            cpu_breakpoint_remove_by_ref(cpu, bp);
956 957
            return 0;
        }
958
    }
959
    return -ENOENT;
960 961
}

962
/* Remove a specific breakpoint by reference.  */
963
void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
B
bellard 已提交
964
{
965 966 967
    QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);

    breakpoint_invalidate(cpu, breakpoint->pc);
968

969
    g_free(breakpoint);
970 971 972
}

/* Remove all matching breakpoints. */
973
void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
974
{
975
    CPUBreakpoint *bp, *next;
976

977
    QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
978 979 980
        if (bp->flags & mask) {
            cpu_breakpoint_remove_by_ref(cpu, bp);
        }
981
    }
B
bellard 已提交
982 983
}

B
bellard 已提交
984 985
/* enable or disable single step mode. EXCP_DEBUG is returned by the
   CPU loop after each instruction */
986
void cpu_single_step(CPUState *cpu, int enabled)
B
bellard 已提交
987
{
988 989 990
    if (cpu->singlestep_enabled != enabled) {
        cpu->singlestep_enabled = enabled;
        if (kvm_enabled()) {
991
            kvm_update_guest_debug(cpu, 0);
992
        } else {
S
Stuart Brady 已提交
993
            /* must flush all the translated code to avoid inconsistencies */
994
            /* XXX: only flush what is necessary */
995
            tb_flush(cpu);
996
        }
B
bellard 已提交
997 998 999
    }
}

1000
void cpu_abort(CPUState *cpu, const char *fmt, ...)
B
bellard 已提交
1001 1002
{
    va_list ap;
P
pbrook 已提交
1003
    va_list ap2;
B
bellard 已提交
1004 1005

    va_start(ap, fmt);
P
pbrook 已提交
1006
    va_copy(ap2, ap);
B
bellard 已提交
1007 1008 1009
    fprintf(stderr, "qemu: fatal: ");
    vfprintf(stderr, fmt, ap);
    fprintf(stderr, "\n");
1010
    cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1011
    if (qemu_log_separate()) {
1012
        qemu_log_lock();
1013 1014 1015
        qemu_log("qemu: fatal: ");
        qemu_log_vprintf(fmt, ap2);
        qemu_log("\n");
1016
        log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1017
        qemu_log_flush();
1018
        qemu_log_unlock();
1019
        qemu_log_close();
1020
    }
P
pbrook 已提交
1021
    va_end(ap2);
1022
    va_end(ap);
1023
    replay_finish();
1024 1025 1026 1027 1028 1029 1030 1031
#if defined(CONFIG_USER_ONLY)
    {
        struct sigaction act;
        sigfillset(&act.sa_mask);
        act.sa_handler = SIG_DFL;
        sigaction(SIGABRT, &act, NULL);
    }
#endif
B
bellard 已提交
1032 1033 1034
    abort();
}

1035
#if !defined(CONFIG_USER_ONLY)
M
Mike Day 已提交
1036
/* Called from RCU critical section */
P
Paolo Bonzini 已提交
1037 1038 1039 1040
static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
{
    RAMBlock *block;

P
Paolo Bonzini 已提交
1041
    block = atomic_rcu_read(&ram_list.mru_block);
1042
    if (block && addr - block->offset < block->max_length) {
1043
        return block;
P
Paolo Bonzini 已提交
1044
    }
P
Peter Xu 已提交
1045
    RAMBLOCK_FOREACH(block) {
1046
        if (addr - block->offset < block->max_length) {
P
Paolo Bonzini 已提交
1047 1048 1049 1050 1051 1052 1053 1054
            goto found;
        }
    }

    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
    abort();

found:
P
Paolo Bonzini 已提交
1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070
    /* It is safe to write mru_block outside the iothread lock.  This
     * is what happens:
     *
     *     mru_block = xxx
     *     rcu_read_unlock()
     *                                        xxx removed from list
     *                  rcu_read_lock()
     *                  read mru_block
     *                                        mru_block = NULL;
     *                                        call_rcu(reclaim_ramblock, xxx);
     *                  rcu_read_unlock()
     *
     * atomic_rcu_set is not needed here.  The block was already published
     * when it was placed into the list.  Here we're just making an extra
     * copy of the pointer.
     */
P
Paolo Bonzini 已提交
1071 1072 1073 1074
    ram_list.mru_block = block;
    return block;
}

1075
static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
J
Juan Quintela 已提交
1076
{
1077
    CPUState *cpu;
P
Paolo Bonzini 已提交
1078
    ram_addr_t start1;
1079 1080 1081 1082 1083
    RAMBlock *block;
    ram_addr_t end;

    end = TARGET_PAGE_ALIGN(start + length);
    start &= TARGET_PAGE_MASK;
J
Juan Quintela 已提交
1084

M
Mike Day 已提交
1085
    rcu_read_lock();
P
Paolo Bonzini 已提交
1086 1087
    block = qemu_get_ram_block(start);
    assert(block == qemu_get_ram_block(end - 1));
1088
    start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
1089 1090 1091
    CPU_FOREACH(cpu) {
        tlb_reset_dirty(cpu, start1, length);
    }
M
Mike Day 已提交
1092
    rcu_read_unlock();
J
Juan Quintela 已提交
1093 1094
}

P
pbrook 已提交
1095
/* Note: start and end must be within the same ram block.  */
1096 1097 1098
bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
                                              ram_addr_t length,
                                              unsigned client)
1099
{
1100
    DirtyMemoryBlocks *blocks;
1101
    unsigned long end, page;
1102
    bool dirty = false;
1103 1104 1105 1106

    if (length == 0) {
        return false;
    }
B
bellard 已提交
1107

1108 1109
    end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
    page = start >> TARGET_PAGE_BITS;
1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125

    rcu_read_lock();

    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);

    while (page < end) {
        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);

        dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
                                              offset, num);
        page += num;
    }

    rcu_read_unlock();
1126 1127

    if (dirty && tcg_enabled()) {
1128
        tlb_reset_dirty_range_all(start, length);
P
pbrook 已提交
1129
    }
1130 1131

    return dirty;
1132 1133
}

1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202
DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
     (ram_addr_t start, ram_addr_t length, unsigned client)
{
    DirtyMemoryBlocks *blocks;
    unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
    ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
    ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
    DirtyBitmapSnapshot *snap;
    unsigned long page, end, dest;

    snap = g_malloc0(sizeof(*snap) +
                     ((last - first) >> (TARGET_PAGE_BITS + 3)));
    snap->start = first;
    snap->end   = last;

    page = first >> TARGET_PAGE_BITS;
    end  = last  >> TARGET_PAGE_BITS;
    dest = 0;

    rcu_read_lock();

    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);

    while (page < end) {
        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);

        assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
        assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
        offset >>= BITS_PER_LEVEL;

        bitmap_copy_and_clear_atomic(snap->dirty + dest,
                                     blocks->blocks[idx] + offset,
                                     num);
        page += num;
        dest += num >> BITS_PER_LEVEL;
    }

    rcu_read_unlock();

    if (tcg_enabled()) {
        tlb_reset_dirty_range_all(start, length);
    }

    return snap;
}

bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
                                            ram_addr_t start,
                                            ram_addr_t length)
{
    unsigned long page, end;

    assert(start >= snap->start);
    assert(start + length <= snap->end);

    end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
    page = (start - snap->start) >> TARGET_PAGE_BITS;

    while (page < end) {
        if (test_bit(page, snap->dirty)) {
            return true;
        }
        page++;
    }
    return false;
}

1203
/* Called from RCU critical section */
1204
hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1205 1206 1207 1208 1209
                                       MemoryRegionSection *section,
                                       target_ulong vaddr,
                                       hwaddr paddr, hwaddr xlat,
                                       int prot,
                                       target_ulong *address)
B
Blue Swirl 已提交
1210
{
A
Avi Kivity 已提交
1211
    hwaddr iotlb;
B
Blue Swirl 已提交
1212 1213
    CPUWatchpoint *wp;

1214
    if (memory_region_is_ram(section->mr)) {
B
Blue Swirl 已提交
1215
        /* Normal RAM.  */
1216
        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
B
Blue Swirl 已提交
1217
        if (!section->readonly) {
1218
            iotlb |= PHYS_SECTION_NOTDIRTY;
B
Blue Swirl 已提交
1219
        } else {
1220
            iotlb |= PHYS_SECTION_ROM;
B
Blue Swirl 已提交
1221 1222
        }
    } else {
1223 1224 1225 1226
        AddressSpaceDispatch *d;

        d = atomic_rcu_read(&section->address_space->dispatch);
        iotlb = section - d->map.sections;
1227
        iotlb += xlat;
B
Blue Swirl 已提交
1228 1229 1230 1231
    }

    /* Make accesses to pages with watchpoints go via the
       watchpoint trap routines.  */
1232
    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1233
        if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
B
Blue Swirl 已提交
1234 1235
            /* Avoid trapping reads of pages with a write breakpoint. */
            if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
1236
                iotlb = PHYS_SECTION_WATCH + paddr;
B
Blue Swirl 已提交
1237 1238 1239 1240 1241 1242 1243 1244
                *address |= TLB_MMIO;
                break;
            }
        }
    }

    return iotlb;
}
1245 1246
#endif /* defined(CONFIG_USER_ONLY) */

1247
#if !defined(CONFIG_USER_ONLY)
1248

A
Anthony Liguori 已提交
1249
static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
1250
                             uint16_t section);
1251
static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
1252

1253 1254
static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
                               qemu_anon_ram_alloc;
1255 1256 1257 1258 1259 1260

/*
 * Set a custom physical guest memory alloator.
 * Accelerators with unusual needs may need this.  Hopefully, we can
 * get rid of it eventually.
 */
1261
void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
1262 1263 1264 1265
{
    phys_mem_alloc = alloc;
}

1266 1267
static uint16_t phys_section_add(PhysPageMap *map,
                                 MemoryRegionSection *section)
1268
{
1269 1270 1271 1272
    /* The physical section number is ORed with a page-aligned
     * pointer to produce the iotlb entries.  Thus it should
     * never overflow into the page-aligned value.
     */
1273
    assert(map->sections_nb < TARGET_PAGE_SIZE);
1274

1275 1276 1277 1278
    if (map->sections_nb == map->sections_nb_alloc) {
        map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
        map->sections = g_renew(MemoryRegionSection, map->sections,
                                map->sections_nb_alloc);
1279
    }
1280
    map->sections[map->sections_nb] = *section;
P
Paolo Bonzini 已提交
1281
    memory_region_ref(section->mr);
1282
    return map->sections_nb++;
1283 1284
}

1285 1286
static void phys_section_destroy(MemoryRegion *mr)
{
D
Don Slutz 已提交
1287 1288
    bool have_sub_page = mr->subpage;

P
Paolo Bonzini 已提交
1289 1290
    memory_region_unref(mr);

D
Don Slutz 已提交
1291
    if (have_sub_page) {
1292
        subpage_t *subpage = container_of(mr, subpage_t, iomem);
P
Peter Crosthwaite 已提交
1293
        object_unref(OBJECT(&subpage->iomem));
1294 1295 1296 1297
        g_free(subpage);
    }
}

P
Paolo Bonzini 已提交
1298
static void phys_sections_free(PhysPageMap *map)
1299
{
1300 1301
    while (map->sections_nb > 0) {
        MemoryRegionSection *section = &map->sections[--map->sections_nb];
1302 1303
        phys_section_destroy(section->mr);
    }
1304 1305
    g_free(map->sections);
    g_free(map->nodes);
1306 1307
}

A
Avi Kivity 已提交
1308
static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
1309 1310
{
    subpage_t *subpage;
A
Avi Kivity 已提交
1311
    hwaddr base = section->offset_within_address_space
1312
        & TARGET_PAGE_MASK;
1313
    MemoryRegionSection *existing = phys_page_find(d, base);
1314 1315
    MemoryRegionSection subsection = {
        .offset_within_address_space = base,
1316
        .size = int128_make64(TARGET_PAGE_SIZE),
1317
    };
A
Avi Kivity 已提交
1318
    hwaddr start, end;
1319

1320
    assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1321

1322
    if (!(existing->mr->subpage)) {
1323
        subpage = subpage_init(d->as, base);
1324
        subsection.address_space = d->as;
1325
        subsection.mr = &subpage->iomem;
A
Avi Kivity 已提交
1326
        phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1327
                      phys_section_add(&d->map, &subsection));
1328
    } else {
1329
        subpage = container_of(existing->mr, subpage_t, iomem);
1330 1331
    }
    start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1332
    end = start + int128_get64(section->size) - 1;
1333 1334
    subpage_register(subpage, start, end,
                     phys_section_add(&d->map, section));
1335 1336 1337
}


1338 1339
static void register_multipage(AddressSpaceDispatch *d,
                               MemoryRegionSection *section)
1340
{
A
Avi Kivity 已提交
1341
    hwaddr start_addr = section->offset_within_address_space;
1342
    uint16_t section_index = phys_section_add(&d->map, section);
1343 1344
    uint64_t num_pages = int128_get64(int128_rshift(section->size,
                                                    TARGET_PAGE_BITS));
1345

1346 1347
    assert(num_pages);
    phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1348 1349
}

A
Avi Kivity 已提交
1350
static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
1351
{
1352
    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
1353
    AddressSpaceDispatch *d = as->next_dispatch;
1354
    MemoryRegionSection now = *section, remain = *section;
1355
    Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1356

1357 1358 1359 1360
    if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
        uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
                       - now.offset_within_address_space;

1361
        now.size = int128_min(int128_make64(left), now.size);
A
Avi Kivity 已提交
1362
        register_subpage(d, &now);
1363
    } else {
1364
        now.size = int128_zero();
1365
    }
1366 1367 1368 1369
    while (int128_ne(remain.size, now.size)) {
        remain.size = int128_sub(remain.size, now.size);
        remain.offset_within_address_space += int128_get64(now.size);
        remain.offset_within_region += int128_get64(now.size);
1370
        now = remain;
1371
        if (int128_lt(remain.size, page_size)) {
1372
            register_subpage(d, &now);
1373
        } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1374
            now.size = page_size;
A
Avi Kivity 已提交
1375
            register_subpage(d, &now);
1376
        } else {
1377
            now.size = int128_and(now.size, int128_neg(page_size));
A
Avi Kivity 已提交
1378
            register_multipage(d, &now);
1379
        }
1380 1381 1382
    }
}

1383 1384 1385 1386 1387 1388
void qemu_flush_coalesced_mmio_buffer(void)
{
    if (kvm_enabled())
        kvm_flush_coalesced_mmio_buffer();
}

1389 1390 1391 1392 1393 1394 1395 1396 1397 1398
void qemu_mutex_lock_ramlist(void)
{
    qemu_mutex_lock(&ram_list.mutex);
}

void qemu_mutex_unlock_ramlist(void)
{
    qemu_mutex_unlock(&ram_list.mutex);
}

1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418
void ram_block_dump(Monitor *mon)
{
    RAMBlock *block;
    char *psize;

    rcu_read_lock();
    monitor_printf(mon, "%24s %8s  %18s %18s %18s\n",
                   "Block Name", "PSize", "Offset", "Used", "Total");
    RAMBLOCK_FOREACH(block) {
        psize = size_to_str(block->page_size);
        monitor_printf(mon, "%24s %8s  0x%016" PRIx64 " 0x%016" PRIx64
                       " 0x%016" PRIx64 "\n", block->idstr, psize,
                       (uint64_t)block->offset,
                       (uint64_t)block->used_length,
                       (uint64_t)block->max_length);
        g_free(psize);
    }
    rcu_read_unlock();
}

1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499
#ifdef __linux__
/*
 * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
 * may or may not name the same files / on the same filesystem now as
 * when we actually open and map them.  Iterate over the file
 * descriptors instead, and use qemu_fd_getpagesize().
 */
static int find_max_supported_pagesize(Object *obj, void *opaque)
{
    char *mem_path;
    long *hpsize_min = opaque;

    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
        mem_path = object_property_get_str(obj, "mem-path", NULL);
        if (mem_path) {
            long hpsize = qemu_mempath_getpagesize(mem_path);
            if (hpsize < *hpsize_min) {
                *hpsize_min = hpsize;
            }
        } else {
            *hpsize_min = getpagesize();
        }
    }

    return 0;
}

long qemu_getrampagesize(void)
{
    long hpsize = LONG_MAX;
    long mainrampagesize;
    Object *memdev_root;

    if (mem_path) {
        mainrampagesize = qemu_mempath_getpagesize(mem_path);
    } else {
        mainrampagesize = getpagesize();
    }

    /* it's possible we have memory-backend objects with
     * hugepage-backed RAM. these may get mapped into system
     * address space via -numa parameters or memory hotplug
     * hooks. we want to take these into account, but we
     * also want to make sure these supported hugepage
     * sizes are applicable across the entire range of memory
     * we may boot from, so we take the min across all
     * backends, and assume normal pages in cases where a
     * backend isn't backed by hugepages.
     */
    memdev_root = object_resolve_path("/objects", NULL);
    if (memdev_root) {
        object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize);
    }
    if (hpsize == LONG_MAX) {
        /* No additional memory regions found ==> Report main RAM page size */
        return mainrampagesize;
    }

    /* If NUMA is disabled or the NUMA nodes are not backed with a
     * memory-backend, then there is at least one node using "normal" RAM,
     * so if its page size is smaller we have got to report that size instead.
     */
    if (hpsize > mainrampagesize &&
        (nb_numa_nodes == 0 || numa_info[0].node_memdev == NULL)) {
        static bool warned;
        if (!warned) {
            error_report("Huge page support disabled (n/a for main memory).");
            warned = true;
        }
        return mainrampagesize;
    }

    return hpsize;
}
#else
long qemu_getrampagesize(void)
{
    return getpagesize();
}
#endif

1500
#ifdef __linux__
1501 1502 1503 1504 1505 1506 1507 1508 1509
static int64_t get_file_size(int fd)
{
    int64_t size = lseek(fd, 0, SEEK_END);
    if (size < 0) {
        return -errno;
    }
    return size;
}

1510 1511 1512 1513
static int file_ram_open(const char *path,
                         const char *region_name,
                         bool *created,
                         Error **errp)
1514 1515
{
    char *filename;
1516 1517
    char *sanitized_name;
    char *c;
1518
    int fd = -1;
1519

1520
    *created = false;
1521 1522 1523 1524 1525
    for (;;) {
        fd = open(path, O_RDWR);
        if (fd >= 0) {
            /* @path names an existing file, use it */
            break;
1526
        }
1527 1528 1529 1530
        if (errno == ENOENT) {
            /* @path names a file that doesn't exist, create it */
            fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
            if (fd >= 0) {
1531
                *created = true;
1532 1533 1534 1535 1536
                break;
            }
        } else if (errno == EISDIR) {
            /* @path names a directory, create a file there */
            /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1537
            sanitized_name = g_strdup(region_name);
1538 1539 1540 1541 1542
            for (c = sanitized_name; *c != '\0'; c++) {
                if (*c == '/') {
                    *c = '_';
                }
            }
1543

1544 1545 1546
            filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
                                       sanitized_name);
            g_free(sanitized_name);
1547

1548 1549 1550 1551 1552 1553 1554
            fd = mkstemp(filename);
            if (fd >= 0) {
                unlink(filename);
                g_free(filename);
                break;
            }
            g_free(filename);
1555
        }
1556 1557 1558 1559
        if (errno != EEXIST && errno != EINTR) {
            error_setg_errno(errp, errno,
                             "can't open backing store %s for guest RAM",
                             path);
1560
            return -1;
1561 1562 1563 1564 1565
        }
        /*
         * Try again on EINTR and EEXIST.  The latter happens when
         * something else creates the file between our two open().
         */
1566
    }
1567

1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578
    return fd;
}

static void *file_ram_alloc(RAMBlock *block,
                            ram_addr_t memory,
                            int fd,
                            bool truncate,
                            Error **errp)
{
    void *area;

1579
    block->page_size = qemu_fd_getpagesize(fd);
1580 1581 1582 1583 1584 1585
    block->mr->align = block->page_size;
#if defined(__s390x__)
    if (kvm_enabled()) {
        block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
    }
#endif
1586

1587
    if (memory < block->page_size) {
1588
        error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1589 1590
                   "or larger than page size 0x%zx",
                   memory, block->page_size);
1591
        return NULL;
1592 1593
    }

1594
    memory = ROUND_UP(memory, block->page_size);
1595 1596 1597 1598 1599 1600

    /*
     * ftruncate is not supported by hugetlbfs in older
     * hosts, so don't bother bailing out on errors.
     * If anything goes wrong with it under other filesystems,
     * mmap will fail.
1601 1602 1603 1604 1605 1606 1607 1608
     *
     * Do not truncate the non-empty backend file to avoid corrupting
     * the existing data in the file. Disabling shrinking is not
     * enough. For example, the current vNVDIMM implementation stores
     * the guest NVDIMM labels at the end of the backend file. If the
     * backend file is later extended, QEMU will not be able to find
     * those labels. Therefore, extending the non-empty backend file
     * is disabled as well.
1609
     */
1610
    if (truncate && ftruncate(fd, memory)) {
Y
Yoshiaki Tamura 已提交
1611
        perror("ftruncate");
1612
    }
1613

1614 1615
    area = qemu_ram_mmap(fd, memory, block->mr->align,
                         block->flags & RAM_SHARED);
1616
    if (area == MAP_FAILED) {
1617
        error_setg_errno(errp, errno,
1618
                         "unable to map backing store for guest RAM");
1619
        return NULL;
1620
    }
1621 1622

    if (mem_prealloc) {
1623
        os_mem_prealloc(fd, area, memory, smp_cpus, errp);
1624
        if (errp && *errp) {
1625 1626
            qemu_ram_munmap(area, memory);
            return NULL;
1627
        }
1628 1629
    }

A
Alex Williamson 已提交
1630
    block->fd = fd;
1631 1632 1633 1634
    return area;
}
#endif

M
Mike Day 已提交
1635
/* Called with the ramlist lock held.  */
1636
static ram_addr_t find_ram_offset(ram_addr_t size)
A
Alex Williamson 已提交
1637 1638
{
    RAMBlock *block, *next_block;
A
Alex Williamson 已提交
1639
    ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
A
Alex Williamson 已提交
1640

1641 1642
    assert(size != 0); /* it would hand out same offset multiple times */

M
Mike Day 已提交
1643
    if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
A
Alex Williamson 已提交
1644
        return 0;
M
Mike Day 已提交
1645
    }
A
Alex Williamson 已提交
1646

P
Peter Xu 已提交
1647
    RAMBLOCK_FOREACH(block) {
1648
        ram_addr_t end, next = RAM_ADDR_MAX;
A
Alex Williamson 已提交
1649

1650
        end = block->offset + block->max_length;
A
Alex Williamson 已提交
1651

P
Peter Xu 已提交
1652
        RAMBLOCK_FOREACH(next_block) {
A
Alex Williamson 已提交
1653 1654 1655 1656 1657
            if (next_block->offset >= end) {
                next = MIN(next, next_block->offset);
            }
        }
        if (next - end >= size && next - end < mingap) {
A
Alex Williamson 已提交
1658
            offset = end;
A
Alex Williamson 已提交
1659 1660 1661
            mingap = next - end;
        }
    }
A
Alex Williamson 已提交
1662 1663 1664 1665 1666 1667 1668

    if (offset == RAM_ADDR_MAX) {
        fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
                (uint64_t)size);
        abort();
    }

A
Alex Williamson 已提交
1669 1670 1671
    return offset;
}

1672
unsigned long last_ram_page(void)
1673 1674 1675 1676
{
    RAMBlock *block;
    ram_addr_t last = 0;

M
Mike Day 已提交
1677
    rcu_read_lock();
P
Peter Xu 已提交
1678
    RAMBLOCK_FOREACH(block) {
1679
        last = MAX(last, block->offset + block->max_length);
M
Mike Day 已提交
1680
    }
M
Mike Day 已提交
1681
    rcu_read_unlock();
1682
    return last >> TARGET_PAGE_BITS;
1683 1684
}

1685 1686 1687 1688 1689
static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
{
    int ret;

    /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1690
    if (!machine_dump_guest_core(current_machine)) {
1691 1692 1693 1694 1695 1696 1697 1698 1699
        ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
        if (ret) {
            perror("qemu_madvise");
            fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
                            "but dump_guest_core=off specified\n");
        }
    }
}

D
Dr. David Alan Gilbert 已提交
1700 1701 1702 1703 1704
const char *qemu_ram_get_idstr(RAMBlock *rb)
{
    return rb->idstr;
}

1705 1706 1707 1708 1709
bool qemu_ram_is_shared(RAMBlock *rb)
{
    return rb->flags & RAM_SHARED;
}

1710
/* Called with iothread lock held.  */
G
Gonglei 已提交
1711
void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
1712
{
G
Gonglei 已提交
1713
    RAMBlock *block;
1714

1715 1716
    assert(new_block);
    assert(!new_block->idstr[0]);
1717

1718 1719
    if (dev) {
        char *id = qdev_get_dev_path(dev);
1720 1721
        if (id) {
            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1722
            g_free(id);
1723 1724 1725 1726
        }
    }
    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);

G
Gonglei 已提交
1727
    rcu_read_lock();
P
Peter Xu 已提交
1728
    RAMBLOCK_FOREACH(block) {
G
Gonglei 已提交
1729 1730
        if (block != new_block &&
            !strcmp(block->idstr, new_block->idstr)) {
1731 1732 1733 1734 1735
            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
                    new_block->idstr);
            abort();
        }
    }
M
Mike Day 已提交
1736
    rcu_read_unlock();
1737 1738
}

1739
/* Called with iothread lock held.  */
G
Gonglei 已提交
1740
void qemu_ram_unset_idstr(RAMBlock *block)
1741
{
1742 1743 1744 1745
    /* FIXME: arch_init.c assumes that this is not called throughout
     * migration.  Ignore the problem since hot-unplug during migration
     * does not work anyway.
     */
1746 1747 1748 1749 1750
    if (block) {
        memset(block->idstr, 0, sizeof(block->idstr));
    }
}

1751 1752 1753 1754 1755
size_t qemu_ram_pagesize(RAMBlock *rb)
{
    return rb->page_size;
}

1756 1757 1758 1759 1760 1761
/* Returns the largest size of page in use */
size_t qemu_ram_pagesize_largest(void)
{
    RAMBlock *block;
    size_t largest = 0;

P
Peter Xu 已提交
1762
    RAMBLOCK_FOREACH(block) {
1763 1764 1765 1766 1767 1768
        largest = MAX(largest, qemu_ram_pagesize(block));
    }

    return largest;
}

1769 1770
static int memory_try_enable_merging(void *addr, size_t len)
{
1771
    if (!machine_mem_merge(current_machine)) {
1772 1773 1774 1775 1776 1777 1778
        /* disabled by the user */
        return 0;
    }

    return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
}

1779 1780 1781 1782 1783 1784 1785
/* Only legal before guest might have detected the memory size: e.g. on
 * incoming migration, or right after reset.
 *
 * As memory core doesn't know how is memory accessed, it is up to
 * resize callback to update device state and/or add assertions to detect
 * misuse, if necessary.
 */
G
Gonglei 已提交
1786
int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
1787 1788 1789
{
    assert(block);

1790
    newsize = HOST_PAGE_ALIGN(newsize);
1791

1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813
    if (block->used_length == newsize) {
        return 0;
    }

    if (!(block->flags & RAM_RESIZEABLE)) {
        error_setg_errno(errp, EINVAL,
                         "Length mismatch: %s: 0x" RAM_ADDR_FMT
                         " in != 0x" RAM_ADDR_FMT, block->idstr,
                         newsize, block->used_length);
        return -EINVAL;
    }

    if (block->max_length < newsize) {
        error_setg_errno(errp, EINVAL,
                         "Length too large: %s: 0x" RAM_ADDR_FMT
                         " > 0x" RAM_ADDR_FMT, block->idstr,
                         newsize, block->max_length);
        return -EINVAL;
    }

    cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
    block->used_length = newsize;
1814 1815
    cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
                                        DIRTY_CLIENTS_ALL);
1816 1817 1818 1819 1820 1821 1822
    memory_region_set_size(block->mr, newsize);
    if (block->resized) {
        block->resized(block->idstr, newsize, block->host);
    }
    return 0;
}

1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863
/* Called with ram_list.mutex held */
static void dirty_memory_extend(ram_addr_t old_ram_size,
                                ram_addr_t new_ram_size)
{
    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
                                             DIRTY_MEMORY_BLOCK_SIZE);
    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
                                             DIRTY_MEMORY_BLOCK_SIZE);
    int i;

    /* Only need to extend if block count increased */
    if (new_num_blocks <= old_num_blocks) {
        return;
    }

    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
        DirtyMemoryBlocks *old_blocks;
        DirtyMemoryBlocks *new_blocks;
        int j;

        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
        new_blocks = g_malloc(sizeof(*new_blocks) +
                              sizeof(new_blocks->blocks[0]) * new_num_blocks);

        if (old_num_blocks) {
            memcpy(new_blocks->blocks, old_blocks->blocks,
                   old_num_blocks * sizeof(old_blocks->blocks[0]));
        }

        for (j = old_num_blocks; j < new_num_blocks; j++) {
            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
        }

        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);

        if (old_blocks) {
            g_free_rcu(old_blocks, rcu);
        }
    }
}

1864
static void ram_block_add(RAMBlock *new_block, Error **errp)
1865
{
1866
    RAMBlock *block;
M
Mike Day 已提交
1867
    RAMBlock *last_block = NULL;
1868
    ram_addr_t old_ram_size, new_ram_size;
1869
    Error *err = NULL;
1870

1871
    old_ram_size = last_ram_page();
1872

1873
    qemu_mutex_lock_ramlist();
1874
    new_block->offset = find_ram_offset(new_block->max_length);
1875 1876 1877

    if (!new_block->host) {
        if (xen_enabled()) {
1878
            xen_ram_alloc(new_block->offset, new_block->max_length,
1879 1880 1881 1882
                          new_block->mr, &err);
            if (err) {
                error_propagate(errp, err);
                qemu_mutex_unlock_ramlist();
1883
                return;
1884
            }
1885
        } else {
1886
            new_block->host = phys_mem_alloc(new_block->max_length,
1887
                                             &new_block->mr->align);
1888
            if (!new_block->host) {
1889 1890 1891 1892
                error_setg_errno(errp, errno,
                                 "cannot set up guest memory '%s'",
                                 memory_region_name(new_block->mr));
                qemu_mutex_unlock_ramlist();
1893
                return;
1894
            }
1895
            memory_try_enable_merging(new_block->host, new_block->max_length);
1896
        }
1897
    }
P
pbrook 已提交
1898

L
Li Zhijian 已提交
1899 1900 1901
    new_ram_size = MAX(old_ram_size,
              (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
    if (new_ram_size > old_ram_size) {
1902
        dirty_memory_extend(old_ram_size, new_ram_size);
L
Li Zhijian 已提交
1903
    }
M
Mike Day 已提交
1904 1905 1906 1907
    /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
     * QLIST (which has an RCU-friendly variant) does not have insertion at
     * tail, so save the last element in last_block.
     */
P
Peter Xu 已提交
1908
    RAMBLOCK_FOREACH(block) {
M
Mike Day 已提交
1909
        last_block = block;
1910
        if (block->max_length < new_block->max_length) {
1911 1912 1913 1914
            break;
        }
    }
    if (block) {
M
Mike Day 已提交
1915
        QLIST_INSERT_BEFORE_RCU(block, new_block, next);
M
Mike Day 已提交
1916
    } else if (last_block) {
M
Mike Day 已提交
1917
        QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
M
Mike Day 已提交
1918
    } else { /* list is empty */
M
Mike Day 已提交
1919
        QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1920
    }
1921
    ram_list.mru_block = NULL;
P
pbrook 已提交
1922

M
Mike Day 已提交
1923 1924
    /* Write list before version */
    smp_wmb();
U
Umesh Deshpande 已提交
1925
    ram_list.version++;
1926
    qemu_mutex_unlock_ramlist();
U
Umesh Deshpande 已提交
1927

1928
    cpu_physical_memory_set_dirty_range(new_block->offset,
1929 1930
                                        new_block->used_length,
                                        DIRTY_CLIENTS_ALL);
P
pbrook 已提交
1931

1932 1933 1934
    if (new_block->host) {
        qemu_ram_setup_dump(new_block->host, new_block->max_length);
        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
C
Cao jin 已提交
1935
        /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
1936
        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
P
Paolo Bonzini 已提交
1937
        ram_block_notify_add(new_block->host, new_block->max_length);
1938
    }
P
pbrook 已提交
1939
}
B
bellard 已提交
1940

1941
#ifdef __linux__
1942 1943 1944
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
                                 bool share, int fd,
                                 Error **errp)
1945 1946
{
    RAMBlock *new_block;
1947
    Error *local_err = NULL;
1948
    int64_t file_size;
1949 1950

    if (xen_enabled()) {
1951
        error_setg(errp, "-mem-path not supported with Xen");
1952
        return NULL;
1953 1954
    }

1955 1956 1957 1958 1959 1960
    if (kvm_enabled() && !kvm_has_sync_mmu()) {
        error_setg(errp,
                   "host lacks kvm mmu notifiers, -mem-path unsupported");
        return NULL;
    }

1961 1962 1963 1964 1965 1966
    if (phys_mem_alloc != qemu_anon_ram_alloc) {
        /*
         * file_ram_alloc() needs to allocate just like
         * phys_mem_alloc, but we haven't bothered to provide
         * a hook there.
         */
1967 1968
        error_setg(errp,
                   "-mem-path not supported with this accelerator");
1969
        return NULL;
1970 1971
    }

1972
    size = HOST_PAGE_ALIGN(size);
1973 1974 1975 1976 1977 1978 1979 1980
    file_size = get_file_size(fd);
    if (file_size > 0 && file_size < size) {
        error_setg(errp, "backing store %s size 0x%" PRIx64
                   " does not match 'size' option 0x" RAM_ADDR_FMT,
                   mem_path, file_size, size);
        return NULL;
    }

1981 1982
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
1983 1984
    new_block->used_length = size;
    new_block->max_length = size;
1985
    new_block->flags = share ? RAM_SHARED : 0;
1986
    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
1987 1988
    if (!new_block->host) {
        g_free(new_block);
1989
        return NULL;
1990 1991
    }

1992
    ram_block_add(new_block, &local_err);
1993 1994 1995
    if (local_err) {
        g_free(new_block);
        error_propagate(errp, local_err);
1996
        return NULL;
1997
    }
1998
    return new_block;
1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025

}


RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
                                   bool share, const char *mem_path,
                                   Error **errp)
{
    int fd;
    bool created;
    RAMBlock *block;

    fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
    if (fd < 0) {
        return NULL;
    }

    block = qemu_ram_alloc_from_fd(size, mr, share, fd, errp);
    if (!block) {
        if (created) {
            unlink(mem_path);
        }
        close(fd);
        return NULL;
    }

    return block;
2026
}
2027
#endif
2028

2029
static
2030 2031 2032 2033 2034 2035
RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
                                  void (*resized)(const char*,
                                                  uint64_t length,
                                                  void *host),
                                  void *host, bool resizeable,
                                  MemoryRegion *mr, Error **errp)
2036 2037
{
    RAMBlock *new_block;
2038
    Error *local_err = NULL;
2039

2040 2041
    size = HOST_PAGE_ALIGN(size);
    max_size = HOST_PAGE_ALIGN(max_size);
2042 2043
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
2044
    new_block->resized = resized;
2045 2046
    new_block->used_length = size;
    new_block->max_length = max_size;
2047
    assert(max_size >= size);
2048
    new_block->fd = -1;
2049
    new_block->page_size = getpagesize();
2050 2051
    new_block->host = host;
    if (host) {
2052
        new_block->flags |= RAM_PREALLOC;
2053
    }
2054 2055 2056
    if (resizeable) {
        new_block->flags |= RAM_RESIZEABLE;
    }
2057
    ram_block_add(new_block, &local_err);
2058 2059 2060
    if (local_err) {
        g_free(new_block);
        error_propagate(errp, local_err);
2061
        return NULL;
2062
    }
2063
    return new_block;
2064 2065
}

2066
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
2067 2068 2069 2070 2071
                                   MemoryRegion *mr, Error **errp)
{
    return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
}

2072
RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
2073
{
2074 2075 2076
    return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
}

2077
RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
2078 2079 2080 2081 2082 2083
                                     void (*resized)(const char*,
                                                     uint64_t length,
                                                     void *host),
                                     MemoryRegion *mr, Error **errp)
{
    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
2084 2085
}

P
Paolo Bonzini 已提交
2086 2087 2088 2089 2090 2091 2092 2093
static void reclaim_ramblock(RAMBlock *block)
{
    if (block->flags & RAM_PREALLOC) {
        ;
    } else if (xen_enabled()) {
        xen_invalidate_map_cache_entry(block->host);
#ifndef _WIN32
    } else if (block->fd >= 0) {
2094
        qemu_ram_munmap(block->host, block->max_length);
P
Paolo Bonzini 已提交
2095 2096 2097 2098 2099 2100 2101 2102
        close(block->fd);
#endif
    } else {
        qemu_anon_ram_free(block->host, block->max_length);
    }
    g_free(block);
}

2103
void qemu_ram_free(RAMBlock *block)
B
bellard 已提交
2104
{
2105 2106 2107 2108
    if (!block) {
        return;
    }

P
Paolo Bonzini 已提交
2109 2110 2111 2112
    if (block->host) {
        ram_block_notify_remove(block->host, block->max_length);
    }

2113
    qemu_mutex_lock_ramlist();
2114 2115 2116 2117 2118 2119
    QLIST_REMOVE_RCU(block, next);
    ram_list.mru_block = NULL;
    /* Write list before version */
    smp_wmb();
    ram_list.version++;
    call_rcu(block, reclaim_ramblock, rcu);
2120
    qemu_mutex_unlock_ramlist();
B
bellard 已提交
2121 2122
}

H
Huang Ying 已提交
2123 2124 2125 2126 2127 2128 2129 2130
#ifndef _WIN32
void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
{
    RAMBlock *block;
    ram_addr_t offset;
    int flags;
    void *area, *vaddr;

P
Peter Xu 已提交
2131
    RAMBLOCK_FOREACH(block) {
H
Huang Ying 已提交
2132
        offset = addr - block->offset;
2133
        if (offset < block->max_length) {
2134
            vaddr = ramblock_ptr(block, offset);
2135
            if (block->flags & RAM_PREALLOC) {
H
Huang Ying 已提交
2136
                ;
2137 2138
            } else if (xen_enabled()) {
                abort();
H
Huang Ying 已提交
2139 2140
            } else {
                flags = MAP_FIXED;
2141
                if (block->fd >= 0) {
2142 2143
                    flags |= (block->flags & RAM_SHARED ?
                              MAP_SHARED : MAP_PRIVATE);
2144 2145
                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
                                flags, block->fd, offset);
H
Huang Ying 已提交
2146
                } else {
2147 2148 2149 2150 2151 2152 2153
                    /*
                     * Remap needs to match alloc.  Accelerators that
                     * set phys_mem_alloc never remap.  If they did,
                     * we'd need a remap hook here.
                     */
                    assert(phys_mem_alloc == qemu_anon_ram_alloc);

H
Huang Ying 已提交
2154 2155 2156 2157 2158
                    flags |= MAP_PRIVATE | MAP_ANONYMOUS;
                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
                                flags, -1, 0);
                }
                if (area != vaddr) {
2159 2160
                    fprintf(stderr, "Could not remap addr: "
                            RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
H
Huang Ying 已提交
2161 2162 2163
                            length, addr);
                    exit(1);
                }
2164
                memory_try_enable_merging(vaddr, length);
2165
                qemu_ram_setup_dump(vaddr, length);
H
Huang Ying 已提交
2166 2167 2168 2169 2170 2171
            }
        }
    }
}
#endif /* !_WIN32 */

2172
/* Return a host pointer to ram allocated with qemu_ram_alloc.
2173 2174 2175
 * This should not be used for general purpose DMA.  Use address_space_map
 * or address_space_rw instead. For local memory (e.g. video ram) that the
 * device owns, use memory_region_get_ram_ptr.
M
Mike Day 已提交
2176
 *
2177
 * Called within RCU critical section.
2178
 */
2179
void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
2180
{
2181 2182 2183 2184
    RAMBlock *block = ram_block;

    if (block == NULL) {
        block = qemu_get_ram_block(addr);
2185
        addr -= block->offset;
2186
    }
2187 2188

    if (xen_enabled() && block->host == NULL) {
2189 2190 2191 2192 2193
        /* We need to check if the requested address is in the RAM
         * because we don't want to map the entire memory in QEMU.
         * In that case just map until the end of the page.
         */
        if (block->offset == 0) {
2194
            return xen_map_cache(addr, 0, 0, false);
2195
        }
2196

2197
        block->host = xen_map_cache(block->offset, block->max_length, 1, false);
2198
    }
2199
    return ramblock_ptr(block, addr);
2200 2201
}

2202
/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
2203
 * but takes a size argument.
M
Mike Day 已提交
2204
 *
2205
 * Called within RCU critical section.
2206
 */
2207
static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
2208
                                 hwaddr *size, bool lock)
2209
{
2210
    RAMBlock *block = ram_block;
2211 2212 2213
    if (*size == 0) {
        return NULL;
    }
2214

2215 2216
    if (block == NULL) {
        block = qemu_get_ram_block(addr);
2217
        addr -= block->offset;
2218
    }
2219
    *size = MIN(*size, block->max_length - addr);
2220 2221 2222 2223 2224 2225 2226

    if (xen_enabled() && block->host == NULL) {
        /* We need to check if the requested address is in the RAM
         * because we don't want to map the entire memory in QEMU.
         * In that case just map the requested area.
         */
        if (block->offset == 0) {
2227
            return xen_map_cache(addr, *size, lock, lock);
2228 2229
        }

2230
        block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
2231
    }
2232

2233
    return ramblock_ptr(block, addr);
2234 2235
}

D
Dr. David Alan Gilbert 已提交
2236 2237 2238 2239 2240 2241 2242 2243 2244 2245
/*
 * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
 * in that RAMBlock.
 *
 * ptr: Host pointer to look up
 * round_offset: If true round the result offset down to a page boundary
 * *ram_addr: set to result ram_addr
 * *offset: set to result offset within the RAMBlock
 *
 * Returns: RAMBlock (or NULL if not found)
2246 2247 2248 2249 2250 2251 2252
 *
 * By the time this function returns, the returned pointer is not protected
 * by RCU anymore.  If the caller is not within an RCU critical section and
 * does not hold the iothread lock, it must have other means of protecting the
 * pointer, such as a reference to the region that includes the incoming
 * ram_addr_t.
 */
D
Dr. David Alan Gilbert 已提交
2253 2254
RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
                                   ram_addr_t *offset)
P
pbrook 已提交
2255
{
P
pbrook 已提交
2256 2257 2258
    RAMBlock *block;
    uint8_t *host = ptr;

2259
    if (xen_enabled()) {
2260
        ram_addr_t ram_addr;
M
Mike Day 已提交
2261
        rcu_read_lock();
2262 2263
        ram_addr = xen_ram_addr_from_mapcache(ptr);
        block = qemu_get_ram_block(ram_addr);
D
Dr. David Alan Gilbert 已提交
2264
        if (block) {
2265
            *offset = ram_addr - block->offset;
D
Dr. David Alan Gilbert 已提交
2266
        }
M
Mike Day 已提交
2267
        rcu_read_unlock();
D
Dr. David Alan Gilbert 已提交
2268
        return block;
2269 2270
    }

M
Mike Day 已提交
2271 2272
    rcu_read_lock();
    block = atomic_rcu_read(&ram_list.mru_block);
2273
    if (block && block->host && host - block->host < block->max_length) {
2274 2275 2276
        goto found;
    }

P
Peter Xu 已提交
2277
    RAMBLOCK_FOREACH(block) {
J
Jun Nakajima 已提交
2278 2279 2280 2281
        /* This case append when the block is not mapped. */
        if (block->host == NULL) {
            continue;
        }
2282
        if (host - block->host < block->max_length) {
2283
            goto found;
A
Alex Williamson 已提交
2284
        }
P
pbrook 已提交
2285
    }
J
Jun Nakajima 已提交
2286

M
Mike Day 已提交
2287
    rcu_read_unlock();
2288
    return NULL;
2289 2290

found:
D
Dr. David Alan Gilbert 已提交
2291 2292 2293 2294
    *offset = (host - block->host);
    if (round_offset) {
        *offset &= TARGET_PAGE_MASK;
    }
M
Mike Day 已提交
2295
    rcu_read_unlock();
D
Dr. David Alan Gilbert 已提交
2296 2297 2298
    return block;
}

D
Dr. David Alan Gilbert 已提交
2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309
/*
 * Finds the named RAMBlock
 *
 * name: The name of RAMBlock to find
 *
 * Returns: RAMBlock (or NULL if not found)
 */
RAMBlock *qemu_ram_block_by_name(const char *name)
{
    RAMBlock *block;

P
Peter Xu 已提交
2310
    RAMBLOCK_FOREACH(block) {
D
Dr. David Alan Gilbert 已提交
2311 2312 2313 2314 2315 2316 2317 2318
        if (!strcmp(name, block->idstr)) {
            return block;
        }
    }

    return NULL;
}

D
Dr. David Alan Gilbert 已提交
2319 2320
/* Some of the softmmu routines need to translate from a host pointer
   (typically a TLB entry) back to a ram offset.  */
2321
ram_addr_t qemu_ram_addr_from_host(void *ptr)
D
Dr. David Alan Gilbert 已提交
2322 2323
{
    RAMBlock *block;
2324
    ram_addr_t offset;
D
Dr. David Alan Gilbert 已提交
2325

2326
    block = qemu_ram_block_from_host(ptr, false, &offset);
D
Dr. David Alan Gilbert 已提交
2327
    if (!block) {
2328
        return RAM_ADDR_INVALID;
D
Dr. David Alan Gilbert 已提交
2329 2330
    }

2331
    return block->offset + offset;
M
Marcelo Tosatti 已提交
2332
}
A
Alex Williamson 已提交
2333

2334
/* Called within RCU critical section.  */
A
Avi Kivity 已提交
2335
static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
2336
                               uint64_t val, unsigned size)
2337
{
2338 2339
    bool locked = false;

2340
    assert(tcg_enabled());
2341
    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
2342 2343
        locked = true;
        tb_lock();
2344
        tb_invalidate_phys_page_fast(ram_addr, size);
2345
    }
2346 2347
    switch (size) {
    case 1:
2348
        stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2349 2350
        break;
    case 2:
2351
        stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2352 2353
        break;
    case 4:
2354
        stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2355 2356 2357
        break;
    default:
        abort();
2358
    }
2359 2360 2361 2362 2363

    if (locked) {
        tb_unlock();
    }

2364 2365 2366 2367 2368
    /* Set both VGA and migration bits for simplicity and to remove
     * the notdirty callback faster.
     */
    cpu_physical_memory_set_dirty_range(ram_addr, size,
                                        DIRTY_CLIENTS_NOCODE);
B
bellard 已提交
2369 2370
    /* we remove the notdirty callback only if the code has been
       flushed */
2371
    if (!cpu_physical_memory_is_clean(ram_addr)) {
2372
        tlb_set_dirty(current_cpu, current_cpu->mem_io_vaddr);
2373
    }
2374 2375
}

2376 2377 2378 2379 2380 2381
static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
                                 unsigned size, bool is_write)
{
    return is_write;
}

2382 2383
static const MemoryRegionOps notdirty_mem_ops = {
    .write = notdirty_mem_write,
2384
    .valid.accepts = notdirty_mem_accepts,
2385
    .endianness = DEVICE_NATIVE_ENDIAN,
2386 2387
};

P
pbrook 已提交
2388
/* Generate a debug exception if a watchpoint has been hit.  */
2389
static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
P
pbrook 已提交
2390
{
2391
    CPUState *cpu = current_cpu;
2392
    CPUClass *cc = CPU_GET_CLASS(cpu);
2393
    CPUArchState *env = cpu->env_ptr;
2394
    target_ulong pc, cs_base;
P
pbrook 已提交
2395
    target_ulong vaddr;
2396
    CPUWatchpoint *wp;
2397
    uint32_t cpu_flags;
P
pbrook 已提交
2398

2399
    assert(tcg_enabled());
2400
    if (cpu->watchpoint_hit) {
2401 2402 2403
        /* We re-entered the check after replacing the TB. Now raise
         * the debug interrupt so that is will trigger after the
         * current instruction. */
2404
        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2405 2406
        return;
    }
2407
    vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
2408
    vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len);
2409
    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2410 2411
        if (cpu_watchpoint_address_matches(wp, vaddr, len)
            && (wp->flags & flags)) {
2412 2413 2414 2415 2416 2417
            if (flags == BP_MEM_READ) {
                wp->flags |= BP_WATCHPOINT_HIT_READ;
            } else {
                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
            }
            wp->hitaddr = vaddr;
2418
            wp->hitattrs = attrs;
2419
            if (!cpu->watchpoint_hit) {
2420 2421 2422 2423 2424
                if (wp->flags & BP_CPU &&
                    !cc->debug_check_watchpoint(cpu, wp)) {
                    wp->flags &= ~BP_WATCHPOINT_HIT;
                    continue;
                }
2425
                cpu->watchpoint_hit = wp;
2426

2427 2428 2429
                /* Both tb_lock and iothread_mutex will be reset when
                 * cpu_loop_exit or cpu_loop_exit_noexc longjmp
                 * back into the cpu_exec main loop.
2430 2431
                 */
                tb_lock();
2432
                tb_check_watchpoint(cpu);
2433
                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2434
                    cpu->exception_index = EXCP_DEBUG;
2435
                    cpu_loop_exit(cpu);
2436 2437
                } else {
                    cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
2438
                    tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
2439
                    cpu_loop_exit_noexc(cpu);
2440
                }
2441
            }
2442 2443
        } else {
            wp->flags &= ~BP_WATCHPOINT_HIT;
P
pbrook 已提交
2444 2445 2446 2447
        }
    }
}

2448 2449 2450
/* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
   so these check for a hit then pass through to the normal out-of-line
   phys routines.  */
2451 2452
static MemTxResult watch_mem_read(void *opaque, hwaddr addr, uint64_t *pdata,
                                  unsigned size, MemTxAttrs attrs)
2453
{
2454 2455
    MemTxResult res;
    uint64_t data;
2456 2457
    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2458 2459

    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
2460
    switch (size) {
2461
    case 1:
2462
        data = address_space_ldub(as, addr, attrs, &res);
2463 2464
        break;
    case 2:
2465
        data = address_space_lduw(as, addr, attrs, &res);
2466 2467
        break;
    case 4:
2468
        data = address_space_ldl(as, addr, attrs, &res);
2469
        break;
2470 2471
    default: abort();
    }
2472 2473
    *pdata = data;
    return res;
2474 2475
}

2476 2477 2478
static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
                                   uint64_t val, unsigned size,
                                   MemTxAttrs attrs)
2479
{
2480
    MemTxResult res;
2481 2482
    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2483 2484

    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
2485
    switch (size) {
2486
    case 1:
2487
        address_space_stb(as, addr, val, attrs, &res);
2488 2489
        break;
    case 2:
2490
        address_space_stw(as, addr, val, attrs, &res);
2491 2492
        break;
    case 4:
2493
        address_space_stl(as, addr, val, attrs, &res);
2494
        break;
2495 2496
    default: abort();
    }
2497
    return res;
2498 2499
}

2500
static const MemoryRegionOps watch_mem_ops = {
2501 2502
    .read_with_attrs = watch_mem_read,
    .write_with_attrs = watch_mem_write,
2503
    .endianness = DEVICE_NATIVE_ENDIAN,
2504 2505
};

2506 2507
static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
                                unsigned len, MemTxAttrs attrs)
2508
{
2509
    subpage_t *subpage = opaque;
2510
    uint8_t buf[8];
2511
    MemTxResult res;
2512

2513
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2514
    printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2515
           subpage, len, addr);
2516
#endif
2517 2518 2519 2520
    res = address_space_read(subpage->as, addr + subpage->base,
                             attrs, buf, len);
    if (res) {
        return res;
2521
    }
2522 2523
    switch (len) {
    case 1:
2524 2525
        *data = ldub_p(buf);
        return MEMTX_OK;
2526
    case 2:
2527 2528
        *data = lduw_p(buf);
        return MEMTX_OK;
2529
    case 4:
2530 2531
        *data = ldl_p(buf);
        return MEMTX_OK;
2532
    case 8:
2533 2534
        *data = ldq_p(buf);
        return MEMTX_OK;
2535 2536 2537
    default:
        abort();
    }
2538 2539
}

2540 2541
static MemTxResult subpage_write(void *opaque, hwaddr addr,
                                 uint64_t value, unsigned len, MemTxAttrs attrs)
2542
{
2543
    subpage_t *subpage = opaque;
2544
    uint8_t buf[8];
2545

2546
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2547
    printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2548 2549
           " value %"PRIx64"\n",
           __func__, subpage, len, addr, value);
2550
#endif
2551 2552 2553 2554 2555 2556 2557 2558 2559 2560
    switch (len) {
    case 1:
        stb_p(buf, value);
        break;
    case 2:
        stw_p(buf, value);
        break;
    case 4:
        stl_p(buf, value);
        break;
2561 2562 2563
    case 8:
        stq_p(buf, value);
        break;
2564 2565 2566
    default:
        abort();
    }
2567 2568
    return address_space_write(subpage->as, addr + subpage->base,
                               attrs, buf, len);
2569 2570
}

2571
static bool subpage_accepts(void *opaque, hwaddr addr,
A
Amos Kong 已提交
2572
                            unsigned len, bool is_write)
2573
{
2574
    subpage_t *subpage = opaque;
2575
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2576
    printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2577
           __func__, subpage, is_write ? 'w' : 'r', len, addr);
2578 2579
#endif

2580
    return address_space_access_valid(subpage->as, addr + subpage->base,
A
Amos Kong 已提交
2581
                                      len, is_write);
2582 2583
}

2584
static const MemoryRegionOps subpage_ops = {
2585 2586
    .read_with_attrs = subpage_read,
    .write_with_attrs = subpage_write,
2587 2588 2589 2590
    .impl.min_access_size = 1,
    .impl.max_access_size = 8,
    .valid.min_access_size = 1,
    .valid.max_access_size = 8,
2591
    .valid.accepts = subpage_accepts,
2592
    .endianness = DEVICE_NATIVE_ENDIAN,
2593 2594
};

A
Anthony Liguori 已提交
2595
static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2596
                             uint16_t section)
2597 2598 2599 2600 2601 2602 2603 2604
{
    int idx, eidx;

    if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
        return -1;
    idx = SUBPAGE_IDX(start);
    eidx = SUBPAGE_IDX(end);
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2605 2606
    printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
           __func__, mmio, start, end, idx, eidx, section);
2607 2608
#endif
    for (; idx <= eidx; idx++) {
2609
        mmio->sub_section[idx] = section;
2610 2611 2612 2613 2614
    }

    return 0;
}

2615
static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
2616
{
A
Anthony Liguori 已提交
2617
    subpage_t *mmio;
2618

2619
    mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2620
    mmio->as = as;
2621
    mmio->base = base;
2622
    memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
P
Peter Crosthwaite 已提交
2623
                          NULL, TARGET_PAGE_SIZE);
A
Avi Kivity 已提交
2624
    mmio->iomem.subpage = true;
2625
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2626 2627
    printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
           mmio, base, TARGET_PAGE_SIZE);
2628
#endif
2629
    subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2630 2631 2632 2633

    return mmio;
}

2634 2635
static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as,
                              MemoryRegion *mr)
2636
{
2637
    assert(as);
2638
    MemoryRegionSection section = {
2639
        .address_space = as,
2640 2641 2642
        .mr = mr,
        .offset_within_address_space = 0,
        .offset_within_region = 0,
2643
        .size = int128_2_64(),
2644 2645
    };

2646
    return phys_section_add(map, &section);
2647 2648
}

2649
MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs)
2650
{
2651 2652
    int asidx = cpu_asidx_from_attrs(cpu, attrs);
    CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2653
    AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2654
    MemoryRegionSection *sections = d->map.sections;
P
Paolo Bonzini 已提交
2655 2656

    return sections[index & ~TARGET_PAGE_MASK].mr;
2657 2658
}

A
Avi Kivity 已提交
2659 2660
static void io_mem_init(void)
{
2661
    memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2662
    memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2663
                          NULL, UINT64_MAX);
2664 2665 2666 2667

    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
     * which can be called without the iothread mutex.
     */
2668
    memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2669
                          NULL, UINT64_MAX);
2670 2671
    memory_region_clear_global_locking(&io_mem_notdirty);

2672
    memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2673
                          NULL, UINT64_MAX);
A
Avi Kivity 已提交
2674 2675
}

A
Avi Kivity 已提交
2676
static void mem_begin(MemoryListener *listener)
2677 2678
{
    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2679 2680 2681
    AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
    uint16_t n;

2682
    n = dummy_section(&d->map, as, &io_mem_unassigned);
2683
    assert(n == PHYS_SECTION_UNASSIGNED);
2684
    n = dummy_section(&d->map, as, &io_mem_notdirty);
2685
    assert(n == PHYS_SECTION_NOTDIRTY);
2686
    n = dummy_section(&d->map, as, &io_mem_rom);
2687
    assert(n == PHYS_SECTION_ROM);
2688
    n = dummy_section(&d->map, as, &io_mem_watch);
2689
    assert(n == PHYS_SECTION_WATCH);
2690

M
Michael S. Tsirkin 已提交
2691
    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2692 2693 2694 2695
    d->as = as;
    as->next_dispatch = d;
}

2696 2697 2698 2699 2700 2701
static void address_space_dispatch_free(AddressSpaceDispatch *d)
{
    phys_sections_free(&d->map);
    g_free(d);
}

2702
static void mem_commit(MemoryListener *listener)
A
Avi Kivity 已提交
2703
{
2704
    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2705 2706 2707
    AddressSpaceDispatch *cur = as->dispatch;
    AddressSpaceDispatch *next = as->next_dispatch;

2708
    phys_page_compact_all(next, next->map.nodes_nb);
2709

2710
    atomic_rcu_set(&as->dispatch, next);
2711
    if (cur) {
2712
        call_rcu(cur, address_space_dispatch_free, rcu);
2713
    }
2714 2715
}

2716
static void tcg_commit(MemoryListener *listener)
2717
{
2718 2719
    CPUAddressSpace *cpuas;
    AddressSpaceDispatch *d;
2720 2721 2722

    /* since each CPU stores ram addresses in its TLB cache, we must
       reset the modified entries */
2723 2724 2725 2726 2727 2728 2729
    cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
    cpu_reloading_memory_map();
    /* The CPU and TLB are protected by the iothread lock.
     * We reload the dispatch pointer now because cpu_reloading_memory_map()
     * may have split the RCU critical section.
     */
    d = atomic_rcu_read(&cpuas->as->dispatch);
2730
    atomic_rcu_set(&cpuas->memory_dispatch, d);
2731
    tlb_flush(cpuas->cpu);
2732 2733
}

A
Avi Kivity 已提交
2734 2735
void address_space_init_dispatch(AddressSpace *as)
{
2736
    as->dispatch = NULL;
2737
    as->dispatch_listener = (MemoryListener) {
A
Avi Kivity 已提交
2738
        .begin = mem_begin,
2739
        .commit = mem_commit,
A
Avi Kivity 已提交
2740 2741 2742 2743
        .region_add = mem_add,
        .region_nop = mem_add,
        .priority = 0,
    };
2744
    memory_listener_register(&as->dispatch_listener, as);
A
Avi Kivity 已提交
2745 2746
}

2747 2748 2749 2750 2751
void address_space_unregister(AddressSpace *as)
{
    memory_listener_unregister(&as->dispatch_listener);
}

A
Avi Kivity 已提交
2752 2753 2754 2755
void address_space_destroy_dispatch(AddressSpace *as)
{
    AddressSpaceDispatch *d = as->dispatch;

2756 2757 2758 2759
    atomic_rcu_set(&as->dispatch, NULL);
    if (d) {
        call_rcu(d, address_space_dispatch_free, rcu);
    }
A
Avi Kivity 已提交
2760 2761
}

A
Avi Kivity 已提交
2762 2763
static void memory_map_init(void)
{
2764
    system_memory = g_malloc(sizeof(*system_memory));
2765

2766
    memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2767
    address_space_init(&address_space_memory, system_memory, "memory");
2768

2769
    system_io = g_malloc(sizeof(*system_io));
2770 2771
    memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
                          65536);
2772
    address_space_init(&address_space_io, system_io, "I/O");
A
Avi Kivity 已提交
2773 2774 2775 2776 2777 2778 2779
}

MemoryRegion *get_system_memory(void)
{
    return system_memory;
}

2780 2781 2782 2783 2784
MemoryRegion *get_system_io(void)
{
    return system_io;
}

2785 2786
#endif /* !defined(CONFIG_USER_ONLY) */

B
bellard 已提交
2787 2788
/* physical memory access (slow version, mainly for debug) */
#if defined(CONFIG_USER_ONLY)
2789
int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
P
Paul Brook 已提交
2790
                        uint8_t *buf, int len, int is_write)
B
bellard 已提交
2791 2792 2793
{
    int l, flags;
    target_ulong page;
2794
    void * p;
B
bellard 已提交
2795 2796 2797 2798 2799 2800 2801 2802

    while (len > 0) {
        page = addr & TARGET_PAGE_MASK;
        l = (page + TARGET_PAGE_SIZE) - addr;
        if (l > len)
            l = len;
        flags = page_get_flags(page);
        if (!(flags & PAGE_VALID))
P
Paul Brook 已提交
2803
            return -1;
B
bellard 已提交
2804 2805
        if (is_write) {
            if (!(flags & PAGE_WRITE))
P
Paul Brook 已提交
2806
                return -1;
2807
            /* XXX: this code should not depend on lock_user */
A
aurel32 已提交
2808
            if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
P
Paul Brook 已提交
2809
                return -1;
A
aurel32 已提交
2810 2811
            memcpy(p, buf, l);
            unlock_user(p, addr, l);
B
bellard 已提交
2812 2813
        } else {
            if (!(flags & PAGE_READ))
P
Paul Brook 已提交
2814
                return -1;
2815
            /* XXX: this code should not depend on lock_user */
A
aurel32 已提交
2816
            if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
P
Paul Brook 已提交
2817
                return -1;
A
aurel32 已提交
2818
            memcpy(buf, p, l);
A
aurel32 已提交
2819
            unlock_user(p, addr, 0);
B
bellard 已提交
2820 2821 2822 2823 2824
        }
        len -= l;
        buf += l;
        addr += l;
    }
P
Paul Brook 已提交
2825
    return 0;
B
bellard 已提交
2826
}
B
bellard 已提交
2827

B
bellard 已提交
2828
#else
2829

2830
static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
A
Avi Kivity 已提交
2831
                                     hwaddr length)
2832
{
2833
    uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
2834 2835
    addr += memory_region_get_ram_addr(mr);

2836 2837 2838 2839 2840 2841 2842 2843 2844
    /* No early return if dirty_log_mask is or becomes 0, because
     * cpu_physical_memory_set_dirty_range will still call
     * xen_modified_memory.
     */
    if (dirty_log_mask) {
        dirty_log_mask =
            cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
    }
    if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
2845
        assert(tcg_enabled());
2846
        tb_lock();
2847
        tb_invalidate_phys_range(addr, addr + length);
2848
        tb_unlock();
2849
        dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
2850
    }
2851
    cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
2852 2853
}

2854
static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2855
{
2856
    unsigned access_size_max = mr->ops->valid.max_access_size;
2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869

    /* Regions are assumed to support 1-4 byte accesses unless
       otherwise specified.  */
    if (access_size_max == 0) {
        access_size_max = 4;
    }

    /* Bound the maximum access by the alignment of the address.  */
    if (!mr->ops->impl.unaligned) {
        unsigned align_size_max = addr & -addr;
        if (align_size_max != 0 && align_size_max < access_size_max) {
            access_size_max = align_size_max;
        }
2870
    }
2871 2872 2873 2874

    /* Don't attempt accesses larger than the maximum.  */
    if (l > access_size_max) {
        l = access_size_max;
2875
    }
2876
    l = pow2floor(l);
2877 2878

    return l;
2879 2880
}

2881
static bool prepare_mmio_access(MemoryRegion *mr)
2882
{
2883 2884 2885 2886 2887 2888 2889 2890
    bool unlocked = !qemu_mutex_iothread_locked();
    bool release_lock = false;

    if (unlocked && mr->global_locking) {
        qemu_mutex_lock_iothread();
        unlocked = false;
        release_lock = true;
    }
2891
    if (mr->flush_coalesced_mmio) {
2892 2893 2894
        if (unlocked) {
            qemu_mutex_lock_iothread();
        }
2895
        qemu_flush_coalesced_mmio_buffer();
2896 2897 2898
        if (unlocked) {
            qemu_mutex_unlock_iothread();
        }
2899
    }
2900 2901

    return release_lock;
2902 2903
}

2904 2905 2906 2907 2908 2909
/* Called within RCU critical section.  */
static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
                                                MemTxAttrs attrs,
                                                const uint8_t *buf,
                                                int len, hwaddr addr1,
                                                hwaddr l, MemoryRegion *mr)
B
bellard 已提交
2910 2911
{
    uint8_t *ptr;
2912
    uint64_t val;
2913
    MemTxResult result = MEMTX_OK;
2914
    bool release_lock = false;
2915

2916
    for (;;) {
2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930
        if (!memory_access_is_direct(mr, true)) {
            release_lock |= prepare_mmio_access(mr);
            l = memory_access_size(mr, l, addr1);
            /* XXX: could force current_cpu to NULL to avoid
               potential bugs */
            switch (l) {
            case 8:
                /* 64 bit write access */
                val = ldq_p(buf);
                result |= memory_region_dispatch_write(mr, addr1, val, 8,
                                                       attrs);
                break;
            case 4:
                /* 32 bit write access */
2931
                val = (uint32_t)ldl_p(buf);
2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948
                result |= memory_region_dispatch_write(mr, addr1, val, 4,
                                                       attrs);
                break;
            case 2:
                /* 16 bit write access */
                val = lduw_p(buf);
                result |= memory_region_dispatch_write(mr, addr1, val, 2,
                                                       attrs);
                break;
            case 1:
                /* 8 bit write access */
                val = ldub_p(buf);
                result |= memory_region_dispatch_write(mr, addr1, val, 1,
                                                       attrs);
                break;
            default:
                abort();
B
bellard 已提交
2949 2950
            }
        } else {
2951
            /* RAM case */
2952
            ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
2953 2954
            memcpy(ptr, buf, l);
            invalidate_and_set_dirty(mr, addr1, l);
B
bellard 已提交
2955
        }
2956 2957 2958 2959 2960 2961

        if (release_lock) {
            qemu_mutex_unlock_iothread();
            release_lock = false;
        }

B
bellard 已提交
2962 2963 2964
        len -= l;
        buf += l;
        addr += l;
2965 2966 2967 2968 2969 2970 2971

        if (!len) {
            break;
        }

        l = len;
        mr = address_space_translate(as, addr, &addr1, &l, true);
B
bellard 已提交
2972
    }
2973

2974
    return result;
B
bellard 已提交
2975
}
B
bellard 已提交
2976

2977 2978
MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
                                const uint8_t *buf, int len)
A
Avi Kivity 已提交
2979
{
2980 2981 2982 2983 2984
    hwaddr l;
    hwaddr addr1;
    MemoryRegion *mr;
    MemTxResult result = MEMTX_OK;

2985 2986
    if (len > 0) {
        rcu_read_lock();
2987
        l = len;
2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006
        mr = address_space_translate(as, addr, &addr1, &l, true);
        result = address_space_write_continue(as, addr, attrs, buf, len,
                                              addr1, l, mr);
        rcu_read_unlock();
    }

    return result;
}

/* Called within RCU critical section.  */
MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
                                        MemTxAttrs attrs, uint8_t *buf,
                                        int len, hwaddr addr1, hwaddr l,
                                        MemoryRegion *mr)
{
    uint8_t *ptr;
    uint64_t val;
    MemTxResult result = MEMTX_OK;
    bool release_lock = false;
3007

3008
    for (;;) {
3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042
        if (!memory_access_is_direct(mr, false)) {
            /* I/O case */
            release_lock |= prepare_mmio_access(mr);
            l = memory_access_size(mr, l, addr1);
            switch (l) {
            case 8:
                /* 64 bit read access */
                result |= memory_region_dispatch_read(mr, addr1, &val, 8,
                                                      attrs);
                stq_p(buf, val);
                break;
            case 4:
                /* 32 bit read access */
                result |= memory_region_dispatch_read(mr, addr1, &val, 4,
                                                      attrs);
                stl_p(buf, val);
                break;
            case 2:
                /* 16 bit read access */
                result |= memory_region_dispatch_read(mr, addr1, &val, 2,
                                                      attrs);
                stw_p(buf, val);
                break;
            case 1:
                /* 8 bit read access */
                result |= memory_region_dispatch_read(mr, addr1, &val, 1,
                                                      attrs);
                stb_p(buf, val);
                break;
            default:
                abort();
            }
        } else {
            /* RAM case */
3043
            ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054
            memcpy(buf, ptr, l);
        }

        if (release_lock) {
            qemu_mutex_unlock_iothread();
            release_lock = false;
        }

        len -= l;
        buf += l;
        addr += l;
3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066

        if (!len) {
            break;
        }

        l = len;
        mr = address_space_translate(as, addr, &addr1, &l, false);
    }

    return result;
}

3067 3068
MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
                                    MemTxAttrs attrs, uint8_t *buf, int len)
3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081
{
    hwaddr l;
    hwaddr addr1;
    MemoryRegion *mr;
    MemTxResult result = MEMTX_OK;

    if (len > 0) {
        rcu_read_lock();
        l = len;
        mr = address_space_translate(as, addr, &addr1, &l, false);
        result = address_space_read_continue(as, addr, attrs, buf, len,
                                             addr1, l, mr);
        rcu_read_unlock();
3082 3083 3084
    }

    return result;
A
Avi Kivity 已提交
3085 3086
}

3087 3088 3089 3090 3091 3092 3093 3094 3095
MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
                             uint8_t *buf, int len, bool is_write)
{
    if (is_write) {
        return address_space_write(as, addr, attrs, (uint8_t *)buf, len);
    } else {
        return address_space_read(as, addr, attrs, (uint8_t *)buf, len);
    }
}
A
Avi Kivity 已提交
3096

A
Avi Kivity 已提交
3097
void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
A
Avi Kivity 已提交
3098 3099
                            int len, int is_write)
{
3100 3101
    address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
                     buf, len, is_write);
A
Avi Kivity 已提交
3102 3103
}

3104 3105 3106 3107 3108
enum write_rom_type {
    WRITE_DATA,
    FLUSH_CACHE,
};

3109
static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
3110
    hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
B
bellard 已提交
3111
{
3112
    hwaddr l;
B
bellard 已提交
3113
    uint8_t *ptr;
3114
    hwaddr addr1;
3115
    MemoryRegion *mr;
3116

3117
    rcu_read_lock();
B
bellard 已提交
3118
    while (len > 0) {
3119
        l = len;
3120
        mr = address_space_translate(as, addr, &addr1, &l, true);
3121

3122 3123
        if (!(memory_region_is_ram(mr) ||
              memory_region_is_romd(mr))) {
3124
            l = memory_access_size(mr, l, addr1);
B
bellard 已提交
3125 3126
        } else {
            /* ROM/RAM case */
3127
            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3128 3129 3130
            switch (type) {
            case WRITE_DATA:
                memcpy(ptr, buf, l);
3131
                invalidate_and_set_dirty(mr, addr1, l);
3132 3133 3134 3135 3136
                break;
            case FLUSH_CACHE:
                flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
                break;
            }
B
bellard 已提交
3137 3138 3139 3140 3141
        }
        len -= l;
        buf += l;
        addr += l;
    }
3142
    rcu_read_unlock();
B
bellard 已提交
3143 3144
}

3145
/* used for ROM loading : can write in RAM and ROM */
3146
void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
3147 3148
                                   const uint8_t *buf, int len)
{
3149
    cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163
}

void cpu_flush_icache_range(hwaddr start, int len)
{
    /*
     * This function should do the same thing as an icache flush that was
     * triggered from within the guest. For TCG we are always cache coherent,
     * so there is no need to flush anything. For KVM / Xen we need to flush
     * the host's instruction cache at least.
     */
    if (tcg_enabled()) {
        return;
    }

3164 3165
    cpu_physical_memory_write_rom_internal(&address_space_memory,
                                           start, NULL, len, FLUSH_CACHE);
3166 3167
}

3168
typedef struct {
3169
    MemoryRegion *mr;
3170
    void *buffer;
A
Avi Kivity 已提交
3171 3172
    hwaddr addr;
    hwaddr len;
F
Fam Zheng 已提交
3173
    bool in_use;
3174 3175 3176 3177
} BounceBuffer;

static BounceBuffer bounce;

3178
typedef struct MapClient {
3179
    QEMUBH *bh;
B
Blue Swirl 已提交
3180
    QLIST_ENTRY(MapClient) link;
3181 3182
} MapClient;

3183
QemuMutex map_client_list_lock;
B
Blue Swirl 已提交
3184 3185
static QLIST_HEAD(map_client_list, MapClient) map_client_list
    = QLIST_HEAD_INITIALIZER(map_client_list);
3186

3187 3188 3189 3190 3191 3192
static void cpu_unregister_map_client_do(MapClient *client)
{
    QLIST_REMOVE(client, link);
    g_free(client);
}

3193 3194 3195 3196 3197 3198
static void cpu_notify_map_clients_locked(void)
{
    MapClient *client;

    while (!QLIST_EMPTY(&map_client_list)) {
        client = QLIST_FIRST(&map_client_list);
3199 3200
        qemu_bh_schedule(client->bh);
        cpu_unregister_map_client_do(client);
3201 3202 3203
    }
}

3204
void cpu_register_map_client(QEMUBH *bh)
3205
{
3206
    MapClient *client = g_malloc(sizeof(*client));
3207

3208
    qemu_mutex_lock(&map_client_list_lock);
3209
    client->bh = bh;
B
Blue Swirl 已提交
3210
    QLIST_INSERT_HEAD(&map_client_list, client, link);
3211 3212 3213
    if (!atomic_read(&bounce.in_use)) {
        cpu_notify_map_clients_locked();
    }
3214
    qemu_mutex_unlock(&map_client_list_lock);
3215 3216
}

3217
void cpu_exec_init_all(void)
3218
{
3219
    qemu_mutex_init(&ram_list.mutex);
3220 3221 3222 3223 3224 3225 3226 3227
    /* The data structures we set up here depend on knowing the page size,
     * so no more changes can be made after this point.
     * In an ideal world, nothing we did before we had finished the
     * machine setup would care about the target page size, and we could
     * do this much later, rather than requiring board models to state
     * up front what their requirements are.
     */
    finalize_target_page_bits();
3228
    io_mem_init();
3229
    memory_map_init();
3230
    qemu_mutex_init(&map_client_list_lock);
3231 3232
}

3233
void cpu_unregister_map_client(QEMUBH *bh)
3234 3235 3236
{
    MapClient *client;

3237 3238 3239 3240 3241 3242
    qemu_mutex_lock(&map_client_list_lock);
    QLIST_FOREACH(client, &map_client_list, link) {
        if (client->bh == bh) {
            cpu_unregister_map_client_do(client);
            break;
        }
3243
    }
3244
    qemu_mutex_unlock(&map_client_list_lock);
3245 3246 3247 3248
}

static void cpu_notify_map_clients(void)
{
3249
    qemu_mutex_lock(&map_client_list_lock);
3250
    cpu_notify_map_clients_locked();
3251
    qemu_mutex_unlock(&map_client_list_lock);
3252 3253
}

3254 3255
bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
{
3256
    MemoryRegion *mr;
3257 3258
    hwaddr l, xlat;

3259
    rcu_read_lock();
3260 3261
    while (len > 0) {
        l = len;
3262 3263 3264 3265
        mr = address_space_translate(as, addr, &xlat, &l, is_write);
        if (!memory_access_is_direct(mr, is_write)) {
            l = memory_access_size(mr, l, addr);
            if (!memory_region_access_valid(mr, xlat, l, is_write)) {
R
Roman Kapl 已提交
3266
                rcu_read_unlock();
3267 3268 3269 3270 3271 3272 3273
                return false;
            }
        }

        len -= l;
        addr += l;
    }
3274
    rcu_read_unlock();
3275 3276 3277
    return true;
}

3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302
static hwaddr
address_space_extend_translation(AddressSpace *as, hwaddr addr, hwaddr target_len,
                                 MemoryRegion *mr, hwaddr base, hwaddr len,
                                 bool is_write)
{
    hwaddr done = 0;
    hwaddr xlat;
    MemoryRegion *this_mr;

    for (;;) {
        target_len -= len;
        addr += len;
        done += len;
        if (target_len == 0) {
            return done;
        }

        len = target_len;
        this_mr = address_space_translate(as, addr, &xlat, &len, is_write);
        if (this_mr != mr || xlat != base + done) {
            return done;
        }
    }
}

3303 3304 3305 3306
/* Map a physical memory region into a host virtual address.
 * May map a subset of the requested range, given by and returned in *plen.
 * May return NULL if resources needed to perform the mapping are exhausted.
 * Use only for reads OR writes - not for read-modify-write operations.
3307 3308
 * Use cpu_register_map_client() to know when retrying the map operation is
 * likely to succeed.
3309
 */
A
Avi Kivity 已提交
3310
void *address_space_map(AddressSpace *as,
A
Avi Kivity 已提交
3311 3312
                        hwaddr addr,
                        hwaddr *plen,
A
Avi Kivity 已提交
3313
                        bool is_write)
3314
{
A
Avi Kivity 已提交
3315
    hwaddr len = *plen;
3316 3317
    hwaddr l, xlat;
    MemoryRegion *mr;
3318
    void *ptr;
3319

3320 3321 3322
    if (len == 0) {
        return NULL;
    }
3323

3324
    l = len;
3325
    rcu_read_lock();
3326
    mr = address_space_translate(as, addr, &xlat, &l, is_write);
3327

3328
    if (!memory_access_is_direct(mr, is_write)) {
F
Fam Zheng 已提交
3329
        if (atomic_xchg(&bounce.in_use, true)) {
3330
            rcu_read_unlock();
3331
            return NULL;
3332
        }
3333 3334 3335
        /* Avoid unbounded allocations */
        l = MIN(l, TARGET_PAGE_SIZE);
        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3336 3337
        bounce.addr = addr;
        bounce.len = l;
3338 3339 3340

        memory_region_ref(mr);
        bounce.mr = mr;
3341
        if (!is_write) {
3342 3343
            address_space_read(as, addr, MEMTXATTRS_UNSPECIFIED,
                               bounce.buffer, l);
3344
        }
3345

3346
        rcu_read_unlock();
3347 3348 3349 3350 3351
        *plen = l;
        return bounce.buffer;
    }


3352
    memory_region_ref(mr);
3353
    *plen = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write);
3354
    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
3355 3356 3357
    rcu_read_unlock();

    return ptr;
3358 3359
}

A
Avi Kivity 已提交
3360
/* Unmaps a memory region previously mapped by address_space_map().
3361 3362 3363
 * Will also mark the memory as dirty if is_write == 1.  access_len gives
 * the amount of memory that was actually read or written by the caller.
 */
A
Avi Kivity 已提交
3364 3365
void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
                         int is_write, hwaddr access_len)
3366 3367
{
    if (buffer != bounce.buffer) {
3368 3369 3370
        MemoryRegion *mr;
        ram_addr_t addr1;

3371
        mr = memory_region_from_host(buffer, &addr1);
3372
        assert(mr != NULL);
3373
        if (is_write) {
3374
            invalidate_and_set_dirty(mr, addr1, access_len);
3375
        }
3376
        if (xen_enabled()) {
J
Jan Kiszka 已提交
3377
            xen_invalidate_map_cache_entry(buffer);
A
Anthony PERARD 已提交
3378
        }
3379
        memory_region_unref(mr);
3380 3381 3382
        return;
    }
    if (is_write) {
3383 3384
        address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
                            bounce.buffer, access_len);
3385
    }
3386
    qemu_vfree(bounce.buffer);
3387
    bounce.buffer = NULL;
3388
    memory_region_unref(bounce.mr);
F
Fam Zheng 已提交
3389
    atomic_mb_set(&bounce.in_use, false);
3390
    cpu_notify_map_clients();
3391
}
B
bellard 已提交
3392

A
Avi Kivity 已提交
3393 3394
void *cpu_physical_memory_map(hwaddr addr,
                              hwaddr *plen,
A
Avi Kivity 已提交
3395 3396 3397 3398 3399
                              int is_write)
{
    return address_space_map(&address_space_memory, addr, plen, is_write);
}

A
Avi Kivity 已提交
3400 3401
void cpu_physical_memory_unmap(void *buffer, hwaddr len,
                               int is_write, hwaddr access_len)
A
Avi Kivity 已提交
3402 3403 3404 3405
{
    return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
}

P
Paolo Bonzini 已提交
3406 3407 3408 3409 3410 3411 3412 3413 3414 3415
#define ARG1_DECL                AddressSpace *as
#define ARG1                     as
#define SUFFIX
#define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
#define IS_DIRECT(mr, is_write)  memory_access_is_direct(mr, is_write)
#define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
#define RCU_READ_LOCK(...)       rcu_read_lock()
#define RCU_READ_UNLOCK(...)     rcu_read_unlock()
#include "memory_ldst.inc.c"
3416

P
Paolo Bonzini 已提交
3417 3418 3419 3420 3421 3422
int64_t address_space_cache_init(MemoryRegionCache *cache,
                                 AddressSpace *as,
                                 hwaddr addr,
                                 hwaddr len,
                                 bool is_write)
{
P
Paolo Bonzini 已提交
3423 3424 3425 3426
    cache->len = len;
    cache->as = as;
    cache->xlat = addr;
    return len;
P
Paolo Bonzini 已提交
3427 3428 3429 3430 3431 3432 3433 3434 3435 3436
}

void address_space_cache_invalidate(MemoryRegionCache *cache,
                                    hwaddr addr,
                                    hwaddr access_len)
{
}

void address_space_cache_destroy(MemoryRegionCache *cache)
{
P
Paolo Bonzini 已提交
3437
    cache->as = NULL;
P
Paolo Bonzini 已提交
3438 3439 3440 3441 3442
}

#define ARG1_DECL                MemoryRegionCache *cache
#define ARG1                     cache
#define SUFFIX                   _cached
P
Paolo Bonzini 已提交
3443 3444
#define TRANSLATE(addr, ...)     \
    address_space_translate(cache->as, cache->xlat + (addr), __VA_ARGS__)
P
Paolo Bonzini 已提交
3445
#define IS_DIRECT(mr, is_write)  true
P
Paolo Bonzini 已提交
3446 3447 3448 3449
#define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
#define RCU_READ_LOCK()          rcu_read_lock()
#define RCU_READ_UNLOCK()        rcu_read_unlock()
P
Paolo Bonzini 已提交
3450 3451
#include "memory_ldst.inc.c"

3452
/* virtual memory access for debug (includes writing to ROM) */
3453
int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3454
                        uint8_t *buf, int len, int is_write)
B
bellard 已提交
3455 3456
{
    int l;
A
Avi Kivity 已提交
3457
    hwaddr phys_addr;
3458
    target_ulong page;
B
bellard 已提交
3459

3460
    cpu_synchronize_state(cpu);
B
bellard 已提交
3461
    while (len > 0) {
3462 3463 3464
        int asidx;
        MemTxAttrs attrs;

B
bellard 已提交
3465
        page = addr & TARGET_PAGE_MASK;
3466 3467
        phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
        asidx = cpu_asidx_from_attrs(cpu, attrs);
B
bellard 已提交
3468 3469 3470 3471 3472 3473
        /* if no physical page mapped, return an error */
        if (phys_addr == -1)
            return -1;
        l = (page + TARGET_PAGE_SIZE) - addr;
        if (l > len)
            l = len;
3474
        phys_addr += (addr & ~TARGET_PAGE_MASK);
3475
        if (is_write) {
3476 3477
            cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
                                          phys_addr, buf, l);
3478
        } else {
3479 3480
            address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
                             MEMTXATTRS_UNSPECIFIED,
3481
                             buf, l, 0);
3482
        }
B
bellard 已提交
3483 3484 3485 3486 3487 3488
        len -= l;
        buf += l;
        addr += l;
    }
    return 0;
}
3489 3490 3491 3492 3493

/*
 * Allows code that needs to deal with migration bitmaps etc to still be built
 * target independent.
 */
3494
size_t qemu_target_page_size(void)
3495
{
3496
    return TARGET_PAGE_SIZE;
3497 3498
}

3499 3500 3501 3502 3503 3504 3505 3506 3507
int qemu_target_page_bits(void)
{
    return TARGET_PAGE_BITS;
}

int qemu_target_page_bits_min(void)
{
    return TARGET_PAGE_BITS_MIN;
}
P
Paul Brook 已提交
3508
#endif
B
bellard 已提交
3509

3510 3511 3512 3513
/*
 * A helper function for the _utterly broken_ virtio device model to find out if
 * it's running on a big endian machine. Don't do this at home kids!
 */
3514 3515
bool target_words_bigendian(void);
bool target_words_bigendian(void)
3516 3517 3518 3519 3520 3521 3522 3523
{
#if defined(TARGET_WORDS_BIGENDIAN)
    return true;
#else
    return false;
#endif
}

3524
#ifndef CONFIG_USER_ONLY
A
Avi Kivity 已提交
3525
bool cpu_physical_memory_is_io(hwaddr phys_addr)
3526
{
3527
    MemoryRegion*mr;
3528
    hwaddr l = 1;
3529
    bool res;
3530

3531
    rcu_read_lock();
3532 3533
    mr = address_space_translate(&address_space_memory,
                                 phys_addr, &phys_addr, &l, false);
3534

3535 3536 3537
    res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
    rcu_read_unlock();
    return res;
3538
}
3539

3540
int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3541 3542
{
    RAMBlock *block;
3543
    int ret = 0;
3544

M
Mike Day 已提交
3545
    rcu_read_lock();
P
Peter Xu 已提交
3546
    RAMBLOCK_FOREACH(block) {
3547 3548 3549 3550 3551
        ret = func(block->idstr, block->host, block->offset,
                   block->used_length, opaque);
        if (ret) {
            break;
        }
3552
    }
M
Mike Day 已提交
3553
    rcu_read_unlock();
3554
    return ret;
3555
}
3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586

/*
 * Unmap pages of memory from start to start+length such that
 * they a) read as 0, b) Trigger whatever fault mechanism
 * the OS provides for postcopy.
 * The pages must be unmapped by the end of the function.
 * Returns: 0 on success, none-0 on failure
 *
 */
int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
{
    int ret = -1;

    uint8_t *host_startaddr = rb->host + start;

    if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
        error_report("ram_block_discard_range: Unaligned start address: %p",
                     host_startaddr);
        goto err;
    }

    if ((start + length) <= rb->used_length) {
        uint8_t *host_endaddr = host_startaddr + length;
        if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
            error_report("ram_block_discard_range: Unaligned end address: %p",
                         host_endaddr);
            goto err;
        }

        errno = ENOTSUP; /* If we are missing MADVISE etc */

3587
        if (rb->page_size == qemu_host_page_size) {
3588
#if defined(CONFIG_MADVISE)
3589 3590 3591 3592
            /* Note: We need the madvise MADV_DONTNEED behaviour of definitely
             * freeing the page.
             */
            ret = madvise(host_startaddr, length, MADV_DONTNEED);
3593
#endif
3594 3595 3596 3597 3598 3599 3600 3601 3602 3603
        } else {
            /* Huge page case  - unfortunately it can't do DONTNEED, but
             * it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
             * huge page file.
             */
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
            ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                            start, length);
#endif
        }
3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619
        if (ret) {
            ret = -errno;
            error_report("ram_block_discard_range: Failed to discard range "
                         "%s:%" PRIx64 " +%zx (%d)",
                         rb->idstr, start, length, ret);
        }
    } else {
        error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
                     "/%zx/" RAM_ADDR_FMT")",
                     rb->idstr, start, length, rb->used_length);
    }

err:
    return ret;
}

3620
#endif
Y
Yang Zhong 已提交
3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635

void page_size_init(void)
{
    /* NOTE: we can always suppose that qemu_host_page_size >=
       TARGET_PAGE_SIZE */
    qemu_real_host_page_size = getpagesize();
    qemu_real_host_page_mask = -(intptr_t)qemu_real_host_page_size;
    if (qemu_host_page_size == 0) {
        qemu_host_page_size = qemu_real_host_page_size;
    }
    if (qemu_host_page_size < TARGET_PAGE_SIZE) {
        qemu_host_page_size = TARGET_PAGE_SIZE;
    }
    qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
}