exec.c 96.3 KB
Newer Older
B
bellard 已提交
1
/*
2
 *  Virtual page mapping
3
 *
B
bellard 已提交
4 5 6 7 8 9 10 11 12 13 14 15 16
 *  Copyright (c) 2003 Fabrice Bellard
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
B
bellard 已提交
18
 */
P
Peter Maydell 已提交
19
#include "qemu/osdep.h"
20
#include "qapi/error.h"
21
#ifndef _WIN32
B
bellard 已提交
22
#endif
B
bellard 已提交
23

24
#include "qemu/cutils.h"
B
bellard 已提交
25
#include "cpu.h"
26
#include "exec/exec-all.h"
B
bellard 已提交
27
#include "tcg.h"
28
#include "hw/qdev-core.h"
29
#if !defined(CONFIG_USER_ONLY)
30
#include "hw/boards.h"
31
#include "hw/xen/xen.h"
32
#endif
33
#include "sysemu/kvm.h"
34
#include "sysemu/sysemu.h"
35 36
#include "qemu/timer.h"
#include "qemu/config-file.h"
37
#include "qemu/error-report.h"
38
#if defined(CONFIG_USER_ONLY)
39
#include "qemu.h"
J
Jun Nakajima 已提交
40
#else /* !CONFIG_USER_ONLY */
41 42
#include "hw/hw.h"
#include "exec/memory.h"
P
Paolo Bonzini 已提交
43
#include "exec/ioport.h"
44 45
#include "sysemu/dma.h"
#include "exec/address-spaces.h"
46
#include "sysemu/xen-mapcache.h"
47
#include "trace-root.h"
48

49 50 51 52 53
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
#include <fcntl.h>
#include <linux/falloc.h>
#endif

54
#endif
55
#include "exec/cpu-all.h"
M
Mike Day 已提交
56
#include "qemu/rcu_queue.h"
57
#include "qemu/main-loop.h"
58
#include "translate-all.h"
59
#include "sysemu/replay.h"
60

61
#include "exec/memory-internal.h"
62
#include "exec/ram_addr.h"
63
#include "exec/log.h"
64

65 66
#include "migration/vmstate.h"

67
#include "qemu/range.h"
68 69 70
#ifndef _WIN32
#include "qemu/mmap-alloc.h"
#endif
71

72
//#define DEBUG_SUBPAGE
T
ths 已提交
73

74
#if !defined(CONFIG_USER_ONLY)
M
Mike Day 已提交
75 76 77
/* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
 * are protected by the ramlist lock.
 */
M
Mike Day 已提交
78
RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
A
Avi Kivity 已提交
79 80

static MemoryRegion *system_memory;
81
static MemoryRegion *system_io;
A
Avi Kivity 已提交
82

83 84
AddressSpace address_space_io;
AddressSpace address_space_memory;
85

86
MemoryRegion io_mem_rom, io_mem_notdirty;
87
static MemoryRegion io_mem_unassigned;
88

89 90 91
/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
#define RAM_PREALLOC   (1 << 0)

92 93 94
/* RAM is mmap-ed with MAP_SHARED */
#define RAM_SHARED     (1 << 1)

95 96 97 98 99
/* Only a portion of RAM (used_length) is actually used, and migrated.
 * This used_length size can change across reboots.
 */
#define RAM_RESIZEABLE (1 << 2)

100
#endif
101

102 103 104 105 106
#ifdef TARGET_PAGE_BITS_VARY
int target_page_bits;
bool target_page_bits_decided;
#endif

A
Andreas Färber 已提交
107
struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
B
bellard 已提交
108 109
/* current CPU in the current thread. It is only valid inside
   cpu_exec() */
P
Paolo Bonzini 已提交
110
__thread CPUState *current_cpu;
P
pbrook 已提交
111
/* 0 = Do not count executed instructions.
T
ths 已提交
112
   1 = Precise instruction counting.
P
pbrook 已提交
113
   2 = Adaptive rate instruction counting.  */
114
int use_icount;
B
bellard 已提交
115

116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
bool set_preferred_target_page_bits(int bits)
{
    /* The target page size is the lowest common denominator for all
     * the CPUs in the system, so we can only make it smaller, never
     * larger. And we can't make it smaller once we've committed to
     * a particular size.
     */
#ifdef TARGET_PAGE_BITS_VARY
    assert(bits >= TARGET_PAGE_BITS_MIN);
    if (target_page_bits == 0 || target_page_bits > bits) {
        if (target_page_bits_decided) {
            return false;
        }
        target_page_bits = bits;
    }
#endif
    return true;
}

135
#if !defined(CONFIG_USER_ONLY)
136

137 138 139 140 141 142 143 144 145 146
static void finalize_target_page_bits(void)
{
#ifdef TARGET_PAGE_BITS_VARY
    if (target_page_bits == 0) {
        target_page_bits = TARGET_PAGE_BITS_MIN;
    }
    target_page_bits_decided = true;
#endif
}

147 148 149
typedef struct PhysPageEntry PhysPageEntry;

struct PhysPageEntry {
M
Michael S. Tsirkin 已提交
150
    /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
151
    uint32_t skip : 6;
M
Michael S. Tsirkin 已提交
152
     /* index into phys_sections (!skip) or phys_map_nodes (skip) */
153
    uint32_t ptr : 26;
154 155
};

156 157
#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)

158
/* Size of the L2 (and L3, etc) page tables.  */
159
#define ADDR_SPACE_BITS 64
160

M
Michael S. Tsirkin 已提交
161
#define P_L2_BITS 9
162 163 164 165 166
#define P_L2_SIZE (1 << P_L2_BITS)

#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)

typedef PhysPageEntry Node[P_L2_SIZE];
167

168
typedef struct PhysPageMap {
169 170
    struct rcu_head rcu;

171 172 173 174 175 176 177 178
    unsigned sections_nb;
    unsigned sections_nb_alloc;
    unsigned nodes_nb;
    unsigned nodes_nb_alloc;
    Node *nodes;
    MemoryRegionSection *sections;
} PhysPageMap;

179
struct AddressSpaceDispatch {
180 181
    struct rcu_head rcu;

182
    MemoryRegionSection *mru_section;
183 184 185 186
    /* This is a multi-level map on the physical address space.
     * The bottom level has pointers to MemoryRegionSections.
     */
    PhysPageEntry phys_map;
187
    PhysPageMap map;
188
    AddressSpace *as;
189 190
};

191 192 193
#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
typedef struct subpage_t {
    MemoryRegion iomem;
194
    AddressSpace *as;
195
    hwaddr base;
196
    uint16_t sub_section[];
197 198
} subpage_t;

199 200 201 202
#define PHYS_SECTION_UNASSIGNED 0
#define PHYS_SECTION_NOTDIRTY 1
#define PHYS_SECTION_ROM 2
#define PHYS_SECTION_WATCH 3
203

204
static void io_mem_init(void);
A
Avi Kivity 已提交
205
static void memory_map_init(void);
206
static void tcg_commit(MemoryListener *listener);
207

208
static MemoryRegion io_mem_watch;
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223

/**
 * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 * @cpu: the CPU whose AddressSpace this is
 * @as: the AddressSpace itself
 * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 * @tcg_as_listener: listener for tracking changes to the AddressSpace
 */
struct CPUAddressSpace {
    CPUState *cpu;
    AddressSpace *as;
    struct AddressSpaceDispatch *memory_dispatch;
    MemoryListener tcg_as_listener;
};

224
#endif
B
bellard 已提交
225

226
#if !defined(CONFIG_USER_ONLY)
227

228
static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
229
{
230
    static unsigned alloc_hint = 16;
231
    if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
232
        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
233 234
        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
        map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
235
        alloc_hint = map->nodes_nb_alloc;
236
    }
237 238
}

239
static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
240 241
{
    unsigned i;
242
    uint32_t ret;
243 244
    PhysPageEntry e;
    PhysPageEntry *p;
245

246
    ret = map->nodes_nb++;
247
    p = map->nodes[ret];
248
    assert(ret != PHYS_MAP_NODE_NIL);
249
    assert(ret != map->nodes_nb_alloc);
250 251 252

    e.skip = leaf ? 0 : 1;
    e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
253
    for (i = 0; i < P_L2_SIZE; ++i) {
254
        memcpy(&p[i], &e, sizeof(e));
255
    }
256
    return ret;
257 258
}

259 260
static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
                                hwaddr *index, hwaddr *nb, uint16_t leaf,
261
                                int level)
262 263
{
    PhysPageEntry *p;
264
    hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
265

M
Michael S. Tsirkin 已提交
266
    if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
267
        lp->ptr = phys_map_node_alloc(map, level == 0);
B
bellard 已提交
268
    }
269
    p = map->nodes[lp->ptr];
270
    lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
271

272
    while (*nb && lp < &p[P_L2_SIZE]) {
273
        if ((*index & (step - 1)) == 0 && *nb >= step) {
M
Michael S. Tsirkin 已提交
274
            lp->skip = 0;
275
            lp->ptr = leaf;
276 277
            *index += step;
            *nb -= step;
278
        } else {
279
            phys_page_set_level(map, lp, index, nb, leaf, level - 1);
280 281
        }
        ++lp;
282 283 284
    }
}

A
Avi Kivity 已提交
285
static void phys_page_set(AddressSpaceDispatch *d,
A
Avi Kivity 已提交
286
                          hwaddr index, hwaddr nb,
287
                          uint16_t leaf)
288
{
289
    /* Wildly overreserve - it doesn't matter much. */
290
    phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
291

292
    phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
B
bellard 已提交
293 294
}

295 296 297
/* Compact a non leaf page entry. Simply detect that the entry has a single child,
 * and update our entry so we can skip it and go directly to the destination.
 */
298
static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
{
    unsigned valid_ptr = P_L2_SIZE;
    int valid = 0;
    PhysPageEntry *p;
    int i;

    if (lp->ptr == PHYS_MAP_NODE_NIL) {
        return;
    }

    p = nodes[lp->ptr];
    for (i = 0; i < P_L2_SIZE; i++) {
        if (p[i].ptr == PHYS_MAP_NODE_NIL) {
            continue;
        }

        valid_ptr = i;
        valid++;
        if (p[i].skip) {
318
            phys_page_compact(&p[i], nodes);
319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350
        }
    }

    /* We can only compress if there's only one child. */
    if (valid != 1) {
        return;
    }

    assert(valid_ptr < P_L2_SIZE);

    /* Don't compress if it won't fit in the # of bits we have. */
    if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
        return;
    }

    lp->ptr = p[valid_ptr].ptr;
    if (!p[valid_ptr].skip) {
        /* If our only child is a leaf, make this a leaf. */
        /* By design, we should have made this node a leaf to begin with so we
         * should never reach here.
         * But since it's so simple to handle this, let's do it just in case we
         * change this rule.
         */
        lp->skip = 0;
    } else {
        lp->skip += p[valid_ptr].skip;
    }
}

static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
{
    if (d->phys_map.skip) {
351
        phys_page_compact(&d->phys_map, d->map.nodes);
352 353 354
    }
}

F
Fam Zheng 已提交
355 356 357 358 359 360
static inline bool section_covers_addr(const MemoryRegionSection *section,
                                       hwaddr addr)
{
    /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
     * the section must cover the entire address space.
     */
361
    return int128_gethi(section->size) ||
F
Fam Zheng 已提交
362
           range_covers_byte(section->offset_within_address_space,
363
                             int128_getlo(section->size), addr);
F
Fam Zheng 已提交
364 365
}

366
static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
367
                                           Node *nodes, MemoryRegionSection *sections)
B
bellard 已提交
368
{
369
    PhysPageEntry *p;
370
    hwaddr index = addr >> TARGET_PAGE_BITS;
371
    int i;
372

M
Michael S. Tsirkin 已提交
373
    for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
374
        if (lp.ptr == PHYS_MAP_NODE_NIL) {
375
            return &sections[PHYS_SECTION_UNASSIGNED];
376
        }
377
        p = nodes[lp.ptr];
378
        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
379
    }
380

F
Fam Zheng 已提交
381
    if (section_covers_addr(&sections[lp.ptr], addr)) {
382 383 384 385
        return &sections[lp.ptr];
    } else {
        return &sections[PHYS_SECTION_UNASSIGNED];
    }
386 387
}

B
Blue Swirl 已提交
388 389
bool memory_region_is_unassigned(MemoryRegion *mr)
{
P
Paolo Bonzini 已提交
390
    return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
391
        && mr != &io_mem_watch;
B
bellard 已提交
392
}
393

394
/* Called from RCU critical section */
395
static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
396 397
                                                        hwaddr addr,
                                                        bool resolve_subpage)
398
{
399
    MemoryRegionSection *section = atomic_read(&d->mru_section);
400
    subpage_t *subpage;
401
    bool update;
402

403 404 405 406 407 408 409 410
    if (section && section != &d->map.sections[PHYS_SECTION_UNASSIGNED] &&
        section_covers_addr(section, addr)) {
        update = false;
    } else {
        section = phys_page_find(d->phys_map, addr, d->map.nodes,
                                 d->map.sections);
        update = true;
    }
411 412
    if (resolve_subpage && section->mr->subpage) {
        subpage = container_of(section->mr, subpage_t, iomem);
413
        section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
414
    }
415 416 417
    if (update) {
        atomic_set(&d->mru_section, section);
    }
418
    return section;
419 420
}

421
/* Called from RCU critical section */
422
static MemoryRegionSection *
423
address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
424
                                 hwaddr *plen, bool resolve_subpage)
425 426
{
    MemoryRegionSection *section;
427
    MemoryRegion *mr;
428
    Int128 diff;
429

430
    section = address_space_lookup_region(d, addr, resolve_subpage);
431 432 433 434 435 436
    /* Compute offset within MemoryRegionSection */
    addr -= section->offset_within_address_space;

    /* Compute offset within MemoryRegion */
    *xlat = addr + section->offset_within_region;

437
    mr = section->mr;
438 439 440 441 442 443 444 445 446 447 448 449

    /* MMIO registers can be expected to perform full-width accesses based only
     * on their address, without considering adjacent registers that could
     * decode to completely different MemoryRegions.  When such registers
     * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
     * regions overlap wildly.  For this reason we cannot clamp the accesses
     * here.
     *
     * If the length is small (as is the case for address_space_ldl/stl),
     * everything works fine.  If the incoming length is large, however,
     * the caller really has to do the clamping through memory_access_size.
     */
450
    if (memory_region_is_ram(mr)) {
451
        diff = int128_sub(section->size, int128_make64(addr));
452 453
        *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
    }
454 455
    return section;
}
456

457
/* Called from RCU critical section */
458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490
IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
                                            bool is_write)
{
    IOMMUTLBEntry iotlb = {0};
    MemoryRegionSection *section;
    MemoryRegion *mr;

    for (;;) {
        AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
        section = address_space_lookup_region(d, addr, false);
        addr = addr - section->offset_within_address_space
               + section->offset_within_region;
        mr = section->mr;

        if (!mr->iommu_ops) {
            break;
        }

        iotlb = mr->iommu_ops->translate(mr, addr, is_write);
        if (!(iotlb.perm & (1 << is_write))) {
            iotlb.target_as = NULL;
            break;
        }

        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
                | (addr & iotlb.addr_mask));
        as = iotlb.target_as;
    }

    return iotlb;
}

/* Called from RCU critical section */
491 492 493
MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
                                      hwaddr *xlat, hwaddr *plen,
                                      bool is_write)
494
{
A
Avi Kivity 已提交
495 496 497 498 499
    IOMMUTLBEntry iotlb;
    MemoryRegionSection *section;
    MemoryRegion *mr;

    for (;;) {
500 501
        AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
        section = address_space_translate_internal(d, addr, &addr, plen, true);
A
Avi Kivity 已提交
502 503 504 505 506 507
        mr = section->mr;

        if (!mr->iommu_ops) {
            break;
        }

508
        iotlb = mr->iommu_ops->translate(mr, addr, is_write);
A
Avi Kivity 已提交
509 510
        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
                | (addr & iotlb.addr_mask));
511
        *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
A
Avi Kivity 已提交
512 513 514 515 516 517 518 519
        if (!(iotlb.perm & (1 << is_write))) {
            mr = &io_mem_unassigned;
            break;
        }

        as = iotlb.target_as;
    }

520
    if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
521
        hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
522
        *plen = MIN(page, *plen);
523 524
    }

A
Avi Kivity 已提交
525 526
    *xlat = addr;
    return mr;
527 528
}

529
/* Called from RCU critical section */
530
MemoryRegionSection *
531
address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
P
Paolo Bonzini 已提交
532
                                  hwaddr *xlat, hwaddr *plen)
533
{
A
Avi Kivity 已提交
534
    MemoryRegionSection *section;
535
    AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
536 537

    section = address_space_translate_internal(d, addr, xlat, plen, false);
A
Avi Kivity 已提交
538 539 540

    assert(!section->mr->iommu_ops);
    return section;
541
}
542
#endif
B
bellard 已提交
543

544
#if !defined(CONFIG_USER_ONLY)
545 546

static int cpu_common_post_load(void *opaque, int version_id)
B
bellard 已提交
547
{
548
    CPUState *cpu = opaque;
B
bellard 已提交
549

550 551
    /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
       version_id is increased. */
552
    cpu->interrupt_request &= ~0x01;
553
    tlb_flush(cpu);
554 555

    return 0;
B
bellard 已提交
556
}
B
bellard 已提交
557

558 559 560 561
static int cpu_common_pre_load(void *opaque)
{
    CPUState *cpu = opaque;

562
    cpu->exception_index = -1;
563 564 565 566 567 568 569 570

    return 0;
}

static bool cpu_common_exception_index_needed(void *opaque)
{
    CPUState *cpu = opaque;

571
    return tcg_enabled() && cpu->exception_index != -1;
572 573 574 575 576 577
}

static const VMStateDescription vmstate_cpu_common_exception_index = {
    .name = "cpu_common/exception_index",
    .version_id = 1,
    .minimum_version_id = 1,
578
    .needed = cpu_common_exception_index_needed,
579 580 581 582 583 584
    .fields = (VMStateField[]) {
        VMSTATE_INT32(exception_index, CPUState),
        VMSTATE_END_OF_LIST()
    }
};

585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
static bool cpu_common_crash_occurred_needed(void *opaque)
{
    CPUState *cpu = opaque;

    return cpu->crash_occurred;
}

static const VMStateDescription vmstate_cpu_common_crash_occurred = {
    .name = "cpu_common/crash_occurred",
    .version_id = 1,
    .minimum_version_id = 1,
    .needed = cpu_common_crash_occurred_needed,
    .fields = (VMStateField[]) {
        VMSTATE_BOOL(crash_occurred, CPUState),
        VMSTATE_END_OF_LIST()
    }
};

603
const VMStateDescription vmstate_cpu_common = {
604 605 606
    .name = "cpu_common",
    .version_id = 1,
    .minimum_version_id = 1,
607
    .pre_load = cpu_common_pre_load,
608
    .post_load = cpu_common_post_load,
609
    .fields = (VMStateField[]) {
610 611
        VMSTATE_UINT32(halted, CPUState),
        VMSTATE_UINT32(interrupt_request, CPUState),
612
        VMSTATE_END_OF_LIST()
613
    },
614 615
    .subsections = (const VMStateDescription*[]) {
        &vmstate_cpu_common_exception_index,
616
        &vmstate_cpu_common_crash_occurred,
617
        NULL
618 619
    }
};
620

621
#endif
B
bellard 已提交
622

623
CPUState *qemu_get_cpu(int index)
B
bellard 已提交
624
{
A
Andreas Färber 已提交
625
    CPUState *cpu;
B
bellard 已提交
626

A
Andreas Färber 已提交
627
    CPU_FOREACH(cpu) {
628
        if (cpu->cpu_index == index) {
A
Andreas Färber 已提交
629
            return cpu;
630
        }
B
bellard 已提交
631
    }
632

A
Andreas Färber 已提交
633
    return NULL;
B
bellard 已提交
634 635
}

636
#if !defined(CONFIG_USER_ONLY)
637
void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx)
638
{
639 640 641 642 643
    CPUAddressSpace *newas;

    /* Target code should have set num_ases before calling us */
    assert(asidx < cpu->num_ases);

644 645 646 647 648
    if (asidx == 0) {
        /* address space 0 gets the convenience alias */
        cpu->as = as;
    }

649 650
    /* KVM cannot currently support multiple address spaces. */
    assert(asidx == 0 || !kvm_enabled());
651

652 653
    if (!cpu->cpu_ases) {
        cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
654
    }
655

656 657 658
    newas = &cpu->cpu_ases[asidx];
    newas->cpu = cpu;
    newas->as = as;
659
    if (tcg_enabled()) {
660 661
        newas->tcg_as_listener.commit = tcg_commit;
        memory_listener_register(&newas->tcg_as_listener, as);
662
    }
663
}
664 665 666 667 668 669

AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
{
    /* Return the AddressSpace corresponding to the specified index */
    return cpu->cpu_ases[asidx].as;
}
670 671
#endif

672
void cpu_exec_unrealizefn(CPUState *cpu)
673
{
674 675
    CPUClass *cc = CPU_GET_CLASS(cpu);

676
    cpu_list_remove(cpu);
677 678 679 680 681 682 683

    if (cc->vmsd != NULL) {
        vmstate_unregister(NULL, cc->vmsd, cpu);
    }
    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
        vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
    }
684 685
}

L
Laurent Vivier 已提交
686
void cpu_exec_initfn(CPUState *cpu)
B
bellard 已提交
687
{
688
    cpu->as = NULL;
689
    cpu->num_ases = 0;
690

691 692
#ifndef CONFIG_USER_ONLY
    cpu->thread_id = qemu_get_thread_id();
693 694 695 696 697 698 699 700 701 702 703 704 705 706

    /* This is a softmmu CPU object, so create a property for it
     * so users can wire up its memory. (This can't go in qom/cpu.c
     * because that file is compiled only once for both user-mode
     * and system builds.) The default if no link is set up is to use
     * the system address space.
     */
    object_property_add_link(OBJECT(cpu), "memory", TYPE_MEMORY_REGION,
                             (Object **)&cpu->memory,
                             qdev_prop_allow_set_link_before_realize,
                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
                             &error_abort);
    cpu->memory = system_memory;
    object_ref(OBJECT(cpu->memory));
707
#endif
L
Laurent Vivier 已提交
708 709
}

710
void cpu_exec_realizefn(CPUState *cpu, Error **errp)
L
Laurent Vivier 已提交
711 712
{
    CPUClass *cc ATTRIBUTE_UNUSED = CPU_GET_CLASS(cpu);
713

714
    cpu_list_add(cpu);
715 716

#ifndef CONFIG_USER_ONLY
717
    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
718
        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
719
    }
720
    if (cc->vmsd != NULL) {
721
        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
722
    }
723
#endif
B
bellard 已提交
724 725
}

726
static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
727
{
728 729 730 731 732 733
    /* Flush the whole TB as this will not have race conditions
     * even if we don't have proper locking yet.
     * Ideally we would just invalidate the TBs for the
     * specified PC.
     */
    tb_flush(cpu);
734
}
B
bellard 已提交
735

736
#if defined(CONFIG_USER_ONLY)
737
void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
738 739 740 741

{
}

742 743 744 745 746 747 748 749 750 751
int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
                          int flags)
{
    return -ENOSYS;
}

void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
{
}

752
int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
753 754 755 756 757
                          int flags, CPUWatchpoint **watchpoint)
{
    return -ENOSYS;
}
#else
758
/* Add a watchpoint.  */
759
int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
760
                          int flags, CPUWatchpoint **watchpoint)
761
{
762
    CPUWatchpoint *wp;
763

764
    /* forbid ranges which are empty or run off the end of the address space */
765
    if (len == 0 || (addr + len - 1) < addr) {
766 767
        error_report("tried to set invalid watchpoint at %"
                     VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
768 769
        return -EINVAL;
    }
770
    wp = g_malloc(sizeof(*wp));
771 772

    wp->vaddr = addr;
773
    wp->len = len;
774 775
    wp->flags = flags;

776
    /* keep all GDB-injected watchpoints in front */
777 778 779 780 781
    if (flags & BP_GDB) {
        QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
    } else {
        QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
    }
782

783
    tlb_flush_page(cpu, addr);
784 785 786 787

    if (watchpoint)
        *watchpoint = wp;
    return 0;
788 789
}

790
/* Remove a specific watchpoint.  */
791
int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
792
                          int flags)
793
{
794
    CPUWatchpoint *wp;
795

796
    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
797
        if (addr == wp->vaddr && len == wp->len
798
                && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
799
            cpu_watchpoint_remove_by_ref(cpu, wp);
800 801 802
            return 0;
        }
    }
803
    return -ENOENT;
804 805
}

806
/* Remove a specific watchpoint by reference.  */
807
void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
808
{
809
    QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
810

811
    tlb_flush_page(cpu, watchpoint->vaddr);
812

813
    g_free(watchpoint);
814 815 816
}

/* Remove all matching watchpoints.  */
817
void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
818
{
819
    CPUWatchpoint *wp, *next;
820

821
    QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
822 823 824
        if (wp->flags & mask) {
            cpu_watchpoint_remove_by_ref(cpu, wp);
        }
825
    }
826
}
827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847

/* Return true if this watchpoint address matches the specified
 * access (ie the address range covered by the watchpoint overlaps
 * partially or completely with the address range covered by the
 * access).
 */
static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
                                                  vaddr addr,
                                                  vaddr len)
{
    /* We know the lengths are non-zero, but a little caution is
     * required to avoid errors in the case where the range ends
     * exactly at the top of the address space and so addr + len
     * wraps round to zero.
     */
    vaddr wpend = wp->vaddr + wp->len - 1;
    vaddr addrend = addr + len - 1;

    return !(addr > wpend || wp->vaddr > addrend);
}

848
#endif
849

850
/* Add a breakpoint.  */
851
int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
852
                          CPUBreakpoint **breakpoint)
B
bellard 已提交
853
{
854
    CPUBreakpoint *bp;
855

856
    bp = g_malloc(sizeof(*bp));
B
bellard 已提交
857

858 859 860
    bp->pc = pc;
    bp->flags = flags;

861
    /* keep all GDB-injected breakpoints in front */
862
    if (flags & BP_GDB) {
863
        QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
864
    } else {
865
        QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
866
    }
867

868
    breakpoint_invalidate(cpu, pc);
869

870
    if (breakpoint) {
871
        *breakpoint = bp;
872
    }
B
bellard 已提交
873 874 875
    return 0;
}

876
/* Remove a specific breakpoint.  */
877
int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
878 879 880
{
    CPUBreakpoint *bp;

881
    QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
882
        if (bp->pc == pc && bp->flags == flags) {
883
            cpu_breakpoint_remove_by_ref(cpu, bp);
884 885
            return 0;
        }
886
    }
887
    return -ENOENT;
888 889
}

890
/* Remove a specific breakpoint by reference.  */
891
void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
B
bellard 已提交
892
{
893 894 895
    QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);

    breakpoint_invalidate(cpu, breakpoint->pc);
896

897
    g_free(breakpoint);
898 899 900
}

/* Remove all matching breakpoints. */
901
void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
902
{
903
    CPUBreakpoint *bp, *next;
904

905
    QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
906 907 908
        if (bp->flags & mask) {
            cpu_breakpoint_remove_by_ref(cpu, bp);
        }
909
    }
B
bellard 已提交
910 911
}

B
bellard 已提交
912 913
/* enable or disable single step mode. EXCP_DEBUG is returned by the
   CPU loop after each instruction */
914
void cpu_single_step(CPUState *cpu, int enabled)
B
bellard 已提交
915
{
916 917 918
    if (cpu->singlestep_enabled != enabled) {
        cpu->singlestep_enabled = enabled;
        if (kvm_enabled()) {
919
            kvm_update_guest_debug(cpu, 0);
920
        } else {
S
Stuart Brady 已提交
921
            /* must flush all the translated code to avoid inconsistencies */
922
            /* XXX: only flush what is necessary */
923
            tb_flush(cpu);
924
        }
B
bellard 已提交
925 926 927
    }
}

928
void cpu_abort(CPUState *cpu, const char *fmt, ...)
B
bellard 已提交
929 930
{
    va_list ap;
P
pbrook 已提交
931
    va_list ap2;
B
bellard 已提交
932 933

    va_start(ap, fmt);
P
pbrook 已提交
934
    va_copy(ap2, ap);
B
bellard 已提交
935 936 937
    fprintf(stderr, "qemu: fatal: ");
    vfprintf(stderr, fmt, ap);
    fprintf(stderr, "\n");
938
    cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
939
    if (qemu_log_separate()) {
940
        qemu_log_lock();
941 942 943
        qemu_log("qemu: fatal: ");
        qemu_log_vprintf(fmt, ap2);
        qemu_log("\n");
944
        log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
945
        qemu_log_flush();
946
        qemu_log_unlock();
947
        qemu_log_close();
948
    }
P
pbrook 已提交
949
    va_end(ap2);
950
    va_end(ap);
951
    replay_finish();
952 953 954 955 956 957 958 959
#if defined(CONFIG_USER_ONLY)
    {
        struct sigaction act;
        sigfillset(&act.sa_mask);
        act.sa_handler = SIG_DFL;
        sigaction(SIGABRT, &act, NULL);
    }
#endif
B
bellard 已提交
960 961 962
    abort();
}

963
#if !defined(CONFIG_USER_ONLY)
M
Mike Day 已提交
964
/* Called from RCU critical section */
P
Paolo Bonzini 已提交
965 966 967 968
static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
{
    RAMBlock *block;

P
Paolo Bonzini 已提交
969
    block = atomic_rcu_read(&ram_list.mru_block);
970
    if (block && addr - block->offset < block->max_length) {
971
        return block;
P
Paolo Bonzini 已提交
972
    }
M
Mike Day 已提交
973
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
974
        if (addr - block->offset < block->max_length) {
P
Paolo Bonzini 已提交
975 976 977 978 979 980 981 982
            goto found;
        }
    }

    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
    abort();

found:
P
Paolo Bonzini 已提交
983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998
    /* It is safe to write mru_block outside the iothread lock.  This
     * is what happens:
     *
     *     mru_block = xxx
     *     rcu_read_unlock()
     *                                        xxx removed from list
     *                  rcu_read_lock()
     *                  read mru_block
     *                                        mru_block = NULL;
     *                                        call_rcu(reclaim_ramblock, xxx);
     *                  rcu_read_unlock()
     *
     * atomic_rcu_set is not needed here.  The block was already published
     * when it was placed into the list.  Here we're just making an extra
     * copy of the pointer.
     */
P
Paolo Bonzini 已提交
999 1000 1001 1002
    ram_list.mru_block = block;
    return block;
}

1003
static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
J
Juan Quintela 已提交
1004
{
1005
    CPUState *cpu;
P
Paolo Bonzini 已提交
1006
    ram_addr_t start1;
1007 1008 1009 1010 1011
    RAMBlock *block;
    ram_addr_t end;

    end = TARGET_PAGE_ALIGN(start + length);
    start &= TARGET_PAGE_MASK;
J
Juan Quintela 已提交
1012

M
Mike Day 已提交
1013
    rcu_read_lock();
P
Paolo Bonzini 已提交
1014 1015
    block = qemu_get_ram_block(start);
    assert(block == qemu_get_ram_block(end - 1));
1016
    start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
1017 1018 1019
    CPU_FOREACH(cpu) {
        tlb_reset_dirty(cpu, start1, length);
    }
M
Mike Day 已提交
1020
    rcu_read_unlock();
J
Juan Quintela 已提交
1021 1022
}

P
pbrook 已提交
1023
/* Note: start and end must be within the same ram block.  */
1024 1025 1026
bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
                                              ram_addr_t length,
                                              unsigned client)
1027
{
1028
    DirtyMemoryBlocks *blocks;
1029
    unsigned long end, page;
1030
    bool dirty = false;
1031 1032 1033 1034

    if (length == 0) {
        return false;
    }
B
bellard 已提交
1035

1036 1037
    end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
    page = start >> TARGET_PAGE_BITS;
1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053

    rcu_read_lock();

    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);

    while (page < end) {
        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);

        dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
                                              offset, num);
        page += num;
    }

    rcu_read_unlock();
1054 1055

    if (dirty && tcg_enabled()) {
1056
        tlb_reset_dirty_range_all(start, length);
P
pbrook 已提交
1057
    }
1058 1059

    return dirty;
1060 1061
}

1062
/* Called from RCU critical section */
1063
hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1064 1065 1066 1067 1068
                                       MemoryRegionSection *section,
                                       target_ulong vaddr,
                                       hwaddr paddr, hwaddr xlat,
                                       int prot,
                                       target_ulong *address)
B
Blue Swirl 已提交
1069
{
A
Avi Kivity 已提交
1070
    hwaddr iotlb;
B
Blue Swirl 已提交
1071 1072
    CPUWatchpoint *wp;

1073
    if (memory_region_is_ram(section->mr)) {
B
Blue Swirl 已提交
1074
        /* Normal RAM.  */
1075
        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
B
Blue Swirl 已提交
1076
        if (!section->readonly) {
1077
            iotlb |= PHYS_SECTION_NOTDIRTY;
B
Blue Swirl 已提交
1078
        } else {
1079
            iotlb |= PHYS_SECTION_ROM;
B
Blue Swirl 已提交
1080 1081
        }
    } else {
1082 1083 1084 1085
        AddressSpaceDispatch *d;

        d = atomic_rcu_read(&section->address_space->dispatch);
        iotlb = section - d->map.sections;
1086
        iotlb += xlat;
B
Blue Swirl 已提交
1087 1088 1089 1090
    }

    /* Make accesses to pages with watchpoints go via the
       watchpoint trap routines.  */
1091
    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1092
        if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
B
Blue Swirl 已提交
1093 1094
            /* Avoid trapping reads of pages with a write breakpoint. */
            if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
1095
                iotlb = PHYS_SECTION_WATCH + paddr;
B
Blue Swirl 已提交
1096 1097 1098 1099 1100 1101 1102 1103
                *address |= TLB_MMIO;
                break;
            }
        }
    }

    return iotlb;
}
1104 1105
#endif /* defined(CONFIG_USER_ONLY) */

1106
#if !defined(CONFIG_USER_ONLY)
1107

A
Anthony Liguori 已提交
1108
static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
1109
                             uint16_t section);
1110
static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
1111

1112 1113
static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
                               qemu_anon_ram_alloc;
1114 1115 1116 1117 1118 1119

/*
 * Set a custom physical guest memory alloator.
 * Accelerators with unusual needs may need this.  Hopefully, we can
 * get rid of it eventually.
 */
1120
void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
1121 1122 1123 1124
{
    phys_mem_alloc = alloc;
}

1125 1126
static uint16_t phys_section_add(PhysPageMap *map,
                                 MemoryRegionSection *section)
1127
{
1128 1129 1130 1131
    /* The physical section number is ORed with a page-aligned
     * pointer to produce the iotlb entries.  Thus it should
     * never overflow into the page-aligned value.
     */
1132
    assert(map->sections_nb < TARGET_PAGE_SIZE);
1133

1134 1135 1136 1137
    if (map->sections_nb == map->sections_nb_alloc) {
        map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
        map->sections = g_renew(MemoryRegionSection, map->sections,
                                map->sections_nb_alloc);
1138
    }
1139
    map->sections[map->sections_nb] = *section;
P
Paolo Bonzini 已提交
1140
    memory_region_ref(section->mr);
1141
    return map->sections_nb++;
1142 1143
}

1144 1145
static void phys_section_destroy(MemoryRegion *mr)
{
D
Don Slutz 已提交
1146 1147
    bool have_sub_page = mr->subpage;

P
Paolo Bonzini 已提交
1148 1149
    memory_region_unref(mr);

D
Don Slutz 已提交
1150
    if (have_sub_page) {
1151
        subpage_t *subpage = container_of(mr, subpage_t, iomem);
P
Peter Crosthwaite 已提交
1152
        object_unref(OBJECT(&subpage->iomem));
1153 1154 1155 1156
        g_free(subpage);
    }
}

P
Paolo Bonzini 已提交
1157
static void phys_sections_free(PhysPageMap *map)
1158
{
1159 1160
    while (map->sections_nb > 0) {
        MemoryRegionSection *section = &map->sections[--map->sections_nb];
1161 1162
        phys_section_destroy(section->mr);
    }
1163 1164
    g_free(map->sections);
    g_free(map->nodes);
1165 1166
}

A
Avi Kivity 已提交
1167
static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
1168 1169
{
    subpage_t *subpage;
A
Avi Kivity 已提交
1170
    hwaddr base = section->offset_within_address_space
1171
        & TARGET_PAGE_MASK;
1172
    MemoryRegionSection *existing = phys_page_find(d->phys_map, base,
1173
                                                   d->map.nodes, d->map.sections);
1174 1175
    MemoryRegionSection subsection = {
        .offset_within_address_space = base,
1176
        .size = int128_make64(TARGET_PAGE_SIZE),
1177
    };
A
Avi Kivity 已提交
1178
    hwaddr start, end;
1179

1180
    assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1181

1182
    if (!(existing->mr->subpage)) {
1183
        subpage = subpage_init(d->as, base);
1184
        subsection.address_space = d->as;
1185
        subsection.mr = &subpage->iomem;
A
Avi Kivity 已提交
1186
        phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1187
                      phys_section_add(&d->map, &subsection));
1188
    } else {
1189
        subpage = container_of(existing->mr, subpage_t, iomem);
1190 1191
    }
    start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1192
    end = start + int128_get64(section->size) - 1;
1193 1194
    subpage_register(subpage, start, end,
                     phys_section_add(&d->map, section));
1195 1196 1197
}


1198 1199
static void register_multipage(AddressSpaceDispatch *d,
                               MemoryRegionSection *section)
1200
{
A
Avi Kivity 已提交
1201
    hwaddr start_addr = section->offset_within_address_space;
1202
    uint16_t section_index = phys_section_add(&d->map, section);
1203 1204
    uint64_t num_pages = int128_get64(int128_rshift(section->size,
                                                    TARGET_PAGE_BITS));
1205

1206 1207
    assert(num_pages);
    phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1208 1209
}

A
Avi Kivity 已提交
1210
static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
1211
{
1212
    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
1213
    AddressSpaceDispatch *d = as->next_dispatch;
1214
    MemoryRegionSection now = *section, remain = *section;
1215
    Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1216

1217 1218 1219 1220
    if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
        uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
                       - now.offset_within_address_space;

1221
        now.size = int128_min(int128_make64(left), now.size);
A
Avi Kivity 已提交
1222
        register_subpage(d, &now);
1223
    } else {
1224
        now.size = int128_zero();
1225
    }
1226 1227 1228 1229
    while (int128_ne(remain.size, now.size)) {
        remain.size = int128_sub(remain.size, now.size);
        remain.offset_within_address_space += int128_get64(now.size);
        remain.offset_within_region += int128_get64(now.size);
1230
        now = remain;
1231
        if (int128_lt(remain.size, page_size)) {
1232
            register_subpage(d, &now);
1233
        } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1234
            now.size = page_size;
A
Avi Kivity 已提交
1235
            register_subpage(d, &now);
1236
        } else {
1237
            now.size = int128_and(now.size, int128_neg(page_size));
A
Avi Kivity 已提交
1238
            register_multipage(d, &now);
1239
        }
1240 1241 1242
    }
}

1243 1244 1245 1246 1247 1248
void qemu_flush_coalesced_mmio_buffer(void)
{
    if (kvm_enabled())
        kvm_flush_coalesced_mmio_buffer();
}

1249 1250 1251 1252 1253 1254 1255 1256 1257 1258
void qemu_mutex_lock_ramlist(void)
{
    qemu_mutex_lock(&ram_list.mutex);
}

void qemu_mutex_unlock_ramlist(void)
{
    qemu_mutex_unlock(&ram_list.mutex);
}

1259
#ifdef __linux__
1260 1261 1262 1263 1264 1265 1266 1267 1268
static int64_t get_file_size(int fd)
{
    int64_t size = lseek(fd, 0, SEEK_END);
    if (size < 0) {
        return -errno;
    }
    return size;
}

A
Alex Williamson 已提交
1269 1270
static void *file_ram_alloc(RAMBlock *block,
                            ram_addr_t memory,
1271 1272
                            const char *path,
                            Error **errp)
1273
{
1274
    bool unlink_on_error = false;
1275
    char *filename;
1276 1277
    char *sanitized_name;
    char *c;
1278
    void *area = MAP_FAILED;
1279
    int fd = -1;
1280
    int64_t file_size;
1281 1282

    if (kvm_enabled() && !kvm_has_sync_mmu()) {
1283 1284
        error_setg(errp,
                   "host lacks kvm mmu notifiers, -mem-path unsupported");
1285
        return NULL;
1286 1287
    }

1288 1289 1290 1291 1292
    for (;;) {
        fd = open(path, O_RDWR);
        if (fd >= 0) {
            /* @path names an existing file, use it */
            break;
1293
        }
1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309
        if (errno == ENOENT) {
            /* @path names a file that doesn't exist, create it */
            fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
            if (fd >= 0) {
                unlink_on_error = true;
                break;
            }
        } else if (errno == EISDIR) {
            /* @path names a directory, create a file there */
            /* Make name safe to use with mkstemp by replacing '/' with '_'. */
            sanitized_name = g_strdup(memory_region_name(block->mr));
            for (c = sanitized_name; *c != '\0'; c++) {
                if (*c == '/') {
                    *c = '_';
                }
            }
1310

1311 1312 1313
            filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
                                       sanitized_name);
            g_free(sanitized_name);
1314

1315 1316 1317 1318 1319 1320 1321
            fd = mkstemp(filename);
            if (fd >= 0) {
                unlink(filename);
                g_free(filename);
                break;
            }
            g_free(filename);
1322
        }
1323 1324 1325 1326 1327 1328 1329 1330 1331 1332
        if (errno != EEXIST && errno != EINTR) {
            error_setg_errno(errp, errno,
                             "can't open backing store %s for guest RAM",
                             path);
            goto error;
        }
        /*
         * Try again on EINTR and EEXIST.  The latter happens when
         * something else creates the file between our two open().
         */
1333
    }
1334

1335
    block->page_size = qemu_fd_getpagesize(fd);
1336 1337 1338 1339 1340 1341
    block->mr->align = block->page_size;
#if defined(__s390x__)
    if (kvm_enabled()) {
        block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
    }
#endif
1342

1343 1344
    file_size = get_file_size(fd);

1345
    if (memory < block->page_size) {
1346
        error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1347 1348
                   "or larger than page size 0x%zx",
                   memory, block->page_size);
1349
        goto error;
1350 1351
    }

1352 1353 1354 1355 1356 1357 1358
    if (file_size > 0 && file_size < memory) {
        error_setg(errp, "backing store %s size 0x%" PRIx64
                   " does not match 'size' option 0x" RAM_ADDR_FMT,
                   path, file_size, memory);
        goto error;
    }

1359
    memory = ROUND_UP(memory, block->page_size);
1360 1361 1362 1363 1364 1365

    /*
     * ftruncate is not supported by hugetlbfs in older
     * hosts, so don't bother bailing out on errors.
     * If anything goes wrong with it under other filesystems,
     * mmap will fail.
1366 1367 1368 1369 1370 1371 1372 1373
     *
     * Do not truncate the non-empty backend file to avoid corrupting
     * the existing data in the file. Disabling shrinking is not
     * enough. For example, the current vNVDIMM implementation stores
     * the guest NVDIMM labels at the end of the backend file. If the
     * backend file is later extended, QEMU will not be able to find
     * those labels. Therefore, extending the non-empty backend file
     * is disabled as well.
1374
     */
1375
    if (!file_size && ftruncate(fd, memory)) {
Y
Yoshiaki Tamura 已提交
1376
        perror("ftruncate");
1377
    }
1378

1379 1380
    area = qemu_ram_mmap(fd, memory, block->mr->align,
                         block->flags & RAM_SHARED);
1381
    if (area == MAP_FAILED) {
1382
        error_setg_errno(errp, errno,
1383
                         "unable to map backing store for guest RAM");
1384
        goto error;
1385
    }
1386 1387

    if (mem_prealloc) {
1388 1389 1390 1391
        os_mem_prealloc(fd, area, memory, errp);
        if (errp && *errp) {
            goto error;
        }
1392 1393
    }

A
Alex Williamson 已提交
1394
    block->fd = fd;
1395
    return area;
1396 1397

error:
1398 1399 1400
    if (area != MAP_FAILED) {
        qemu_ram_munmap(area, memory);
    }
1401 1402 1403
    if (unlink_on_error) {
        unlink(path);
    }
1404 1405 1406
    if (fd != -1) {
        close(fd);
    }
1407
    return NULL;
1408 1409 1410
}
#endif

M
Mike Day 已提交
1411
/* Called with the ramlist lock held.  */
1412
static ram_addr_t find_ram_offset(ram_addr_t size)
A
Alex Williamson 已提交
1413 1414
{
    RAMBlock *block, *next_block;
A
Alex Williamson 已提交
1415
    ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
A
Alex Williamson 已提交
1416

1417 1418
    assert(size != 0); /* it would hand out same offset multiple times */

M
Mike Day 已提交
1419
    if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
A
Alex Williamson 已提交
1420
        return 0;
M
Mike Day 已提交
1421
    }
A
Alex Williamson 已提交
1422

M
Mike Day 已提交
1423
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1424
        ram_addr_t end, next = RAM_ADDR_MAX;
A
Alex Williamson 已提交
1425

1426
        end = block->offset + block->max_length;
A
Alex Williamson 已提交
1427

M
Mike Day 已提交
1428
        QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) {
A
Alex Williamson 已提交
1429 1430 1431 1432 1433
            if (next_block->offset >= end) {
                next = MIN(next, next_block->offset);
            }
        }
        if (next - end >= size && next - end < mingap) {
A
Alex Williamson 已提交
1434
            offset = end;
A
Alex Williamson 已提交
1435 1436 1437
            mingap = next - end;
        }
    }
A
Alex Williamson 已提交
1438 1439 1440 1441 1442 1443 1444

    if (offset == RAM_ADDR_MAX) {
        fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
                (uint64_t)size);
        abort();
    }

A
Alex Williamson 已提交
1445 1446 1447
    return offset;
}

J
Juan Quintela 已提交
1448
ram_addr_t last_ram_offset(void)
1449 1450 1451 1452
{
    RAMBlock *block;
    ram_addr_t last = 0;

M
Mike Day 已提交
1453 1454
    rcu_read_lock();
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1455
        last = MAX(last, block->offset + block->max_length);
M
Mike Day 已提交
1456
    }
M
Mike Day 已提交
1457
    rcu_read_unlock();
1458 1459 1460
    return last;
}

1461 1462 1463 1464 1465
static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
{
    int ret;

    /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1466
    if (!machine_dump_guest_core(current_machine)) {
1467 1468 1469 1470 1471 1472 1473 1474 1475
        ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
        if (ret) {
            perror("qemu_madvise");
            fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
                            "but dump_guest_core=off specified\n");
        }
    }
}

D
Dr. David Alan Gilbert 已提交
1476 1477 1478 1479 1480
const char *qemu_ram_get_idstr(RAMBlock *rb)
{
    return rb->idstr;
}

1481
/* Called with iothread lock held.  */
G
Gonglei 已提交
1482
void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
1483
{
G
Gonglei 已提交
1484
    RAMBlock *block;
1485

1486 1487
    assert(new_block);
    assert(!new_block->idstr[0]);
1488

1489 1490
    if (dev) {
        char *id = qdev_get_dev_path(dev);
1491 1492
        if (id) {
            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1493
            g_free(id);
1494 1495 1496 1497
        }
    }
    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);

G
Gonglei 已提交
1498
    rcu_read_lock();
M
Mike Day 已提交
1499
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
G
Gonglei 已提交
1500 1501
        if (block != new_block &&
            !strcmp(block->idstr, new_block->idstr)) {
1502 1503 1504 1505 1506
            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
                    new_block->idstr);
            abort();
        }
    }
M
Mike Day 已提交
1507
    rcu_read_unlock();
1508 1509
}

1510
/* Called with iothread lock held.  */
G
Gonglei 已提交
1511
void qemu_ram_unset_idstr(RAMBlock *block)
1512
{
1513 1514 1515 1516
    /* FIXME: arch_init.c assumes that this is not called throughout
     * migration.  Ignore the problem since hot-unplug during migration
     * does not work anyway.
     */
1517 1518 1519 1520 1521
    if (block) {
        memset(block->idstr, 0, sizeof(block->idstr));
    }
}

1522 1523 1524 1525 1526
size_t qemu_ram_pagesize(RAMBlock *rb)
{
    return rb->page_size;
}

1527 1528
static int memory_try_enable_merging(void *addr, size_t len)
{
1529
    if (!machine_mem_merge(current_machine)) {
1530 1531 1532 1533 1534 1535 1536
        /* disabled by the user */
        return 0;
    }

    return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
}

1537 1538 1539 1540 1541 1542 1543
/* Only legal before guest might have detected the memory size: e.g. on
 * incoming migration, or right after reset.
 *
 * As memory core doesn't know how is memory accessed, it is up to
 * resize callback to update device state and/or add assertions to detect
 * misuse, if necessary.
 */
G
Gonglei 已提交
1544
int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
1545 1546 1547
{
    assert(block);

1548
    newsize = HOST_PAGE_ALIGN(newsize);
1549

1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571
    if (block->used_length == newsize) {
        return 0;
    }

    if (!(block->flags & RAM_RESIZEABLE)) {
        error_setg_errno(errp, EINVAL,
                         "Length mismatch: %s: 0x" RAM_ADDR_FMT
                         " in != 0x" RAM_ADDR_FMT, block->idstr,
                         newsize, block->used_length);
        return -EINVAL;
    }

    if (block->max_length < newsize) {
        error_setg_errno(errp, EINVAL,
                         "Length too large: %s: 0x" RAM_ADDR_FMT
                         " > 0x" RAM_ADDR_FMT, block->idstr,
                         newsize, block->max_length);
        return -EINVAL;
    }

    cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
    block->used_length = newsize;
1572 1573
    cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
                                        DIRTY_CLIENTS_ALL);
1574 1575 1576 1577 1578 1579 1580
    memory_region_set_size(block->mr, newsize);
    if (block->resized) {
        block->resized(block->idstr, newsize, block->host);
    }
    return 0;
}

1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621
/* Called with ram_list.mutex held */
static void dirty_memory_extend(ram_addr_t old_ram_size,
                                ram_addr_t new_ram_size)
{
    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
                                             DIRTY_MEMORY_BLOCK_SIZE);
    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
                                             DIRTY_MEMORY_BLOCK_SIZE);
    int i;

    /* Only need to extend if block count increased */
    if (new_num_blocks <= old_num_blocks) {
        return;
    }

    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
        DirtyMemoryBlocks *old_blocks;
        DirtyMemoryBlocks *new_blocks;
        int j;

        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
        new_blocks = g_malloc(sizeof(*new_blocks) +
                              sizeof(new_blocks->blocks[0]) * new_num_blocks);

        if (old_num_blocks) {
            memcpy(new_blocks->blocks, old_blocks->blocks,
                   old_num_blocks * sizeof(old_blocks->blocks[0]));
        }

        for (j = old_num_blocks; j < new_num_blocks; j++) {
            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
        }

        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);

        if (old_blocks) {
            g_free_rcu(old_blocks, rcu);
        }
    }
}

1622
static void ram_block_add(RAMBlock *new_block, Error **errp)
1623
{
1624
    RAMBlock *block;
M
Mike Day 已提交
1625
    RAMBlock *last_block = NULL;
1626
    ram_addr_t old_ram_size, new_ram_size;
1627
    Error *err = NULL;
1628 1629

    old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1630

1631
    qemu_mutex_lock_ramlist();
1632
    new_block->offset = find_ram_offset(new_block->max_length);
1633 1634 1635

    if (!new_block->host) {
        if (xen_enabled()) {
1636
            xen_ram_alloc(new_block->offset, new_block->max_length,
1637 1638 1639 1640
                          new_block->mr, &err);
            if (err) {
                error_propagate(errp, err);
                qemu_mutex_unlock_ramlist();
1641
                return;
1642
            }
1643
        } else {
1644
            new_block->host = phys_mem_alloc(new_block->max_length,
1645
                                             &new_block->mr->align);
1646
            if (!new_block->host) {
1647 1648 1649 1650
                error_setg_errno(errp, errno,
                                 "cannot set up guest memory '%s'",
                                 memory_region_name(new_block->mr));
                qemu_mutex_unlock_ramlist();
1651
                return;
1652
            }
1653
            memory_try_enable_merging(new_block->host, new_block->max_length);
1654
        }
1655
    }
P
pbrook 已提交
1656

L
Li Zhijian 已提交
1657 1658 1659 1660
    new_ram_size = MAX(old_ram_size,
              (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
    if (new_ram_size > old_ram_size) {
        migration_bitmap_extend(old_ram_size, new_ram_size);
1661
        dirty_memory_extend(old_ram_size, new_ram_size);
L
Li Zhijian 已提交
1662
    }
M
Mike Day 已提交
1663 1664 1665 1666
    /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
     * QLIST (which has an RCU-friendly variant) does not have insertion at
     * tail, so save the last element in last_block.
     */
M
Mike Day 已提交
1667
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
M
Mike Day 已提交
1668
        last_block = block;
1669
        if (block->max_length < new_block->max_length) {
1670 1671 1672 1673
            break;
        }
    }
    if (block) {
M
Mike Day 已提交
1674
        QLIST_INSERT_BEFORE_RCU(block, new_block, next);
M
Mike Day 已提交
1675
    } else if (last_block) {
M
Mike Day 已提交
1676
        QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
M
Mike Day 已提交
1677
    } else { /* list is empty */
M
Mike Day 已提交
1678
        QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1679
    }
1680
    ram_list.mru_block = NULL;
P
pbrook 已提交
1681

M
Mike Day 已提交
1682 1683
    /* Write list before version */
    smp_wmb();
U
Umesh Deshpande 已提交
1684
    ram_list.version++;
1685
    qemu_mutex_unlock_ramlist();
U
Umesh Deshpande 已提交
1686

1687
    cpu_physical_memory_set_dirty_range(new_block->offset,
1688 1689
                                        new_block->used_length,
                                        DIRTY_CLIENTS_ALL);
P
pbrook 已提交
1690

1691 1692 1693
    if (new_block->host) {
        qemu_ram_setup_dump(new_block->host, new_block->max_length);
        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
C
Cao jin 已提交
1694
        /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
1695
        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
P
Paolo Bonzini 已提交
1696
        ram_block_notify_add(new_block->host, new_block->max_length);
1697
    }
P
pbrook 已提交
1698
}
B
bellard 已提交
1699

1700
#ifdef __linux__
1701 1702 1703
RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
                                   bool share, const char *mem_path,
                                   Error **errp)
1704 1705
{
    RAMBlock *new_block;
1706
    Error *local_err = NULL;
1707 1708

    if (xen_enabled()) {
1709
        error_setg(errp, "-mem-path not supported with Xen");
1710
        return NULL;
1711 1712 1713 1714 1715 1716 1717 1718
    }

    if (phys_mem_alloc != qemu_anon_ram_alloc) {
        /*
         * file_ram_alloc() needs to allocate just like
         * phys_mem_alloc, but we haven't bothered to provide
         * a hook there.
         */
1719 1720
        error_setg(errp,
                   "-mem-path not supported with this accelerator");
1721
        return NULL;
1722 1723
    }

1724
    size = HOST_PAGE_ALIGN(size);
1725 1726
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
1727 1728
    new_block->used_length = size;
    new_block->max_length = size;
1729
    new_block->flags = share ? RAM_SHARED : 0;
1730 1731 1732 1733
    new_block->host = file_ram_alloc(new_block, size,
                                     mem_path, errp);
    if (!new_block->host) {
        g_free(new_block);
1734
        return NULL;
1735 1736
    }

1737
    ram_block_add(new_block, &local_err);
1738 1739 1740
    if (local_err) {
        g_free(new_block);
        error_propagate(errp, local_err);
1741
        return NULL;
1742
    }
1743
    return new_block;
1744
}
1745
#endif
1746

1747
static
1748 1749 1750 1751 1752 1753
RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
                                  void (*resized)(const char*,
                                                  uint64_t length,
                                                  void *host),
                                  void *host, bool resizeable,
                                  MemoryRegion *mr, Error **errp)
1754 1755
{
    RAMBlock *new_block;
1756
    Error *local_err = NULL;
1757

1758 1759
    size = HOST_PAGE_ALIGN(size);
    max_size = HOST_PAGE_ALIGN(max_size);
1760 1761
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
1762
    new_block->resized = resized;
1763 1764
    new_block->used_length = size;
    new_block->max_length = max_size;
1765
    assert(max_size >= size);
1766
    new_block->fd = -1;
1767
    new_block->page_size = getpagesize();
1768 1769
    new_block->host = host;
    if (host) {
1770
        new_block->flags |= RAM_PREALLOC;
1771
    }
1772 1773 1774
    if (resizeable) {
        new_block->flags |= RAM_RESIZEABLE;
    }
1775
    ram_block_add(new_block, &local_err);
1776 1777 1778
    if (local_err) {
        g_free(new_block);
        error_propagate(errp, local_err);
1779
        return NULL;
1780
    }
1781
    return new_block;
1782 1783
}

1784
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
1785 1786 1787 1788 1789
                                   MemoryRegion *mr, Error **errp)
{
    return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
}

1790
RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
1791
{
1792 1793 1794
    return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
}

1795
RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
1796 1797 1798 1799 1800 1801
                                     void (*resized)(const char*,
                                                     uint64_t length,
                                                     void *host),
                                     MemoryRegion *mr, Error **errp)
{
    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
1802 1803
}

P
Paolo Bonzini 已提交
1804 1805 1806 1807 1808 1809 1810 1811
static void reclaim_ramblock(RAMBlock *block)
{
    if (block->flags & RAM_PREALLOC) {
        ;
    } else if (xen_enabled()) {
        xen_invalidate_map_cache_entry(block->host);
#ifndef _WIN32
    } else if (block->fd >= 0) {
1812
        qemu_ram_munmap(block->host, block->max_length);
P
Paolo Bonzini 已提交
1813 1814 1815 1816 1817 1818 1819 1820
        close(block->fd);
#endif
    } else {
        qemu_anon_ram_free(block->host, block->max_length);
    }
    g_free(block);
}

1821
void qemu_ram_free(RAMBlock *block)
B
bellard 已提交
1822
{
1823 1824 1825 1826
    if (!block) {
        return;
    }

P
Paolo Bonzini 已提交
1827 1828 1829 1830
    if (block->host) {
        ram_block_notify_remove(block->host, block->max_length);
    }

1831
    qemu_mutex_lock_ramlist();
1832 1833 1834 1835 1836 1837
    QLIST_REMOVE_RCU(block, next);
    ram_list.mru_block = NULL;
    /* Write list before version */
    smp_wmb();
    ram_list.version++;
    call_rcu(block, reclaim_ramblock, rcu);
1838
    qemu_mutex_unlock_ramlist();
B
bellard 已提交
1839 1840
}

H
Huang Ying 已提交
1841 1842 1843 1844 1845 1846 1847 1848
#ifndef _WIN32
void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
{
    RAMBlock *block;
    ram_addr_t offset;
    int flags;
    void *area, *vaddr;

M
Mike Day 已提交
1849
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
H
Huang Ying 已提交
1850
        offset = addr - block->offset;
1851
        if (offset < block->max_length) {
1852
            vaddr = ramblock_ptr(block, offset);
1853
            if (block->flags & RAM_PREALLOC) {
H
Huang Ying 已提交
1854
                ;
1855 1856
            } else if (xen_enabled()) {
                abort();
H
Huang Ying 已提交
1857 1858
            } else {
                flags = MAP_FIXED;
1859
                if (block->fd >= 0) {
1860 1861
                    flags |= (block->flags & RAM_SHARED ?
                              MAP_SHARED : MAP_PRIVATE);
1862 1863
                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
                                flags, block->fd, offset);
H
Huang Ying 已提交
1864
                } else {
1865 1866 1867 1868 1869 1870 1871
                    /*
                     * Remap needs to match alloc.  Accelerators that
                     * set phys_mem_alloc never remap.  If they did,
                     * we'd need a remap hook here.
                     */
                    assert(phys_mem_alloc == qemu_anon_ram_alloc);

H
Huang Ying 已提交
1872 1873 1874 1875 1876
                    flags |= MAP_PRIVATE | MAP_ANONYMOUS;
                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
                                flags, -1, 0);
                }
                if (area != vaddr) {
1877 1878
                    fprintf(stderr, "Could not remap addr: "
                            RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
H
Huang Ying 已提交
1879 1880 1881
                            length, addr);
                    exit(1);
                }
1882
                memory_try_enable_merging(vaddr, length);
1883
                qemu_ram_setup_dump(vaddr, length);
H
Huang Ying 已提交
1884 1885 1886 1887 1888 1889
            }
        }
    }
}
#endif /* !_WIN32 */

1890
/* Return a host pointer to ram allocated with qemu_ram_alloc.
1891 1892 1893
 * This should not be used for general purpose DMA.  Use address_space_map
 * or address_space_rw instead. For local memory (e.g. video ram) that the
 * device owns, use memory_region_get_ram_ptr.
M
Mike Day 已提交
1894
 *
1895
 * Called within RCU critical section.
1896
 */
1897
void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
1898
{
1899 1900 1901 1902
    RAMBlock *block = ram_block;

    if (block == NULL) {
        block = qemu_get_ram_block(addr);
1903
        addr -= block->offset;
1904
    }
1905 1906

    if (xen_enabled() && block->host == NULL) {
1907 1908 1909 1910 1911
        /* We need to check if the requested address is in the RAM
         * because we don't want to map the entire memory in QEMU.
         * In that case just map until the end of the page.
         */
        if (block->offset == 0) {
1912
            return xen_map_cache(addr, 0, 0);
1913
        }
1914 1915

        block->host = xen_map_cache(block->offset, block->max_length, 1);
1916
    }
1917
    return ramblock_ptr(block, addr);
1918 1919
}

1920
/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
1921
 * but takes a size argument.
M
Mike Day 已提交
1922
 *
1923
 * Called within RCU critical section.
1924
 */
1925 1926
static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
                                 hwaddr *size)
1927
{
1928
    RAMBlock *block = ram_block;
1929 1930 1931
    if (*size == 0) {
        return NULL;
    }
1932

1933 1934
    if (block == NULL) {
        block = qemu_get_ram_block(addr);
1935
        addr -= block->offset;
1936
    }
1937
    *size = MIN(*size, block->max_length - addr);
1938 1939 1940 1941 1942 1943 1944 1945

    if (xen_enabled() && block->host == NULL) {
        /* We need to check if the requested address is in the RAM
         * because we don't want to map the entire memory in QEMU.
         * In that case just map the requested area.
         */
        if (block->offset == 0) {
            return xen_map_cache(addr, *size, 1);
1946 1947
        }

1948
        block->host = xen_map_cache(block->offset, block->max_length, 1);
1949
    }
1950

1951
    return ramblock_ptr(block, addr);
1952 1953
}

D
Dr. David Alan Gilbert 已提交
1954 1955 1956 1957 1958 1959 1960 1961 1962 1963
/*
 * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
 * in that RAMBlock.
 *
 * ptr: Host pointer to look up
 * round_offset: If true round the result offset down to a page boundary
 * *ram_addr: set to result ram_addr
 * *offset: set to result offset within the RAMBlock
 *
 * Returns: RAMBlock (or NULL if not found)
1964 1965 1966 1967 1968 1969 1970
 *
 * By the time this function returns, the returned pointer is not protected
 * by RCU anymore.  If the caller is not within an RCU critical section and
 * does not hold the iothread lock, it must have other means of protecting the
 * pointer, such as a reference to the region that includes the incoming
 * ram_addr_t.
 */
D
Dr. David Alan Gilbert 已提交
1971 1972
RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
                                   ram_addr_t *offset)
P
pbrook 已提交
1973
{
P
pbrook 已提交
1974 1975 1976
    RAMBlock *block;
    uint8_t *host = ptr;

1977
    if (xen_enabled()) {
1978
        ram_addr_t ram_addr;
M
Mike Day 已提交
1979
        rcu_read_lock();
1980 1981
        ram_addr = xen_ram_addr_from_mapcache(ptr);
        block = qemu_get_ram_block(ram_addr);
D
Dr. David Alan Gilbert 已提交
1982
        if (block) {
1983
            *offset = ram_addr - block->offset;
D
Dr. David Alan Gilbert 已提交
1984
        }
M
Mike Day 已提交
1985
        rcu_read_unlock();
D
Dr. David Alan Gilbert 已提交
1986
        return block;
1987 1988
    }

M
Mike Day 已提交
1989 1990
    rcu_read_lock();
    block = atomic_rcu_read(&ram_list.mru_block);
1991
    if (block && block->host && host - block->host < block->max_length) {
1992 1993 1994
        goto found;
    }

M
Mike Day 已提交
1995
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
J
Jun Nakajima 已提交
1996 1997 1998 1999
        /* This case append when the block is not mapped. */
        if (block->host == NULL) {
            continue;
        }
2000
        if (host - block->host < block->max_length) {
2001
            goto found;
A
Alex Williamson 已提交
2002
        }
P
pbrook 已提交
2003
    }
J
Jun Nakajima 已提交
2004

M
Mike Day 已提交
2005
    rcu_read_unlock();
2006
    return NULL;
2007 2008

found:
D
Dr. David Alan Gilbert 已提交
2009 2010 2011 2012
    *offset = (host - block->host);
    if (round_offset) {
        *offset &= TARGET_PAGE_MASK;
    }
M
Mike Day 已提交
2013
    rcu_read_unlock();
D
Dr. David Alan Gilbert 已提交
2014 2015 2016
    return block;
}

D
Dr. David Alan Gilbert 已提交
2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036
/*
 * Finds the named RAMBlock
 *
 * name: The name of RAMBlock to find
 *
 * Returns: RAMBlock (or NULL if not found)
 */
RAMBlock *qemu_ram_block_by_name(const char *name)
{
    RAMBlock *block;

    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
        if (!strcmp(name, block->idstr)) {
            return block;
        }
    }

    return NULL;
}

D
Dr. David Alan Gilbert 已提交
2037 2038
/* Some of the softmmu routines need to translate from a host pointer
   (typically a TLB entry) back to a ram offset.  */
2039
ram_addr_t qemu_ram_addr_from_host(void *ptr)
D
Dr. David Alan Gilbert 已提交
2040 2041
{
    RAMBlock *block;
2042
    ram_addr_t offset;
D
Dr. David Alan Gilbert 已提交
2043

2044
    block = qemu_ram_block_from_host(ptr, false, &offset);
D
Dr. David Alan Gilbert 已提交
2045
    if (!block) {
2046
        return RAM_ADDR_INVALID;
D
Dr. David Alan Gilbert 已提交
2047 2048
    }

2049
    return block->offset + offset;
M
Marcelo Tosatti 已提交
2050
}
A
Alex Williamson 已提交
2051

2052
/* Called within RCU critical section.  */
A
Avi Kivity 已提交
2053
static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
2054
                               uint64_t val, unsigned size)
2055
{
2056 2057
    bool locked = false;

2058
    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
2059 2060
        locked = true;
        tb_lock();
2061
        tb_invalidate_phys_page_fast(ram_addr, size);
2062
    }
2063 2064
    switch (size) {
    case 1:
2065
        stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2066 2067
        break;
    case 2:
2068
        stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2069 2070
        break;
    case 4:
2071
        stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2072 2073 2074
        break;
    default:
        abort();
2075
    }
2076 2077 2078 2079 2080

    if (locked) {
        tb_unlock();
    }

2081 2082 2083 2084 2085
    /* Set both VGA and migration bits for simplicity and to remove
     * the notdirty callback faster.
     */
    cpu_physical_memory_set_dirty_range(ram_addr, size,
                                        DIRTY_CLIENTS_NOCODE);
B
bellard 已提交
2086 2087
    /* we remove the notdirty callback only if the code has been
       flushed */
2088
    if (!cpu_physical_memory_is_clean(ram_addr)) {
2089
        tlb_set_dirty(current_cpu, current_cpu->mem_io_vaddr);
2090
    }
2091 2092
}

2093 2094 2095 2096 2097 2098
static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
                                 unsigned size, bool is_write)
{
    return is_write;
}

2099 2100
static const MemoryRegionOps notdirty_mem_ops = {
    .write = notdirty_mem_write,
2101
    .valid.accepts = notdirty_mem_accepts,
2102
    .endianness = DEVICE_NATIVE_ENDIAN,
2103 2104
};

P
pbrook 已提交
2105
/* Generate a debug exception if a watchpoint has been hit.  */
2106
static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
P
pbrook 已提交
2107
{
2108
    CPUState *cpu = current_cpu;
2109
    CPUClass *cc = CPU_GET_CLASS(cpu);
2110
    CPUArchState *env = cpu->env_ptr;
2111
    target_ulong pc, cs_base;
P
pbrook 已提交
2112
    target_ulong vaddr;
2113
    CPUWatchpoint *wp;
2114
    uint32_t cpu_flags;
P
pbrook 已提交
2115

2116
    if (cpu->watchpoint_hit) {
2117 2118 2119
        /* We re-entered the check after replacing the TB. Now raise
         * the debug interrupt so that is will trigger after the
         * current instruction. */
2120
        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2121 2122
        return;
    }
2123
    vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
2124
    vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len);
2125
    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2126 2127
        if (cpu_watchpoint_address_matches(wp, vaddr, len)
            && (wp->flags & flags)) {
2128 2129 2130 2131 2132 2133
            if (flags == BP_MEM_READ) {
                wp->flags |= BP_WATCHPOINT_HIT_READ;
            } else {
                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
            }
            wp->hitaddr = vaddr;
2134
            wp->hitattrs = attrs;
2135
            if (!cpu->watchpoint_hit) {
2136 2137 2138 2139 2140
                if (wp->flags & BP_CPU &&
                    !cc->debug_check_watchpoint(cpu, wp)) {
                    wp->flags &= ~BP_WATCHPOINT_HIT;
                    continue;
                }
2141
                cpu->watchpoint_hit = wp;
2142

2143 2144 2145
                /* Both tb_lock and iothread_mutex will be reset when
                 * cpu_loop_exit or cpu_loop_exit_noexc longjmp
                 * back into the cpu_exec main loop.
2146 2147
                 */
                tb_lock();
2148
                tb_check_watchpoint(cpu);
2149
                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2150
                    cpu->exception_index = EXCP_DEBUG;
2151
                    cpu_loop_exit(cpu);
2152 2153
                } else {
                    cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
2154
                    tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
2155
                    cpu_loop_exit_noexc(cpu);
2156
                }
2157
            }
2158 2159
        } else {
            wp->flags &= ~BP_WATCHPOINT_HIT;
P
pbrook 已提交
2160 2161 2162 2163
        }
    }
}

2164 2165 2166
/* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
   so these check for a hit then pass through to the normal out-of-line
   phys routines.  */
2167 2168
static MemTxResult watch_mem_read(void *opaque, hwaddr addr, uint64_t *pdata,
                                  unsigned size, MemTxAttrs attrs)
2169
{
2170 2171
    MemTxResult res;
    uint64_t data;
2172 2173
    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2174 2175

    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
2176
    switch (size) {
2177
    case 1:
2178
        data = address_space_ldub(as, addr, attrs, &res);
2179 2180
        break;
    case 2:
2181
        data = address_space_lduw(as, addr, attrs, &res);
2182 2183
        break;
    case 4:
2184
        data = address_space_ldl(as, addr, attrs, &res);
2185
        break;
2186 2187
    default: abort();
    }
2188 2189
    *pdata = data;
    return res;
2190 2191
}

2192 2193 2194
static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
                                   uint64_t val, unsigned size,
                                   MemTxAttrs attrs)
2195
{
2196
    MemTxResult res;
2197 2198
    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2199 2200

    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
2201
    switch (size) {
2202
    case 1:
2203
        address_space_stb(as, addr, val, attrs, &res);
2204 2205
        break;
    case 2:
2206
        address_space_stw(as, addr, val, attrs, &res);
2207 2208
        break;
    case 4:
2209
        address_space_stl(as, addr, val, attrs, &res);
2210
        break;
2211 2212
    default: abort();
    }
2213
    return res;
2214 2215
}

2216
static const MemoryRegionOps watch_mem_ops = {
2217 2218
    .read_with_attrs = watch_mem_read,
    .write_with_attrs = watch_mem_write,
2219
    .endianness = DEVICE_NATIVE_ENDIAN,
2220 2221
};

2222 2223
static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
                                unsigned len, MemTxAttrs attrs)
2224
{
2225
    subpage_t *subpage = opaque;
2226
    uint8_t buf[8];
2227
    MemTxResult res;
2228

2229
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2230
    printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2231
           subpage, len, addr);
2232
#endif
2233 2234 2235 2236
    res = address_space_read(subpage->as, addr + subpage->base,
                             attrs, buf, len);
    if (res) {
        return res;
2237
    }
2238 2239
    switch (len) {
    case 1:
2240 2241
        *data = ldub_p(buf);
        return MEMTX_OK;
2242
    case 2:
2243 2244
        *data = lduw_p(buf);
        return MEMTX_OK;
2245
    case 4:
2246 2247
        *data = ldl_p(buf);
        return MEMTX_OK;
2248
    case 8:
2249 2250
        *data = ldq_p(buf);
        return MEMTX_OK;
2251 2252 2253
    default:
        abort();
    }
2254 2255
}

2256 2257
static MemTxResult subpage_write(void *opaque, hwaddr addr,
                                 uint64_t value, unsigned len, MemTxAttrs attrs)
2258
{
2259
    subpage_t *subpage = opaque;
2260
    uint8_t buf[8];
2261

2262
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2263
    printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2264 2265
           " value %"PRIx64"\n",
           __func__, subpage, len, addr, value);
2266
#endif
2267 2268 2269 2270 2271 2272 2273 2274 2275 2276
    switch (len) {
    case 1:
        stb_p(buf, value);
        break;
    case 2:
        stw_p(buf, value);
        break;
    case 4:
        stl_p(buf, value);
        break;
2277 2278 2279
    case 8:
        stq_p(buf, value);
        break;
2280 2281 2282
    default:
        abort();
    }
2283 2284
    return address_space_write(subpage->as, addr + subpage->base,
                               attrs, buf, len);
2285 2286
}

2287
static bool subpage_accepts(void *opaque, hwaddr addr,
A
Amos Kong 已提交
2288
                            unsigned len, bool is_write)
2289
{
2290
    subpage_t *subpage = opaque;
2291
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2292
    printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2293
           __func__, subpage, is_write ? 'w' : 'r', len, addr);
2294 2295
#endif

2296
    return address_space_access_valid(subpage->as, addr + subpage->base,
A
Amos Kong 已提交
2297
                                      len, is_write);
2298 2299
}

2300
static const MemoryRegionOps subpage_ops = {
2301 2302
    .read_with_attrs = subpage_read,
    .write_with_attrs = subpage_write,
2303 2304 2305 2306
    .impl.min_access_size = 1,
    .impl.max_access_size = 8,
    .valid.min_access_size = 1,
    .valid.max_access_size = 8,
2307
    .valid.accepts = subpage_accepts,
2308
    .endianness = DEVICE_NATIVE_ENDIAN,
2309 2310
};

A
Anthony Liguori 已提交
2311
static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2312
                             uint16_t section)
2313 2314 2315 2316 2317 2318 2319 2320
{
    int idx, eidx;

    if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
        return -1;
    idx = SUBPAGE_IDX(start);
    eidx = SUBPAGE_IDX(end);
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2321 2322
    printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
           __func__, mmio, start, end, idx, eidx, section);
2323 2324
#endif
    for (; idx <= eidx; idx++) {
2325
        mmio->sub_section[idx] = section;
2326 2327 2328 2329 2330
    }

    return 0;
}

2331
static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
2332
{
A
Anthony Liguori 已提交
2333
    subpage_t *mmio;
2334

2335
    mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2336
    mmio->as = as;
2337
    mmio->base = base;
2338
    memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
P
Peter Crosthwaite 已提交
2339
                          NULL, TARGET_PAGE_SIZE);
A
Avi Kivity 已提交
2340
    mmio->iomem.subpage = true;
2341
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2342 2343
    printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
           mmio, base, TARGET_PAGE_SIZE);
2344
#endif
2345
    subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2346 2347 2348 2349

    return mmio;
}

2350 2351
static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as,
                              MemoryRegion *mr)
2352
{
2353
    assert(as);
2354
    MemoryRegionSection section = {
2355
        .address_space = as,
2356 2357 2358
        .mr = mr,
        .offset_within_address_space = 0,
        .offset_within_region = 0,
2359
        .size = int128_2_64(),
2360 2361
    };

2362
    return phys_section_add(map, &section);
2363 2364
}

2365
MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs)
2366
{
2367 2368
    int asidx = cpu_asidx_from_attrs(cpu, attrs);
    CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2369
    AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2370
    MemoryRegionSection *sections = d->map.sections;
P
Paolo Bonzini 已提交
2371 2372

    return sections[index & ~TARGET_PAGE_MASK].mr;
2373 2374
}

A
Avi Kivity 已提交
2375 2376
static void io_mem_init(void)
{
2377
    memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2378
    memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2379
                          NULL, UINT64_MAX);
2380 2381 2382 2383

    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
     * which can be called without the iothread mutex.
     */
2384
    memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2385
                          NULL, UINT64_MAX);
2386 2387
    memory_region_clear_global_locking(&io_mem_notdirty);

2388
    memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2389
                          NULL, UINT64_MAX);
A
Avi Kivity 已提交
2390 2391
}

A
Avi Kivity 已提交
2392
static void mem_begin(MemoryListener *listener)
2393 2394
{
    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2395 2396 2397
    AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
    uint16_t n;

2398
    n = dummy_section(&d->map, as, &io_mem_unassigned);
2399
    assert(n == PHYS_SECTION_UNASSIGNED);
2400
    n = dummy_section(&d->map, as, &io_mem_notdirty);
2401
    assert(n == PHYS_SECTION_NOTDIRTY);
2402
    n = dummy_section(&d->map, as, &io_mem_rom);
2403
    assert(n == PHYS_SECTION_ROM);
2404
    n = dummy_section(&d->map, as, &io_mem_watch);
2405
    assert(n == PHYS_SECTION_WATCH);
2406

M
Michael S. Tsirkin 已提交
2407
    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2408 2409 2410 2411
    d->as = as;
    as->next_dispatch = d;
}

2412 2413 2414 2415 2416 2417
static void address_space_dispatch_free(AddressSpaceDispatch *d)
{
    phys_sections_free(&d->map);
    g_free(d);
}

2418
static void mem_commit(MemoryListener *listener)
A
Avi Kivity 已提交
2419
{
2420
    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2421 2422 2423
    AddressSpaceDispatch *cur = as->dispatch;
    AddressSpaceDispatch *next = as->next_dispatch;

2424
    phys_page_compact_all(next, next->map.nodes_nb);
2425

2426
    atomic_rcu_set(&as->dispatch, next);
2427
    if (cur) {
2428
        call_rcu(cur, address_space_dispatch_free, rcu);
2429
    }
2430 2431
}

2432
static void tcg_commit(MemoryListener *listener)
2433
{
2434 2435
    CPUAddressSpace *cpuas;
    AddressSpaceDispatch *d;
2436 2437 2438

    /* since each CPU stores ram addresses in its TLB cache, we must
       reset the modified entries */
2439 2440 2441 2442 2443 2444 2445
    cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
    cpu_reloading_memory_map();
    /* The CPU and TLB are protected by the iothread lock.
     * We reload the dispatch pointer now because cpu_reloading_memory_map()
     * may have split the RCU critical section.
     */
    d = atomic_rcu_read(&cpuas->as->dispatch);
2446
    atomic_rcu_set(&cpuas->memory_dispatch, d);
2447
    tlb_flush(cpuas->cpu);
2448 2449
}

A
Avi Kivity 已提交
2450 2451
void address_space_init_dispatch(AddressSpace *as)
{
2452
    as->dispatch = NULL;
2453
    as->dispatch_listener = (MemoryListener) {
A
Avi Kivity 已提交
2454
        .begin = mem_begin,
2455
        .commit = mem_commit,
A
Avi Kivity 已提交
2456 2457 2458 2459
        .region_add = mem_add,
        .region_nop = mem_add,
        .priority = 0,
    };
2460
    memory_listener_register(&as->dispatch_listener, as);
A
Avi Kivity 已提交
2461 2462
}

2463 2464 2465 2466 2467
void address_space_unregister(AddressSpace *as)
{
    memory_listener_unregister(&as->dispatch_listener);
}

A
Avi Kivity 已提交
2468 2469 2470 2471
void address_space_destroy_dispatch(AddressSpace *as)
{
    AddressSpaceDispatch *d = as->dispatch;

2472 2473 2474 2475
    atomic_rcu_set(&as->dispatch, NULL);
    if (d) {
        call_rcu(d, address_space_dispatch_free, rcu);
    }
A
Avi Kivity 已提交
2476 2477
}

A
Avi Kivity 已提交
2478 2479
static void memory_map_init(void)
{
2480
    system_memory = g_malloc(sizeof(*system_memory));
2481

2482
    memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2483
    address_space_init(&address_space_memory, system_memory, "memory");
2484

2485
    system_io = g_malloc(sizeof(*system_io));
2486 2487
    memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
                          65536);
2488
    address_space_init(&address_space_io, system_io, "I/O");
A
Avi Kivity 已提交
2489 2490 2491 2492 2493 2494 2495
}

MemoryRegion *get_system_memory(void)
{
    return system_memory;
}

2496 2497 2498 2499 2500
MemoryRegion *get_system_io(void)
{
    return system_io;
}

2501 2502
#endif /* !defined(CONFIG_USER_ONLY) */

B
bellard 已提交
2503 2504
/* physical memory access (slow version, mainly for debug) */
#if defined(CONFIG_USER_ONLY)
2505
int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
P
Paul Brook 已提交
2506
                        uint8_t *buf, int len, int is_write)
B
bellard 已提交
2507 2508 2509
{
    int l, flags;
    target_ulong page;
2510
    void * p;
B
bellard 已提交
2511 2512 2513 2514 2515 2516 2517 2518

    while (len > 0) {
        page = addr & TARGET_PAGE_MASK;
        l = (page + TARGET_PAGE_SIZE) - addr;
        if (l > len)
            l = len;
        flags = page_get_flags(page);
        if (!(flags & PAGE_VALID))
P
Paul Brook 已提交
2519
            return -1;
B
bellard 已提交
2520 2521
        if (is_write) {
            if (!(flags & PAGE_WRITE))
P
Paul Brook 已提交
2522
                return -1;
2523
            /* XXX: this code should not depend on lock_user */
A
aurel32 已提交
2524
            if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
P
Paul Brook 已提交
2525
                return -1;
A
aurel32 已提交
2526 2527
            memcpy(p, buf, l);
            unlock_user(p, addr, l);
B
bellard 已提交
2528 2529
        } else {
            if (!(flags & PAGE_READ))
P
Paul Brook 已提交
2530
                return -1;
2531
            /* XXX: this code should not depend on lock_user */
A
aurel32 已提交
2532
            if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
P
Paul Brook 已提交
2533
                return -1;
A
aurel32 已提交
2534
            memcpy(buf, p, l);
A
aurel32 已提交
2535
            unlock_user(p, addr, 0);
B
bellard 已提交
2536 2537 2538 2539 2540
        }
        len -= l;
        buf += l;
        addr += l;
    }
P
Paul Brook 已提交
2541
    return 0;
B
bellard 已提交
2542
}
B
bellard 已提交
2543

B
bellard 已提交
2544
#else
2545

2546
static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
A
Avi Kivity 已提交
2547
                                     hwaddr length)
2548
{
2549
    uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
2550 2551
    addr += memory_region_get_ram_addr(mr);

2552 2553 2554 2555 2556 2557 2558 2559 2560
    /* No early return if dirty_log_mask is or becomes 0, because
     * cpu_physical_memory_set_dirty_range will still call
     * xen_modified_memory.
     */
    if (dirty_log_mask) {
        dirty_log_mask =
            cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
    }
    if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
2561
        tb_lock();
2562
        tb_invalidate_phys_range(addr, addr + length);
2563
        tb_unlock();
2564
        dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
2565
    }
2566
    cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
2567 2568
}

2569
static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2570
{
2571
    unsigned access_size_max = mr->ops->valid.max_access_size;
2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584

    /* Regions are assumed to support 1-4 byte accesses unless
       otherwise specified.  */
    if (access_size_max == 0) {
        access_size_max = 4;
    }

    /* Bound the maximum access by the alignment of the address.  */
    if (!mr->ops->impl.unaligned) {
        unsigned align_size_max = addr & -addr;
        if (align_size_max != 0 && align_size_max < access_size_max) {
            access_size_max = align_size_max;
        }
2585
    }
2586 2587 2588 2589

    /* Don't attempt accesses larger than the maximum.  */
    if (l > access_size_max) {
        l = access_size_max;
2590
    }
2591
    l = pow2floor(l);
2592 2593

    return l;
2594 2595
}

2596
static bool prepare_mmio_access(MemoryRegion *mr)
2597
{
2598 2599 2600 2601 2602 2603 2604 2605
    bool unlocked = !qemu_mutex_iothread_locked();
    bool release_lock = false;

    if (unlocked && mr->global_locking) {
        qemu_mutex_lock_iothread();
        unlocked = false;
        release_lock = true;
    }
2606
    if (mr->flush_coalesced_mmio) {
2607 2608 2609
        if (unlocked) {
            qemu_mutex_lock_iothread();
        }
2610
        qemu_flush_coalesced_mmio_buffer();
2611 2612 2613
        if (unlocked) {
            qemu_mutex_unlock_iothread();
        }
2614
    }
2615 2616

    return release_lock;
2617 2618
}

2619 2620 2621 2622 2623 2624
/* Called within RCU critical section.  */
static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
                                                MemTxAttrs attrs,
                                                const uint8_t *buf,
                                                int len, hwaddr addr1,
                                                hwaddr l, MemoryRegion *mr)
B
bellard 已提交
2625 2626
{
    uint8_t *ptr;
2627
    uint64_t val;
2628
    MemTxResult result = MEMTX_OK;
2629
    bool release_lock = false;
2630

2631
    for (;;) {
2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645
        if (!memory_access_is_direct(mr, true)) {
            release_lock |= prepare_mmio_access(mr);
            l = memory_access_size(mr, l, addr1);
            /* XXX: could force current_cpu to NULL to avoid
               potential bugs */
            switch (l) {
            case 8:
                /* 64 bit write access */
                val = ldq_p(buf);
                result |= memory_region_dispatch_write(mr, addr1, val, 8,
                                                       attrs);
                break;
            case 4:
                /* 32 bit write access */
2646
                val = (uint32_t)ldl_p(buf);
2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663
                result |= memory_region_dispatch_write(mr, addr1, val, 4,
                                                       attrs);
                break;
            case 2:
                /* 16 bit write access */
                val = lduw_p(buf);
                result |= memory_region_dispatch_write(mr, addr1, val, 2,
                                                       attrs);
                break;
            case 1:
                /* 8 bit write access */
                val = ldub_p(buf);
                result |= memory_region_dispatch_write(mr, addr1, val, 1,
                                                       attrs);
                break;
            default:
                abort();
B
bellard 已提交
2664 2665
            }
        } else {
2666
            /* RAM case */
2667
            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2668 2669
            memcpy(ptr, buf, l);
            invalidate_and_set_dirty(mr, addr1, l);
B
bellard 已提交
2670
        }
2671 2672 2673 2674 2675 2676

        if (release_lock) {
            qemu_mutex_unlock_iothread();
            release_lock = false;
        }

B
bellard 已提交
2677 2678 2679
        len -= l;
        buf += l;
        addr += l;
2680 2681 2682 2683 2684 2685 2686

        if (!len) {
            break;
        }

        l = len;
        mr = address_space_translate(as, addr, &addr1, &l, true);
B
bellard 已提交
2687
    }
2688

2689
    return result;
B
bellard 已提交
2690
}
B
bellard 已提交
2691

2692 2693
MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
                                const uint8_t *buf, int len)
A
Avi Kivity 已提交
2694
{
2695 2696 2697 2698 2699
    hwaddr l;
    hwaddr addr1;
    MemoryRegion *mr;
    MemTxResult result = MEMTX_OK;

2700 2701
    if (len > 0) {
        rcu_read_lock();
2702
        l = len;
2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721
        mr = address_space_translate(as, addr, &addr1, &l, true);
        result = address_space_write_continue(as, addr, attrs, buf, len,
                                              addr1, l, mr);
        rcu_read_unlock();
    }

    return result;
}

/* Called within RCU critical section.  */
MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
                                        MemTxAttrs attrs, uint8_t *buf,
                                        int len, hwaddr addr1, hwaddr l,
                                        MemoryRegion *mr)
{
    uint8_t *ptr;
    uint64_t val;
    MemTxResult result = MEMTX_OK;
    bool release_lock = false;
2722

2723
    for (;;) {
2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757
        if (!memory_access_is_direct(mr, false)) {
            /* I/O case */
            release_lock |= prepare_mmio_access(mr);
            l = memory_access_size(mr, l, addr1);
            switch (l) {
            case 8:
                /* 64 bit read access */
                result |= memory_region_dispatch_read(mr, addr1, &val, 8,
                                                      attrs);
                stq_p(buf, val);
                break;
            case 4:
                /* 32 bit read access */
                result |= memory_region_dispatch_read(mr, addr1, &val, 4,
                                                      attrs);
                stl_p(buf, val);
                break;
            case 2:
                /* 16 bit read access */
                result |= memory_region_dispatch_read(mr, addr1, &val, 2,
                                                      attrs);
                stw_p(buf, val);
                break;
            case 1:
                /* 8 bit read access */
                result |= memory_region_dispatch_read(mr, addr1, &val, 1,
                                                      attrs);
                stb_p(buf, val);
                break;
            default:
                abort();
            }
        } else {
            /* RAM case */
2758
            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769
            memcpy(buf, ptr, l);
        }

        if (release_lock) {
            qemu_mutex_unlock_iothread();
            release_lock = false;
        }

        len -= l;
        buf += l;
        addr += l;
2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781

        if (!len) {
            break;
        }

        l = len;
        mr = address_space_translate(as, addr, &addr1, &l, false);
    }

    return result;
}

2782 2783
MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
                                    MemTxAttrs attrs, uint8_t *buf, int len)
2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796
{
    hwaddr l;
    hwaddr addr1;
    MemoryRegion *mr;
    MemTxResult result = MEMTX_OK;

    if (len > 0) {
        rcu_read_lock();
        l = len;
        mr = address_space_translate(as, addr, &addr1, &l, false);
        result = address_space_read_continue(as, addr, attrs, buf, len,
                                             addr1, l, mr);
        rcu_read_unlock();
2797 2798 2799
    }

    return result;
A
Avi Kivity 已提交
2800 2801
}

2802 2803 2804 2805 2806 2807 2808 2809 2810
MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
                             uint8_t *buf, int len, bool is_write)
{
    if (is_write) {
        return address_space_write(as, addr, attrs, (uint8_t *)buf, len);
    } else {
        return address_space_read(as, addr, attrs, (uint8_t *)buf, len);
    }
}
A
Avi Kivity 已提交
2811

A
Avi Kivity 已提交
2812
void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
A
Avi Kivity 已提交
2813 2814
                            int len, int is_write)
{
2815 2816
    address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
                     buf, len, is_write);
A
Avi Kivity 已提交
2817 2818
}

2819 2820 2821 2822 2823
enum write_rom_type {
    WRITE_DATA,
    FLUSH_CACHE,
};

2824
static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
2825
    hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
B
bellard 已提交
2826
{
2827
    hwaddr l;
B
bellard 已提交
2828
    uint8_t *ptr;
2829
    hwaddr addr1;
2830
    MemoryRegion *mr;
2831

2832
    rcu_read_lock();
B
bellard 已提交
2833
    while (len > 0) {
2834
        l = len;
2835
        mr = address_space_translate(as, addr, &addr1, &l, true);
2836

2837 2838
        if (!(memory_region_is_ram(mr) ||
              memory_region_is_romd(mr))) {
2839
            l = memory_access_size(mr, l, addr1);
B
bellard 已提交
2840 2841
        } else {
            /* ROM/RAM case */
2842
            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2843 2844 2845
            switch (type) {
            case WRITE_DATA:
                memcpy(ptr, buf, l);
2846
                invalidate_and_set_dirty(mr, addr1, l);
2847 2848 2849 2850 2851
                break;
            case FLUSH_CACHE:
                flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
                break;
            }
B
bellard 已提交
2852 2853 2854 2855 2856
        }
        len -= l;
        buf += l;
        addr += l;
    }
2857
    rcu_read_unlock();
B
bellard 已提交
2858 2859
}

2860
/* used for ROM loading : can write in RAM and ROM */
2861
void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
2862 2863
                                   const uint8_t *buf, int len)
{
2864
    cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878
}

void cpu_flush_icache_range(hwaddr start, int len)
{
    /*
     * This function should do the same thing as an icache flush that was
     * triggered from within the guest. For TCG we are always cache coherent,
     * so there is no need to flush anything. For KVM / Xen we need to flush
     * the host's instruction cache at least.
     */
    if (tcg_enabled()) {
        return;
    }

2879 2880
    cpu_physical_memory_write_rom_internal(&address_space_memory,
                                           start, NULL, len, FLUSH_CACHE);
2881 2882
}

2883
typedef struct {
2884
    MemoryRegion *mr;
2885
    void *buffer;
A
Avi Kivity 已提交
2886 2887
    hwaddr addr;
    hwaddr len;
F
Fam Zheng 已提交
2888
    bool in_use;
2889 2890 2891 2892
} BounceBuffer;

static BounceBuffer bounce;

2893
typedef struct MapClient {
2894
    QEMUBH *bh;
B
Blue Swirl 已提交
2895
    QLIST_ENTRY(MapClient) link;
2896 2897
} MapClient;

2898
QemuMutex map_client_list_lock;
B
Blue Swirl 已提交
2899 2900
static QLIST_HEAD(map_client_list, MapClient) map_client_list
    = QLIST_HEAD_INITIALIZER(map_client_list);
2901

2902 2903 2904 2905 2906 2907
static void cpu_unregister_map_client_do(MapClient *client)
{
    QLIST_REMOVE(client, link);
    g_free(client);
}

2908 2909 2910 2911 2912 2913
static void cpu_notify_map_clients_locked(void)
{
    MapClient *client;

    while (!QLIST_EMPTY(&map_client_list)) {
        client = QLIST_FIRST(&map_client_list);
2914 2915
        qemu_bh_schedule(client->bh);
        cpu_unregister_map_client_do(client);
2916 2917 2918
    }
}

2919
void cpu_register_map_client(QEMUBH *bh)
2920
{
2921
    MapClient *client = g_malloc(sizeof(*client));
2922

2923
    qemu_mutex_lock(&map_client_list_lock);
2924
    client->bh = bh;
B
Blue Swirl 已提交
2925
    QLIST_INSERT_HEAD(&map_client_list, client, link);
2926 2927 2928
    if (!atomic_read(&bounce.in_use)) {
        cpu_notify_map_clients_locked();
    }
2929
    qemu_mutex_unlock(&map_client_list_lock);
2930 2931
}

2932
void cpu_exec_init_all(void)
2933
{
2934
    qemu_mutex_init(&ram_list.mutex);
2935 2936 2937 2938 2939 2940 2941 2942
    /* The data structures we set up here depend on knowing the page size,
     * so no more changes can be made after this point.
     * In an ideal world, nothing we did before we had finished the
     * machine setup would care about the target page size, and we could
     * do this much later, rather than requiring board models to state
     * up front what their requirements are.
     */
    finalize_target_page_bits();
2943
    io_mem_init();
2944
    memory_map_init();
2945
    qemu_mutex_init(&map_client_list_lock);
2946 2947
}

2948
void cpu_unregister_map_client(QEMUBH *bh)
2949 2950 2951
{
    MapClient *client;

2952 2953 2954 2955 2956 2957
    qemu_mutex_lock(&map_client_list_lock);
    QLIST_FOREACH(client, &map_client_list, link) {
        if (client->bh == bh) {
            cpu_unregister_map_client_do(client);
            break;
        }
2958
    }
2959
    qemu_mutex_unlock(&map_client_list_lock);
2960 2961 2962 2963
}

static void cpu_notify_map_clients(void)
{
2964
    qemu_mutex_lock(&map_client_list_lock);
2965
    cpu_notify_map_clients_locked();
2966
    qemu_mutex_unlock(&map_client_list_lock);
2967 2968
}

2969 2970
bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
{
2971
    MemoryRegion *mr;
2972 2973
    hwaddr l, xlat;

2974
    rcu_read_lock();
2975 2976
    while (len > 0) {
        l = len;
2977 2978 2979 2980
        mr = address_space_translate(as, addr, &xlat, &l, is_write);
        if (!memory_access_is_direct(mr, is_write)) {
            l = memory_access_size(mr, l, addr);
            if (!memory_region_access_valid(mr, xlat, l, is_write)) {
R
Roman Kapl 已提交
2981
                rcu_read_unlock();
2982 2983 2984 2985 2986 2987 2988
                return false;
            }
        }

        len -= l;
        addr += l;
    }
2989
    rcu_read_unlock();
2990 2991 2992
    return true;
}

2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017
static hwaddr
address_space_extend_translation(AddressSpace *as, hwaddr addr, hwaddr target_len,
                                 MemoryRegion *mr, hwaddr base, hwaddr len,
                                 bool is_write)
{
    hwaddr done = 0;
    hwaddr xlat;
    MemoryRegion *this_mr;

    for (;;) {
        target_len -= len;
        addr += len;
        done += len;
        if (target_len == 0) {
            return done;
        }

        len = target_len;
        this_mr = address_space_translate(as, addr, &xlat, &len, is_write);
        if (this_mr != mr || xlat != base + done) {
            return done;
        }
    }
}

3018 3019 3020 3021
/* Map a physical memory region into a host virtual address.
 * May map a subset of the requested range, given by and returned in *plen.
 * May return NULL if resources needed to perform the mapping are exhausted.
 * Use only for reads OR writes - not for read-modify-write operations.
3022 3023
 * Use cpu_register_map_client() to know when retrying the map operation is
 * likely to succeed.
3024
 */
A
Avi Kivity 已提交
3025
void *address_space_map(AddressSpace *as,
A
Avi Kivity 已提交
3026 3027
                        hwaddr addr,
                        hwaddr *plen,
A
Avi Kivity 已提交
3028
                        bool is_write)
3029
{
A
Avi Kivity 已提交
3030
    hwaddr len = *plen;
3031 3032
    hwaddr l, xlat;
    MemoryRegion *mr;
3033
    void *ptr;
3034

3035 3036 3037
    if (len == 0) {
        return NULL;
    }
3038

3039
    l = len;
3040
    rcu_read_lock();
3041
    mr = address_space_translate(as, addr, &xlat, &l, is_write);
3042

3043
    if (!memory_access_is_direct(mr, is_write)) {
F
Fam Zheng 已提交
3044
        if (atomic_xchg(&bounce.in_use, true)) {
3045
            rcu_read_unlock();
3046
            return NULL;
3047
        }
3048 3049 3050
        /* Avoid unbounded allocations */
        l = MIN(l, TARGET_PAGE_SIZE);
        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3051 3052
        bounce.addr = addr;
        bounce.len = l;
3053 3054 3055

        memory_region_ref(mr);
        bounce.mr = mr;
3056
        if (!is_write) {
3057 3058
            address_space_read(as, addr, MEMTXATTRS_UNSPECIFIED,
                               bounce.buffer, l);
3059
        }
3060

3061
        rcu_read_unlock();
3062 3063 3064 3065 3066
        *plen = l;
        return bounce.buffer;
    }


3067
    memory_region_ref(mr);
3068 3069
    *plen = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write);
    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen);
3070 3071 3072
    rcu_read_unlock();

    return ptr;
3073 3074
}

A
Avi Kivity 已提交
3075
/* Unmaps a memory region previously mapped by address_space_map().
3076 3077 3078
 * Will also mark the memory as dirty if is_write == 1.  access_len gives
 * the amount of memory that was actually read or written by the caller.
 */
A
Avi Kivity 已提交
3079 3080
void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
                         int is_write, hwaddr access_len)
3081 3082
{
    if (buffer != bounce.buffer) {
3083 3084 3085
        MemoryRegion *mr;
        ram_addr_t addr1;

3086
        mr = memory_region_from_host(buffer, &addr1);
3087
        assert(mr != NULL);
3088
        if (is_write) {
3089
            invalidate_and_set_dirty(mr, addr1, access_len);
3090
        }
3091
        if (xen_enabled()) {
J
Jan Kiszka 已提交
3092
            xen_invalidate_map_cache_entry(buffer);
A
Anthony PERARD 已提交
3093
        }
3094
        memory_region_unref(mr);
3095 3096 3097
        return;
    }
    if (is_write) {
3098 3099
        address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
                            bounce.buffer, access_len);
3100
    }
3101
    qemu_vfree(bounce.buffer);
3102
    bounce.buffer = NULL;
3103
    memory_region_unref(bounce.mr);
F
Fam Zheng 已提交
3104
    atomic_mb_set(&bounce.in_use, false);
3105
    cpu_notify_map_clients();
3106
}
B
bellard 已提交
3107

A
Avi Kivity 已提交
3108 3109
void *cpu_physical_memory_map(hwaddr addr,
                              hwaddr *plen,
A
Avi Kivity 已提交
3110 3111 3112 3113 3114
                              int is_write)
{
    return address_space_map(&address_space_memory, addr, plen, is_write);
}

A
Avi Kivity 已提交
3115 3116
void cpu_physical_memory_unmap(void *buffer, hwaddr len,
                               int is_write, hwaddr access_len)
A
Avi Kivity 已提交
3117 3118 3119 3120
{
    return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
}

P
Paolo Bonzini 已提交
3121 3122 3123 3124 3125 3126 3127 3128 3129 3130
#define ARG1_DECL                AddressSpace *as
#define ARG1                     as
#define SUFFIX
#define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
#define IS_DIRECT(mr, is_write)  memory_access_is_direct(mr, is_write)
#define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
#define RCU_READ_LOCK(...)       rcu_read_lock()
#define RCU_READ_UNLOCK(...)     rcu_read_unlock()
#include "memory_ldst.inc.c"
3131

P
Paolo Bonzini 已提交
3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180
int64_t address_space_cache_init(MemoryRegionCache *cache,
                                 AddressSpace *as,
                                 hwaddr addr,
                                 hwaddr len,
                                 bool is_write)
{
    hwaddr l, xlat;
    MemoryRegion *mr;
    void *ptr;

    assert(len > 0);

    l = len;
    mr = address_space_translate(as, addr, &xlat, &l, is_write);
    if (!memory_access_is_direct(mr, is_write)) {
        return -EINVAL;
    }

    l = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write);
    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, &l);

    cache->xlat = xlat;
    cache->is_write = is_write;
    cache->mr = mr;
    cache->ptr = ptr;
    cache->len = l;
    memory_region_ref(cache->mr);

    return l;
}

void address_space_cache_invalidate(MemoryRegionCache *cache,
                                    hwaddr addr,
                                    hwaddr access_len)
{
    assert(cache->is_write);
    invalidate_and_set_dirty(cache->mr, addr + cache->xlat, access_len);
}

void address_space_cache_destroy(MemoryRegionCache *cache)
{
    if (!cache->mr) {
        return;
    }

    if (xen_enabled()) {
        xen_invalidate_map_cache_entry(cache->ptr);
    }
    memory_region_unref(cache->mr);
3181
    cache->mr = NULL;
P
Paolo Bonzini 已提交
3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208
}

/* Called from RCU critical section.  This function has the same
 * semantics as address_space_translate, but it only works on a
 * predefined range of a MemoryRegion that was mapped with
 * address_space_cache_init.
 */
static inline MemoryRegion *address_space_translate_cached(
    MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
    hwaddr *plen, bool is_write)
{
    assert(addr < cache->len && *plen <= cache->len - addr);
    *xlat = addr + cache->xlat;
    return cache->mr;
}

#define ARG1_DECL                MemoryRegionCache *cache
#define ARG1                     cache
#define SUFFIX                   _cached
#define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
#define IS_DIRECT(mr, is_write)  true
#define MAP_RAM(mr, ofs)         (cache->ptr + (ofs - cache->xlat))
#define INVALIDATE(mr, ofs, len) ((void)0)
#define RCU_READ_LOCK()          ((void)0)
#define RCU_READ_UNLOCK()        ((void)0)
#include "memory_ldst.inc.c"

3209
/* virtual memory access for debug (includes writing to ROM) */
3210
int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3211
                        uint8_t *buf, int len, int is_write)
B
bellard 已提交
3212 3213
{
    int l;
A
Avi Kivity 已提交
3214
    hwaddr phys_addr;
3215
    target_ulong page;
B
bellard 已提交
3216 3217

    while (len > 0) {
3218 3219 3220
        int asidx;
        MemTxAttrs attrs;

B
bellard 已提交
3221
        page = addr & TARGET_PAGE_MASK;
3222 3223
        phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
        asidx = cpu_asidx_from_attrs(cpu, attrs);
B
bellard 已提交
3224 3225 3226 3227 3228 3229
        /* if no physical page mapped, return an error */
        if (phys_addr == -1)
            return -1;
        l = (page + TARGET_PAGE_SIZE) - addr;
        if (l > len)
            l = len;
3230
        phys_addr += (addr & ~TARGET_PAGE_MASK);
3231
        if (is_write) {
3232 3233
            cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
                                          phys_addr, buf, l);
3234
        } else {
3235 3236
            address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
                             MEMTXATTRS_UNSPECIFIED,
3237
                             buf, l, 0);
3238
        }
B
bellard 已提交
3239 3240 3241 3242 3243 3244
        len -= l;
        buf += l;
        addr += l;
    }
    return 0;
}
3245 3246 3247 3248 3249 3250 3251 3252 3253 3254

/*
 * Allows code that needs to deal with migration bitmaps etc to still be built
 * target independent.
 */
size_t qemu_target_page_bits(void)
{
    return TARGET_PAGE_BITS;
}

P
Paul Brook 已提交
3255
#endif
B
bellard 已提交
3256

3257 3258 3259 3260
/*
 * A helper function for the _utterly broken_ virtio device model to find out if
 * it's running on a big endian machine. Don't do this at home kids!
 */
3261 3262
bool target_words_bigendian(void);
bool target_words_bigendian(void)
3263 3264 3265 3266 3267 3268 3269 3270
{
#if defined(TARGET_WORDS_BIGENDIAN)
    return true;
#else
    return false;
#endif
}

3271
#ifndef CONFIG_USER_ONLY
A
Avi Kivity 已提交
3272
bool cpu_physical_memory_is_io(hwaddr phys_addr)
3273
{
3274
    MemoryRegion*mr;
3275
    hwaddr l = 1;
3276
    bool res;
3277

3278
    rcu_read_lock();
3279 3280
    mr = address_space_translate(&address_space_memory,
                                 phys_addr, &phys_addr, &l, false);
3281

3282 3283 3284
    res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
    rcu_read_unlock();
    return res;
3285
}
3286

3287
int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3288 3289
{
    RAMBlock *block;
3290
    int ret = 0;
3291

M
Mike Day 已提交
3292 3293
    rcu_read_lock();
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
3294 3295 3296 3297 3298
        ret = func(block->idstr, block->host, block->offset,
                   block->used_length, opaque);
        if (ret) {
            break;
        }
3299
    }
M
Mike Day 已提交
3300
    rcu_read_unlock();
3301
    return ret;
3302
}
3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333

/*
 * Unmap pages of memory from start to start+length such that
 * they a) read as 0, b) Trigger whatever fault mechanism
 * the OS provides for postcopy.
 * The pages must be unmapped by the end of the function.
 * Returns: 0 on success, none-0 on failure
 *
 */
int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
{
    int ret = -1;

    uint8_t *host_startaddr = rb->host + start;

    if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
        error_report("ram_block_discard_range: Unaligned start address: %p",
                     host_startaddr);
        goto err;
    }

    if ((start + length) <= rb->used_length) {
        uint8_t *host_endaddr = host_startaddr + length;
        if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
            error_report("ram_block_discard_range: Unaligned end address: %p",
                         host_endaddr);
            goto err;
        }

        errno = ENOTSUP; /* If we are missing MADVISE etc */

3334
        if (rb->page_size == qemu_host_page_size) {
3335
#if defined(CONFIG_MADVISE)
3336 3337 3338 3339
            /* Note: We need the madvise MADV_DONTNEED behaviour of definitely
             * freeing the page.
             */
            ret = madvise(host_startaddr, length, MADV_DONTNEED);
3340
#endif
3341 3342 3343 3344 3345 3346 3347 3348 3349 3350
        } else {
            /* Huge page case  - unfortunately it can't do DONTNEED, but
             * it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
             * huge page file.
             */
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
            ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                            start, length);
#endif
        }
3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366
        if (ret) {
            ret = -errno;
            error_report("ram_block_discard_range: Failed to discard range "
                         "%s:%" PRIx64 " +%zx (%d)",
                         rb->idstr, start, length, ret);
        }
    } else {
        error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
                     "/%zx/" RAM_ADDR_FMT")",
                     rb->idstr, start, length, rb->used_length);
    }

err:
    return ret;
}

3367
#endif