exec.c 93.9 KB
Newer Older
B
bellard 已提交
1
/*
2
 *  Virtual page mapping
3
 *
B
bellard 已提交
4 5 6 7 8 9 10 11 12 13 14 15 16
 *  Copyright (c) 2003 Fabrice Bellard
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
B
bellard 已提交
18
 */
P
Peter Maydell 已提交
19
#include "qemu/osdep.h"
20
#include "qapi/error.h"
21
#ifndef _WIN32
B
bellard 已提交
22
#endif
B
bellard 已提交
23

24
#include "qemu/cutils.h"
B
bellard 已提交
25
#include "cpu.h"
26
#include "exec/exec-all.h"
B
bellard 已提交
27
#include "tcg.h"
28
#include "hw/qdev-core.h"
29
#if !defined(CONFIG_USER_ONLY)
30
#include "hw/boards.h"
31
#include "hw/xen/xen.h"
32
#endif
33
#include "sysemu/kvm.h"
34
#include "sysemu/sysemu.h"
35 36
#include "qemu/timer.h"
#include "qemu/config-file.h"
37
#include "qemu/error-report.h"
38
#if defined(CONFIG_USER_ONLY)
39
#include "qemu.h"
J
Jun Nakajima 已提交
40
#else /* !CONFIG_USER_ONLY */
41 42
#include "hw/hw.h"
#include "exec/memory.h"
P
Paolo Bonzini 已提交
43
#include "exec/ioport.h"
44 45
#include "sysemu/dma.h"
#include "exec/address-spaces.h"
46
#include "sysemu/xen-mapcache.h"
47
#include "trace.h"
48
#endif
49
#include "exec/cpu-all.h"
M
Mike Day 已提交
50
#include "qemu/rcu_queue.h"
51
#include "qemu/main-loop.h"
52
#include "translate-all.h"
53
#include "sysemu/replay.h"
54

55
#include "exec/memory-internal.h"
56
#include "exec/ram_addr.h"
57
#include "exec/log.h"
58

59 60
#include "migration/vmstate.h"

61
#include "qemu/range.h"
62 63 64
#ifndef _WIN32
#include "qemu/mmap-alloc.h"
#endif
65

66
//#define DEBUG_SUBPAGE
T
ths 已提交
67

68
#if !defined(CONFIG_USER_ONLY)
M
Mike Day 已提交
69 70 71
/* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
 * are protected by the ramlist lock.
 */
M
Mike Day 已提交
72
RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
A
Avi Kivity 已提交
73 74

static MemoryRegion *system_memory;
75
static MemoryRegion *system_io;
A
Avi Kivity 已提交
76

77 78
AddressSpace address_space_io;
AddressSpace address_space_memory;
79

80
MemoryRegion io_mem_rom, io_mem_notdirty;
81
static MemoryRegion io_mem_unassigned;
82

83 84 85
/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
#define RAM_PREALLOC   (1 << 0)

86 87 88
/* RAM is mmap-ed with MAP_SHARED */
#define RAM_SHARED     (1 << 1)

89 90 91 92 93
/* Only a portion of RAM (used_length) is actually used, and migrated.
 * This used_length size can change across reboots.
 */
#define RAM_RESIZEABLE (1 << 2)

94
#endif
95

96 97 98 99 100
#ifdef TARGET_PAGE_BITS_VARY
int target_page_bits;
bool target_page_bits_decided;
#endif

A
Andreas Färber 已提交
101
struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
B
bellard 已提交
102 103
/* current CPU in the current thread. It is only valid inside
   cpu_exec() */
P
Paolo Bonzini 已提交
104
__thread CPUState *current_cpu;
P
pbrook 已提交
105
/* 0 = Do not count executed instructions.
T
ths 已提交
106
   1 = Precise instruction counting.
P
pbrook 已提交
107
   2 = Adaptive rate instruction counting.  */
108
int use_icount;
B
bellard 已提交
109

110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
bool set_preferred_target_page_bits(int bits)
{
    /* The target page size is the lowest common denominator for all
     * the CPUs in the system, so we can only make it smaller, never
     * larger. And we can't make it smaller once we've committed to
     * a particular size.
     */
#ifdef TARGET_PAGE_BITS_VARY
    assert(bits >= TARGET_PAGE_BITS_MIN);
    if (target_page_bits == 0 || target_page_bits > bits) {
        if (target_page_bits_decided) {
            return false;
        }
        target_page_bits = bits;
    }
#endif
    return true;
}

129
#if !defined(CONFIG_USER_ONLY)
130

131 132 133 134 135 136 137 138 139 140
static void finalize_target_page_bits(void)
{
#ifdef TARGET_PAGE_BITS_VARY
    if (target_page_bits == 0) {
        target_page_bits = TARGET_PAGE_BITS_MIN;
    }
    target_page_bits_decided = true;
#endif
}

141 142 143
typedef struct PhysPageEntry PhysPageEntry;

struct PhysPageEntry {
M
Michael S. Tsirkin 已提交
144
    /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
145
    uint32_t skip : 6;
M
Michael S. Tsirkin 已提交
146
     /* index into phys_sections (!skip) or phys_map_nodes (skip) */
147
    uint32_t ptr : 26;
148 149
};

150 151
#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)

152
/* Size of the L2 (and L3, etc) page tables.  */
153
#define ADDR_SPACE_BITS 64
154

M
Michael S. Tsirkin 已提交
155
#define P_L2_BITS 9
156 157 158 159 160
#define P_L2_SIZE (1 << P_L2_BITS)

#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)

typedef PhysPageEntry Node[P_L2_SIZE];
161

162
typedef struct PhysPageMap {
163 164
    struct rcu_head rcu;

165 166 167 168 169 170 171 172
    unsigned sections_nb;
    unsigned sections_nb_alloc;
    unsigned nodes_nb;
    unsigned nodes_nb_alloc;
    Node *nodes;
    MemoryRegionSection *sections;
} PhysPageMap;

173
struct AddressSpaceDispatch {
174 175
    struct rcu_head rcu;

176
    MemoryRegionSection *mru_section;
177 178 179 180
    /* This is a multi-level map on the physical address space.
     * The bottom level has pointers to MemoryRegionSections.
     */
    PhysPageEntry phys_map;
181
    PhysPageMap map;
182
    AddressSpace *as;
183 184
};

185 186 187
#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
typedef struct subpage_t {
    MemoryRegion iomem;
188
    AddressSpace *as;
189
    hwaddr base;
190
    uint16_t sub_section[];
191 192
} subpage_t;

193 194 195 196
#define PHYS_SECTION_UNASSIGNED 0
#define PHYS_SECTION_NOTDIRTY 1
#define PHYS_SECTION_ROM 2
#define PHYS_SECTION_WATCH 3
197

198
static void io_mem_init(void);
A
Avi Kivity 已提交
199
static void memory_map_init(void);
200
static void tcg_commit(MemoryListener *listener);
201

202
static MemoryRegion io_mem_watch;
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217

/**
 * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 * @cpu: the CPU whose AddressSpace this is
 * @as: the AddressSpace itself
 * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 * @tcg_as_listener: listener for tracking changes to the AddressSpace
 */
struct CPUAddressSpace {
    CPUState *cpu;
    AddressSpace *as;
    struct AddressSpaceDispatch *memory_dispatch;
    MemoryListener tcg_as_listener;
};

218
#endif
B
bellard 已提交
219

220
#if !defined(CONFIG_USER_ONLY)
221

222
static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
223
{
224
    static unsigned alloc_hint = 16;
225
    if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
226
        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
227 228
        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
        map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
229
        alloc_hint = map->nodes_nb_alloc;
230
    }
231 232
}

233
static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
234 235
{
    unsigned i;
236
    uint32_t ret;
237 238
    PhysPageEntry e;
    PhysPageEntry *p;
239

240
    ret = map->nodes_nb++;
241
    p = map->nodes[ret];
242
    assert(ret != PHYS_MAP_NODE_NIL);
243
    assert(ret != map->nodes_nb_alloc);
244 245 246

    e.skip = leaf ? 0 : 1;
    e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
247
    for (i = 0; i < P_L2_SIZE; ++i) {
248
        memcpy(&p[i], &e, sizeof(e));
249
    }
250
    return ret;
251 252
}

253 254
static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
                                hwaddr *index, hwaddr *nb, uint16_t leaf,
255
                                int level)
256 257
{
    PhysPageEntry *p;
258
    hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
259

M
Michael S. Tsirkin 已提交
260
    if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
261
        lp->ptr = phys_map_node_alloc(map, level == 0);
B
bellard 已提交
262
    }
263
    p = map->nodes[lp->ptr];
264
    lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
265

266
    while (*nb && lp < &p[P_L2_SIZE]) {
267
        if ((*index & (step - 1)) == 0 && *nb >= step) {
M
Michael S. Tsirkin 已提交
268
            lp->skip = 0;
269
            lp->ptr = leaf;
270 271
            *index += step;
            *nb -= step;
272
        } else {
273
            phys_page_set_level(map, lp, index, nb, leaf, level - 1);
274 275
        }
        ++lp;
276 277 278
    }
}

A
Avi Kivity 已提交
279
static void phys_page_set(AddressSpaceDispatch *d,
A
Avi Kivity 已提交
280
                          hwaddr index, hwaddr nb,
281
                          uint16_t leaf)
282
{
283
    /* Wildly overreserve - it doesn't matter much. */
284
    phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
285

286
    phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
B
bellard 已提交
287 288
}

289 290 291
/* Compact a non leaf page entry. Simply detect that the entry has a single child,
 * and update our entry so we can skip it and go directly to the destination.
 */
292
static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
{
    unsigned valid_ptr = P_L2_SIZE;
    int valid = 0;
    PhysPageEntry *p;
    int i;

    if (lp->ptr == PHYS_MAP_NODE_NIL) {
        return;
    }

    p = nodes[lp->ptr];
    for (i = 0; i < P_L2_SIZE; i++) {
        if (p[i].ptr == PHYS_MAP_NODE_NIL) {
            continue;
        }

        valid_ptr = i;
        valid++;
        if (p[i].skip) {
312
            phys_page_compact(&p[i], nodes);
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
        }
    }

    /* We can only compress if there's only one child. */
    if (valid != 1) {
        return;
    }

    assert(valid_ptr < P_L2_SIZE);

    /* Don't compress if it won't fit in the # of bits we have. */
    if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
        return;
    }

    lp->ptr = p[valid_ptr].ptr;
    if (!p[valid_ptr].skip) {
        /* If our only child is a leaf, make this a leaf. */
        /* By design, we should have made this node a leaf to begin with so we
         * should never reach here.
         * But since it's so simple to handle this, let's do it just in case we
         * change this rule.
         */
        lp->skip = 0;
    } else {
        lp->skip += p[valid_ptr].skip;
    }
}

static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
{
    if (d->phys_map.skip) {
345
        phys_page_compact(&d->phys_map, d->map.nodes);
346 347 348
    }
}

F
Fam Zheng 已提交
349 350 351 352 353 354
static inline bool section_covers_addr(const MemoryRegionSection *section,
                                       hwaddr addr)
{
    /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
     * the section must cover the entire address space.
     */
355
    return int128_gethi(section->size) ||
F
Fam Zheng 已提交
356
           range_covers_byte(section->offset_within_address_space,
357
                             int128_getlo(section->size), addr);
F
Fam Zheng 已提交
358 359
}

360
static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
361
                                           Node *nodes, MemoryRegionSection *sections)
B
bellard 已提交
362
{
363
    PhysPageEntry *p;
364
    hwaddr index = addr >> TARGET_PAGE_BITS;
365
    int i;
366

M
Michael S. Tsirkin 已提交
367
    for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
368
        if (lp.ptr == PHYS_MAP_NODE_NIL) {
369
            return &sections[PHYS_SECTION_UNASSIGNED];
370
        }
371
        p = nodes[lp.ptr];
372
        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
373
    }
374

F
Fam Zheng 已提交
375
    if (section_covers_addr(&sections[lp.ptr], addr)) {
376 377 378 379
        return &sections[lp.ptr];
    } else {
        return &sections[PHYS_SECTION_UNASSIGNED];
    }
380 381
}

B
Blue Swirl 已提交
382 383
bool memory_region_is_unassigned(MemoryRegion *mr)
{
P
Paolo Bonzini 已提交
384
    return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
385
        && mr != &io_mem_watch;
B
bellard 已提交
386
}
387

388
/* Called from RCU critical section */
389
static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
390 391
                                                        hwaddr addr,
                                                        bool resolve_subpage)
392
{
393
    MemoryRegionSection *section = atomic_read(&d->mru_section);
394
    subpage_t *subpage;
395
    bool update;
396

397 398 399 400 401 402 403 404
    if (section && section != &d->map.sections[PHYS_SECTION_UNASSIGNED] &&
        section_covers_addr(section, addr)) {
        update = false;
    } else {
        section = phys_page_find(d->phys_map, addr, d->map.nodes,
                                 d->map.sections);
        update = true;
    }
405 406
    if (resolve_subpage && section->mr->subpage) {
        subpage = container_of(section->mr, subpage_t, iomem);
407
        section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
408
    }
409 410 411
    if (update) {
        atomic_set(&d->mru_section, section);
    }
412
    return section;
413 414
}

415
/* Called from RCU critical section */
416
static MemoryRegionSection *
417
address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
418
                                 hwaddr *plen, bool resolve_subpage)
419 420
{
    MemoryRegionSection *section;
421
    MemoryRegion *mr;
422
    Int128 diff;
423

424
    section = address_space_lookup_region(d, addr, resolve_subpage);
425 426 427 428 429 430
    /* Compute offset within MemoryRegionSection */
    addr -= section->offset_within_address_space;

    /* Compute offset within MemoryRegion */
    *xlat = addr + section->offset_within_region;

431
    mr = section->mr;
432 433 434 435 436 437 438 439 440 441 442 443

    /* MMIO registers can be expected to perform full-width accesses based only
     * on their address, without considering adjacent registers that could
     * decode to completely different MemoryRegions.  When such registers
     * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
     * regions overlap wildly.  For this reason we cannot clamp the accesses
     * here.
     *
     * If the length is small (as is the case for address_space_ldl/stl),
     * everything works fine.  If the incoming length is large, however,
     * the caller really has to do the clamping through memory_access_size.
     */
444
    if (memory_region_is_ram(mr)) {
445
        diff = int128_sub(section->size, int128_make64(addr));
446 447
        *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
    }
448 449
    return section;
}
450

451
/* Called from RCU critical section */
452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
                                            bool is_write)
{
    IOMMUTLBEntry iotlb = {0};
    MemoryRegionSection *section;
    MemoryRegion *mr;

    for (;;) {
        AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
        section = address_space_lookup_region(d, addr, false);
        addr = addr - section->offset_within_address_space
               + section->offset_within_region;
        mr = section->mr;

        if (!mr->iommu_ops) {
            break;
        }

        iotlb = mr->iommu_ops->translate(mr, addr, is_write);
        if (!(iotlb.perm & (1 << is_write))) {
            iotlb.target_as = NULL;
            break;
        }

        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
                | (addr & iotlb.addr_mask));
        as = iotlb.target_as;
    }

    return iotlb;
}

/* Called from RCU critical section */
485 486 487
MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
                                      hwaddr *xlat, hwaddr *plen,
                                      bool is_write)
488
{
A
Avi Kivity 已提交
489 490 491 492 493
    IOMMUTLBEntry iotlb;
    MemoryRegionSection *section;
    MemoryRegion *mr;

    for (;;) {
494 495
        AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
        section = address_space_translate_internal(d, addr, &addr, plen, true);
A
Avi Kivity 已提交
496 497 498 499 500 501
        mr = section->mr;

        if (!mr->iommu_ops) {
            break;
        }

502
        iotlb = mr->iommu_ops->translate(mr, addr, is_write);
A
Avi Kivity 已提交
503 504
        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
                | (addr & iotlb.addr_mask));
505
        *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
A
Avi Kivity 已提交
506 507 508 509 510 511 512 513
        if (!(iotlb.perm & (1 << is_write))) {
            mr = &io_mem_unassigned;
            break;
        }

        as = iotlb.target_as;
    }

514
    if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
515
        hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
516
        *plen = MIN(page, *plen);
517 518
    }

A
Avi Kivity 已提交
519 520
    *xlat = addr;
    return mr;
521 522
}

523
/* Called from RCU critical section */
524
MemoryRegionSection *
525
address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
P
Paolo Bonzini 已提交
526
                                  hwaddr *xlat, hwaddr *plen)
527
{
A
Avi Kivity 已提交
528
    MemoryRegionSection *section;
529
    AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
530 531

    section = address_space_translate_internal(d, addr, xlat, plen, false);
A
Avi Kivity 已提交
532 533 534

    assert(!section->mr->iommu_ops);
    return section;
535
}
536
#endif
B
bellard 已提交
537

538
#if !defined(CONFIG_USER_ONLY)
539 540

static int cpu_common_post_load(void *opaque, int version_id)
B
bellard 已提交
541
{
542
    CPUState *cpu = opaque;
B
bellard 已提交
543

544 545
    /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
       version_id is increased. */
546
    cpu->interrupt_request &= ~0x01;
547
    tlb_flush(cpu);
548 549

    return 0;
B
bellard 已提交
550
}
B
bellard 已提交
551

552 553 554 555
static int cpu_common_pre_load(void *opaque)
{
    CPUState *cpu = opaque;

556
    cpu->exception_index = -1;
557 558 559 560 561 562 563 564

    return 0;
}

static bool cpu_common_exception_index_needed(void *opaque)
{
    CPUState *cpu = opaque;

565
    return tcg_enabled() && cpu->exception_index != -1;
566 567 568 569 570 571
}

static const VMStateDescription vmstate_cpu_common_exception_index = {
    .name = "cpu_common/exception_index",
    .version_id = 1,
    .minimum_version_id = 1,
572
    .needed = cpu_common_exception_index_needed,
573 574 575 576 577 578
    .fields = (VMStateField[]) {
        VMSTATE_INT32(exception_index, CPUState),
        VMSTATE_END_OF_LIST()
    }
};

579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596
static bool cpu_common_crash_occurred_needed(void *opaque)
{
    CPUState *cpu = opaque;

    return cpu->crash_occurred;
}

static const VMStateDescription vmstate_cpu_common_crash_occurred = {
    .name = "cpu_common/crash_occurred",
    .version_id = 1,
    .minimum_version_id = 1,
    .needed = cpu_common_crash_occurred_needed,
    .fields = (VMStateField[]) {
        VMSTATE_BOOL(crash_occurred, CPUState),
        VMSTATE_END_OF_LIST()
    }
};

597
const VMStateDescription vmstate_cpu_common = {
598 599 600
    .name = "cpu_common",
    .version_id = 1,
    .minimum_version_id = 1,
601
    .pre_load = cpu_common_pre_load,
602
    .post_load = cpu_common_post_load,
603
    .fields = (VMStateField[]) {
604 605
        VMSTATE_UINT32(halted, CPUState),
        VMSTATE_UINT32(interrupt_request, CPUState),
606
        VMSTATE_END_OF_LIST()
607
    },
608 609
    .subsections = (const VMStateDescription*[]) {
        &vmstate_cpu_common_exception_index,
610
        &vmstate_cpu_common_crash_occurred,
611
        NULL
612 613
    }
};
614

615
#endif
B
bellard 已提交
616

617
CPUState *qemu_get_cpu(int index)
B
bellard 已提交
618
{
A
Andreas Färber 已提交
619
    CPUState *cpu;
B
bellard 已提交
620

A
Andreas Färber 已提交
621
    CPU_FOREACH(cpu) {
622
        if (cpu->cpu_index == index) {
A
Andreas Färber 已提交
623
            return cpu;
624
        }
B
bellard 已提交
625
    }
626

A
Andreas Färber 已提交
627
    return NULL;
B
bellard 已提交
628 629
}

630
#if !defined(CONFIG_USER_ONLY)
631
void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx)
632
{
633 634 635 636 637
    CPUAddressSpace *newas;

    /* Target code should have set num_ases before calling us */
    assert(asidx < cpu->num_ases);

638 639 640 641 642
    if (asidx == 0) {
        /* address space 0 gets the convenience alias */
        cpu->as = as;
    }

643 644
    /* KVM cannot currently support multiple address spaces. */
    assert(asidx == 0 || !kvm_enabled());
645

646 647
    if (!cpu->cpu_ases) {
        cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
648
    }
649

650 651 652
    newas = &cpu->cpu_ases[asidx];
    newas->cpu = cpu;
    newas->as = as;
653
    if (tcg_enabled()) {
654 655
        newas->tcg_as_listener.commit = tcg_commit;
        memory_listener_register(&newas->tcg_as_listener, as);
656
    }
657
}
658 659 660 661 662 663

AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
{
    /* Return the AddressSpace corresponding to the specified index */
    return cpu->cpu_ases[asidx].as;
}
664 665
#endif

666
void cpu_exec_unrealizefn(CPUState *cpu)
667
{
668 669
    CPUClass *cc = CPU_GET_CLASS(cpu);

670
    cpu_list_remove(cpu);
671 672 673 674 675 676 677

    if (cc->vmsd != NULL) {
        vmstate_unregister(NULL, cc->vmsd, cpu);
    }
    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
        vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
    }
678 679
}

L
Laurent Vivier 已提交
680
void cpu_exec_initfn(CPUState *cpu)
B
bellard 已提交
681
{
682
    cpu->as = NULL;
683
    cpu->num_ases = 0;
684

685 686
#ifndef CONFIG_USER_ONLY
    cpu->thread_id = qemu_get_thread_id();
687 688 689 690 691 692 693 694 695 696 697 698 699 700

    /* This is a softmmu CPU object, so create a property for it
     * so users can wire up its memory. (This can't go in qom/cpu.c
     * because that file is compiled only once for both user-mode
     * and system builds.) The default if no link is set up is to use
     * the system address space.
     */
    object_property_add_link(OBJECT(cpu), "memory", TYPE_MEMORY_REGION,
                             (Object **)&cpu->memory,
                             qdev_prop_allow_set_link_before_realize,
                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
                             &error_abort);
    cpu->memory = system_memory;
    object_ref(OBJECT(cpu->memory));
701
#endif
L
Laurent Vivier 已提交
702 703
}

704
void cpu_exec_realizefn(CPUState *cpu, Error **errp)
L
Laurent Vivier 已提交
705 706
{
    CPUClass *cc ATTRIBUTE_UNUSED = CPU_GET_CLASS(cpu);
707

708
    cpu_list_add(cpu);
709 710

#ifndef CONFIG_USER_ONLY
711
    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
712
        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
713
    }
714
    if (cc->vmsd != NULL) {
715
        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
716
    }
717
#endif
B
bellard 已提交
718 719
}

720
static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
721
{
722 723 724 725 726 727
    /* Flush the whole TB as this will not have race conditions
     * even if we don't have proper locking yet.
     * Ideally we would just invalidate the TBs for the
     * specified PC.
     */
    tb_flush(cpu);
728
}
B
bellard 已提交
729

730
#if defined(CONFIG_USER_ONLY)
731
void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
732 733 734 735

{
}

736 737 738 739 740 741 742 743 744 745
int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
                          int flags)
{
    return -ENOSYS;
}

void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
{
}

746
int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
747 748 749 750 751
                          int flags, CPUWatchpoint **watchpoint)
{
    return -ENOSYS;
}
#else
752
/* Add a watchpoint.  */
753
int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
754
                          int flags, CPUWatchpoint **watchpoint)
755
{
756
    CPUWatchpoint *wp;
757

758
    /* forbid ranges which are empty or run off the end of the address space */
759
    if (len == 0 || (addr + len - 1) < addr) {
760 761
        error_report("tried to set invalid watchpoint at %"
                     VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
762 763
        return -EINVAL;
    }
764
    wp = g_malloc(sizeof(*wp));
765 766

    wp->vaddr = addr;
767
    wp->len = len;
768 769
    wp->flags = flags;

770
    /* keep all GDB-injected watchpoints in front */
771 772 773 774 775
    if (flags & BP_GDB) {
        QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
    } else {
        QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
    }
776

777
    tlb_flush_page(cpu, addr);
778 779 780 781

    if (watchpoint)
        *watchpoint = wp;
    return 0;
782 783
}

784
/* Remove a specific watchpoint.  */
785
int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
786
                          int flags)
787
{
788
    CPUWatchpoint *wp;
789

790
    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
791
        if (addr == wp->vaddr && len == wp->len
792
                && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
793
            cpu_watchpoint_remove_by_ref(cpu, wp);
794 795 796
            return 0;
        }
    }
797
    return -ENOENT;
798 799
}

800
/* Remove a specific watchpoint by reference.  */
801
void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
802
{
803
    QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
804

805
    tlb_flush_page(cpu, watchpoint->vaddr);
806

807
    g_free(watchpoint);
808 809 810
}

/* Remove all matching watchpoints.  */
811
void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
812
{
813
    CPUWatchpoint *wp, *next;
814

815
    QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
816 817 818
        if (wp->flags & mask) {
            cpu_watchpoint_remove_by_ref(cpu, wp);
        }
819
    }
820
}
821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841

/* Return true if this watchpoint address matches the specified
 * access (ie the address range covered by the watchpoint overlaps
 * partially or completely with the address range covered by the
 * access).
 */
static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
                                                  vaddr addr,
                                                  vaddr len)
{
    /* We know the lengths are non-zero, but a little caution is
     * required to avoid errors in the case where the range ends
     * exactly at the top of the address space and so addr + len
     * wraps round to zero.
     */
    vaddr wpend = wp->vaddr + wp->len - 1;
    vaddr addrend = addr + len - 1;

    return !(addr > wpend || wp->vaddr > addrend);
}

842
#endif
843

844
/* Add a breakpoint.  */
845
int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
846
                          CPUBreakpoint **breakpoint)
B
bellard 已提交
847
{
848
    CPUBreakpoint *bp;
849

850
    bp = g_malloc(sizeof(*bp));
B
bellard 已提交
851

852 853 854
    bp->pc = pc;
    bp->flags = flags;

855
    /* keep all GDB-injected breakpoints in front */
856
    if (flags & BP_GDB) {
857
        QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
858
    } else {
859
        QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
860
    }
861

862
    breakpoint_invalidate(cpu, pc);
863

864
    if (breakpoint) {
865
        *breakpoint = bp;
866
    }
B
bellard 已提交
867 868 869
    return 0;
}

870
/* Remove a specific breakpoint.  */
871
int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
872 873 874
{
    CPUBreakpoint *bp;

875
    QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
876
        if (bp->pc == pc && bp->flags == flags) {
877
            cpu_breakpoint_remove_by_ref(cpu, bp);
878 879
            return 0;
        }
880
    }
881
    return -ENOENT;
882 883
}

884
/* Remove a specific breakpoint by reference.  */
885
void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
B
bellard 已提交
886
{
887 888 889
    QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);

    breakpoint_invalidate(cpu, breakpoint->pc);
890

891
    g_free(breakpoint);
892 893 894
}

/* Remove all matching breakpoints. */
895
void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
896
{
897
    CPUBreakpoint *bp, *next;
898

899
    QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
900 901 902
        if (bp->flags & mask) {
            cpu_breakpoint_remove_by_ref(cpu, bp);
        }
903
    }
B
bellard 已提交
904 905
}

B
bellard 已提交
906 907
/* enable or disable single step mode. EXCP_DEBUG is returned by the
   CPU loop after each instruction */
908
void cpu_single_step(CPUState *cpu, int enabled)
B
bellard 已提交
909
{
910 911 912
    if (cpu->singlestep_enabled != enabled) {
        cpu->singlestep_enabled = enabled;
        if (kvm_enabled()) {
913
            kvm_update_guest_debug(cpu, 0);
914
        } else {
S
Stuart Brady 已提交
915
            /* must flush all the translated code to avoid inconsistencies */
916
            /* XXX: only flush what is necessary */
917
            tb_flush(cpu);
918
        }
B
bellard 已提交
919 920 921
    }
}

922
void cpu_abort(CPUState *cpu, const char *fmt, ...)
B
bellard 已提交
923 924
{
    va_list ap;
P
pbrook 已提交
925
    va_list ap2;
B
bellard 已提交
926 927

    va_start(ap, fmt);
P
pbrook 已提交
928
    va_copy(ap2, ap);
B
bellard 已提交
929 930 931
    fprintf(stderr, "qemu: fatal: ");
    vfprintf(stderr, fmt, ap);
    fprintf(stderr, "\n");
932
    cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
933
    if (qemu_log_separate()) {
934
        qemu_log_lock();
935 936 937
        qemu_log("qemu: fatal: ");
        qemu_log_vprintf(fmt, ap2);
        qemu_log("\n");
938
        log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
939
        qemu_log_flush();
940
        qemu_log_unlock();
941
        qemu_log_close();
942
    }
P
pbrook 已提交
943
    va_end(ap2);
944
    va_end(ap);
945
    replay_finish();
946 947 948 949 950 951 952 953
#if defined(CONFIG_USER_ONLY)
    {
        struct sigaction act;
        sigfillset(&act.sa_mask);
        act.sa_handler = SIG_DFL;
        sigaction(SIGABRT, &act, NULL);
    }
#endif
B
bellard 已提交
954 955 956
    abort();
}

957
#if !defined(CONFIG_USER_ONLY)
M
Mike Day 已提交
958
/* Called from RCU critical section */
P
Paolo Bonzini 已提交
959 960 961 962
static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
{
    RAMBlock *block;

P
Paolo Bonzini 已提交
963
    block = atomic_rcu_read(&ram_list.mru_block);
964
    if (block && addr - block->offset < block->max_length) {
965
        return block;
P
Paolo Bonzini 已提交
966
    }
M
Mike Day 已提交
967
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
968
        if (addr - block->offset < block->max_length) {
P
Paolo Bonzini 已提交
969 970 971 972 973 974 975 976
            goto found;
        }
    }

    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
    abort();

found:
P
Paolo Bonzini 已提交
977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992
    /* It is safe to write mru_block outside the iothread lock.  This
     * is what happens:
     *
     *     mru_block = xxx
     *     rcu_read_unlock()
     *                                        xxx removed from list
     *                  rcu_read_lock()
     *                  read mru_block
     *                                        mru_block = NULL;
     *                                        call_rcu(reclaim_ramblock, xxx);
     *                  rcu_read_unlock()
     *
     * atomic_rcu_set is not needed here.  The block was already published
     * when it was placed into the list.  Here we're just making an extra
     * copy of the pointer.
     */
P
Paolo Bonzini 已提交
993 994 995 996
    ram_list.mru_block = block;
    return block;
}

997
static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
J
Juan Quintela 已提交
998
{
999
    CPUState *cpu;
P
Paolo Bonzini 已提交
1000
    ram_addr_t start1;
1001 1002 1003 1004 1005
    RAMBlock *block;
    ram_addr_t end;

    end = TARGET_PAGE_ALIGN(start + length);
    start &= TARGET_PAGE_MASK;
J
Juan Quintela 已提交
1006

M
Mike Day 已提交
1007
    rcu_read_lock();
P
Paolo Bonzini 已提交
1008 1009
    block = qemu_get_ram_block(start);
    assert(block == qemu_get_ram_block(end - 1));
1010
    start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
1011 1012 1013
    CPU_FOREACH(cpu) {
        tlb_reset_dirty(cpu, start1, length);
    }
M
Mike Day 已提交
1014
    rcu_read_unlock();
J
Juan Quintela 已提交
1015 1016
}

P
pbrook 已提交
1017
/* Note: start and end must be within the same ram block.  */
1018 1019 1020
bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
                                              ram_addr_t length,
                                              unsigned client)
1021
{
1022
    DirtyMemoryBlocks *blocks;
1023
    unsigned long end, page;
1024
    bool dirty = false;
1025 1026 1027 1028

    if (length == 0) {
        return false;
    }
B
bellard 已提交
1029

1030 1031
    end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
    page = start >> TARGET_PAGE_BITS;
1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047

    rcu_read_lock();

    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);

    while (page < end) {
        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);

        dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
                                              offset, num);
        page += num;
    }

    rcu_read_unlock();
1048 1049

    if (dirty && tcg_enabled()) {
1050
        tlb_reset_dirty_range_all(start, length);
P
pbrook 已提交
1051
    }
1052 1053

    return dirty;
1054 1055
}

1056
/* Called from RCU critical section */
1057
hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1058 1059 1060 1061 1062
                                       MemoryRegionSection *section,
                                       target_ulong vaddr,
                                       hwaddr paddr, hwaddr xlat,
                                       int prot,
                                       target_ulong *address)
B
Blue Swirl 已提交
1063
{
A
Avi Kivity 已提交
1064
    hwaddr iotlb;
B
Blue Swirl 已提交
1065 1066
    CPUWatchpoint *wp;

1067
    if (memory_region_is_ram(section->mr)) {
B
Blue Swirl 已提交
1068
        /* Normal RAM.  */
1069
        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
B
Blue Swirl 已提交
1070
        if (!section->readonly) {
1071
            iotlb |= PHYS_SECTION_NOTDIRTY;
B
Blue Swirl 已提交
1072
        } else {
1073
            iotlb |= PHYS_SECTION_ROM;
B
Blue Swirl 已提交
1074 1075
        }
    } else {
1076 1077 1078 1079
        AddressSpaceDispatch *d;

        d = atomic_rcu_read(&section->address_space->dispatch);
        iotlb = section - d->map.sections;
1080
        iotlb += xlat;
B
Blue Swirl 已提交
1081 1082 1083 1084
    }

    /* Make accesses to pages with watchpoints go via the
       watchpoint trap routines.  */
1085
    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1086
        if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
B
Blue Swirl 已提交
1087 1088
            /* Avoid trapping reads of pages with a write breakpoint. */
            if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
1089
                iotlb = PHYS_SECTION_WATCH + paddr;
B
Blue Swirl 已提交
1090 1091 1092 1093 1094 1095 1096 1097
                *address |= TLB_MMIO;
                break;
            }
        }
    }

    return iotlb;
}
1098 1099
#endif /* defined(CONFIG_USER_ONLY) */

1100
#if !defined(CONFIG_USER_ONLY)
1101

A
Anthony Liguori 已提交
1102
static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
1103
                             uint16_t section);
1104
static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
1105

1106 1107
static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
                               qemu_anon_ram_alloc;
1108 1109 1110 1111 1112 1113

/*
 * Set a custom physical guest memory alloator.
 * Accelerators with unusual needs may need this.  Hopefully, we can
 * get rid of it eventually.
 */
1114
void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
1115 1116 1117 1118
{
    phys_mem_alloc = alloc;
}

1119 1120
static uint16_t phys_section_add(PhysPageMap *map,
                                 MemoryRegionSection *section)
1121
{
1122 1123 1124 1125
    /* The physical section number is ORed with a page-aligned
     * pointer to produce the iotlb entries.  Thus it should
     * never overflow into the page-aligned value.
     */
1126
    assert(map->sections_nb < TARGET_PAGE_SIZE);
1127

1128 1129 1130 1131
    if (map->sections_nb == map->sections_nb_alloc) {
        map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
        map->sections = g_renew(MemoryRegionSection, map->sections,
                                map->sections_nb_alloc);
1132
    }
1133
    map->sections[map->sections_nb] = *section;
P
Paolo Bonzini 已提交
1134
    memory_region_ref(section->mr);
1135
    return map->sections_nb++;
1136 1137
}

1138 1139
static void phys_section_destroy(MemoryRegion *mr)
{
D
Don Slutz 已提交
1140 1141
    bool have_sub_page = mr->subpage;

P
Paolo Bonzini 已提交
1142 1143
    memory_region_unref(mr);

D
Don Slutz 已提交
1144
    if (have_sub_page) {
1145
        subpage_t *subpage = container_of(mr, subpage_t, iomem);
P
Peter Crosthwaite 已提交
1146
        object_unref(OBJECT(&subpage->iomem));
1147 1148 1149 1150
        g_free(subpage);
    }
}

P
Paolo Bonzini 已提交
1151
static void phys_sections_free(PhysPageMap *map)
1152
{
1153 1154
    while (map->sections_nb > 0) {
        MemoryRegionSection *section = &map->sections[--map->sections_nb];
1155 1156
        phys_section_destroy(section->mr);
    }
1157 1158
    g_free(map->sections);
    g_free(map->nodes);
1159 1160
}

A
Avi Kivity 已提交
1161
static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
1162 1163
{
    subpage_t *subpage;
A
Avi Kivity 已提交
1164
    hwaddr base = section->offset_within_address_space
1165
        & TARGET_PAGE_MASK;
1166
    MemoryRegionSection *existing = phys_page_find(d->phys_map, base,
1167
                                                   d->map.nodes, d->map.sections);
1168 1169
    MemoryRegionSection subsection = {
        .offset_within_address_space = base,
1170
        .size = int128_make64(TARGET_PAGE_SIZE),
1171
    };
A
Avi Kivity 已提交
1172
    hwaddr start, end;
1173

1174
    assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1175

1176
    if (!(existing->mr->subpage)) {
1177
        subpage = subpage_init(d->as, base);
1178
        subsection.address_space = d->as;
1179
        subsection.mr = &subpage->iomem;
A
Avi Kivity 已提交
1180
        phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1181
                      phys_section_add(&d->map, &subsection));
1182
    } else {
1183
        subpage = container_of(existing->mr, subpage_t, iomem);
1184 1185
    }
    start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1186
    end = start + int128_get64(section->size) - 1;
1187 1188
    subpage_register(subpage, start, end,
                     phys_section_add(&d->map, section));
1189 1190 1191
}


1192 1193
static void register_multipage(AddressSpaceDispatch *d,
                               MemoryRegionSection *section)
1194
{
A
Avi Kivity 已提交
1195
    hwaddr start_addr = section->offset_within_address_space;
1196
    uint16_t section_index = phys_section_add(&d->map, section);
1197 1198
    uint64_t num_pages = int128_get64(int128_rshift(section->size,
                                                    TARGET_PAGE_BITS));
1199

1200 1201
    assert(num_pages);
    phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1202 1203
}

A
Avi Kivity 已提交
1204
static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
1205
{
1206
    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
1207
    AddressSpaceDispatch *d = as->next_dispatch;
1208
    MemoryRegionSection now = *section, remain = *section;
1209
    Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1210

1211 1212 1213 1214
    if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
        uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
                       - now.offset_within_address_space;

1215
        now.size = int128_min(int128_make64(left), now.size);
A
Avi Kivity 已提交
1216
        register_subpage(d, &now);
1217
    } else {
1218
        now.size = int128_zero();
1219
    }
1220 1221 1222 1223
    while (int128_ne(remain.size, now.size)) {
        remain.size = int128_sub(remain.size, now.size);
        remain.offset_within_address_space += int128_get64(now.size);
        remain.offset_within_region += int128_get64(now.size);
1224
        now = remain;
1225
        if (int128_lt(remain.size, page_size)) {
1226
            register_subpage(d, &now);
1227
        } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1228
            now.size = page_size;
A
Avi Kivity 已提交
1229
            register_subpage(d, &now);
1230
        } else {
1231
            now.size = int128_and(now.size, int128_neg(page_size));
A
Avi Kivity 已提交
1232
            register_multipage(d, &now);
1233
        }
1234 1235 1236
    }
}

1237 1238 1239 1240 1241 1242
void qemu_flush_coalesced_mmio_buffer(void)
{
    if (kvm_enabled())
        kvm_flush_coalesced_mmio_buffer();
}

1243 1244 1245 1246 1247 1248 1249 1250 1251 1252
void qemu_mutex_lock_ramlist(void)
{
    qemu_mutex_lock(&ram_list.mutex);
}

void qemu_mutex_unlock_ramlist(void)
{
    qemu_mutex_unlock(&ram_list.mutex);
}

1253
#ifdef __linux__
1254 1255 1256 1257 1258 1259 1260 1261 1262
static int64_t get_file_size(int fd)
{
    int64_t size = lseek(fd, 0, SEEK_END);
    if (size < 0) {
        return -errno;
    }
    return size;
}

A
Alex Williamson 已提交
1263 1264
static void *file_ram_alloc(RAMBlock *block,
                            ram_addr_t memory,
1265 1266
                            const char *path,
                            Error **errp)
1267
{
1268
    bool unlink_on_error = false;
1269
    char *filename;
1270 1271
    char *sanitized_name;
    char *c;
1272
    void *area = MAP_FAILED;
1273
    int fd = -1;
1274
    int64_t file_size;
1275 1276

    if (kvm_enabled() && !kvm_has_sync_mmu()) {
1277 1278
        error_setg(errp,
                   "host lacks kvm mmu notifiers, -mem-path unsupported");
1279
        return NULL;
1280 1281
    }

1282 1283 1284 1285 1286
    for (;;) {
        fd = open(path, O_RDWR);
        if (fd >= 0) {
            /* @path names an existing file, use it */
            break;
1287
        }
1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303
        if (errno == ENOENT) {
            /* @path names a file that doesn't exist, create it */
            fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
            if (fd >= 0) {
                unlink_on_error = true;
                break;
            }
        } else if (errno == EISDIR) {
            /* @path names a directory, create a file there */
            /* Make name safe to use with mkstemp by replacing '/' with '_'. */
            sanitized_name = g_strdup(memory_region_name(block->mr));
            for (c = sanitized_name; *c != '\0'; c++) {
                if (*c == '/') {
                    *c = '_';
                }
            }
1304

1305 1306 1307
            filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
                                       sanitized_name);
            g_free(sanitized_name);
1308

1309 1310 1311 1312 1313 1314 1315
            fd = mkstemp(filename);
            if (fd >= 0) {
                unlink(filename);
                g_free(filename);
                break;
            }
            g_free(filename);
1316
        }
1317 1318 1319 1320 1321 1322 1323 1324 1325 1326
        if (errno != EEXIST && errno != EINTR) {
            error_setg_errno(errp, errno,
                             "can't open backing store %s for guest RAM",
                             path);
            goto error;
        }
        /*
         * Try again on EINTR and EEXIST.  The latter happens when
         * something else creates the file between our two open().
         */
1327
    }
1328

1329
    block->page_size = qemu_fd_getpagesize(fd);
1330 1331 1332 1333 1334 1335
    block->mr->align = block->page_size;
#if defined(__s390x__)
    if (kvm_enabled()) {
        block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
    }
#endif
1336

1337 1338
    file_size = get_file_size(fd);

1339
    if (memory < block->page_size) {
1340
        error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1341 1342
                   "or larger than page size 0x%zx",
                   memory, block->page_size);
1343
        goto error;
1344 1345
    }

1346 1347 1348 1349 1350 1351 1352
    if (file_size > 0 && file_size < memory) {
        error_setg(errp, "backing store %s size 0x%" PRIx64
                   " does not match 'size' option 0x" RAM_ADDR_FMT,
                   path, file_size, memory);
        goto error;
    }

1353
    memory = ROUND_UP(memory, block->page_size);
1354 1355 1356 1357 1358 1359

    /*
     * ftruncate is not supported by hugetlbfs in older
     * hosts, so don't bother bailing out on errors.
     * If anything goes wrong with it under other filesystems,
     * mmap will fail.
1360 1361 1362 1363 1364 1365 1366 1367
     *
     * Do not truncate the non-empty backend file to avoid corrupting
     * the existing data in the file. Disabling shrinking is not
     * enough. For example, the current vNVDIMM implementation stores
     * the guest NVDIMM labels at the end of the backend file. If the
     * backend file is later extended, QEMU will not be able to find
     * those labels. Therefore, extending the non-empty backend file
     * is disabled as well.
1368
     */
1369
    if (!file_size && ftruncate(fd, memory)) {
Y
Yoshiaki Tamura 已提交
1370
        perror("ftruncate");
1371
    }
1372

1373 1374
    area = qemu_ram_mmap(fd, memory, block->mr->align,
                         block->flags & RAM_SHARED);
1375
    if (area == MAP_FAILED) {
1376
        error_setg_errno(errp, errno,
1377
                         "unable to map backing store for guest RAM");
1378
        goto error;
1379
    }
1380 1381

    if (mem_prealloc) {
1382 1383 1384 1385
        os_mem_prealloc(fd, area, memory, errp);
        if (errp && *errp) {
            goto error;
        }
1386 1387
    }

A
Alex Williamson 已提交
1388
    block->fd = fd;
1389
    return area;
1390 1391

error:
1392 1393 1394
    if (area != MAP_FAILED) {
        qemu_ram_munmap(area, memory);
    }
1395 1396 1397
    if (unlink_on_error) {
        unlink(path);
    }
1398 1399 1400
    if (fd != -1) {
        close(fd);
    }
1401
    return NULL;
1402 1403 1404
}
#endif

M
Mike Day 已提交
1405
/* Called with the ramlist lock held.  */
1406
static ram_addr_t find_ram_offset(ram_addr_t size)
A
Alex Williamson 已提交
1407 1408
{
    RAMBlock *block, *next_block;
A
Alex Williamson 已提交
1409
    ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
A
Alex Williamson 已提交
1410

1411 1412
    assert(size != 0); /* it would hand out same offset multiple times */

M
Mike Day 已提交
1413
    if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
A
Alex Williamson 已提交
1414
        return 0;
M
Mike Day 已提交
1415
    }
A
Alex Williamson 已提交
1416

M
Mike Day 已提交
1417
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1418
        ram_addr_t end, next = RAM_ADDR_MAX;
A
Alex Williamson 已提交
1419

1420
        end = block->offset + block->max_length;
A
Alex Williamson 已提交
1421

M
Mike Day 已提交
1422
        QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) {
A
Alex Williamson 已提交
1423 1424 1425 1426 1427
            if (next_block->offset >= end) {
                next = MIN(next, next_block->offset);
            }
        }
        if (next - end >= size && next - end < mingap) {
A
Alex Williamson 已提交
1428
            offset = end;
A
Alex Williamson 已提交
1429 1430 1431
            mingap = next - end;
        }
    }
A
Alex Williamson 已提交
1432 1433 1434 1435 1436 1437 1438

    if (offset == RAM_ADDR_MAX) {
        fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
                (uint64_t)size);
        abort();
    }

A
Alex Williamson 已提交
1439 1440 1441
    return offset;
}

J
Juan Quintela 已提交
1442
ram_addr_t last_ram_offset(void)
1443 1444 1445 1446
{
    RAMBlock *block;
    ram_addr_t last = 0;

M
Mike Day 已提交
1447 1448
    rcu_read_lock();
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1449
        last = MAX(last, block->offset + block->max_length);
M
Mike Day 已提交
1450
    }
M
Mike Day 已提交
1451
    rcu_read_unlock();
1452 1453 1454
    return last;
}

1455 1456 1457 1458 1459
static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
{
    int ret;

    /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1460
    if (!machine_dump_guest_core(current_machine)) {
1461 1462 1463 1464 1465 1466 1467 1468 1469
        ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
        if (ret) {
            perror("qemu_madvise");
            fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
                            "but dump_guest_core=off specified\n");
        }
    }
}

D
Dr. David Alan Gilbert 已提交
1470 1471 1472 1473 1474
const char *qemu_ram_get_idstr(RAMBlock *rb)
{
    return rb->idstr;
}

1475
/* Called with iothread lock held.  */
G
Gonglei 已提交
1476
void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
1477
{
G
Gonglei 已提交
1478
    RAMBlock *block;
1479

1480 1481
    assert(new_block);
    assert(!new_block->idstr[0]);
1482

1483 1484
    if (dev) {
        char *id = qdev_get_dev_path(dev);
1485 1486
        if (id) {
            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1487
            g_free(id);
1488 1489 1490 1491
        }
    }
    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);

G
Gonglei 已提交
1492
    rcu_read_lock();
M
Mike Day 已提交
1493
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
G
Gonglei 已提交
1494 1495
        if (block != new_block &&
            !strcmp(block->idstr, new_block->idstr)) {
1496 1497 1498 1499 1500
            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
                    new_block->idstr);
            abort();
        }
    }
M
Mike Day 已提交
1501
    rcu_read_unlock();
1502 1503
}

1504
/* Called with iothread lock held.  */
G
Gonglei 已提交
1505
void qemu_ram_unset_idstr(RAMBlock *block)
1506
{
1507 1508 1509 1510
    /* FIXME: arch_init.c assumes that this is not called throughout
     * migration.  Ignore the problem since hot-unplug during migration
     * does not work anyway.
     */
1511 1512 1513 1514 1515
    if (block) {
        memset(block->idstr, 0, sizeof(block->idstr));
    }
}

1516 1517 1518 1519 1520
size_t qemu_ram_pagesize(RAMBlock *rb)
{
    return rb->page_size;
}

1521 1522
static int memory_try_enable_merging(void *addr, size_t len)
{
1523
    if (!machine_mem_merge(current_machine)) {
1524 1525 1526 1527 1528 1529 1530
        /* disabled by the user */
        return 0;
    }

    return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
}

1531 1532 1533 1534 1535 1536 1537
/* Only legal before guest might have detected the memory size: e.g. on
 * incoming migration, or right after reset.
 *
 * As memory core doesn't know how is memory accessed, it is up to
 * resize callback to update device state and/or add assertions to detect
 * misuse, if necessary.
 */
G
Gonglei 已提交
1538
int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
1539 1540 1541
{
    assert(block);

1542
    newsize = HOST_PAGE_ALIGN(newsize);
1543

1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565
    if (block->used_length == newsize) {
        return 0;
    }

    if (!(block->flags & RAM_RESIZEABLE)) {
        error_setg_errno(errp, EINVAL,
                         "Length mismatch: %s: 0x" RAM_ADDR_FMT
                         " in != 0x" RAM_ADDR_FMT, block->idstr,
                         newsize, block->used_length);
        return -EINVAL;
    }

    if (block->max_length < newsize) {
        error_setg_errno(errp, EINVAL,
                         "Length too large: %s: 0x" RAM_ADDR_FMT
                         " > 0x" RAM_ADDR_FMT, block->idstr,
                         newsize, block->max_length);
        return -EINVAL;
    }

    cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
    block->used_length = newsize;
1566 1567
    cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
                                        DIRTY_CLIENTS_ALL);
1568 1569 1570 1571 1572 1573 1574
    memory_region_set_size(block->mr, newsize);
    if (block->resized) {
        block->resized(block->idstr, newsize, block->host);
    }
    return 0;
}

1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615
/* Called with ram_list.mutex held */
static void dirty_memory_extend(ram_addr_t old_ram_size,
                                ram_addr_t new_ram_size)
{
    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
                                             DIRTY_MEMORY_BLOCK_SIZE);
    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
                                             DIRTY_MEMORY_BLOCK_SIZE);
    int i;

    /* Only need to extend if block count increased */
    if (new_num_blocks <= old_num_blocks) {
        return;
    }

    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
        DirtyMemoryBlocks *old_blocks;
        DirtyMemoryBlocks *new_blocks;
        int j;

        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
        new_blocks = g_malloc(sizeof(*new_blocks) +
                              sizeof(new_blocks->blocks[0]) * new_num_blocks);

        if (old_num_blocks) {
            memcpy(new_blocks->blocks, old_blocks->blocks,
                   old_num_blocks * sizeof(old_blocks->blocks[0]));
        }

        for (j = old_num_blocks; j < new_num_blocks; j++) {
            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
        }

        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);

        if (old_blocks) {
            g_free_rcu(old_blocks, rcu);
        }
    }
}

1616
static void ram_block_add(RAMBlock *new_block, Error **errp)
1617
{
1618
    RAMBlock *block;
M
Mike Day 已提交
1619
    RAMBlock *last_block = NULL;
1620
    ram_addr_t old_ram_size, new_ram_size;
1621
    Error *err = NULL;
1622 1623

    old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1624

1625
    qemu_mutex_lock_ramlist();
1626
    new_block->offset = find_ram_offset(new_block->max_length);
1627 1628 1629

    if (!new_block->host) {
        if (xen_enabled()) {
1630
            xen_ram_alloc(new_block->offset, new_block->max_length,
1631 1632 1633 1634
                          new_block->mr, &err);
            if (err) {
                error_propagate(errp, err);
                qemu_mutex_unlock_ramlist();
1635
                return;
1636
            }
1637
        } else {
1638
            new_block->host = phys_mem_alloc(new_block->max_length,
1639
                                             &new_block->mr->align);
1640
            if (!new_block->host) {
1641 1642 1643 1644
                error_setg_errno(errp, errno,
                                 "cannot set up guest memory '%s'",
                                 memory_region_name(new_block->mr));
                qemu_mutex_unlock_ramlist();
1645
                return;
1646
            }
1647
            memory_try_enable_merging(new_block->host, new_block->max_length);
1648
        }
1649
    }
P
pbrook 已提交
1650

L
Li Zhijian 已提交
1651 1652 1653 1654
    new_ram_size = MAX(old_ram_size,
              (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
    if (new_ram_size > old_ram_size) {
        migration_bitmap_extend(old_ram_size, new_ram_size);
1655
        dirty_memory_extend(old_ram_size, new_ram_size);
L
Li Zhijian 已提交
1656
    }
M
Mike Day 已提交
1657 1658 1659 1660
    /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
     * QLIST (which has an RCU-friendly variant) does not have insertion at
     * tail, so save the last element in last_block.
     */
M
Mike Day 已提交
1661
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
M
Mike Day 已提交
1662
        last_block = block;
1663
        if (block->max_length < new_block->max_length) {
1664 1665 1666 1667
            break;
        }
    }
    if (block) {
M
Mike Day 已提交
1668
        QLIST_INSERT_BEFORE_RCU(block, new_block, next);
M
Mike Day 已提交
1669
    } else if (last_block) {
M
Mike Day 已提交
1670
        QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
M
Mike Day 已提交
1671
    } else { /* list is empty */
M
Mike Day 已提交
1672
        QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1673
    }
1674
    ram_list.mru_block = NULL;
P
pbrook 已提交
1675

M
Mike Day 已提交
1676 1677
    /* Write list before version */
    smp_wmb();
U
Umesh Deshpande 已提交
1678
    ram_list.version++;
1679
    qemu_mutex_unlock_ramlist();
U
Umesh Deshpande 已提交
1680

1681
    cpu_physical_memory_set_dirty_range(new_block->offset,
1682 1683
                                        new_block->used_length,
                                        DIRTY_CLIENTS_ALL);
P
pbrook 已提交
1684

1685 1686 1687
    if (new_block->host) {
        qemu_ram_setup_dump(new_block->host, new_block->max_length);
        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
C
Cao jin 已提交
1688
        /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
1689
        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
P
Paolo Bonzini 已提交
1690
        ram_block_notify_add(new_block->host, new_block->max_length);
1691
    }
P
pbrook 已提交
1692
}
B
bellard 已提交
1693

1694
#ifdef __linux__
1695 1696 1697
RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
                                   bool share, const char *mem_path,
                                   Error **errp)
1698 1699
{
    RAMBlock *new_block;
1700
    Error *local_err = NULL;
1701 1702

    if (xen_enabled()) {
1703
        error_setg(errp, "-mem-path not supported with Xen");
1704
        return NULL;
1705 1706 1707 1708 1709 1710 1711 1712
    }

    if (phys_mem_alloc != qemu_anon_ram_alloc) {
        /*
         * file_ram_alloc() needs to allocate just like
         * phys_mem_alloc, but we haven't bothered to provide
         * a hook there.
         */
1713 1714
        error_setg(errp,
                   "-mem-path not supported with this accelerator");
1715
        return NULL;
1716 1717
    }

1718
    size = HOST_PAGE_ALIGN(size);
1719 1720
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
1721 1722
    new_block->used_length = size;
    new_block->max_length = size;
1723
    new_block->flags = share ? RAM_SHARED : 0;
1724 1725 1726 1727
    new_block->host = file_ram_alloc(new_block, size,
                                     mem_path, errp);
    if (!new_block->host) {
        g_free(new_block);
1728
        return NULL;
1729 1730
    }

1731
    ram_block_add(new_block, &local_err);
1732 1733 1734
    if (local_err) {
        g_free(new_block);
        error_propagate(errp, local_err);
1735
        return NULL;
1736
    }
1737
    return new_block;
1738
}
1739
#endif
1740

1741
static
1742 1743 1744 1745 1746 1747
RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
                                  void (*resized)(const char*,
                                                  uint64_t length,
                                                  void *host),
                                  void *host, bool resizeable,
                                  MemoryRegion *mr, Error **errp)
1748 1749
{
    RAMBlock *new_block;
1750
    Error *local_err = NULL;
1751

1752 1753
    size = HOST_PAGE_ALIGN(size);
    max_size = HOST_PAGE_ALIGN(max_size);
1754 1755
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
1756
    new_block->resized = resized;
1757 1758
    new_block->used_length = size;
    new_block->max_length = max_size;
1759
    assert(max_size >= size);
1760
    new_block->fd = -1;
1761
    new_block->page_size = getpagesize();
1762 1763
    new_block->host = host;
    if (host) {
1764
        new_block->flags |= RAM_PREALLOC;
1765
    }
1766 1767 1768
    if (resizeable) {
        new_block->flags |= RAM_RESIZEABLE;
    }
1769
    ram_block_add(new_block, &local_err);
1770 1771 1772
    if (local_err) {
        g_free(new_block);
        error_propagate(errp, local_err);
1773
        return NULL;
1774
    }
1775
    return new_block;
1776 1777
}

1778
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
1779 1780 1781 1782 1783
                                   MemoryRegion *mr, Error **errp)
{
    return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
}

1784
RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
1785
{
1786 1787 1788
    return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
}

1789
RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
1790 1791 1792 1793 1794 1795
                                     void (*resized)(const char*,
                                                     uint64_t length,
                                                     void *host),
                                     MemoryRegion *mr, Error **errp)
{
    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
1796 1797
}

P
Paolo Bonzini 已提交
1798 1799 1800 1801 1802 1803 1804 1805
static void reclaim_ramblock(RAMBlock *block)
{
    if (block->flags & RAM_PREALLOC) {
        ;
    } else if (xen_enabled()) {
        xen_invalidate_map_cache_entry(block->host);
#ifndef _WIN32
    } else if (block->fd >= 0) {
1806
        qemu_ram_munmap(block->host, block->max_length);
P
Paolo Bonzini 已提交
1807 1808 1809 1810 1811 1812 1813 1814
        close(block->fd);
#endif
    } else {
        qemu_anon_ram_free(block->host, block->max_length);
    }
    g_free(block);
}

1815
void qemu_ram_free(RAMBlock *block)
B
bellard 已提交
1816
{
1817 1818 1819 1820
    if (!block) {
        return;
    }

P
Paolo Bonzini 已提交
1821 1822 1823 1824
    if (block->host) {
        ram_block_notify_remove(block->host, block->max_length);
    }

1825
    qemu_mutex_lock_ramlist();
1826 1827 1828 1829 1830 1831
    QLIST_REMOVE_RCU(block, next);
    ram_list.mru_block = NULL;
    /* Write list before version */
    smp_wmb();
    ram_list.version++;
    call_rcu(block, reclaim_ramblock, rcu);
1832
    qemu_mutex_unlock_ramlist();
B
bellard 已提交
1833 1834
}

H
Huang Ying 已提交
1835 1836 1837 1838 1839 1840 1841 1842
#ifndef _WIN32
void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
{
    RAMBlock *block;
    ram_addr_t offset;
    int flags;
    void *area, *vaddr;

M
Mike Day 已提交
1843
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
H
Huang Ying 已提交
1844
        offset = addr - block->offset;
1845
        if (offset < block->max_length) {
1846
            vaddr = ramblock_ptr(block, offset);
1847
            if (block->flags & RAM_PREALLOC) {
H
Huang Ying 已提交
1848
                ;
1849 1850
            } else if (xen_enabled()) {
                abort();
H
Huang Ying 已提交
1851 1852
            } else {
                flags = MAP_FIXED;
1853
                if (block->fd >= 0) {
1854 1855
                    flags |= (block->flags & RAM_SHARED ?
                              MAP_SHARED : MAP_PRIVATE);
1856 1857
                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
                                flags, block->fd, offset);
H
Huang Ying 已提交
1858
                } else {
1859 1860 1861 1862 1863 1864 1865
                    /*
                     * Remap needs to match alloc.  Accelerators that
                     * set phys_mem_alloc never remap.  If they did,
                     * we'd need a remap hook here.
                     */
                    assert(phys_mem_alloc == qemu_anon_ram_alloc);

H
Huang Ying 已提交
1866 1867 1868 1869 1870
                    flags |= MAP_PRIVATE | MAP_ANONYMOUS;
                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
                                flags, -1, 0);
                }
                if (area != vaddr) {
1871 1872
                    fprintf(stderr, "Could not remap addr: "
                            RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
H
Huang Ying 已提交
1873 1874 1875
                            length, addr);
                    exit(1);
                }
1876
                memory_try_enable_merging(vaddr, length);
1877
                qemu_ram_setup_dump(vaddr, length);
H
Huang Ying 已提交
1878 1879 1880 1881 1882 1883
            }
        }
    }
}
#endif /* !_WIN32 */

1884
/* Return a host pointer to ram allocated with qemu_ram_alloc.
1885 1886 1887
 * This should not be used for general purpose DMA.  Use address_space_map
 * or address_space_rw instead. For local memory (e.g. video ram) that the
 * device owns, use memory_region_get_ram_ptr.
M
Mike Day 已提交
1888
 *
1889
 * Called within RCU critical section.
1890
 */
1891
void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
1892
{
1893 1894 1895 1896
    RAMBlock *block = ram_block;

    if (block == NULL) {
        block = qemu_get_ram_block(addr);
1897
        addr -= block->offset;
1898
    }
1899 1900

    if (xen_enabled() && block->host == NULL) {
1901 1902 1903 1904 1905
        /* We need to check if the requested address is in the RAM
         * because we don't want to map the entire memory in QEMU.
         * In that case just map until the end of the page.
         */
        if (block->offset == 0) {
1906
            return xen_map_cache(addr, 0, 0);
1907
        }
1908 1909

        block->host = xen_map_cache(block->offset, block->max_length, 1);
1910
    }
1911
    return ramblock_ptr(block, addr);
1912 1913
}

1914
/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
1915
 * but takes a size argument.
M
Mike Day 已提交
1916
 *
1917
 * Called within RCU critical section.
1918
 */
1919 1920
static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
                                 hwaddr *size)
1921
{
1922
    RAMBlock *block = ram_block;
1923 1924 1925
    if (*size == 0) {
        return NULL;
    }
1926

1927 1928
    if (block == NULL) {
        block = qemu_get_ram_block(addr);
1929
        addr -= block->offset;
1930
    }
1931
    *size = MIN(*size, block->max_length - addr);
1932 1933 1934 1935 1936 1937 1938 1939

    if (xen_enabled() && block->host == NULL) {
        /* We need to check if the requested address is in the RAM
         * because we don't want to map the entire memory in QEMU.
         * In that case just map the requested area.
         */
        if (block->offset == 0) {
            return xen_map_cache(addr, *size, 1);
1940 1941
        }

1942
        block->host = xen_map_cache(block->offset, block->max_length, 1);
1943
    }
1944

1945
    return ramblock_ptr(block, addr);
1946 1947
}

D
Dr. David Alan Gilbert 已提交
1948 1949 1950 1951 1952 1953 1954 1955 1956 1957
/*
 * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
 * in that RAMBlock.
 *
 * ptr: Host pointer to look up
 * round_offset: If true round the result offset down to a page boundary
 * *ram_addr: set to result ram_addr
 * *offset: set to result offset within the RAMBlock
 *
 * Returns: RAMBlock (or NULL if not found)
1958 1959 1960 1961 1962 1963 1964
 *
 * By the time this function returns, the returned pointer is not protected
 * by RCU anymore.  If the caller is not within an RCU critical section and
 * does not hold the iothread lock, it must have other means of protecting the
 * pointer, such as a reference to the region that includes the incoming
 * ram_addr_t.
 */
D
Dr. David Alan Gilbert 已提交
1965 1966
RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
                                   ram_addr_t *offset)
P
pbrook 已提交
1967
{
P
pbrook 已提交
1968 1969 1970
    RAMBlock *block;
    uint8_t *host = ptr;

1971
    if (xen_enabled()) {
1972
        ram_addr_t ram_addr;
M
Mike Day 已提交
1973
        rcu_read_lock();
1974 1975
        ram_addr = xen_ram_addr_from_mapcache(ptr);
        block = qemu_get_ram_block(ram_addr);
D
Dr. David Alan Gilbert 已提交
1976
        if (block) {
1977
            *offset = ram_addr - block->offset;
D
Dr. David Alan Gilbert 已提交
1978
        }
M
Mike Day 已提交
1979
        rcu_read_unlock();
D
Dr. David Alan Gilbert 已提交
1980
        return block;
1981 1982
    }

M
Mike Day 已提交
1983 1984
    rcu_read_lock();
    block = atomic_rcu_read(&ram_list.mru_block);
1985
    if (block && block->host && host - block->host < block->max_length) {
1986 1987 1988
        goto found;
    }

M
Mike Day 已提交
1989
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
J
Jun Nakajima 已提交
1990 1991 1992 1993
        /* This case append when the block is not mapped. */
        if (block->host == NULL) {
            continue;
        }
1994
        if (host - block->host < block->max_length) {
1995
            goto found;
A
Alex Williamson 已提交
1996
        }
P
pbrook 已提交
1997
    }
J
Jun Nakajima 已提交
1998

M
Mike Day 已提交
1999
    rcu_read_unlock();
2000
    return NULL;
2001 2002

found:
D
Dr. David Alan Gilbert 已提交
2003 2004 2005 2006
    *offset = (host - block->host);
    if (round_offset) {
        *offset &= TARGET_PAGE_MASK;
    }
M
Mike Day 已提交
2007
    rcu_read_unlock();
D
Dr. David Alan Gilbert 已提交
2008 2009 2010
    return block;
}

D
Dr. David Alan Gilbert 已提交
2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030
/*
 * Finds the named RAMBlock
 *
 * name: The name of RAMBlock to find
 *
 * Returns: RAMBlock (or NULL if not found)
 */
RAMBlock *qemu_ram_block_by_name(const char *name)
{
    RAMBlock *block;

    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
        if (!strcmp(name, block->idstr)) {
            return block;
        }
    }

    return NULL;
}

D
Dr. David Alan Gilbert 已提交
2031 2032
/* Some of the softmmu routines need to translate from a host pointer
   (typically a TLB entry) back to a ram offset.  */
2033
ram_addr_t qemu_ram_addr_from_host(void *ptr)
D
Dr. David Alan Gilbert 已提交
2034 2035
{
    RAMBlock *block;
2036
    ram_addr_t offset;
D
Dr. David Alan Gilbert 已提交
2037

2038
    block = qemu_ram_block_from_host(ptr, false, &offset);
D
Dr. David Alan Gilbert 已提交
2039
    if (!block) {
2040
        return RAM_ADDR_INVALID;
D
Dr. David Alan Gilbert 已提交
2041 2042
    }

2043
    return block->offset + offset;
M
Marcelo Tosatti 已提交
2044
}
A
Alex Williamson 已提交
2045

2046
/* Called within RCU critical section.  */
A
Avi Kivity 已提交
2047
static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
2048
                               uint64_t val, unsigned size)
2049
{
2050 2051
    bool locked = false;

2052
    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
2053 2054
        locked = true;
        tb_lock();
2055
        tb_invalidate_phys_page_fast(ram_addr, size);
2056
    }
2057 2058
    switch (size) {
    case 1:
2059
        stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2060 2061
        break;
    case 2:
2062
        stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2063 2064
        break;
    case 4:
2065
        stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2066 2067 2068
        break;
    default:
        abort();
2069
    }
2070 2071 2072 2073 2074

    if (locked) {
        tb_unlock();
    }

2075 2076 2077 2078 2079
    /* Set both VGA and migration bits for simplicity and to remove
     * the notdirty callback faster.
     */
    cpu_physical_memory_set_dirty_range(ram_addr, size,
                                        DIRTY_CLIENTS_NOCODE);
B
bellard 已提交
2080 2081
    /* we remove the notdirty callback only if the code has been
       flushed */
2082
    if (!cpu_physical_memory_is_clean(ram_addr)) {
2083
        tlb_set_dirty(current_cpu, current_cpu->mem_io_vaddr);
2084
    }
2085 2086
}

2087 2088 2089 2090 2091 2092
static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
                                 unsigned size, bool is_write)
{
    return is_write;
}

2093 2094
static const MemoryRegionOps notdirty_mem_ops = {
    .write = notdirty_mem_write,
2095
    .valid.accepts = notdirty_mem_accepts,
2096
    .endianness = DEVICE_NATIVE_ENDIAN,
2097 2098
};

P
pbrook 已提交
2099
/* Generate a debug exception if a watchpoint has been hit.  */
2100
static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
P
pbrook 已提交
2101
{
2102
    CPUState *cpu = current_cpu;
2103
    CPUClass *cc = CPU_GET_CLASS(cpu);
2104
    CPUArchState *env = cpu->env_ptr;
2105
    target_ulong pc, cs_base;
P
pbrook 已提交
2106
    target_ulong vaddr;
2107
    CPUWatchpoint *wp;
2108
    uint32_t cpu_flags;
P
pbrook 已提交
2109

2110
    if (cpu->watchpoint_hit) {
2111 2112 2113
        /* We re-entered the check after replacing the TB. Now raise
         * the debug interrupt so that is will trigger after the
         * current instruction. */
2114
        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2115 2116
        return;
    }
2117
    vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
2118
    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2119 2120
        if (cpu_watchpoint_address_matches(wp, vaddr, len)
            && (wp->flags & flags)) {
2121 2122 2123 2124 2125 2126
            if (flags == BP_MEM_READ) {
                wp->flags |= BP_WATCHPOINT_HIT_READ;
            } else {
                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
            }
            wp->hitaddr = vaddr;
2127
            wp->hitattrs = attrs;
2128
            if (!cpu->watchpoint_hit) {
2129 2130 2131 2132 2133
                if (wp->flags & BP_CPU &&
                    !cc->debug_check_watchpoint(cpu, wp)) {
                    wp->flags &= ~BP_WATCHPOINT_HIT;
                    continue;
                }
2134
                cpu->watchpoint_hit = wp;
2135 2136 2137 2138 2139 2140

                /* The tb_lock will be reset when cpu_loop_exit or
                 * cpu_loop_exit_noexc longjmp back into the cpu_exec
                 * main loop.
                 */
                tb_lock();
2141
                tb_check_watchpoint(cpu);
2142
                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2143
                    cpu->exception_index = EXCP_DEBUG;
2144
                    cpu_loop_exit(cpu);
2145 2146
                } else {
                    cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
2147
                    tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
2148
                    cpu_loop_exit_noexc(cpu);
2149
                }
2150
            }
2151 2152
        } else {
            wp->flags &= ~BP_WATCHPOINT_HIT;
P
pbrook 已提交
2153 2154 2155 2156
        }
    }
}

2157 2158 2159
/* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
   so these check for a hit then pass through to the normal out-of-line
   phys routines.  */
2160 2161
static MemTxResult watch_mem_read(void *opaque, hwaddr addr, uint64_t *pdata,
                                  unsigned size, MemTxAttrs attrs)
2162
{
2163 2164
    MemTxResult res;
    uint64_t data;
2165 2166
    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2167 2168

    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
2169
    switch (size) {
2170
    case 1:
2171
        data = address_space_ldub(as, addr, attrs, &res);
2172 2173
        break;
    case 2:
2174
        data = address_space_lduw(as, addr, attrs, &res);
2175 2176
        break;
    case 4:
2177
        data = address_space_ldl(as, addr, attrs, &res);
2178
        break;
2179 2180
    default: abort();
    }
2181 2182
    *pdata = data;
    return res;
2183 2184
}

2185 2186 2187
static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
                                   uint64_t val, unsigned size,
                                   MemTxAttrs attrs)
2188
{
2189
    MemTxResult res;
2190 2191
    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2192 2193

    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
2194
    switch (size) {
2195
    case 1:
2196
        address_space_stb(as, addr, val, attrs, &res);
2197 2198
        break;
    case 2:
2199
        address_space_stw(as, addr, val, attrs, &res);
2200 2201
        break;
    case 4:
2202
        address_space_stl(as, addr, val, attrs, &res);
2203
        break;
2204 2205
    default: abort();
    }
2206
    return res;
2207 2208
}

2209
static const MemoryRegionOps watch_mem_ops = {
2210 2211
    .read_with_attrs = watch_mem_read,
    .write_with_attrs = watch_mem_write,
2212
    .endianness = DEVICE_NATIVE_ENDIAN,
2213 2214
};

2215 2216
static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
                                unsigned len, MemTxAttrs attrs)
2217
{
2218
    subpage_t *subpage = opaque;
2219
    uint8_t buf[8];
2220
    MemTxResult res;
2221

2222
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2223
    printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2224
           subpage, len, addr);
2225
#endif
2226 2227 2228 2229
    res = address_space_read(subpage->as, addr + subpage->base,
                             attrs, buf, len);
    if (res) {
        return res;
2230
    }
2231 2232
    switch (len) {
    case 1:
2233 2234
        *data = ldub_p(buf);
        return MEMTX_OK;
2235
    case 2:
2236 2237
        *data = lduw_p(buf);
        return MEMTX_OK;
2238
    case 4:
2239 2240
        *data = ldl_p(buf);
        return MEMTX_OK;
2241
    case 8:
2242 2243
        *data = ldq_p(buf);
        return MEMTX_OK;
2244 2245 2246
    default:
        abort();
    }
2247 2248
}

2249 2250
static MemTxResult subpage_write(void *opaque, hwaddr addr,
                                 uint64_t value, unsigned len, MemTxAttrs attrs)
2251
{
2252
    subpage_t *subpage = opaque;
2253
    uint8_t buf[8];
2254

2255
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2256
    printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2257 2258
           " value %"PRIx64"\n",
           __func__, subpage, len, addr, value);
2259
#endif
2260 2261 2262 2263 2264 2265 2266 2267 2268 2269
    switch (len) {
    case 1:
        stb_p(buf, value);
        break;
    case 2:
        stw_p(buf, value);
        break;
    case 4:
        stl_p(buf, value);
        break;
2270 2271 2272
    case 8:
        stq_p(buf, value);
        break;
2273 2274 2275
    default:
        abort();
    }
2276 2277
    return address_space_write(subpage->as, addr + subpage->base,
                               attrs, buf, len);
2278 2279
}

2280
static bool subpage_accepts(void *opaque, hwaddr addr,
A
Amos Kong 已提交
2281
                            unsigned len, bool is_write)
2282
{
2283
    subpage_t *subpage = opaque;
2284
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2285
    printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2286
           __func__, subpage, is_write ? 'w' : 'r', len, addr);
2287 2288
#endif

2289
    return address_space_access_valid(subpage->as, addr + subpage->base,
A
Amos Kong 已提交
2290
                                      len, is_write);
2291 2292
}

2293
static const MemoryRegionOps subpage_ops = {
2294 2295
    .read_with_attrs = subpage_read,
    .write_with_attrs = subpage_write,
2296 2297 2298 2299
    .impl.min_access_size = 1,
    .impl.max_access_size = 8,
    .valid.min_access_size = 1,
    .valid.max_access_size = 8,
2300
    .valid.accepts = subpage_accepts,
2301
    .endianness = DEVICE_NATIVE_ENDIAN,
2302 2303
};

A
Anthony Liguori 已提交
2304
static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2305
                             uint16_t section)
2306 2307 2308 2309 2310 2311 2312 2313
{
    int idx, eidx;

    if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
        return -1;
    idx = SUBPAGE_IDX(start);
    eidx = SUBPAGE_IDX(end);
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2314 2315
    printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
           __func__, mmio, start, end, idx, eidx, section);
2316 2317
#endif
    for (; idx <= eidx; idx++) {
2318
        mmio->sub_section[idx] = section;
2319 2320 2321 2322 2323
    }

    return 0;
}

2324
static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
2325
{
A
Anthony Liguori 已提交
2326
    subpage_t *mmio;
2327

2328
    mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2329
    mmio->as = as;
2330
    mmio->base = base;
2331
    memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
P
Peter Crosthwaite 已提交
2332
                          NULL, TARGET_PAGE_SIZE);
A
Avi Kivity 已提交
2333
    mmio->iomem.subpage = true;
2334
#if defined(DEBUG_SUBPAGE)
A
Amos Kong 已提交
2335 2336
    printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
           mmio, base, TARGET_PAGE_SIZE);
2337
#endif
2338
    subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2339 2340 2341 2342

    return mmio;
}

2343 2344
static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as,
                              MemoryRegion *mr)
2345
{
2346
    assert(as);
2347
    MemoryRegionSection section = {
2348
        .address_space = as,
2349 2350 2351
        .mr = mr,
        .offset_within_address_space = 0,
        .offset_within_region = 0,
2352
        .size = int128_2_64(),
2353 2354
    };

2355
    return phys_section_add(map, &section);
2356 2357
}

2358
MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs)
2359
{
2360 2361
    int asidx = cpu_asidx_from_attrs(cpu, attrs);
    CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2362
    AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2363
    MemoryRegionSection *sections = d->map.sections;
P
Paolo Bonzini 已提交
2364 2365

    return sections[index & ~TARGET_PAGE_MASK].mr;
2366 2367
}

A
Avi Kivity 已提交
2368 2369
static void io_mem_init(void)
{
2370
    memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2371
    memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2372
                          NULL, UINT64_MAX);
2373
    memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2374
                          NULL, UINT64_MAX);
2375
    memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2376
                          NULL, UINT64_MAX);
A
Avi Kivity 已提交
2377 2378
}

A
Avi Kivity 已提交
2379
static void mem_begin(MemoryListener *listener)
2380 2381
{
    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2382 2383 2384
    AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
    uint16_t n;

2385
    n = dummy_section(&d->map, as, &io_mem_unassigned);
2386
    assert(n == PHYS_SECTION_UNASSIGNED);
2387
    n = dummy_section(&d->map, as, &io_mem_notdirty);
2388
    assert(n == PHYS_SECTION_NOTDIRTY);
2389
    n = dummy_section(&d->map, as, &io_mem_rom);
2390
    assert(n == PHYS_SECTION_ROM);
2391
    n = dummy_section(&d->map, as, &io_mem_watch);
2392
    assert(n == PHYS_SECTION_WATCH);
2393

M
Michael S. Tsirkin 已提交
2394
    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2395 2396 2397 2398
    d->as = as;
    as->next_dispatch = d;
}

2399 2400 2401 2402 2403 2404
static void address_space_dispatch_free(AddressSpaceDispatch *d)
{
    phys_sections_free(&d->map);
    g_free(d);
}

2405
static void mem_commit(MemoryListener *listener)
A
Avi Kivity 已提交
2406
{
2407
    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2408 2409 2410
    AddressSpaceDispatch *cur = as->dispatch;
    AddressSpaceDispatch *next = as->next_dispatch;

2411
    phys_page_compact_all(next, next->map.nodes_nb);
2412

2413
    atomic_rcu_set(&as->dispatch, next);
2414
    if (cur) {
2415
        call_rcu(cur, address_space_dispatch_free, rcu);
2416
    }
2417 2418
}

2419
static void tcg_commit(MemoryListener *listener)
2420
{
2421 2422
    CPUAddressSpace *cpuas;
    AddressSpaceDispatch *d;
2423 2424 2425

    /* since each CPU stores ram addresses in its TLB cache, we must
       reset the modified entries */
2426 2427 2428 2429 2430 2431 2432
    cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
    cpu_reloading_memory_map();
    /* The CPU and TLB are protected by the iothread lock.
     * We reload the dispatch pointer now because cpu_reloading_memory_map()
     * may have split the RCU critical section.
     */
    d = atomic_rcu_read(&cpuas->as->dispatch);
2433
    atomic_rcu_set(&cpuas->memory_dispatch, d);
2434
    tlb_flush(cpuas->cpu);
2435 2436
}

A
Avi Kivity 已提交
2437 2438
void address_space_init_dispatch(AddressSpace *as)
{
2439
    as->dispatch = NULL;
2440
    as->dispatch_listener = (MemoryListener) {
A
Avi Kivity 已提交
2441
        .begin = mem_begin,
2442
        .commit = mem_commit,
A
Avi Kivity 已提交
2443 2444 2445 2446
        .region_add = mem_add,
        .region_nop = mem_add,
        .priority = 0,
    };
2447
    memory_listener_register(&as->dispatch_listener, as);
A
Avi Kivity 已提交
2448 2449
}

2450 2451 2452 2453 2454
void address_space_unregister(AddressSpace *as)
{
    memory_listener_unregister(&as->dispatch_listener);
}

A
Avi Kivity 已提交
2455 2456 2457 2458
void address_space_destroy_dispatch(AddressSpace *as)
{
    AddressSpaceDispatch *d = as->dispatch;

2459 2460 2461 2462
    atomic_rcu_set(&as->dispatch, NULL);
    if (d) {
        call_rcu(d, address_space_dispatch_free, rcu);
    }
A
Avi Kivity 已提交
2463 2464
}

A
Avi Kivity 已提交
2465 2466
static void memory_map_init(void)
{
2467
    system_memory = g_malloc(sizeof(*system_memory));
2468

2469
    memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2470
    address_space_init(&address_space_memory, system_memory, "memory");
2471

2472
    system_io = g_malloc(sizeof(*system_io));
2473 2474
    memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
                          65536);
2475
    address_space_init(&address_space_io, system_io, "I/O");
A
Avi Kivity 已提交
2476 2477 2478 2479 2480 2481 2482
}

MemoryRegion *get_system_memory(void)
{
    return system_memory;
}

2483 2484 2485 2486 2487
MemoryRegion *get_system_io(void)
{
    return system_io;
}

2488 2489
#endif /* !defined(CONFIG_USER_ONLY) */

B
bellard 已提交
2490 2491
/* physical memory access (slow version, mainly for debug) */
#if defined(CONFIG_USER_ONLY)
2492
int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
P
Paul Brook 已提交
2493
                        uint8_t *buf, int len, int is_write)
B
bellard 已提交
2494 2495 2496
{
    int l, flags;
    target_ulong page;
2497
    void * p;
B
bellard 已提交
2498 2499 2500 2501 2502 2503 2504 2505

    while (len > 0) {
        page = addr & TARGET_PAGE_MASK;
        l = (page + TARGET_PAGE_SIZE) - addr;
        if (l > len)
            l = len;
        flags = page_get_flags(page);
        if (!(flags & PAGE_VALID))
P
Paul Brook 已提交
2506
            return -1;
B
bellard 已提交
2507 2508
        if (is_write) {
            if (!(flags & PAGE_WRITE))
P
Paul Brook 已提交
2509
                return -1;
2510
            /* XXX: this code should not depend on lock_user */
A
aurel32 已提交
2511
            if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
P
Paul Brook 已提交
2512
                return -1;
A
aurel32 已提交
2513 2514
            memcpy(p, buf, l);
            unlock_user(p, addr, l);
B
bellard 已提交
2515 2516
        } else {
            if (!(flags & PAGE_READ))
P
Paul Brook 已提交
2517
                return -1;
2518
            /* XXX: this code should not depend on lock_user */
A
aurel32 已提交
2519
            if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
P
Paul Brook 已提交
2520
                return -1;
A
aurel32 已提交
2521
            memcpy(buf, p, l);
A
aurel32 已提交
2522
            unlock_user(p, addr, 0);
B
bellard 已提交
2523 2524 2525 2526 2527
        }
        len -= l;
        buf += l;
        addr += l;
    }
P
Paul Brook 已提交
2528
    return 0;
B
bellard 已提交
2529
}
B
bellard 已提交
2530

B
bellard 已提交
2531
#else
2532

2533
static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
A
Avi Kivity 已提交
2534
                                     hwaddr length)
2535
{
2536
    uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
2537 2538
    addr += memory_region_get_ram_addr(mr);

2539 2540 2541 2542 2543 2544 2545 2546 2547
    /* No early return if dirty_log_mask is or becomes 0, because
     * cpu_physical_memory_set_dirty_range will still call
     * xen_modified_memory.
     */
    if (dirty_log_mask) {
        dirty_log_mask =
            cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
    }
    if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
2548
        tb_lock();
2549
        tb_invalidate_phys_range(addr, addr + length);
2550
        tb_unlock();
2551
        dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
2552
    }
2553
    cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
2554 2555
}

2556
static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2557
{
2558
    unsigned access_size_max = mr->ops->valid.max_access_size;
2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571

    /* Regions are assumed to support 1-4 byte accesses unless
       otherwise specified.  */
    if (access_size_max == 0) {
        access_size_max = 4;
    }

    /* Bound the maximum access by the alignment of the address.  */
    if (!mr->ops->impl.unaligned) {
        unsigned align_size_max = addr & -addr;
        if (align_size_max != 0 && align_size_max < access_size_max) {
            access_size_max = align_size_max;
        }
2572
    }
2573 2574 2575 2576

    /* Don't attempt accesses larger than the maximum.  */
    if (l > access_size_max) {
        l = access_size_max;
2577
    }
2578
    l = pow2floor(l);
2579 2580

    return l;
2581 2582
}

2583
static bool prepare_mmio_access(MemoryRegion *mr)
2584
{
2585 2586 2587 2588 2589 2590 2591 2592
    bool unlocked = !qemu_mutex_iothread_locked();
    bool release_lock = false;

    if (unlocked && mr->global_locking) {
        qemu_mutex_lock_iothread();
        unlocked = false;
        release_lock = true;
    }
2593
    if (mr->flush_coalesced_mmio) {
2594 2595 2596
        if (unlocked) {
            qemu_mutex_lock_iothread();
        }
2597
        qemu_flush_coalesced_mmio_buffer();
2598 2599 2600
        if (unlocked) {
            qemu_mutex_unlock_iothread();
        }
2601
    }
2602 2603

    return release_lock;
2604 2605
}

2606 2607 2608 2609 2610 2611
/* Called within RCU critical section.  */
static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
                                                MemTxAttrs attrs,
                                                const uint8_t *buf,
                                                int len, hwaddr addr1,
                                                hwaddr l, MemoryRegion *mr)
B
bellard 已提交
2612 2613
{
    uint8_t *ptr;
2614
    uint64_t val;
2615
    MemTxResult result = MEMTX_OK;
2616
    bool release_lock = false;
2617

2618
    for (;;) {
2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632
        if (!memory_access_is_direct(mr, true)) {
            release_lock |= prepare_mmio_access(mr);
            l = memory_access_size(mr, l, addr1);
            /* XXX: could force current_cpu to NULL to avoid
               potential bugs */
            switch (l) {
            case 8:
                /* 64 bit write access */
                val = ldq_p(buf);
                result |= memory_region_dispatch_write(mr, addr1, val, 8,
                                                       attrs);
                break;
            case 4:
                /* 32 bit write access */
2633
                val = (uint32_t)ldl_p(buf);
2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650
                result |= memory_region_dispatch_write(mr, addr1, val, 4,
                                                       attrs);
                break;
            case 2:
                /* 16 bit write access */
                val = lduw_p(buf);
                result |= memory_region_dispatch_write(mr, addr1, val, 2,
                                                       attrs);
                break;
            case 1:
                /* 8 bit write access */
                val = ldub_p(buf);
                result |= memory_region_dispatch_write(mr, addr1, val, 1,
                                                       attrs);
                break;
            default:
                abort();
B
bellard 已提交
2651 2652
            }
        } else {
2653
            /* RAM case */
2654
            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2655 2656
            memcpy(ptr, buf, l);
            invalidate_and_set_dirty(mr, addr1, l);
B
bellard 已提交
2657
        }
2658 2659 2660 2661 2662 2663

        if (release_lock) {
            qemu_mutex_unlock_iothread();
            release_lock = false;
        }

B
bellard 已提交
2664 2665 2666
        len -= l;
        buf += l;
        addr += l;
2667 2668 2669 2670 2671 2672 2673

        if (!len) {
            break;
        }

        l = len;
        mr = address_space_translate(as, addr, &addr1, &l, true);
B
bellard 已提交
2674
    }
2675

2676
    return result;
B
bellard 已提交
2677
}
B
bellard 已提交
2678

2679 2680
MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
                                const uint8_t *buf, int len)
A
Avi Kivity 已提交
2681
{
2682 2683 2684 2685 2686
    hwaddr l;
    hwaddr addr1;
    MemoryRegion *mr;
    MemTxResult result = MEMTX_OK;

2687 2688
    if (len > 0) {
        rcu_read_lock();
2689
        l = len;
2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708
        mr = address_space_translate(as, addr, &addr1, &l, true);
        result = address_space_write_continue(as, addr, attrs, buf, len,
                                              addr1, l, mr);
        rcu_read_unlock();
    }

    return result;
}

/* Called within RCU critical section.  */
MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
                                        MemTxAttrs attrs, uint8_t *buf,
                                        int len, hwaddr addr1, hwaddr l,
                                        MemoryRegion *mr)
{
    uint8_t *ptr;
    uint64_t val;
    MemTxResult result = MEMTX_OK;
    bool release_lock = false;
2709

2710
    for (;;) {
2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744
        if (!memory_access_is_direct(mr, false)) {
            /* I/O case */
            release_lock |= prepare_mmio_access(mr);
            l = memory_access_size(mr, l, addr1);
            switch (l) {
            case 8:
                /* 64 bit read access */
                result |= memory_region_dispatch_read(mr, addr1, &val, 8,
                                                      attrs);
                stq_p(buf, val);
                break;
            case 4:
                /* 32 bit read access */
                result |= memory_region_dispatch_read(mr, addr1, &val, 4,
                                                      attrs);
                stl_p(buf, val);
                break;
            case 2:
                /* 16 bit read access */
                result |= memory_region_dispatch_read(mr, addr1, &val, 2,
                                                      attrs);
                stw_p(buf, val);
                break;
            case 1:
                /* 8 bit read access */
                result |= memory_region_dispatch_read(mr, addr1, &val, 1,
                                                      attrs);
                stb_p(buf, val);
                break;
            default:
                abort();
            }
        } else {
            /* RAM case */
2745
            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756
            memcpy(buf, ptr, l);
        }

        if (release_lock) {
            qemu_mutex_unlock_iothread();
            release_lock = false;
        }

        len -= l;
        buf += l;
        addr += l;
2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768

        if (!len) {
            break;
        }

        l = len;
        mr = address_space_translate(as, addr, &addr1, &l, false);
    }

    return result;
}

2769 2770
MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
                                    MemTxAttrs attrs, uint8_t *buf, int len)
2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783
{
    hwaddr l;
    hwaddr addr1;
    MemoryRegion *mr;
    MemTxResult result = MEMTX_OK;

    if (len > 0) {
        rcu_read_lock();
        l = len;
        mr = address_space_translate(as, addr, &addr1, &l, false);
        result = address_space_read_continue(as, addr, attrs, buf, len,
                                             addr1, l, mr);
        rcu_read_unlock();
2784 2785 2786
    }

    return result;
A
Avi Kivity 已提交
2787 2788
}

2789 2790 2791 2792 2793 2794 2795 2796 2797
MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
                             uint8_t *buf, int len, bool is_write)
{
    if (is_write) {
        return address_space_write(as, addr, attrs, (uint8_t *)buf, len);
    } else {
        return address_space_read(as, addr, attrs, (uint8_t *)buf, len);
    }
}
A
Avi Kivity 已提交
2798

A
Avi Kivity 已提交
2799
void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
A
Avi Kivity 已提交
2800 2801
                            int len, int is_write)
{
2802 2803
    address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
                     buf, len, is_write);
A
Avi Kivity 已提交
2804 2805
}

2806 2807 2808 2809 2810
enum write_rom_type {
    WRITE_DATA,
    FLUSH_CACHE,
};

2811
static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
2812
    hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
B
bellard 已提交
2813
{
2814
    hwaddr l;
B
bellard 已提交
2815
    uint8_t *ptr;
2816
    hwaddr addr1;
2817
    MemoryRegion *mr;
2818

2819
    rcu_read_lock();
B
bellard 已提交
2820
    while (len > 0) {
2821
        l = len;
2822
        mr = address_space_translate(as, addr, &addr1, &l, true);
2823

2824 2825
        if (!(memory_region_is_ram(mr) ||
              memory_region_is_romd(mr))) {
2826
            l = memory_access_size(mr, l, addr1);
B
bellard 已提交
2827 2828
        } else {
            /* ROM/RAM case */
2829
            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2830 2831 2832
            switch (type) {
            case WRITE_DATA:
                memcpy(ptr, buf, l);
2833
                invalidate_and_set_dirty(mr, addr1, l);
2834 2835 2836 2837 2838
                break;
            case FLUSH_CACHE:
                flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
                break;
            }
B
bellard 已提交
2839 2840 2841 2842 2843
        }
        len -= l;
        buf += l;
        addr += l;
    }
2844
    rcu_read_unlock();
B
bellard 已提交
2845 2846
}

2847
/* used for ROM loading : can write in RAM and ROM */
2848
void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
2849 2850
                                   const uint8_t *buf, int len)
{
2851
    cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865
}

void cpu_flush_icache_range(hwaddr start, int len)
{
    /*
     * This function should do the same thing as an icache flush that was
     * triggered from within the guest. For TCG we are always cache coherent,
     * so there is no need to flush anything. For KVM / Xen we need to flush
     * the host's instruction cache at least.
     */
    if (tcg_enabled()) {
        return;
    }

2866 2867
    cpu_physical_memory_write_rom_internal(&address_space_memory,
                                           start, NULL, len, FLUSH_CACHE);
2868 2869
}

2870
typedef struct {
2871
    MemoryRegion *mr;
2872
    void *buffer;
A
Avi Kivity 已提交
2873 2874
    hwaddr addr;
    hwaddr len;
F
Fam Zheng 已提交
2875
    bool in_use;
2876 2877 2878 2879
} BounceBuffer;

static BounceBuffer bounce;

2880
typedef struct MapClient {
2881
    QEMUBH *bh;
B
Blue Swirl 已提交
2882
    QLIST_ENTRY(MapClient) link;
2883 2884
} MapClient;

2885
QemuMutex map_client_list_lock;
B
Blue Swirl 已提交
2886 2887
static QLIST_HEAD(map_client_list, MapClient) map_client_list
    = QLIST_HEAD_INITIALIZER(map_client_list);
2888

2889 2890 2891 2892 2893 2894
static void cpu_unregister_map_client_do(MapClient *client)
{
    QLIST_REMOVE(client, link);
    g_free(client);
}

2895 2896 2897 2898 2899 2900
static void cpu_notify_map_clients_locked(void)
{
    MapClient *client;

    while (!QLIST_EMPTY(&map_client_list)) {
        client = QLIST_FIRST(&map_client_list);
2901 2902
        qemu_bh_schedule(client->bh);
        cpu_unregister_map_client_do(client);
2903 2904 2905
    }
}

2906
void cpu_register_map_client(QEMUBH *bh)
2907
{
2908
    MapClient *client = g_malloc(sizeof(*client));
2909

2910
    qemu_mutex_lock(&map_client_list_lock);
2911
    client->bh = bh;
B
Blue Swirl 已提交
2912
    QLIST_INSERT_HEAD(&map_client_list, client, link);
2913 2914 2915
    if (!atomic_read(&bounce.in_use)) {
        cpu_notify_map_clients_locked();
    }
2916
    qemu_mutex_unlock(&map_client_list_lock);
2917 2918
}

2919
void cpu_exec_init_all(void)
2920
{
2921
    qemu_mutex_init(&ram_list.mutex);
2922 2923 2924 2925 2926 2927 2928 2929
    /* The data structures we set up here depend on knowing the page size,
     * so no more changes can be made after this point.
     * In an ideal world, nothing we did before we had finished the
     * machine setup would care about the target page size, and we could
     * do this much later, rather than requiring board models to state
     * up front what their requirements are.
     */
    finalize_target_page_bits();
2930
    io_mem_init();
2931
    memory_map_init();
2932
    qemu_mutex_init(&map_client_list_lock);
2933 2934
}

2935
void cpu_unregister_map_client(QEMUBH *bh)
2936 2937 2938
{
    MapClient *client;

2939 2940 2941 2942 2943 2944
    qemu_mutex_lock(&map_client_list_lock);
    QLIST_FOREACH(client, &map_client_list, link) {
        if (client->bh == bh) {
            cpu_unregister_map_client_do(client);
            break;
        }
2945
    }
2946
    qemu_mutex_unlock(&map_client_list_lock);
2947 2948 2949 2950
}

static void cpu_notify_map_clients(void)
{
2951
    qemu_mutex_lock(&map_client_list_lock);
2952
    cpu_notify_map_clients_locked();
2953
    qemu_mutex_unlock(&map_client_list_lock);
2954 2955
}

2956 2957
bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
{
2958
    MemoryRegion *mr;
2959 2960
    hwaddr l, xlat;

2961
    rcu_read_lock();
2962 2963
    while (len > 0) {
        l = len;
2964 2965 2966 2967
        mr = address_space_translate(as, addr, &xlat, &l, is_write);
        if (!memory_access_is_direct(mr, is_write)) {
            l = memory_access_size(mr, l, addr);
            if (!memory_region_access_valid(mr, xlat, l, is_write)) {
R
Roman Kapl 已提交
2968
                rcu_read_unlock();
2969 2970 2971 2972 2973 2974 2975
                return false;
            }
        }

        len -= l;
        addr += l;
    }
2976
    rcu_read_unlock();
2977 2978 2979
    return true;
}

2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004
static hwaddr
address_space_extend_translation(AddressSpace *as, hwaddr addr, hwaddr target_len,
                                 MemoryRegion *mr, hwaddr base, hwaddr len,
                                 bool is_write)
{
    hwaddr done = 0;
    hwaddr xlat;
    MemoryRegion *this_mr;

    for (;;) {
        target_len -= len;
        addr += len;
        done += len;
        if (target_len == 0) {
            return done;
        }

        len = target_len;
        this_mr = address_space_translate(as, addr, &xlat, &len, is_write);
        if (this_mr != mr || xlat != base + done) {
            return done;
        }
    }
}

3005 3006 3007 3008
/* Map a physical memory region into a host virtual address.
 * May map a subset of the requested range, given by and returned in *plen.
 * May return NULL if resources needed to perform the mapping are exhausted.
 * Use only for reads OR writes - not for read-modify-write operations.
3009 3010
 * Use cpu_register_map_client() to know when retrying the map operation is
 * likely to succeed.
3011
 */
A
Avi Kivity 已提交
3012
void *address_space_map(AddressSpace *as,
A
Avi Kivity 已提交
3013 3014
                        hwaddr addr,
                        hwaddr *plen,
A
Avi Kivity 已提交
3015
                        bool is_write)
3016
{
A
Avi Kivity 已提交
3017
    hwaddr len = *plen;
3018 3019
    hwaddr l, xlat;
    MemoryRegion *mr;
3020
    void *ptr;
3021

3022 3023 3024
    if (len == 0) {
        return NULL;
    }
3025

3026
    l = len;
3027
    rcu_read_lock();
3028
    mr = address_space_translate(as, addr, &xlat, &l, is_write);
3029

3030
    if (!memory_access_is_direct(mr, is_write)) {
F
Fam Zheng 已提交
3031
        if (atomic_xchg(&bounce.in_use, true)) {
3032
            rcu_read_unlock();
3033
            return NULL;
3034
        }
3035 3036 3037
        /* Avoid unbounded allocations */
        l = MIN(l, TARGET_PAGE_SIZE);
        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3038 3039
        bounce.addr = addr;
        bounce.len = l;
3040 3041 3042

        memory_region_ref(mr);
        bounce.mr = mr;
3043
        if (!is_write) {
3044 3045
            address_space_read(as, addr, MEMTXATTRS_UNSPECIFIED,
                               bounce.buffer, l);
3046
        }
3047

3048
        rcu_read_unlock();
3049 3050 3051 3052 3053
        *plen = l;
        return bounce.buffer;
    }


3054
    memory_region_ref(mr);
3055 3056
    *plen = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write);
    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen);
3057 3058 3059
    rcu_read_unlock();

    return ptr;
3060 3061
}

A
Avi Kivity 已提交
3062
/* Unmaps a memory region previously mapped by address_space_map().
3063 3064 3065
 * Will also mark the memory as dirty if is_write == 1.  access_len gives
 * the amount of memory that was actually read or written by the caller.
 */
A
Avi Kivity 已提交
3066 3067
void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
                         int is_write, hwaddr access_len)
3068 3069
{
    if (buffer != bounce.buffer) {
3070 3071 3072
        MemoryRegion *mr;
        ram_addr_t addr1;

3073
        mr = memory_region_from_host(buffer, &addr1);
3074
        assert(mr != NULL);
3075
        if (is_write) {
3076
            invalidate_and_set_dirty(mr, addr1, access_len);
3077
        }
3078
        if (xen_enabled()) {
J
Jan Kiszka 已提交
3079
            xen_invalidate_map_cache_entry(buffer);
A
Anthony PERARD 已提交
3080
        }
3081
        memory_region_unref(mr);
3082 3083 3084
        return;
    }
    if (is_write) {
3085 3086
        address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
                            bounce.buffer, access_len);
3087
    }
3088
    qemu_vfree(bounce.buffer);
3089
    bounce.buffer = NULL;
3090
    memory_region_unref(bounce.mr);
F
Fam Zheng 已提交
3091
    atomic_mb_set(&bounce.in_use, false);
3092
    cpu_notify_map_clients();
3093
}
B
bellard 已提交
3094

A
Avi Kivity 已提交
3095 3096
void *cpu_physical_memory_map(hwaddr addr,
                              hwaddr *plen,
A
Avi Kivity 已提交
3097 3098 3099 3100 3101
                              int is_write)
{
    return address_space_map(&address_space_memory, addr, plen, is_write);
}

A
Avi Kivity 已提交
3102 3103
void cpu_physical_memory_unmap(void *buffer, hwaddr len,
                               int is_write, hwaddr access_len)
A
Avi Kivity 已提交
3104 3105 3106 3107
{
    return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
}

P
Paolo Bonzini 已提交
3108 3109 3110 3111 3112 3113 3114 3115 3116 3117
#define ARG1_DECL                AddressSpace *as
#define ARG1                     as
#define SUFFIX
#define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
#define IS_DIRECT(mr, is_write)  memory_access_is_direct(mr, is_write)
#define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
#define RCU_READ_LOCK(...)       rcu_read_lock()
#define RCU_READ_UNLOCK(...)     rcu_read_unlock()
#include "memory_ldst.inc.c"
3118

P
Paolo Bonzini 已提交
3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194
int64_t address_space_cache_init(MemoryRegionCache *cache,
                                 AddressSpace *as,
                                 hwaddr addr,
                                 hwaddr len,
                                 bool is_write)
{
    hwaddr l, xlat;
    MemoryRegion *mr;
    void *ptr;

    assert(len > 0);

    l = len;
    mr = address_space_translate(as, addr, &xlat, &l, is_write);
    if (!memory_access_is_direct(mr, is_write)) {
        return -EINVAL;
    }

    l = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write);
    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, &l);

    cache->xlat = xlat;
    cache->is_write = is_write;
    cache->mr = mr;
    cache->ptr = ptr;
    cache->len = l;
    memory_region_ref(cache->mr);

    return l;
}

void address_space_cache_invalidate(MemoryRegionCache *cache,
                                    hwaddr addr,
                                    hwaddr access_len)
{
    assert(cache->is_write);
    invalidate_and_set_dirty(cache->mr, addr + cache->xlat, access_len);
}

void address_space_cache_destroy(MemoryRegionCache *cache)
{
    if (!cache->mr) {
        return;
    }

    if (xen_enabled()) {
        xen_invalidate_map_cache_entry(cache->ptr);
    }
    memory_region_unref(cache->mr);
}

/* Called from RCU critical section.  This function has the same
 * semantics as address_space_translate, but it only works on a
 * predefined range of a MemoryRegion that was mapped with
 * address_space_cache_init.
 */
static inline MemoryRegion *address_space_translate_cached(
    MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
    hwaddr *plen, bool is_write)
{
    assert(addr < cache->len && *plen <= cache->len - addr);
    *xlat = addr + cache->xlat;
    return cache->mr;
}

#define ARG1_DECL                MemoryRegionCache *cache
#define ARG1                     cache
#define SUFFIX                   _cached
#define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
#define IS_DIRECT(mr, is_write)  true
#define MAP_RAM(mr, ofs)         (cache->ptr + (ofs - cache->xlat))
#define INVALIDATE(mr, ofs, len) ((void)0)
#define RCU_READ_LOCK()          ((void)0)
#define RCU_READ_UNLOCK()        ((void)0)
#include "memory_ldst.inc.c"

3195
/* virtual memory access for debug (includes writing to ROM) */
3196
int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3197
                        uint8_t *buf, int len, int is_write)
B
bellard 已提交
3198 3199
{
    int l;
A
Avi Kivity 已提交
3200
    hwaddr phys_addr;
3201
    target_ulong page;
B
bellard 已提交
3202 3203

    while (len > 0) {
3204 3205 3206
        int asidx;
        MemTxAttrs attrs;

B
bellard 已提交
3207
        page = addr & TARGET_PAGE_MASK;
3208 3209
        phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
        asidx = cpu_asidx_from_attrs(cpu, attrs);
B
bellard 已提交
3210 3211 3212 3213 3214 3215
        /* if no physical page mapped, return an error */
        if (phys_addr == -1)
            return -1;
        l = (page + TARGET_PAGE_SIZE) - addr;
        if (l > len)
            l = len;
3216
        phys_addr += (addr & ~TARGET_PAGE_MASK);
3217
        if (is_write) {
3218 3219
            cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
                                          phys_addr, buf, l);
3220
        } else {
3221 3222
            address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
                             MEMTXATTRS_UNSPECIFIED,
3223
                             buf, l, 0);
3224
        }
B
bellard 已提交
3225 3226 3227 3228 3229 3230
        len -= l;
        buf += l;
        addr += l;
    }
    return 0;
}
3231 3232 3233 3234 3235 3236 3237 3238 3239 3240

/*
 * Allows code that needs to deal with migration bitmaps etc to still be built
 * target independent.
 */
size_t qemu_target_page_bits(void)
{
    return TARGET_PAGE_BITS;
}

P
Paul Brook 已提交
3241
#endif
B
bellard 已提交
3242

3243 3244 3245 3246
/*
 * A helper function for the _utterly broken_ virtio device model to find out if
 * it's running on a big endian machine. Don't do this at home kids!
 */
3247 3248
bool target_words_bigendian(void);
bool target_words_bigendian(void)
3249 3250 3251 3252 3253 3254 3255 3256
{
#if defined(TARGET_WORDS_BIGENDIAN)
    return true;
#else
    return false;
#endif
}

3257
#ifndef CONFIG_USER_ONLY
A
Avi Kivity 已提交
3258
bool cpu_physical_memory_is_io(hwaddr phys_addr)
3259
{
3260
    MemoryRegion*mr;
3261
    hwaddr l = 1;
3262
    bool res;
3263

3264
    rcu_read_lock();
3265 3266
    mr = address_space_translate(&address_space_memory,
                                 phys_addr, &phys_addr, &l, false);
3267

3268 3269 3270
    res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
    rcu_read_unlock();
    return res;
3271
}
3272

3273
int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3274 3275
{
    RAMBlock *block;
3276
    int ret = 0;
3277

M
Mike Day 已提交
3278 3279
    rcu_read_lock();
    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
3280 3281 3282 3283 3284
        ret = func(block->idstr, block->host, block->offset,
                   block->used_length, opaque);
        if (ret) {
            break;
        }
3285
    }
M
Mike Day 已提交
3286
    rcu_read_unlock();
3287
    return ret;
3288
}
3289
#endif