diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c index e44c319915acac43183176ce993c515932948dd8..e319ec1ad8aff043d54141807de1ec1ac8ec12e6 100644 --- a/backends/hostmem-file.c +++ b/backends/hostmem-file.c @@ -34,6 +34,7 @@ struct HostMemoryBackendFile { bool share; bool discard_data; char *mem_path; + uint64_t align; }; static void @@ -58,7 +59,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) path = object_get_canonical_path(OBJECT(backend)); memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), path, - backend->size, fb->share, + backend->size, fb->align, fb->share, fb->mem_path, errp); g_free(path); } @@ -115,6 +116,40 @@ static void file_memory_backend_set_discard_data(Object *o, bool value, MEMORY_BACKEND_FILE(o)->discard_data = value; } +static void file_memory_backend_get_align(Object *o, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); + uint64_t val = fb->align; + + visit_type_size(v, name, &val, errp); +} + +static void file_memory_backend_set_align(Object *o, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(o); + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); + Error *local_err = NULL; + uint64_t val; + + if (host_memory_backend_mr_inited(backend)) { + error_setg(&local_err, "cannot change property value"); + goto out; + } + + visit_type_size(v, name, &val, &local_err); + if (local_err) { + goto out; + } + fb->align = val; + + out: + error_propagate(errp, local_err); +} + static void file_backend_unparent(Object *obj) { HostMemoryBackend *backend = MEMORY_BACKEND(obj); @@ -145,6 +180,10 @@ file_backend_class_init(ObjectClass *oc, void *data) object_class_property_add_str(oc, "mem-path", get_mem_path, set_mem_path, &error_abort); + object_class_property_add(oc, "align", "int", + file_memory_backend_get_align, + file_memory_backend_set_align, + NULL, NULL, &error_abort); } static void file_backend_instance_finalize(Object *o) diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt index 2d9f8c0e8cfd1043df7fa8f8bf4076e38be3ca65..e903d8bb093b53b8d9ae71c89d529bc33d8db492 100644 --- a/docs/nvdimm.txt +++ b/docs/nvdimm.txt @@ -122,3 +122,34 @@ Note: M >= size of RAM devices + size of statically plugged vNVDIMM devices + size of hotplugged vNVDIMM devices + +Alignment +--------- + +QEMU uses mmap(2) to maps vNVDIMM backends and aligns the mapping +address to the page size (getpagesize(2)) by default. However, some +types of backends may require an alignment different than the page +size. In that case, QEMU v2.12.0 and later provide 'align' option to +memory-backend-file to allow users to specify the proper alignment. + +For example, device dax require the 2 MB alignment, so we can use +following QEMU command line options to use it (/dev/dax0.0) as the +backend of vNVDIMM: + + -object memory-backend-file,id=mem1,share=on,mem-path=/dev/dax0.0,size=4G,align=2M + -device nvdimm,id=nvdimm1,memdev=mem1 + +Guest Data Persistence +---------------------- + +Though QEMU supports multiple types of vNVDIMM backends on Linux, +currently the only one that can guarantee the guest write persistence +is the device DAX on the real NVDIMM device (e.g., /dev/dax0.0), to +which all guest access do not involve any host-side kernel cache. + +When using other types of backends, it's suggested to set 'unarmed' +option of '-device nvdimm' to 'on', which sets the unarmed flag of the +guest NVDIMM region mapping structure. This unarmed flag indicates +guest software that this vNVDIMM device contains a region that cannot +accept persistent writes. In result, for example, the guest Linux +NVDIMM driver, marks such vNVDIMM device as read-only. diff --git a/exec.c b/exec.c index d28fc0cd3d9fd2e0a77aaa27d0343202d46bef88..629a5083851d5c3a4d3db64008196df052ce9778 100644 --- a/exec.c +++ b/exec.c @@ -1612,7 +1612,13 @@ static void *file_ram_alloc(RAMBlock *block, void *area; block->page_size = qemu_fd_getpagesize(fd); - block->mr->align = block->page_size; + if (block->mr->align % block->page_size) { + error_setg(errp, "alignment 0x%" PRIx64 + " must be multiples of page size 0x%zx", + block->mr->align, block->page_size); + return NULL; + } + block->mr->align = MAX(block->page_size, block->mr->align); #if defined(__s390x__) if (kvm_enabled()) { block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN); diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 6ceea196e70ab8d5ef0c5e7750fbc04caeda098c..59d6e4254ccde9d2534d4bee7f31522667f62962 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -138,6 +138,8 @@ struct NvdimmNfitMemDev { } QEMU_PACKED; typedef struct NvdimmNfitMemDev NvdimmNfitMemDev; +#define ACPI_NFIT_MEM_NOT_ARMED (1 << 3) + /* * NVDIMM Control Region Structure * @@ -284,6 +286,7 @@ static void nvdimm_build_structure_memdev(GArray *structures, DeviceState *dev) { NvdimmNfitMemDev *nfit_memdev; + NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev)); uint64_t size = object_property_get_uint(OBJECT(dev), PC_DIMM_SIZE_PROP, NULL); int slot = object_property_get_int(OBJECT(dev), PC_DIMM_SLOT_PROP, @@ -312,6 +315,10 @@ nvdimm_build_structure_memdev(GArray *structures, DeviceState *dev) /* Only one interleave for PMEM. */ nfit_memdev->interleave_ways = cpu_to_le16(1); + + if (nvdimm->unarmed) { + nfit_memdev->flags |= cpu_to_le16(ACPI_NFIT_MEM_NOT_ARMED); + } } /* diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 543f9bd6cc4cf5757b9799bdcbaa261c5676638d..a4537af4000f0a8c5250a2844b7b784c9701e24d 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -34,6 +34,8 @@ #include "hw/arm/arm.h" #include "hw/arm/primecell.h" #include "hw/arm/virt.h" +#include "hw/vfio/vfio-calxeda-xgmac.h" +#include "hw/vfio/vfio-amd-xgbe.h" #include "hw/devices.h" #include "net/net.h" #include "sysemu/block-backend.h" @@ -1357,7 +1359,7 @@ static void machvirt_init(MachineState *machine) break; } - cpuobj = object_new(machine->cpu_type); + cpuobj = object_new(possible_cpus->cpus[n].type); object_property_set_int(cpuobj, possible_cpus->cpus[n].arch_id, "mp-affinity", NULL); @@ -1573,6 +1575,7 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) sizeof(CPUArchId) * max_cpus); ms->possible_cpus->len = max_cpus; for (n = 0; n < ms->possible_cpus->len; n++) { + ms->possible_cpus->cpus[n].type = ms->cpu_type; ms->possible_cpus->cpus[n].arch_id = virt_cpu_mp_affinity(vms, n); ms->possible_cpus->cpus[n].props.has_thread_id = true; @@ -1591,7 +1594,8 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) * configuration of the particular instance. */ mc->max_cpus = 255; - mc->has_dynamic_sysbus = true; + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_CALXEDA_XGMAC); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_AMD_XGBE); mc->block_default_type = IF_VIRTIO; mc->no_cdrom = 1; mc->pci_allow_0_address = true; diff --git a/hw/core/machine.c b/hw/core/machine.c index c857f3f9349221f8125a96648b7d32edd6180b0e..cdc1163dc6fb5b61d26d4de136639e21bb907c07 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -334,46 +334,61 @@ static bool machine_get_enforce_config_section(Object *obj, Error **errp) return ms->enforce_config_section; } -static void error_on_sysbus_device(SysBusDevice *sbdev, void *opaque) +void machine_class_allow_dynamic_sysbus_dev(MachineClass *mc, const char *type) { - error_report("Option '-device %s' cannot be handled by this machine", - object_class_get_name(object_get_class(OBJECT(sbdev)))); - exit(1); + strList *item = g_new0(strList, 1); + + item->value = g_strdup(type); + item->next = mc->allowed_dynamic_sysbus_devices; + mc->allowed_dynamic_sysbus_devices = item; } -static void machine_init_notify(Notifier *notifier, void *data) +static void validate_sysbus_device(SysBusDevice *sbdev, void *opaque) { - Object *machine = qdev_get_machine(); - ObjectClass *oc = object_get_class(machine); - MachineClass *mc = MACHINE_CLASS(oc); + MachineState *machine = opaque; + MachineClass *mc = MACHINE_GET_CLASS(machine); + bool allowed = false; + strList *wl; - if (mc->has_dynamic_sysbus) { - /* Our machine can handle dynamic sysbus devices, we're all good */ - return; + for (wl = mc->allowed_dynamic_sysbus_devices; + !allowed && wl; + wl = wl->next) { + allowed |= !!object_dynamic_cast(OBJECT(sbdev), wl->value); + } + + if (!allowed) { + error_report("Option '-device %s' cannot be handled by this machine", + object_class_get_name(object_get_class(OBJECT(sbdev)))); + exit(1); } +} + +static void machine_init_notify(Notifier *notifier, void *data) +{ + MachineState *machine = MACHINE(qdev_get_machine()); /* - * Loop through all dynamically created devices and check whether there - * are sysbus devices among them. If there are, error out. + * Loop through all dynamically created sysbus devices and check if they are + * all allowed. If a device is not allowed, error out. */ - foreach_dynamic_sysbus_device(error_on_sysbus_device, NULL); + foreach_dynamic_sysbus_device(validate_sysbus_device, machine); } HotpluggableCPUList *machine_query_hotpluggable_cpus(MachineState *machine) { int i; - Object *cpu; HotpluggableCPUList *head = NULL; - const char *cpu_type; + MachineClass *mc = MACHINE_GET_CLASS(machine); + + /* force board to initialize possible_cpus if it hasn't been done yet */ + mc->possible_cpu_arch_ids(machine); - cpu = machine->possible_cpus->cpus[0].cpu; - assert(cpu); /* Boot cpu is always present */ - cpu_type = object_get_typename(cpu); for (i = 0; i < machine->possible_cpus->len; i++) { + Object *cpu; HotpluggableCPUList *list_item = g_new0(typeof(*list_item), 1); HotpluggableCPU *cpu_item = g_new0(typeof(*cpu_item), 1); - cpu_item->type = g_strdup(cpu_type); + cpu_item->type = g_strdup(machine->possible_cpus->cpus[i].type); cpu_item->vcpus_count = machine->possible_cpus->cpus[i].vcpus_count; cpu_item->props = g_memdup(&machine->possible_cpus->cpus[i].props, sizeof(*cpu_item->props)); diff --git a/hw/core/qdev.c b/hw/core/qdev.c index 11112951a52278c1cbf849314fe5b5503b5841b5..f739753e3a1d39510b7bd42acf73aae904bc7455 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -253,19 +253,31 @@ void qdev_set_legacy_instance_id(DeviceState *dev, int alias_id, dev->alias_required_for_version = required_for_version; } +HotplugHandler *qdev_get_machine_hotplug_handler(DeviceState *dev) +{ + MachineState *machine; + MachineClass *mc; + Object *m_obj = qdev_get_machine(); + + if (object_dynamic_cast(m_obj, TYPE_MACHINE)) { + machine = MACHINE(m_obj); + mc = MACHINE_GET_CLASS(machine); + if (mc->get_hotplug_handler) { + return mc->get_hotplug_handler(machine, dev); + } + } + + return NULL; +} + HotplugHandler *qdev_get_hotplug_handler(DeviceState *dev) { - HotplugHandler *hotplug_ctrl = NULL; + HotplugHandler *hotplug_ctrl; if (dev->parent_bus && dev->parent_bus->hotplug_handler) { hotplug_ctrl = dev->parent_bus->hotplug_handler; - } else if (object_dynamic_cast(qdev_get_machine(), TYPE_MACHINE)) { - MachineState *machine = MACHINE(qdev_get_machine()); - MachineClass *mc = MACHINE_GET_CLASS(machine); - - if (mc->get_hotplug_handler) { - hotplug_ctrl = mc->get_hotplug_handler(machine, dev); - } + } else { + hotplug_ctrl = qdev_get_machine_hotplug_handler(dev); } return hotplug_ctrl; } diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 55686bf5d8a1f085a10069c660759a133c926654..ccc50baa85e0bf3e4f2dd90555e3febea58b9e1c 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1148,7 +1148,8 @@ void pc_cpus_init(PCMachineState *pcms) pcms->apic_id_limit = x86_cpu_apic_id_from_index(max_cpus - 1) + 1; possible_cpus = mc->possible_cpu_arch_ids(ms); for (i = 0; i < smp_cpus; i++) { - pc_new_cpu(ms->cpu_type, possible_cpus->cpus[i].arch_id, &error_fatal); + pc_new_cpu(possible_cpus->cpus[i].type, possible_cpus->cpus[i].arch_id, + &error_fatal); } } @@ -2307,6 +2308,7 @@ static const CPUArchIdList *pc_possible_cpu_arch_ids(MachineState *ms) for (i = 0; i < ms->possible_cpus->len; i++) { X86CPUTopoInfo topo; + ms->possible_cpus->cpus[i].type = ms->cpu_type; ms->possible_cpus->cpus[i].vcpus_count = 1; ms->possible_cpus->cpus[i].arch_id = x86_cpu_apic_id_from_index(i); x86_topo_ids_from_apicid(ms->possible_cpus->cpus[i].arch_id, diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 5c6c608fcba07ff64a9e1f1c4732ab928eea179f..ed3a0b8ff79a9d7965ee2ce63d5190eea0d8c7ad 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -42,6 +42,8 @@ #include "exec/address-spaces.h" #include "hw/i386/pc.h" #include "hw/i386/ich9.h" +#include "hw/i386/amd_iommu.h" +#include "hw/i386/intel_iommu.h" #include "hw/smbios/smbios.h" #include "hw/ide/pci.h" #include "hw/ide/ahci.h" @@ -299,7 +301,8 @@ static void pc_q35_machine_options(MachineClass *m) m->default_machine_opts = "firmware=bios-256k.bin"; m->default_display = "std"; m->no_floppy = 1; - m->has_dynamic_sysbus = true; + machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE); + machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE); m->max_cpus = 288; } diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c index 952fce5ec825752511314e5add209e8ee43f90e8..61e677f92ff2d407b01f60d0c95e5ea8db861465 100644 --- a/hw/mem/nvdimm.c +++ b/hw/mem/nvdimm.c @@ -25,6 +25,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qapi/visitor.h" +#include "qapi-visit.h" #include "hw/mem/nvdimm.h" static void nvdimm_get_label_size(Object *obj, Visitor *v, const char *name, @@ -64,11 +65,36 @@ out: error_propagate(errp, local_err); } +static bool nvdimm_get_unarmed(Object *obj, Error **errp) +{ + NVDIMMDevice *nvdimm = NVDIMM(obj); + + return nvdimm->unarmed; +} + +static void nvdimm_set_unarmed(Object *obj, bool value, Error **errp) +{ + NVDIMMDevice *nvdimm = NVDIMM(obj); + Error *local_err = NULL; + + if (memory_region_size(&nvdimm->nvdimm_mr)) { + error_setg(&local_err, "cannot change property value"); + goto out; + } + + nvdimm->unarmed = value; + + out: + error_propagate(errp, local_err); +} + static void nvdimm_init(Object *obj) { - object_property_add(obj, "label-size", "int", + object_property_add(obj, NVDIMM_LABLE_SIZE_PROP, "int", nvdimm_get_label_size, nvdimm_set_label_size, NULL, NULL, NULL); + object_property_add_bool(obj, NVDIMM_UNARMED_PROP, + nvdimm_get_unarmed, nvdimm_set_unarmed, NULL); } static MemoryRegion *nvdimm_get_memory_region(PCDIMMDevice *dimm, Error **errp) diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c index 753ac0e4eab3424e56621dc4f93987480f3c7176..4313484b21bb60329f3dabba7c607347e6b353d4 100644 --- a/hw/nvram/fw_cfg.c +++ b/hw/nvram/fw_cfg.c @@ -784,7 +784,7 @@ void fw_cfg_add_file_callback(FWCfgState *s, const char *filename, * index and "i - 1" is the one being copied from, thus the * unusual start and end in the for statement. */ - for (i = count + 1; i > index; i--) { + for (i = count; i > index; i--) { s->files->f[i] = s->files->f[i - 1]; s->files->f[i].select = cpu_to_be16(FW_CFG_FILE_FIRST + i); s->entries[0][FW_CFG_FILE_FIRST + i] = @@ -833,7 +833,6 @@ void *fw_cfg_modify_file(FWCfgState *s, const char *filename, assert(s->files); index = be32_to_cpu(s->files->count); - assert(index < fw_cfg_file_slots(s)); for (i = 0; i < index; i++) { if (strcmp(filename, s->files->f[i].name) == 0) { @@ -843,6 +842,9 @@ void *fw_cfg_modify_file(FWCfgState *s, const char *filename, return ptr; } } + + assert(index < fw_cfg_file_slots(s)); + /* add new one */ fw_cfg_add_file_callback(s, filename, NULL, NULL, NULL, data, len, true); return NULL; diff --git a/hw/ppc/e500plat.c b/hw/ppc/e500plat.c index e59e80fb9e13c62ecb6e9fcf70d5b9add8fdd755..81d03e103803c75b89134ebc0e1a41334994b295 100644 --- a/hw/ppc/e500plat.c +++ b/hw/ppc/e500plat.c @@ -12,9 +12,11 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "e500.h" +#include "hw/net/fsl_etsec/etsec.h" #include "hw/boards.h" #include "sysemu/device_tree.h" #include "sysemu/kvm.h" +#include "hw/sysbus.h" #include "hw/pci/pci.h" #include "hw/ppc/openpic.h" #include "kvm_ppc.h" @@ -63,7 +65,7 @@ static void e500plat_machine_init(MachineClass *mc) mc->desc = "generic paravirt e500 platform"; mc->init = e500plat_init; mc->max_cpus = 32; - mc->has_dynamic_sysbus = true; + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ETSEC_COMMON); mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("e500v2_v30"); } diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 499ab647d8829dad32dea7988b06cb33df7ed864..a781dd22e727585e38fff1f0189f1e196393ad5b 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2226,11 +2226,6 @@ static void spapr_init_cpus(sPAPRMachineState *spapr) int boot_cores_nr = smp_cpus / smp_threads; int i; - if (!type) { - error_report("Unable to find sPAPR CPU Core definition"); - exit(1); - } - possible_cpus = mc->possible_cpu_arch_ids(machine); if (mc->has_hotpluggable_cpus) { if (smp_cpus % smp_threads) { @@ -3545,6 +3540,7 @@ static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx) static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine) { int i; + const char *core_type; int spapr_max_cores = max_cpus / smp_threads; MachineClass *mc = MACHINE_GET_CLASS(machine); @@ -3556,12 +3552,19 @@ static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine) return machine->possible_cpus; } + core_type = spapr_get_cpu_core_type(machine->cpu_type); + if (!core_type) { + error_report("Unable to find sPAPR CPU Core definition"); + exit(1); + } + machine->possible_cpus = g_malloc0(sizeof(CPUArchIdList) + sizeof(CPUArchId) * spapr_max_cores); machine->possible_cpus->len = spapr_max_cores; for (i = 0; i < machine->possible_cpus->len; i++) { int core_id = i * smp_threads; + machine->possible_cpus->cpus[i].type = core_type; machine->possible_cpus->cpus[i].vcpus_count = smp_threads; machine->possible_cpus->cpus[i].arch_id = core_id; machine->possible_cpus->cpus[i].props.has_core_id = true; @@ -3843,7 +3846,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) mc->default_boot_order = ""; mc->default_ram_size = 512 * M_BYTE; mc->kvm_type = spapr_kvm_type; - mc->has_dynamic_sysbus = true; + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SPAPR_PCI_HOST_BRIDGE); mc->pci_allow_0_address = true; mc->get_hotplug_handler = spapr_get_hotplug_handler; hc->pre_plug = spapr_machine_device_pre_plug; diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index 35df7e19c5af10e734fc9f7dd94c0eae40df8596..3807dcb097a728250b09d19f2be036a2551b6478 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -414,6 +414,7 @@ static const CPUArchIdList *s390_possible_cpu_arch_ids(MachineState *ms) sizeof(CPUArchId) * max_cpus); ms->possible_cpus->len = max_cpus; for (i = 0; i < ms->possible_cpus->len; i++) { + ms->possible_cpus->cpus[i].type = ms->cpu_type; ms->possible_cpus->cpus[i].vcpus_count = 1; ms->possible_cpus->cpus[i].arch_id = i; ms->possible_cpus->cpus[i].props.has_core_id = true; diff --git a/hw/xen/xen_backend.c b/hw/xen/xen_backend.c index 0f849a26d28f25fddd58c5cab3c3c06d82ccac74..7445b506ac610dc35283f28072f0dca893d02dff 100644 --- a/hw/xen/xen_backend.c +++ b/hw/xen/xen_backend.c @@ -564,7 +564,7 @@ static void xen_set_dynamic_sysbus(void) ObjectClass *oc = object_get_class(machine); MachineClass *mc = MACHINE_CLASS(oc); - mc->has_dynamic_sysbus = true; + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_XENSYSDEV); } int xen_be_register(const char *type, struct XenDevOps *ops) diff --git a/include/exec/memory.h b/include/exec/memory.h index a4cabdf44c353ed840b3b16c86f8bbdc6e265814..07c5d6d59796b92f21152ab99f949f9dc21e4c0a 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -465,6 +465,8 @@ void memory_region_init_resizeable_ram(MemoryRegion *mr, * @name: Region name, becomes part of RAMBlock name used in migration stream * must be unique within any device * @size: size of the region. + * @align: alignment of the region base address; if 0, the default alignment + * (getpagesize()) will be used. * @share: %true if memory must be mmaped with the MAP_SHARED flag * @path: the path in which to allocate the RAM. * @errp: pointer to Error*, to store an error if it happens. @@ -476,6 +478,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, struct Object *owner, const char *name, uint64_t size, + uint64_t align, bool share, const char *path, Error **errp); diff --git a/include/hw/boards.h b/include/hw/boards.h index 156b16f7a6b5e10ae2c7f4d3574ca0b1b74a88d9..efb0a9edfdf1c15dac98a6582156b3643e1a6233 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -76,10 +76,14 @@ void machine_set_cpu_numa_node(MachineState *machine, const CpuInstanceProperties *props, Error **errp); +void machine_class_allow_dynamic_sysbus_dev(MachineClass *mc, const char *type); + + /** * CPUArchId: * @arch_id - architecture-dependent CPU ID of present or possible CPU * @cpu - pointer to corresponding CPU object if it's present on NULL otherwise + * @type - QOM class name of possible @cpu object * @props - CPU object properties, initialized by board * #vcpus_count - number of threads provided by @cpu object */ @@ -88,6 +92,7 @@ typedef struct { int64_t vcpus_count; CpuInstanceProperties props; Object *cpu; + const char *type; } CPUArchId; /** @@ -179,7 +184,6 @@ struct MachineClass { no_floppy:1, no_cdrom:1, no_sdcard:1, - has_dynamic_sysbus:1, pci_allow_0_address:1, legacy_fw_cfg_order:1; int is_default; @@ -197,6 +201,7 @@ struct MachineClass { bool ignore_memory_transaction_failures; int numa_mem_align_shift; const char **valid_cpu_types; + strList *allowed_dynamic_sysbus_devices; bool auto_enable_numa_with_memhp; void (*numa_auto_assign_ram)(MachineClass *mc, NodeInfo *nodes, int nb_nodes, ram_addr_t size); diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index 03e1ff9558a16737f1af1371a5a25fac3f67e8d5..7fd87c4e1cef470e4727b724f359dd5ef1ef3c04 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -47,6 +47,10 @@ #define NVDIMM_CLASS(oc) OBJECT_CLASS_CHECK(NVDIMMClass, (oc), TYPE_NVDIMM) #define NVDIMM_GET_CLASS(obj) OBJECT_GET_CLASS(NVDIMMClass, (obj), \ TYPE_NVDIMM) + +#define NVDIMM_LABLE_SIZE_PROP "label-size" +#define NVDIMM_UNARMED_PROP "unarmed" + struct NVDIMMDevice { /* private */ PCDIMMDevice parent_obj; @@ -71,6 +75,14 @@ struct NVDIMMDevice { * guest via ACPI NFIT and _FIT method if NVDIMM hotplug is supported. */ MemoryRegion nvdimm_mr; + + /* + * The 'on' value results in the unarmed flag set in ACPI NFIT, + * which can be used to notify guest implicitly that the host + * backend (e.g., files on HDD, /dev/pmemX, etc.) cannot guarantee + * the guest write persistence. + */ + bool unarmed; }; typedef struct NVDIMMDevice NVDIMMDevice; diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h index 0a71bf83f04700af621e00949695a5ea53345e14..51473eee7ba3b66e98882e0c0a3bc9fc0eb10ded 100644 --- a/include/hw/qdev-core.h +++ b/include/hw/qdev-core.h @@ -286,6 +286,7 @@ DeviceState *qdev_try_create(BusState *bus, const char *name); void qdev_init_nofail(DeviceState *dev); void qdev_set_legacy_instance_id(DeviceState *dev, int alias_id, int required_for_version); +HotplugHandler *qdev_get_machine_hotplug_handler(DeviceState *dev); HotplugHandler *qdev_get_hotplug_handler(DeviceState *dev); void qdev_unplug(DeviceState *dev, Error **errp); void qdev_simple_device_unplug_cb(HotplugHandler *hotplug_dev, diff --git a/include/qemu/memfd.h b/include/qemu/memfd.h index 745a8c501ee1ec1a66129ebef7e2d97a885000a6..41c24d807c450d5e8c045137f2955a5319f78573 100644 --- a/include/qemu/memfd.h +++ b/include/qemu/memfd.h @@ -16,6 +16,7 @@ #define F_SEAL_WRITE 0x0008 /* prevent writes */ #endif +int qemu_memfd_create(const char *name, size_t size, unsigned int seals); void *qemu_memfd_alloc(const char *name, size_t size, unsigned int seals, int *fd); void qemu_memfd_free(void *ptr, size_t size, int fd); diff --git a/memory.c b/memory.c index 4b41fb837b40c7b0dc6b3bd979f0431ad1232863..449a1429b946c57034a286d0f50cc43f321069ff 100644 --- a/memory.c +++ b/memory.c @@ -1570,6 +1570,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, struct Object *owner, const char *name, uint64_t size, + uint64_t align, bool share, const char *path, Error **errp) @@ -1578,6 +1579,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, mr->ram = true; mr->terminates = true; mr->destructor = memory_region_destructor_ram; + mr->align = align; mr->ram_block = qemu_ram_alloc_from_file(size, mr, share, path, errp); mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0; } diff --git a/numa.c b/numa.c index 7b9c33ad1208047fc73e7f52faf2eea7beeb5f5a..83675a03f3cb65cc945d3bed92951464ae7256d0 100644 --- a/numa.c +++ b/numa.c @@ -456,7 +456,7 @@ static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner, if (mem_path) { #ifdef __linux__ Error *err = NULL; - memory_region_init_ram_from_file(mr, owner, name, ram_size, false, + memory_region_init_ram_from_file(mr, owner, name, ram_size, 0, false, mem_path, &err); if (err) { error_report_err(err); diff --git a/qdev-monitor.c b/qdev-monitor.c index b4abb4b5ea0289b479f0b03d29082d376d54a273..c436616446c96dfc2a23601f2b55c7461d66d495 100644 --- a/qdev-monitor.c +++ b/qdev-monitor.c @@ -613,28 +613,33 @@ DeviceState *qdev_device_add(QemuOpts *opts, Error **errp) if (bus) { qdev_set_parent_bus(dev, bus); + } else if (qdev_hotplug && !qdev_get_machine_hotplug_handler(dev)) { + /* No bus, no machine hotplug handler --> device is not hotpluggable */ + error_setg(&err, "Device '%s' can not be hotplugged on this machine", + driver); + goto err_del_dev; } qdev_set_id(dev, qemu_opts_id(opts)); /* set properties */ if (qemu_opt_foreach(opts, set_property, dev, &err)) { - error_propagate(errp, err); - object_unparent(OBJECT(dev)); - object_unref(OBJECT(dev)); - return NULL; + goto err_del_dev; } dev->opts = opts; object_property_set_bool(OBJECT(dev), true, "realized", &err); if (err != NULL) { - error_propagate(errp, err); dev->opts = NULL; - object_unparent(OBJECT(dev)); - object_unref(OBJECT(dev)); - return NULL; + goto err_del_dev; } return dev; + +err_del_dev: + error_propagate(errp, err); + object_unparent(OBJECT(dev)); + object_unref(OBJECT(dev)); + return NULL; } diff --git a/qemu-options.hx b/qemu-options.hx index 678181c5991dca1d4b66b3fb0efeeb439c4a2ea4..5ff741a4af4f8ee00bff3be2ca92f77951f5469e 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -169,7 +169,9 @@ ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" - "-numa dist,src=source,dst=destination,val=distance\n", QEMU_ARCH_ALL) + "-numa dist,src=source,dst=destination,val=distance\n" + "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n", + QEMU_ARCH_ALL) STEXI @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] @@ -3972,18 +3974,24 @@ property must be set. These objects are placed in the @table @option -@item -object memory-backend-file,id=@var{id},size=@var{size},mem-path=@var{dir},share=@var{on|off},discard-data=@var{on|off} +@item -object memory-backend-file,id=@var{id},size=@var{size},mem-path=@var{dir},share=@var{on|off},discard-data=@var{on|off},merge=@var{on|off},dump=@var{on|off},prealloc=@var{on|off},host-nodes=@var{host-nodes},policy=@var{default|preferred|bind|interleave},align=@var{align} Creates a memory file backend object, which can be used to back -the guest RAM with huge pages. The @option{id} parameter is a -unique ID that will be used to reference this memory region -when configuring the @option{-numa} argument. The @option{size} -option provides the size of the memory region, and accepts -common suffixes, eg @option{500M}. The @option{mem-path} provides -the path to either a shared memory or huge page filesystem mount. +the guest RAM with huge pages. + +The @option{id} parameter is a unique ID that will be used to reference this +memory region when configuring the @option{-numa} argument. + +The @option{size} option provides the size of the memory region, and accepts +common suffixes, eg @option{500M}. + +The @option{mem-path} provides the path to either a shared memory or huge page +filesystem mount. + The @option{share} boolean option determines whether the memory region is marked as private to QEMU, or shared. The latter allows a co-operating external process to access the QEMU memory region. + Setting the @option{discard-data} boolean option to @var{on} indicates that file contents can be destroyed when QEMU exits, to avoid unnecessarily flushing data to the backing file. Note @@ -3991,6 +3999,48 @@ that @option{discard-data} is only an optimization, and QEMU might not discard file contents if it aborts unexpectedly or is terminated using SIGKILL. +The @option{merge} boolean option enables memory merge, also known as +MADV_MERGEABLE, so that Kernel Samepage Merging will consider the pages for +memory deduplication. + +Setting the @option{dump} boolean option to @var{off} excludes the memory from +core dumps. This feature is also known as MADV_DONTDUMP. + +The @option{prealloc} boolean option enables memory preallocation. + +The @option{host-nodes} option binds the memory range to a list of NUMA host +nodes. + +The @option{policy} option sets the NUMA policy to one of the following values: + +@table @option +@item @var{default} +default host policy + +@item @var{preferred} +prefer the given host node list for allocation + +@item @var{bind} +restrict memory allocation to the given host node list + +@item @var{interleave} +interleave memory allocations across the given host node list +@end table + +The @option{align} option specifies the base address alignment when +QEMU mmap(2) @option{mem-path}, and accepts common suffixes, eg +@option{2M}. Some backend store specified by @option{mem-path} +requires an alignment different than the default one used by QEMU, eg +the device DAX /dev/dax0.0 requires 2M alignment rather than 4K. In +such cases, users can specify the required alignment via this option. + +@item -object memory-backend-ram,id=@var{id},merge=@var{on|off},dump=@var{on|off},prealloc=@var{on|off},size=@var{size},host-nodes=@var{host-nodes},policy=@var{default|preferred|bind|interleave} + +Creates a memory backend object, which can be used to back the guest RAM. +Memory backend objects offer more control than the @option{-m} option that is +traditionally used to define guest RAM. Please refer to +@option{memory-backend-file} for a description of the options. + @item -object rng-random,id=@var{id},filename=@var{/dev/random} Creates a random number generator backend which obtains entropy from diff --git a/scripts/device-crash-test b/scripts/device-crash-test index 827d8ec2af5deed5a335967601e8e8a2058ba0f6..7417177ebb448f96620827e0d385f2d573a5136d 100755 --- a/scripts/device-crash-test +++ b/scripts/device-crash-test @@ -207,11 +207,9 @@ ERROR_WHITELIST = [ # Known crashes will generate error messages, but won't be fatal. # Those entries must be removed once we fix the crashes. {'exitcode':-6, 'log':r"Device 'serial0' is in use", 'loglevel':logging.ERROR}, - {'exitcode':-6, 'log':r"spapr_rtas_register: Assertion .*rtas_table\[token\]\.name.* failed", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"qemu_net_client_setup: Assertion `!peer->peer' failed", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r'RAMBlock "[\w.-]+" already registered', 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"find_ram_offset: Assertion `size != 0' failed.", 'loglevel':logging.ERROR}, - {'exitcode':-6, 'log':r"puv3_load_kernel: Assertion `kernel_filename != NULL' failed", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"add_cpreg_to_hashtable: code should not be reached", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"qemu_alloc_display: Assertion `surface->image != NULL' failed", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"Unexpected error in error_set_from_qdev_prop_error", 'loglevel':logging.ERROR}, @@ -219,16 +217,10 @@ ERROR_WHITELIST = [ {'exitcode':-6, 'log':r"Object .* is not an instance of type generic-pc-machine", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"Object .* is not an instance of type e500-ccsr", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"vmstate_register_with_alias_id: Assertion `!se->compat \|\| se->instance_id == 0' failed", 'loglevel':logging.ERROR}, - {'exitcode':-11, 'device':'stm32f205-soc', 'loglevel':logging.ERROR, 'expected':True}, - {'exitcode':-11, 'device':'xlnx,zynqmp', 'loglevel':logging.ERROR, 'expected':True}, - {'exitcode':-11, 'device':'mips-cps', 'loglevel':logging.ERROR, 'expected':True}, {'exitcode':-11, 'device':'gus', 'loglevel':logging.ERROR, 'expected':True}, - {'exitcode':-11, 'device':'a9mpcore_priv', 'loglevel':logging.ERROR, 'expected':True}, - {'exitcode':-11, 'device':'a15mpcore_priv', 'loglevel':logging.ERROR, 'expected':True}, {'exitcode':-11, 'device':'isa-serial', 'loglevel':logging.ERROR, 'expected':True}, {'exitcode':-11, 'device':'sb16', 'loglevel':logging.ERROR, 'expected':True}, {'exitcode':-11, 'device':'cs4231a', 'loglevel':logging.ERROR, 'expected':True}, - {'exitcode':-11, 'device':'arm-gicv3', 'loglevel':logging.ERROR, 'expected':True}, {'exitcode':-11, 'machine':'isapc', 'device':'.*-iommu', 'loglevel':logging.ERROR, 'expected':True}, # everything else (including SIGABRT and SIGSEGV) will be a fatal error: diff --git a/util/memfd.c b/util/memfd.c index 412e94a405fc97110c4f4ce2508a04d1a5ad5ab0..dce61f9d211b8eef674ffa917df4a0685842f103 100644 --- a/util/memfd.c +++ b/util/memfd.c @@ -27,8 +27,6 @@ #include "qemu/osdep.h" -#include - #include "qemu/memfd.h" #if defined CONFIG_LINUX && !defined CONFIG_MEMFD @@ -53,6 +51,38 @@ static int memfd_create(const char *name, unsigned int flags) #define MFD_ALLOW_SEALING 0x0002U #endif +int qemu_memfd_create(const char *name, size_t size, unsigned int seals) +{ + int mfd = -1; + +#ifdef CONFIG_LINUX + unsigned int flags = MFD_CLOEXEC; + + if (seals) { + flags |= MFD_ALLOW_SEALING; + } + + mfd = memfd_create(name, flags); + if (mfd < 0) { + return -1; + } + + if (ftruncate(mfd, size) == -1) { + perror("ftruncate"); + close(mfd); + return -1; + } + + if (seals && fcntl(mfd, F_ADD_SEALS, seals) == -1) { + perror("fcntl"); + close(mfd); + return -1; + } +#endif + + return mfd; +} + /* * This is a best-effort helper for shared memory allocation, with * optional sealing. The helper will do his best to allocate using @@ -63,35 +93,14 @@ void *qemu_memfd_alloc(const char *name, size_t size, unsigned int seals, int *fd) { void *ptr; - int mfd = -1; - - *fd = -1; - -#ifdef CONFIG_LINUX - if (seals) { - mfd = memfd_create(name, MFD_ALLOW_SEALING | MFD_CLOEXEC); - } + int mfd = qemu_memfd_create(name, size, seals); + /* some systems have memfd without sealing */ if (mfd == -1) { - /* some systems have memfd without sealing */ - mfd = memfd_create(name, MFD_CLOEXEC); - seals = 0; + mfd = qemu_memfd_create(name, size, 0); } -#endif - - if (mfd != -1) { - if (ftruncate(mfd, size) == -1) { - perror("ftruncate"); - close(mfd); - return NULL; - } - if (seals && fcntl(mfd, F_ADD_SEALS, seals) == -1) { - perror("fcntl"); - close(mfd); - return NULL; - } - } else { + if (mfd == -1) { const char *tmpdir = g_get_tmp_dir(); gchar *fname; diff --git a/vl.c b/vl.c index 2586f259521c9c4df4b14e7c632de7e4f2612ae7..e725ecbc0851db576d91481335daa968ea8e1259 100644 --- a/vl.c +++ b/vl.c @@ -4611,8 +4611,6 @@ int main(int argc, char **argv, char **envp) current_machine->boot_order = boot_order; current_machine->cpu_model = cpu_model; - parse_numa_opts(current_machine); - /* parse features once if machine provides default cpu_type */ if (machine_class->default_cpu_type) { current_machine->cpu_type = machine_class->default_cpu_type; @@ -4621,6 +4619,7 @@ int main(int argc, char **argv, char **envp) cpu_parse_cpu_model(machine_class->default_cpu_type, cpu_model); } } + parse_numa_opts(current_machine); machine_run_board_init(current_machine);