numa.c 10.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/*
 * NUMA parameter parsing routines
 *
 * Copyright (c) 2014 Fujitsu Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#include "sysemu/sysemu.h"
#include "exec/cpu-common.h"
#include "qemu/bitmap.h"
#include "qom/cpu.h"
29 30
#include "qemu/error-report.h"
#include "include/exec/cpu-common.h" /* for RAM_ADDR_FMT */
31 32 33 34
#include "qapi-visit.h"
#include "qapi/opts-visitor.h"
#include "qapi/dealloc-visitor.h"
#include "qapi/qmp/qerror.h"
35
#include "hw/boards.h"
36
#include "sysemu/hostmem.h"
H
Hu Tao 已提交
37
#include "qmp-commands.h"
38 39 40 41 42 43 44 45

QemuOptsList qemu_numa_opts = {
    .name = "numa",
    .implied_opt_name = "type",
    .head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head),
    .desc = { { 0 } } /* validated with OptsVisitor */
};

46 47
static int have_memdevs = -1;

48
static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
49
{
50 51
    uint16_t nodenr;
    uint16List *cpus = NULL;
52

53 54
    if (node->has_nodeid) {
        nodenr = node->nodeid;
55
    } else {
56
        nodenr = nb_numa_nodes;
57 58
    }

59 60 61 62
    if (nodenr >= MAX_NODES) {
        error_setg(errp, "Max number of NUMA nodes reached: %"
                   PRIu16 "\n", nodenr);
        return;
63 64
    }

65 66 67 68 69 70 71
    for (cpus = node->cpus; cpus; cpus = cpus->next) {
        if (cpus->value > MAX_CPUMASK_BITS) {
            error_setg(errp, "CPU number %" PRIu16 " is bigger than %d",
                       cpus->value, MAX_CPUMASK_BITS);
            return;
        }
        bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1);
72 73
    }

74 75 76 77 78 79 80 81 82 83 84 85 86 87
    if (node->has_mem && node->has_memdev) {
        error_setg(errp, "qemu: cannot specify both mem= and memdev=\n");
        return;
    }

    if (have_memdevs == -1) {
        have_memdevs = node->has_memdev;
    }
    if (node->has_memdev != have_memdevs) {
        error_setg(errp, "qemu: memdev option must be specified for either "
                   "all or no nodes\n");
        return;
    }

88 89 90 91 92 93 94 95 96
    if (node->has_mem) {
        uint64_t mem_size = node->mem;
        const char *mem_str = qemu_opt_get(opts, "mem");
        /* Fix up legacy suffix-less format */
        if (g_ascii_isdigit(mem_str[strlen(mem_str) - 1])) {
            mem_size <<= 20;
        }
        numa_info[nodenr].node_mem = mem_size;
    }
97 98 99 100 101 102 103 104 105 106 107 108
    if (node->has_memdev) {
        Object *o;
        o = object_resolve_path_type(node->memdev, TYPE_MEMORY_BACKEND, NULL);
        if (!o) {
            error_setg(errp, "memdev=%s is ambiguous", node->memdev);
            return;
        }

        object_ref(o);
        numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
        numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
    }
109 110
}

111
int numa_init_func(QemuOpts *opts, void *opaque)
112
{
113 114
    NumaOptions *object = NULL;
    Error *err = NULL;
115

116 117 118 119
    {
        OptsVisitor *ov = opts_visitor_new(opts);
        visit_type_NumaOptions(opts_get_visitor(ov), &object, NULL, &err);
        opts_visitor_cleanup(ov);
120 121
    }

122 123 124
    if (err) {
        goto error;
    }
125

126 127 128 129 130
    switch (object->kind) {
    case NUMA_OPTIONS_KIND_NODE:
        numa_node_parse(object->node, opts, &err);
        if (err) {
            goto error;
131
        }
132 133 134 135 136
        nb_numa_nodes++;
        break;
    default:
        abort();
    }
137

138
    return 0;
139

140 141 142 143 144 145 146 147 148
error:
    qerror_report_err(err);
    error_free(err);

    if (object) {
        QapiDeallocVisitor *dv = qapi_dealloc_visitor_new();
        visit_type_NumaOptions(qapi_dealloc_get_visitor(dv),
                               &object, NULL, NULL);
        qapi_dealloc_visitor_cleanup(dv);
149
    }
150 151

    return -1;
152 153 154 155 156
}

void set_numa_nodes(void)
{
    if (nb_numa_nodes > 0) {
157
        uint64_t numa_total;
158 159 160 161 162 163 164 165 166 167
        int i;

        if (nb_numa_nodes > MAX_NODES) {
            nb_numa_nodes = MAX_NODES;
        }

        /* If no memory size if given for any node, assume the default case
         * and distribute the available memory equally across all nodes
         */
        for (i = 0; i < nb_numa_nodes; i++) {
168
            if (numa_info[i].node_mem != 0) {
169 170 171 172 173 174
                break;
            }
        }
        if (i == nb_numa_nodes) {
            uint64_t usedmem = 0;

M
Michael S. Tsirkin 已提交
175
            /* On Linux, each node's border has to be 8MB aligned,
176 177 178
             * the final node gets the rest.
             */
            for (i = 0; i < nb_numa_nodes - 1; i++) {
179 180 181
                numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
                                        ~((1 << 23UL) - 1);
                usedmem += numa_info[i].node_mem;
182
            }
183
            numa_info[i].node_mem = ram_size - usedmem;
184 185
        }

186 187
        numa_total = 0;
        for (i = 0; i < nb_numa_nodes; i++) {
188
            numa_total += numa_info[i].node_mem;
189 190 191 192 193 194 195 196
        }
        if (numa_total != ram_size) {
            error_report("total memory for NUMA nodes (%" PRIu64 ")"
                         " should equal RAM size (" RAM_ADDR_FMT ")",
                         numa_total, ram_size);
            exit(1);
        }

197
        for (i = 0; i < nb_numa_nodes; i++) {
198
            if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
199 200 201 202 203 204 205 206 207
                break;
            }
        }
        /* assigning the VCPUs round-robin is easier to implement, guest OSes
         * must cope with this anyway, because there are BIOSes out there in
         * real machines which also use this scheme.
         */
        if (i == nb_numa_nodes) {
            for (i = 0; i < max_cpus; i++) {
208
                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
209 210 211 212 213 214 215 216 217 218 219 220
            }
        }
    }
}

void set_numa_modes(void)
{
    CPUState *cpu;
    int i;

    CPU_FOREACH(cpu) {
        for (i = 0; i < nb_numa_nodes; i++) {
221
            if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
222 223 224 225 226
                cpu->numa_node = i;
            }
        }
    }
}
227

228 229 230 231
static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner,
                                           const char *name,
                                           uint64_t ram_size)
{
232 233
    if (mem_path) {
#ifdef __linux__
234
        Error *err = NULL;
235
        memory_region_init_ram_from_file(mr, owner, name, ram_size, false,
236 237 238 239 240
                                         mem_path, &err);

        /* Legacy behavior: if allocation failed, fall back to
         * regular RAM allocation.
         */
241
        if (err) {
242 243 244 245
            qerror_report_err(err);
            error_free(err);
            memory_region_init_ram(mr, owner, name, ram_size);
        }
246 247 248 249 250 251 252
#else
        fprintf(stderr, "-mem-path not supported on this host\n");
        exit(1);
#endif
    } else {
        memory_region_init_ram(mr, owner, name, ram_size);
    }
253 254 255
    vmstate_register_ram_global(mr);
}

256 257 258 259
void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
                                          const char *name,
                                          uint64_t ram_size)
{
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
    uint64_t addr = 0;
    int i;

    if (nb_numa_nodes == 0 || !have_memdevs) {
        allocate_system_memory_nonnuma(mr, owner, name, ram_size);
        return;
    }

    memory_region_init(mr, owner, name, ram_size);
    for (i = 0; i < MAX_NODES; i++) {
        Error *local_err = NULL;
        uint64_t size = numa_info[i].node_mem;
        HostMemoryBackend *backend = numa_info[i].node_memdev;
        if (!backend) {
            continue;
        }
        MemoryRegion *seg = host_memory_backend_get_memory(backend, &local_err);
        if (local_err) {
            qerror_report_err(local_err);
            exit(1);
        }

        memory_region_add_subregion(mr, addr, seg);
        vmstate_register_ram_global(seg);
        addr += size;
    }
286
}
H
Hu Tao 已提交
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369

static int query_memdev(Object *obj, void *opaque)
{
    MemdevList **list = opaque;
    Error *err = NULL;

    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
        MemdevList *m = g_malloc0(sizeof(*m));

        m->value = g_malloc0(sizeof(*m->value));

        m->value->size = object_property_get_int(obj, "size",
                                                 &err);
        if (err) {
            goto error;
        }

        m->value->merge = object_property_get_bool(obj, "merge",
                                                   &err);
        if (err) {
            goto error;
        }

        m->value->dump = object_property_get_bool(obj, "dump",
                                                  &err);
        if (err) {
            goto error;
        }

        m->value->prealloc = object_property_get_bool(obj,
                                                      "prealloc", &err);
        if (err) {
            goto error;
        }

        m->value->policy = object_property_get_enum(obj,
                                                    "policy",
                                                    HostMemPolicy_lookup,
                                                    &err);
        if (err) {
            goto error;
        }

        object_property_get_uint16List(obj, "host-nodes",
                                       &m->value->host_nodes, &err);
        if (err) {
            goto error;
        }

        m->next = *list;
        *list = m;
    }

    return 0;
error:
    return -1;
}

MemdevList *qmp_query_memdev(Error **errp)
{
    Object *obj;
    MemdevList *list = NULL, *m;

    obj = object_resolve_path("/objects", NULL);
    if (obj == NULL) {
        return NULL;
    }

    if (object_child_foreach(obj, query_memdev, &list) != 0) {
        goto error;
    }

    return list;

error:
    while (list) {
        m = list;
        list = list->next;
        g_free(m->value);
        g_free(m);
    }
    return NULL;
}