/* * virnuma.c: helper APIs for managing numa * * Copyright (C) 2011-2013 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see * . * */ #include #define NUMA_MAX_N_CPUS 4096 #if WITH_NUMACTL # define NUMA_VERSION1_COMPATIBILITY 1 # include # if LIBNUMA_API_VERSION > 1 # undef NUMA_MAX_N_CPUS # define NUMA_MAX_N_CPUS (numa_all_cpus_ptr->size) # endif #endif /* WITH_NUMACTL */ #include #include #include "virnuma.h" #include "vircommand.h" #include "virerror.h" #include "virlog.h" #include "viralloc.h" #include "virbitmap.h" #include "virstring.h" #include "virfile.h" #include "nodeinfo.h" #define VIR_FROM_THIS VIR_FROM_NONE VIR_LOG_INIT("util.numa"); VIR_ENUM_IMPL(virDomainNumatuneMemMode, VIR_DOMAIN_NUMATUNE_MEM_LAST, "strict", "preferred", "interleave"); VIR_ENUM_IMPL(virNumaTuneMemPlacementMode, VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_LAST, "default", "static", "auto"); #if HAVE_NUMAD char * virNumaGetAutoPlacementAdvice(unsigned short vcpus, unsigned long long balloon) { virCommandPtr cmd = NULL; char *output = NULL; cmd = virCommandNewArgList(NUMAD, "-w", NULL); virCommandAddArgFormat(cmd, "%d:%llu", vcpus, VIR_DIV_UP(balloon, 1024)); virCommandSetOutputBuffer(cmd, &output); if (virCommandRun(cmd, NULL) < 0) virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Failed to query numad for the " "advisory nodeset")); virCommandFree(cmd); return output; } #else char * virNumaGetAutoPlacementAdvice(unsigned short vcpus ATTRIBUTE_UNUSED, unsigned long long balloon ATTRIBUTE_UNUSED) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("numad is not available on this host")); return NULL; } #endif #if WITH_NUMACTL int virNumaSetupMemoryPolicy(virNumaTuneDef numatune, virBitmapPtr nodemask) { nodemask_t mask; int mode = -1; int node = -1; int ret = -1; int bit = 0; size_t i; int maxnode = 0; virBitmapPtr tmp_nodemask = NULL; if (numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_STATIC) { if (!numatune.memory.nodemask) return 0; VIR_DEBUG("Set NUMA memory policy with specified nodeset"); tmp_nodemask = numatune.memory.nodemask; } else if (numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) { VIR_DEBUG("Set NUMA memory policy with advisory nodeset from numad"); tmp_nodemask = nodemask; } else { return 0; } if (numa_available() < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Host kernel is not aware of NUMA.")); return -1; } maxnode = numa_max_node(); maxnode = maxnode < NUMA_NUM_NODES ? maxnode : NUMA_NUM_NODES; /* Convert nodemask to NUMA bitmask. */ nodemask_zero(&mask); bit = -1; while ((bit = virBitmapNextSetBit(tmp_nodemask, bit)) >= 0) { if (bit > maxnode) { virReportError(VIR_ERR_INTERNAL_ERROR, _("NUMA node %d is out of range"), bit); return -1; } nodemask_set(&mask, bit); } mode = numatune.memory.mode; if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { numa_set_bind_policy(1); numa_set_membind(&mask); numa_set_bind_policy(0); } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_PREFERRED) { int nnodes = 0; for (i = 0; i < NUMA_NUM_NODES; i++) { if (nodemask_isset(&mask, i)) { node = i; nnodes++; } } if (nnodes != 1) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("NUMA memory tuning in 'preferred' mode " "only supports single node")); goto cleanup; } numa_set_bind_policy(0); numa_set_preferred(node); } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_INTERLEAVE) { numa_set_interleave_mask(&mask); } else { /* XXX: Shouldn't go here, as we already do checking when * parsing domain XML. */ virReportError(VIR_ERR_XML_ERROR, "%s", _("Invalid mode for memory NUMA tuning.")); goto cleanup; } ret = 0; cleanup: return ret; } bool virNumaIsAvailable(void) { return numa_available() != -1; } /** * virNumaGetMaxNode: * Get the highest node number available on the current system. * (See the node numbers in /sys/devices/system/node/ ). * * Returns the highest NUMA node id on success, -1 on error. */ int virNumaGetMaxNode(void) { int ret; if (!virNumaIsAvailable()) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("NUMA isn't available on this host")); return -1; } if ((ret = numa_max_node()) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Failed to request maximum NUMA node id")); return -1; } return ret; } /** * virNumaGetNodeMemory: * @node: identifier of the requested NUMA node * @memsize: returns the total size of memory in the NUMA node * @memfree: returns the total free memory in a NUMA node * * Returns the size of the memory in one NUMA node in bytes via the @size * argument and free memory of a node in the @free argument. The caller has to * guarantee that @node is in range (see virNumaGetMaxNode). * * Returns 0 on success, -1 on error. Does not report errors. */ int virNumaGetNodeMemory(int node, unsigned long long *memsize, unsigned long long *memfree) { long long node_size; long long node_free; if (memsize) *memsize = 0; if (memfree) *memfree = 0; if ((node_size = numa_node_size64(node, &node_free)) < 0) return -1; if (memsize) *memsize = node_size; if (memfree) *memfree = node_free; return 0; } /** * virNumaGetNodeCPUs: * @node: identifier of the requested NUMA node * @cpus: returns a bitmap of CPUs in @node * * Returns count of CPUs in the selected node and sets the map of the cpus to * @cpus. On error if the @node doesn't exist in the system this function * returns -2 and sets @cpus to NULL. On other errors -1 is returned, @cpus * is set to NULL and an error is reported. */ # define n_bits(var) (8 * sizeof(var)) # define MASK_CPU_ISSET(mask, cpu) \ (((mask)[((cpu) / n_bits(*(mask)))] >> ((cpu) % n_bits(*(mask)))) & 1) int virNumaGetNodeCPUs(int node, virBitmapPtr *cpus) { unsigned long *mask = NULL; unsigned long *allonesmask = NULL; virBitmapPtr cpumap = NULL; int ncpus = 0; int max_n_cpus = virNumaGetMaxCPUs(); int mask_n_bytes = max_n_cpus / 8; size_t i; int ret = -1; *cpus = NULL; if (VIR_ALLOC_N(mask, mask_n_bytes / sizeof(*mask)) < 0) goto cleanup; if (VIR_ALLOC_N(allonesmask, mask_n_bytes / sizeof(*mask)) < 0) goto cleanup; memset(allonesmask, 0xff, mask_n_bytes); /* The first time this returns -1, ENOENT if node doesn't exist... */ if (numa_node_to_cpus(node, mask, mask_n_bytes) < 0) { VIR_WARN("NUMA topology for cell %d is not available, ignoring", node); ret = -2; goto cleanup; } /* second, third... times it returns an all-1's mask */ if (memcmp(mask, allonesmask, mask_n_bytes) == 0) { VIR_DEBUG("NUMA topology for cell %d is invalid, ignoring", node); ret = -2; goto cleanup; } if (!(cpumap = virBitmapNew(max_n_cpus))) goto cleanup; for (i = 0; i < max_n_cpus; i++) { if (MASK_CPU_ISSET(mask, i)) { ignore_value(virBitmapSetBit(cpumap, i)); ncpus++; } } *cpus = cpumap; cpumap = NULL; ret = ncpus; cleanup: VIR_FREE(mask); VIR_FREE(allonesmask); VIR_FREE(cpumap); return ret; } # undef MASK_CPU_ISSET # undef n_bits #else int virNumaSetupMemoryPolicy(virNumaTuneDef numatune, virBitmapPtr nodemask ATTRIBUTE_UNUSED) { if (numatune.memory.nodemask) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("libvirt is compiled without NUMA tuning support")); return -1; } return 0; } bool virNumaIsAvailable(void) { return false; } int virNumaGetMaxNode(void) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("NUMA isn't available on this host")); return -1; } int virNumaGetNodeMemory(int node ATTRIBUTE_UNUSED, unsigned long long *memsize, unsigned long long *memfree) { if (memsize) *memsize = 0; if (memfree) *memfree = 0; VIR_DEBUG("NUMA isn't available on this host"); return -1; } int virNumaGetNodeCPUs(int node ATTRIBUTE_UNUSED, virBitmapPtr *cpus) { *cpus = NULL; virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("NUMA isn't available on this host")); return -1; } #endif /** * virNumaGetMaxCPUs: * * Get the maximum count of CPUs supportable in the host. * * Returns the count of CPUs supported. */ unsigned int virNumaGetMaxCPUs(void) { return NUMA_MAX_N_CPUS; } #ifdef HAVE_NUMA_BITMASK_ISBITSET /** * virNumaNodeIsAvailable: * @node: node to check * * On some hosts the set of NUMA nodes isn't continuous. * Use this function to test if the @node is available. * * Returns: true if @node is available, * false if @node doesn't exist */ bool virNumaNodeIsAvailable(int node) { return numa_bitmask_isbitset(numa_nodes_ptr, node); } /** * virNumaGetDistances: * @node: identifier of the requested NUMA node * @distances: array of distances to sibling nodes * @ndistances: size of @distances * * Get array of distances to sibling nodes from @node. If a * distances[x] equals to zero, the node x is not enabled or * doesn't exist. As a special case, if @node itself refers to * disabled or nonexistent NUMA node, then @distances and * @ndistances are set to NULL and zero respectively. * * The distances are a bit of magic. For a local node the value * is 10, for remote it's typically 20 meaning that time penalty * for accessing a remote node is two time bigger than when * accessing a local node. * * Returns 0 on success, -1 otherwise. */ int virNumaGetDistances(int node, int **distances, int *ndistances) { int ret = -1; int max_node; size_t i; if (!virNumaNodeIsAvailable(node)) { VIR_DEBUG("Node %d does not exist", node); *distances = NULL; *ndistances = 0; return 0; } if ((max_node = virNumaGetMaxNode()) < 0) goto cleanup; if (VIR_ALLOC_N(*distances, max_node) < 0) goto cleanup; *ndistances = max_node + 1; for (i = 0; i<= max_node; i++) { if (!virNumaNodeIsAvailable(node)) continue; (*distances)[i] = numa_distance(node, i); } ret = 0; cleanup: return ret; } #else bool virNumaNodeIsAvailable(int node) { int max_node = virNumaGetMaxNode(); if (max_node < 0) return false; /* Do we have anything better? */ return (node >= 0) && (node < max_node); } int virNumaGetDistances(int node ATTRIBUTE_UNUSED, int **distances, int *ndistances) { *distances = NULL; *ndistances = 0; VIR_DEBUG("NUMA distance information isn't availble on this host"); return 0; } #endif #define HUGEPAGES_NUMA_PREFIX "/sys/devices/system/node/" #define HUGEPAGES_SYSTEM_PREFIX "/sys/kernel/mm/hugepages/" #define HUGEPAGES_PREFIX "hugepages-" static int virNumaGetHugePageInfoPath(char **path, int node, unsigned int page_size, const char *suffix) { int ret = -1; if (node == -1) { /* We are aiming at overall system info */ if (page_size) { /* And even on specific huge page size */ if (virAsprintf(path, HUGEPAGES_SYSTEM_PREFIX HUGEPAGES_PREFIX "%ukB/%s", page_size, suffix ? suffix : "") < 0) goto cleanup; } else { if (VIR_STRDUP(*path, HUGEPAGES_SYSTEM_PREFIX) < 0) goto cleanup; } } else { /* We are aiming on specific NUMA node */ if (page_size) { /* And even on specific huge page size */ if (virAsprintf(path, HUGEPAGES_NUMA_PREFIX "node%d/hugepages/" HUGEPAGES_PREFIX "%ukB/%s", node, page_size, suffix ? suffix : "") < 0) goto cleanup; } else { if (virAsprintf(path, HUGEPAGES_NUMA_PREFIX "node%d/hugepages/", node) < 0) goto cleanup; } } ret = 0; cleanup: return ret; } /** * virNumaGetHugePageInfo: * @node: NUMA node id * @page_size: which huge page are we interested in * @page_avail: total number of huge pages in the pool * @page_free: the number of free huge pages in the pool * * For given NUMA node and huge page size fetch information on * total number of huge pages in the pool (both free and taken) * and count for free huge pages in the pool. * * If you're interested in just one bit, pass NULL to the other one. * * As a special case, if @node == -1, overall info is fetched * from the system. * * Returns 0 on success, -1 otherwise (with error reported). */ static int virNumaGetHugePageInfo(int node, unsigned int page_size, unsigned int *page_avail, unsigned int *page_free) { int ret = -1; char *path = NULL; char *buf = NULL; char *end; if (page_avail) { if (virNumaGetHugePageInfoPath(&path, node, page_size, "nr_hugepages") < 0) goto cleanup; if (virFileReadAll(path, 1024, &buf) < 0) goto cleanup; if (virStrToLong_ui(buf, &end, 10, page_avail) < 0 || *end != '\n') { virReportError(VIR_ERR_INTERNAL_ERROR, _("unable to parse: %s"), buf); goto cleanup; } VIR_FREE(buf); VIR_FREE(path); } if (page_free) { if (virNumaGetHugePageInfoPath(&path, node, page_size, "free_hugepages") < 0) goto cleanup; if (virFileReadAll(path, 1024, &buf) < 0) goto cleanup; if (virStrToLong_ui(buf, &end, 10, page_free) < 0 || *end != '\n') { virReportError(VIR_ERR_INTERNAL_ERROR, _("unable to parse: %s"), buf); goto cleanup; } } ret = 0; cleanup: VIR_FREE(buf); VIR_FREE(path); return ret; } /** * virNumaGetPageInfo: * @node: NUMA node id * @page_size: which huge page are we interested in (in KiB) * @page_avail: total number of huge pages in the pool * @page_free: the number of free huge pages in the pool * * For given NUMA node and page size fetch information on * total number of pages in the pool (both free and taken) * and count for free pages in the pool. * * If you're interested in just one bit, pass NULL to the other one. * * As a special case, if @node == -1, overall info is fetched * from the system. * * Returns 0 on success, -1 otherwise (with error reported). */ int virNumaGetPageInfo(int node, unsigned int page_size, unsigned int *page_avail, unsigned int *page_free) { int ret = -1; long system_page_size = sysconf(_SC_PAGESIZE); /* sysconf() returns page size in bytes, * the @page_size is however in kibibytes */ if (page_size == system_page_size / 1024) { #if 0 unsigned long long memsize, memfree; /* TODO: come up with better algorithm that takes huge pages into * account. The problem is huge pages cut off regular memory. */ if (node == -1) { if (nodeGetMemory(&memsize, &memfree) < 0) goto cleanup; } else { if (virNumaGetNodeMemory(node, &memsize, &memfree) < 0) goto cleanup; } if (page_avail) *page_avail = memsize / system_page_size; if (page_free) *page_free = memfree / system_page_size; #else virReportError(VIR_ERR_ARGUMENT_UNSUPPORTED, "%s", _("system page size are not supported yet")); goto cleanup; #endif /* 0 */ } else { if (virNumaGetHugePageInfo(node, page_size, page_avail, page_free) < 0) goto cleanup; } ret = 0; cleanup: return ret; } /** * virNumaGetPages: * @node: NUMA node id * @pages_size: list of pages supported on @node * @pages_avail: list of the pool sizes on @node * @pages_free: list of free pages on @node * @npages: the lists size * * For given NUMA node fetch info on pages. The size of pages * (e.g. 4K, 2M, 1G) is stored into @pages_size, the size of the * pool is then stored into @pages_avail and the number of free * pages in the pool is stored into @pages_free. * * If you're interested only in some lists, pass NULL to the * other ones. * * As a special case, if @node == -1, overall info is fetched * from the system. * * Returns 0 on success, -1 otherwise. */ int virNumaGetPages(int node, unsigned int **pages_size, unsigned int **pages_avail, unsigned int **pages_free, size_t *npages) { int ret = -1; char *path = NULL; DIR *dir = NULL; int direrr; struct dirent *entry; unsigned int *tmp_size = NULL, *tmp_avail = NULL, *tmp_free = NULL; unsigned int ntmp = 0; size_t i; bool exchange; #if 0 /* This has to be disabled until the time the issue in * virNumaGetPageInfo is resolved. Sorry. */ long system_page_size; /* sysconf() returns page size in bytes, * but we are storing the page size in kibibytes. */ system_page_size = sysconf(_SC_PAGESIZE) / 1024; /* We know that ordinary system pages are supported * if nothing else is. */ if (VIR_REALLOC_N(tmp_size, 1) < 0 || VIR_REALLOC_N(tmp_avail, 1) < 0 || VIR_REALLOC_N(tmp_free, 1) < 0) goto cleanup; if (virNumaGetPageInfo(node, system_page_size, &tmp_avail[ntmp], &tmp_free[ntmp]) < 0) goto cleanup; tmp_size[ntmp] = system_page_size; ntmp++; #endif /* 0 */ /* Now that we got ordinary system pages, lets get info on huge pages */ if (virNumaGetHugePageInfoPath(&path, node, 0, NULL) < 0) goto cleanup; if (!(dir = opendir(path))) { virReportSystemError(errno, _("unable to open path: %s"), path); goto cleanup; } while ((direrr = virDirRead(dir, &entry, path)) > 0) { const char *page_name = entry->d_name; unsigned int page_size, page_avail = 0, page_free = 0; char *end; /* Just to give you a hint, we're dealing with this: * hugepages-2048kB/ or hugepages-1048576kB/ */ if (!STRPREFIX(entry->d_name, HUGEPAGES_PREFIX)) continue; page_name += strlen(HUGEPAGES_PREFIX); if (virStrToLong_ui(page_name, &end, 10, &page_size) < 0 || STRCASENEQ(end, "kB")) { virReportError(VIR_ERR_INTERNAL_ERROR, _("unable to parse %s"), entry->d_name); goto cleanup; } /* Querying more detailed info makes sense only sometimes */ if ((pages_avail || pages_free) && virNumaGetHugePageInfo(node, page_size, &page_avail, &page_free) < 0) goto cleanup; if (VIR_REALLOC_N(tmp_size, ntmp + 1) < 0 || VIR_REALLOC_N(tmp_avail, ntmp + 1) < 0 || VIR_REALLOC_N(tmp_free, ntmp + 1) < 0) goto cleanup; tmp_size[ntmp] = page_size; tmp_avail[ntmp] = page_avail; tmp_free[ntmp] = page_free; ntmp++; } if (direrr < 0) goto cleanup; /* Just to produce nice output, sort the arrays by increasing page size */ do { exchange = false; for (i = 0; i < ntmp -1; i++) { if (tmp_size[i] > tmp_size[i + 1]) { exchange = true; SWAP(tmp_size[i], tmp_size[i + 1]); SWAP(tmp_avail[i], tmp_avail[i + 1]); SWAP(tmp_free[i], tmp_free[i + 1]); } } } while (exchange); if (pages_size) { *pages_size = tmp_size; tmp_size = NULL; } if (pages_avail) { *pages_avail = tmp_avail; tmp_avail = NULL; } if (pages_free) { *pages_free = tmp_free; tmp_free = NULL; } *npages = ntmp; ret = 0; cleanup: VIR_FREE(tmp_free); VIR_FREE(tmp_avail); VIR_FREE(tmp_size); closedir(dir); VIR_FREE(path); return ret; }