qemu_cgroup.c 35.0 KB
Newer Older
1 2 3
/*
 * qemu_cgroup.c: QEMU cgroup management
 *
E
Eric Blake 已提交
4
 * Copyright (C) 2006-2013 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright (C) 2006 Daniel P. Berrange
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 */

#include <config.h>

#include "qemu_cgroup.h"
27
#include "qemu_domain.h"
28
#include "qemu_process.h"
29
#include "vircgroup.h"
30
#include "virlog.h"
31
#include "viralloc.h"
32
#include "virerror.h"
33
#include "domain_audit.h"
34
#include "virscsi.h"
35
#include "virstring.h"
36 37 38 39 40 41 42

#define VIR_FROM_THIS VIR_FROM_QEMU

static const char *const defaultDeviceACL[] = {
    "/dev/null", "/dev/full", "/dev/zero",
    "/dev/random", "/dev/urandom",
    "/dev/ptmx", "/dev/kvm", "/dev/kqemu",
43
    "/dev/rtc", "/dev/hpet", "/dev/vfio/vfio",
44 45 46 47 48
    NULL,
};
#define DEVICE_PTY_MAJOR 136
#define DEVICE_SND_MAJOR 116

49
static int
50
qemuSetupDiskPathAllow(virDomainDiskDefPtr disk,
51 52 53
                       const char *path,
                       size_t depth ATTRIBUTE_UNUSED,
                       void *opaque)
54
{
55 56
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
57 58 59
    int rc;

    VIR_DEBUG("Process path %s for disk", path);
60
    rc = virCgroupAllowDevicePath(priv->cgroup, path,
61 62
                                  (disk->readonly ? VIR_CGROUP_DEVICE_READ
                                   : VIR_CGROUP_DEVICE_RW));
63
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
64
                             disk->readonly ? "r" : "rw", rc);
65 66
    if (rc < 0) {
        if (rc == -EACCES) { /* Get this for root squash NFS */
67 68 69 70 71 72 73 74 75 76 77 78
            VIR_DEBUG("Ignoring EACCES for %s", path);
        } else {
            virReportSystemError(-rc,
                                 _("Unable to allow access for disk path %s"),
                                 path);
            return -1;
        }
    }
    return 0;
}


79 80 81
int
qemuSetupDiskCgroup(virDomainObjPtr vm,
                    virDomainDiskDefPtr disk)
82
{
83 84 85 86 87 88
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

89
    return virDomainDiskDefForeachPath(disk, true, qemuSetupDiskPathAllow, vm);
90 91 92
}


93 94 95 96 97
static int
qemuTeardownDiskPathDeny(virDomainDiskDefPtr disk ATTRIBUTE_UNUSED,
                         const char *path,
                         size_t depth ATTRIBUTE_UNUSED,
                         void *opaque)
98
{
99 100
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
101 102 103
    int rc;

    VIR_DEBUG("Process path %s for disk", path);
104
    rc = virCgroupDenyDevicePath(priv->cgroup, path,
105
                                 VIR_CGROUP_DEVICE_RWM);
106
    virDomainAuditCgroupPath(vm, priv->cgroup, "deny", path, "rwm", rc);
107 108
    if (rc < 0) {
        if (rc == -EACCES) { /* Get this for root squash NFS */
109 110 111 112 113 114 115 116 117 118 119 120
            VIR_DEBUG("Ignoring EACCES for %s", path);
        } else {
            virReportSystemError(-rc,
                                 _("Unable to deny access for disk path %s"),
                                 path);
            return -1;
        }
    }
    return 0;
}


121 122 123
int
qemuTeardownDiskCgroup(virDomainObjPtr vm,
                       virDomainDiskDefPtr disk)
124
{
125 126 127 128 129 130
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

131 132 133
    return virDomainDiskDefForeachPath(disk,
                                       true,
                                       qemuTeardownDiskPathDeny,
134
                                       vm);
135 136
}

137
static int
138 139
qemuSetupChrSourceCgroup(virDomainDefPtr def,
                         virDomainChrSourceDefPtr dev,
140
                         void *opaque)
141
{
142 143
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
144 145
    int rc;

146
    if (dev->type != VIR_DOMAIN_CHR_TYPE_DEV)
147 148
        return 0;

149
    VIR_DEBUG("Process path '%s' for device", dev->data.file.path);
150

151
    rc = virCgroupAllowDevicePath(priv->cgroup, dev->data.file.path,
152
                                  VIR_CGROUP_DEVICE_RW);
153
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
154
                             dev->data.file.path, "rw", rc);
155
    if (rc < 0) {
156 157
        virReportSystemError(-rc,
                             _("Unable to allow device %s for %s"),
158
                             dev->data.file.path, def->name);
159 160 161 162 163 164
        return -1;
    }

    return 0;
}

165 166 167 168 169
static int
qemuSetupChardevCgroup(virDomainDefPtr def,
                       virDomainChrDefPtr dev,
                       void *opaque)
{
170
    return qemuSetupChrSourceCgroup(def, &dev->source, opaque);
171 172 173 174 175 176
}


static int
qemuSetupTPMCgroup(virDomainDefPtr def,
                   virDomainTPMDefPtr dev,
177
                   void *opaque)
178 179 180 181 182 183
{
    int rc = 0;

    switch (dev->type) {
    case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
        rc = qemuSetupChrSourceCgroup(def, &dev->data.passthrough.source,
184
                                      opaque);
185 186 187 188 189 190 191 192
        break;
    case VIR_DOMAIN_TPM_TYPE_LAST:
        break;
    }

    return rc;
}

193

194 195 196 197
static int
qemuSetupHostUsbDeviceCgroup(virUSBDevicePtr dev ATTRIBUTE_UNUSED,
                             const char *path,
                             void *opaque)
198
{
199 200
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
201 202 203
    int rc;

    VIR_DEBUG("Process path '%s' for USB device", path);
204
    rc = virCgroupAllowDevicePath(priv->cgroup, path,
205
                                  VIR_CGROUP_DEVICE_RW);
206
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", rc);
207
    if (rc < 0) {
208 209 210 211 212 213 214 215 216
        virReportSystemError(-rc,
                             _("Unable to allow device %s"),
                             path);
        return -1;
    }

    return 0;
}

217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
static int
qemuSetupHostScsiDeviceCgroup(virSCSIDevicePtr dev ATTRIBUTE_UNUSED,
                              const char *path,
                              void *opaque)
{
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int rc;

    VIR_DEBUG("Process path '%s' for SCSI device", path);

    rc = virCgroupAllowDevicePath(priv->cgroup, path,
                                  virSCSIDeviceGetReadonly(dev) ?
                                  VIR_CGROUP_DEVICE_READ :
                                  VIR_CGROUP_DEVICE_RW);

    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
                             virSCSIDeviceGetReadonly(dev) ? "r" : "rw", rc);
    if (rc < 0) {
        virReportSystemError(-rc,
                             _("Unable to allow device %s"),
                             path);
        return -1;
    }

    return 0;
}
244

245 246 247 248 249 250 251
int
qemuSetupHostdevCGroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virPCIDevicePtr pci = NULL;
252
    virUSBDevicePtr usb = NULL;
253
    virSCSIDevicePtr scsi = NULL;
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
            if (dev->source.subsys.u.pci.backend
269
                == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
270 271 272 273 274 275 276 277 278
                int rc;

                pci = virPCIDeviceNew(dev->source.subsys.u.pci.addr.domain,
                                      dev->source.subsys.u.pci.addr.bus,
                                      dev->source.subsys.u.pci.addr.slot,
                                      dev->source.subsys.u.pci.addr.function);
                if (!pci)
                    goto cleanup;

279
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
                    goto cleanup;

                VIR_DEBUG("Cgroup allow %s for PCI device assignment", path);
                rc = virCgroupAllowDevicePath(priv->cgroup, path,
                                              VIR_CGROUP_DEVICE_RW);
                virDomainAuditCgroupPath(vm, priv->cgroup,
                                         "allow", path, "rw", rc);
                if (rc < 0) {
                    virReportSystemError(-rc,
                                         _("Unable to allow access "
                                           "for device path %s"),
                                         path);
                    goto cleanup;
                }
            }
            break;
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* NB: hostdev->missing wasn't previously checked in the
             * case of hotplug, only when starting a domain. Now it is
             * always checked, and the cgroup setup skipped if true.
             */
            if (dev->missing)
                break;
            if ((usb = virUSBDeviceNew(dev->source.subsys.u.usb.bus,
                                       dev->source.subsys.u.usb.device,
                                       NULL)) == NULL) {
                goto cleanup;
            }

            /* oddly, qemuSetupHostUsbDeviceCgroup doesn't ever
             * reference the usb object we just created
             */
            if (virUSBDeviceFileIterate(usb, qemuSetupHostUsbDeviceCgroup,
                                        vm) < 0) {
                goto cleanup;
            }
            break;
318 319 320 321 322 323 324 325 326 327 328 329 330 331

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI:
            if ((scsi = virSCSIDeviceNew(dev->source.subsys.u.scsi.adapter,
                                         dev->source.subsys.u.scsi.bus,
                                         dev->source.subsys.u.scsi.target,
                                         dev->source.subsys.u.scsi.unit,
                                         dev->readonly)) == NULL)
                goto cleanup;

            if (virSCSIDeviceFileIterate(scsi,
                                         qemuSetupHostScsiDeviceCgroup,
                                         vm) < 0)
                goto cleanup;

332 333 334 335 336 337 338 339
        default:
            break;
        }
    }

    ret = 0;
cleanup:
    virPCIDeviceFree(pci);
340
    virUSBDeviceFree(usb);
341
    virSCSIDeviceFree(scsi);
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
    VIR_FREE(path);
    return ret;
}

int
qemuTeardownHostdevCgroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virPCIDevicePtr pci = NULL;
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
            if (dev->source.subsys.u.pci.backend
368
                == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
369 370 371 372 373 374 375 376 377
                int rc;

                pci = virPCIDeviceNew(dev->source.subsys.u.pci.addr.domain,
                                      dev->source.subsys.u.pci.addr.bus,
                                      dev->source.subsys.u.pci.addr.slot,
                                      dev->source.subsys.u.pci.addr.function);
                if (!pci)
                    goto cleanup;

378
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
                    goto cleanup;

                VIR_DEBUG("Cgroup deny %s for PCI device assignment", path);
                rc = virCgroupDenyDevicePath(priv->cgroup, path,
                                             VIR_CGROUP_DEVICE_RWM);
                virDomainAuditCgroupPath(vm, priv->cgroup,
                                         "deny", path, "rwm", rc);
                if (rc < 0) {
                    virReportSystemError(-rc,
                                         _("Unable to deny access "
                                           "for device path %s"),
                                         path);
                    goto cleanup;
                }
            }
            break;
395 396 397
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* nothing to tear down for USB */
            break;
398 399 400 401 402 403 404 405 406 407 408 409
        default:
            break;
        }
    }

    ret = 0;
cleanup:
    virPCIDeviceFree(pci);
    VIR_FREE(path);
    return ret;
}

410 411 412 413 414
static int
qemuSetupBlkioCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int rc = -1;
415
    size_t i;
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_BLKIO)) {
        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Block I/O tuning is not available on this host"));
            return -1;
        } else {
            return 0;
        }
    }

    if (vm->def->blkio.weight != 0) {
        rc = virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set io weight for domain %s"),
                                 vm->def->name);
            return -1;
        }
    }

    if (vm->def->blkio.ndevices) {
        for (i = 0; i < vm->def->blkio.ndevices; i++) {
            virBlkioDeviceWeightPtr dw = &vm->def->blkio.devices[i];
            if (!dw->weight)
                continue;
            rc = virCgroupSetBlkioDeviceWeight(priv->cgroup, dw->path,
                                               dw->weight);
            if (rc != 0) {
                virReportSystemError(-rc,
                                     _("Unable to set io device weight "
                                       "for domain %s"),
                                     vm->def->name);
                return -1;
            }
        }
    }

    return 0;
}

458

459 460 461 462 463 464 465 466 467 468 469 470 471
static int
qemuSetupMemoryCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int rc;

    if (!virCgroupHasController(priv->cgroup,VIR_CGROUP_CONTROLLER_MEMORY)) {
        if (vm->def->mem.hard_limit != 0 ||
            vm->def->mem.soft_limit != 0 ||
            vm->def->mem.swap_hard_limit != 0) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Memory cgroup is not available on this host"));
            return -1;
O
Osier Yang 已提交
472 473
        } else {
            return 0;
474 475 476
        }
    }

477 478
    rc = virCgroupSetMemoryHardLimit(priv->cgroup,
                                     qemuDomainMemoryLimit(vm->def));
479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508
    if (rc != 0) {
        virReportSystemError(-rc,
                             _("Unable to set memory hard limit for domain %s"),
                             vm->def->name);
        return -1;
    }
    if (vm->def->mem.soft_limit != 0) {
        rc = virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set memory soft limit for domain %s"),
                                 vm->def->name);
            return -1;
        }
    }

    if (vm->def->mem.swap_hard_limit != 0) {
        rc = virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set swap hard limit for domain %s"),
                                 vm->def->name);
            return -1;
        }
    }

    return 0;
}


509 510 511 512 513 514 515 516 517
static int
qemuSetupDevicesCgroup(virQEMUDriverPtr driver,
                       virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = NULL;
    const char *const *deviceACL = NULL;
    int rc = -1;
    int ret = -1;
518
    size_t i;
519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    rc = virCgroupDenyAllDevices(priv->cgroup);
    virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rc == 0);
    if (rc != 0) {
        if (rc == -EPERM) {
            VIR_WARN("Group devices ACL is not accessible, disabling whitelisting");
            return 0;
        }

        virReportSystemError(-rc,
                             _("Unable to deny all devices for %s"), vm->def->name);
        goto cleanup;
    }

536
    for (i = 0; i < vm->def->ndisks; i++) {
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571
        if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0)
            goto cleanup;
    }

    rc = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR,
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR,
                              "pty", "rw", rc == 0);
    if (rc != 0) {
        virReportSystemError(-rc, "%s",
                             _("unable to allow /dev/pts/ devices"));
        goto cleanup;
    }

    cfg = virQEMUDriverGetConfig(driver);
    deviceACL = cfg->cgroupDeviceACL ?
                (const char *const *)cfg->cgroupDeviceACL :
                defaultDeviceACL;

    if (vm->def->nsounds &&
        (!vm->def->ngraphics ||
         ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC &&
           cfg->vncAllowHostAudio) ||
           (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL)))) {
        rc = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR,
                                       VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR,
                                  "sound", "rw", rc == 0);
        if (rc != 0) {
            virReportSystemError(-rc, "%s",
                                     _("unable to allow /dev/snd/ devices"));
            goto cleanup;
        }
    }

572
    for (i = 0; deviceACL[i] != NULL; i++) {
573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
        if (access(deviceACL[i], F_OK) < 0) {
            VIR_DEBUG("Ignoring non-existant device %s",
                      deviceACL[i]);
            continue;
        }

        rc = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i],
                                      VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rc);
        if (rc < 0 &&
            rc != -ENOENT) {
            virReportSystemError(-rc,
                                 _("unable to allow device %s"),
                                 deviceACL[i]);
            goto cleanup;
        }
    }

    if (virDomainChrDefForeach(vm->def,
                               true,
                               qemuSetupChardevCgroup,
                               vm) < 0)
        goto cleanup;

    if (vm->def->tpm &&
        (qemuSetupTPMCgroup(vm->def,
                            vm->def->tpm,
                            vm) < 0))
        goto cleanup;

    for (i = 0; i < vm->def->nhostdevs; i++) {
        if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0)
            goto cleanup;
    }

    ret = 0;
cleanup:
    virObjectUnref(cfg);
    return ret;
}


615 616
static int
qemuSetupCpusetCgroup(virDomainObjPtr vm,
617 618
                      virBitmapPtr nodemask,
                      virCapsPtr caps)
619 620
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
621 622
    char *mem_mask = NULL;
    char *cpu_mask = NULL;
623 624 625 626 627 628 629 630 631 632 633 634 635
    int rc;
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

    if ((vm->def->numatune.memory.nodemask ||
         (vm->def->numatune.memory.placement_mode ==
          VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) &&
        vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {

        if (vm->def->numatune.memory.placement_mode ==
            VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)
636
            mem_mask = virBitmapFormat(nodemask);
637
        else
638
            mem_mask = virBitmapFormat(vm->def->numatune.memory.nodemask);
639

640
        if (!mem_mask) {
641 642 643 644 645
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("failed to convert memory nodemask"));
            goto cleanup;
        }

646
        rc = virCgroupSetCpusetMems(priv->cgroup, mem_mask);
647 648 649 650 651 652 653 654 655

        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set cpuset.mems for domain %s"),
                                 vm->def->name);
            goto cleanup;
        }
    }

656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
    if (vm->def->cpumask ||
        (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) {

        if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
            virBitmapPtr cpumap;
            if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask)))
                goto cleanup;
            cpu_mask = virBitmapFormat(cpumap);
            virBitmapFree(cpumap);
        } else {
            cpu_mask = virBitmapFormat(vm->def->cpumask);
        }

        if (!cpu_mask) {
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("failed to convert cpu mask"));
            goto cleanup;
        }

675
        if ((rc = virCgroupSetCpusetCpus(priv->cgroup, cpu_mask)) != 0) {
676 677 678 679 680 681 682
            virReportSystemError(-rc,
                                 _("Unable to set cpuset.cpus for domain %s"),
                                 vm->def->name);
            goto cleanup;
        }
    }

683 684
    ret = 0;
cleanup:
685 686
    VIR_FREE(mem_mask);
    VIR_FREE(cpu_mask);
687 688 689 690
    return ret;
}


691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
static int
qemuSetupCpuCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int rc = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
       if (vm->def->cputune.shares) {
           virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                          _("CPU tuning is not available on this host"));
           return -1;
       } else {
           return 0;
       }
    }

    if (vm->def->cputune.shares) {
        rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to set io cpu shares for domain %s"),
                                 vm->def->name);
            return -1;
        }
    }

    return 0;
}


721 722 723 724
int
qemuInitCgroup(virQEMUDriverPtr driver,
               virDomainObjPtr vm,
               bool startup)
725
{
726
    int rc = -1;
727
    qemuDomainObjPrivatePtr priv = vm->privateData;
728
    virCgroupPtr parent = NULL;
729 730
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);

731 732 733
    if (!cfg->privileged)
        goto done;

734 735
    virCgroupFree(&priv->cgroup);

736 737 738
    if (!vm->def->resource && startup) {
        virDomainResourceDefPtr res;

739
        if (VIR_ALLOC(res) < 0)
740
            goto cleanup;
741

742
        if (VIR_STRDUP(res->partition, "/machine") < 0) {
743 744 745 746 747
            VIR_FREE(res);
            goto cleanup;
        }

        vm->def->resource = res;
748 749
    }

750 751 752 753 754 755 756 757 758 759 760
    if (vm->def->resource &&
        vm->def->resource->partition) {
        if (vm->def->resource->partition[0] != '/') {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                           _("Resource partition '%s' must start with '/'"),
                           vm->def->resource->partition);
            goto cleanup;
        }
        /* We only auto-create the default partition. In other
         * cases we expec the sysadmin/app to have done so */
        rc = virCgroupNewPartition(vm->def->resource->partition,
761
                                   STREQ(vm->def->resource->partition, "/machine"),
762 763 764 765 766
                                   cfg->cgroupControllers,
                                   &parent);
        if (rc != 0) {
            if (rc == -ENXIO ||
                rc == -EPERM ||
767
                rc == -EACCES) { /* No cgroups mounts == success */
768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817
                VIR_DEBUG("No cgroups present/configured/accessible, ignoring error");
                goto done;
            }

            virReportSystemError(-rc,
                                 _("Unable to initialize %s cgroup"),
                                 vm->def->resource->partition);
            goto cleanup;
        }

        rc = virCgroupNewDomainPartition(parent,
                                         "qemu",
                                         vm->def->name,
                                         true,
                                         &priv->cgroup);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to create cgroup for %s"),
                                 vm->def->name);
            goto cleanup;
        }
    } else {
        rc = virCgroupNewDriver("qemu",
                                true,
                                cfg->cgroupControllers,
                                &parent);
        if (rc != 0) {
            if (rc == -ENXIO ||
                rc == -EPERM ||
                rc == -EACCES) { /* No cgroups mounts == success */
                VIR_DEBUG("No cgroups present/configured/accessible, ignoring error");
                goto done;
            }

            virReportSystemError(-rc,
                                 _("Unable to create cgroup for %s"),
                                 vm->def->name);
            goto cleanup;
        }

        rc = virCgroupNewDomainDriver(parent,
                                      vm->def->name,
                                      true,
                                      &priv->cgroup);
        if (rc != 0) {
            virReportSystemError(-rc,
                                 _("Unable to create cgroup for %s"),
                                 vm->def->name);
            goto cleanup;
        }
818 819 820 821 822
    }

done:
    rc = 0;
cleanup:
823
    virCgroupFree(&parent);
824 825 826 827 828
    virObjectUnref(cfg);
    return rc;
}


829 830 831 832
int
qemuSetupCgroup(virQEMUDriverPtr driver,
                virDomainObjPtr vm,
                virBitmapPtr nodemask)
833
{
834
    qemuDomainObjPrivatePtr priv = vm->privateData;
835
    virCapsPtr caps = NULL;
836
    int ret = -1;
837

838
    if (qemuInitCgroup(driver, vm, true) < 0)
839
        return -1;
840

841
    if (!priv->cgroup)
842
        return 0;
843

844 845 846
    if (!(caps = virQEMUDriverGetCapabilities(driver, false)))
        goto cleanup;

847 848
    if (qemuSetupDevicesCgroup(driver, vm) < 0)
        goto cleanup;
849

850 851
    if (qemuSetupBlkioCgroup(vm) < 0)
        goto cleanup;
852

853 854
    if (qemuSetupMemoryCgroup(vm) < 0)
        goto cleanup;
855

856 857
    if (qemuSetupCpuCgroup(vm) < 0)
        goto cleanup;
858

859
    if (qemuSetupCpusetCgroup(vm, nodemask, caps) < 0)
860
        goto cleanup;
861

862
    ret = 0;
863
cleanup:
864
    virObjectUnref(caps);
865
    return ret;
866 867
}

868 869 870 871
int
qemuSetupCgroupVcpuBW(virCgroupPtr cgroup,
                      unsigned long long period,
                      long long quota)
872 873 874 875 876 877 878 879 880 881 882 883
{
    int rc;
    unsigned long long old_period;

    if (period == 0 && quota == 0)
        return 0;

    if (period) {
        /* get old period, and we can rollback if set quota failed */
        rc = virCgroupGetCpuCfsPeriod(cgroup, &old_period);
        if (rc < 0) {
            virReportSystemError(-rc,
W
Wen Congyang 已提交
884
                                 "%s", _("Unable to get cpu bandwidth period"));
885 886 887 888 889 890
            return -1;
        }

        rc = virCgroupSetCpuCfsPeriod(cgroup, period);
        if (rc < 0) {
            virReportSystemError(-rc,
W
Wen Congyang 已提交
891
                                 "%s", _("Unable to set cpu bandwidth period"));
892 893 894 895 896 897 898 899
            return -1;
        }
    }

    if (quota) {
        rc = virCgroupSetCpuCfsQuota(cgroup, quota);
        if (rc < 0) {
            virReportSystemError(-rc,
W
Wen Congyang 已提交
900
                                 "%s", _("Unable to set cpu bandwidth quota"));
901 902 903 904 905 906 907 908 909 910
            goto cleanup;
        }
    }

    return 0;

cleanup:
    if (period) {
        rc = virCgroupSetCpuCfsPeriod(cgroup, old_period);
        if (rc < 0)
911
            virReportSystemError(-rc, "%s",
912
                                 _("Unable to rollback cpu bandwidth period"));
913 914 915 916 917
    }

    return -1;
}

918 919 920 921 922
int
qemuSetupCgroupVcpuPin(virCgroupPtr cgroup,
                       virDomainVcpuPinDefPtr *vcpupin,
                       int nvcpupin,
                       int vcpuid)
923
{
924
    size_t i;
925 926 927

    for (i = 0; i < nvcpupin; i++) {
        if (vcpuid == vcpupin[i]->vcpuid) {
928
            return qemuSetupCgroupEmulatorPin(cgroup, vcpupin[i]->cpumask);
929 930 931
        }
    }

932 933 934
    return -1;
}

935 936 937
int
qemuSetupCgroupEmulatorPin(virCgroupPtr cgroup,
                           virBitmapPtr cpumask)
938 939 940 941
{
    int rc = 0;
    char *new_cpus = NULL;

942
    new_cpus = virBitmapFormat(cpumask);
943 944 945 946 947 948 949 950 951 952 953 954 955 956 957
    if (!new_cpus) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("failed to convert cpu mask"));
        rc = -1;
        goto cleanup;
    }

    rc = virCgroupSetCpusetCpus(cgroup, new_cpus);
    if (rc < 0) {
        virReportSystemError(-rc,
                             "%s",
                             _("Unable to set cpuset.cpus"));
        goto cleanup;
    }

958 959 960 961 962
cleanup:
    VIR_FREE(new_cpus);
    return rc;
}

963 964
int
qemuSetupCgroupForVcpu(virDomainObjPtr vm)
965 966 967
{
    virCgroupPtr cgroup_vcpu = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
968
    virDomainDefPtr def = vm->def;
969
    int rc;
970
    size_t i, j;
971 972 973
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

974
    if ((period || quota) &&
975
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
976 977
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
H
Hu Tao 已提交
978 979 980
        return -1;
    }

981 982 983 984
    /* We are trying to setup cgroups for CPU pinning, which can also be done
     * with virProcessInfoSetAffinity, thus the lack of cgroups is not fatal
     * here.
     */
985
    if (priv->cgroup == NULL)
986 987
        return 0;

988
    if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) {
989
        /* If we don't know VCPU<->PID mapping or all vcpu runs in the same
W
Wen Congyang 已提交
990
         * thread, we cannot control each vcpu.
991
         */
992 993
        VIR_WARN("Unable to get vcpus' pids.");
        return 0;
994 995 996
    }

    for (i = 0; i < priv->nvcpupids; i++) {
997
        rc = virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu);
998 999 1000
        if (rc < 0) {
            virReportSystemError(-rc,
                                 _("Unable to create vcpu cgroup for %s(vcpu:"
1001
                                   " %zu)"),
1002 1003 1004 1005 1006 1007 1008 1009
                                 vm->def->name, i);
            goto cleanup;
        }

        /* move the thread for vcpu to sub dir */
        rc = virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]);
        if (rc < 0) {
            virReportSystemError(-rc,
1010
                                 _("unable to add vcpu %zu task %d to cgroup"),
1011 1012 1013 1014 1015
                                 i, priv->vcpupids[i]);
            goto cleanup;
        }

        if (period || quota) {
H
Hu Tao 已提交
1016 1017
            if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0)
                goto cleanup;
1018 1019
        }

1020
        /* Set vcpupin in cgroup if vcpupin xml is provided */
1021
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
M
Martin Kletzander 已提交
1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupVcpuPin will fail. */
            for (j = 0; j < def->cputune.nvcpupin; j++) {
                if (def->cputune.vcpupin[j]->vcpuid != i)
                    continue;

                if (qemuSetupCgroupVcpuPin(cgroup_vcpu,
                                           def->cputune.vcpupin,
                                           def->cputune.nvcpupin,
                                           i) < 0)
                    goto cleanup;

                break;
            }
        }
1037

1038 1039 1040 1041 1042 1043
        virCgroupFree(&cgroup_vcpu);
    }

    return 0;

cleanup:
1044 1045 1046 1047 1048
    if (cgroup_vcpu) {
        virCgroupRemove(cgroup_vcpu);
        virCgroupFree(&cgroup_vcpu);
    }

1049 1050 1051
    return -1;
}

1052 1053 1054 1055
int
qemuSetupCgroupForEmulator(virQEMUDriverPtr driver,
                           virDomainObjPtr vm,
                           virBitmapPtr nodemask)
1056
{
1057
    virBitmapPtr cpumask = NULL;
1058
    virBitmapPtr cpumap = NULL;
1059
    virCgroupPtr cgroup_emulator = NULL;
1060
    virDomainDefPtr def = vm->def;
1061
    qemuDomainObjPrivatePtr priv = vm->privateData;
1062 1063
    unsigned long long period = vm->def->cputune.emulator_period;
    long long quota = vm->def->cputune.emulator_quota;
1064
    int rc;
1065

1066
    if ((period || quota) &&
1067
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
1068 1069 1070 1071 1072
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

1073
    if (priv->cgroup == NULL)
1074 1075
        return 0; /* Not supported, so claim success */

1076
    rc = virCgroupNewEmulator(priv->cgroup, true, &cgroup_emulator);
1077 1078 1079 1080 1081 1082 1083
    if (rc < 0) {
        virReportSystemError(-rc,
                             _("Unable to create emulator cgroup for %s"),
                             vm->def->name);
        goto cleanup;
    }

1084
    rc = virCgroupMoveTask(priv->cgroup, cgroup_emulator);
1085 1086 1087 1088 1089 1090
    if (rc < 0) {
        virReportSystemError(-rc,
                             _("Unable to move tasks from domain cgroup to "
                               "emulator cgroup for %s"),
                             vm->def->name);
        goto cleanup;
1091 1092
    }

1093 1094 1095 1096 1097
    if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
        if (!(cpumap = qemuPrepareCpumap(driver, nodemask)))
            goto cleanup;
        cpumask = cpumap;
    } else if (def->cputune.emulatorpin) {
1098
        cpumask = def->cputune.emulatorpin->cpumask;
1099
    } else if (def->cpumask) {
1100
        cpumask = def->cpumask;
1101
    }
1102 1103

    if (cpumask) {
1104
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
1105 1106 1107 1108 1109
            rc = qemuSetupCgroupEmulatorPin(cgroup_emulator, cpumask);
            if (rc < 0)
                goto cleanup;
        }
        cpumask = NULL; /* sanity */
H
Hu Tao 已提交
1110
    }
1111

1112
    if (period || quota) {
1113
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
H
Hu Tao 已提交
1114 1115
            if ((rc = qemuSetupCgroupVcpuBW(cgroup_emulator, period,
                                            quota)) < 0)
1116 1117 1118 1119
                goto cleanup;
        }
    }

1120
    virCgroupFree(&cgroup_emulator);
1121
    virBitmapFree(cpumap);
1122 1123 1124
    return 0;

cleanup:
1125 1126
    virBitmapFree(cpumap);

1127 1128 1129 1130 1131 1132 1133
    if (cgroup_emulator) {
        virCgroupRemove(cgroup_emulator);
        virCgroupFree(&cgroup_emulator);
    }

    return rc;
}
1134

1135 1136
int
qemuRemoveCgroup(virDomainObjPtr vm)
1137
{
1138
    qemuDomainObjPrivatePtr priv = vm->privateData;
1139

1140
    if (priv->cgroup == NULL)
1141 1142
        return 0; /* Not supported, so claim success */

1143
    return virCgroupRemove(priv->cgroup);
1144 1145
}

1146 1147
int
qemuAddToCgroup(virDomainObjPtr vm)
1148
{
1149
    qemuDomainObjPrivatePtr priv = vm->privateData;
1150 1151
    int rc;

1152
    if (priv->cgroup == NULL)
1153 1154
        return 0; /* Not supported, so claim success */

1155
    rc = virCgroupAddTask(priv->cgroup, getpid());
1156 1157 1158
    if (rc != 0) {
        virReportSystemError(-rc,
                             _("unable to add domain %s task %d to cgroup"),
1159 1160
                             vm->def->name, getpid());
        return -1;
1161 1162
    }

1163
    return 0;
1164
}