qemu_cgroup.c 33.8 KB
Newer Older
1 2 3
/*
 * qemu_cgroup.c: QEMU cgroup management
 *
4
 * Copyright (C) 2006-2015 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright (C) 2006 Daniel P. Berrange
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 */

#include <config.h>

#include "qemu_cgroup.h"
27
#include "qemu_domain.h"
28
#include "qemu_process.h"
29
#include "vircgroup.h"
30
#include "virlog.h"
31
#include "viralloc.h"
32
#include "virerror.h"
33
#include "domain_audit.h"
34
#include "virscsi.h"
35
#include "virstring.h"
36
#include "virfile.h"
37
#include "virtypedparam.h"
38
#include "virnuma.h"
39
#include "virsystemd.h"
40 41 42

#define VIR_FROM_THIS VIR_FROM_QEMU

43 44
VIR_LOG_INIT("qemu.qemu_cgroup");

45 46 47 48
static const char *const defaultDeviceACL[] = {
    "/dev/null", "/dev/full", "/dev/zero",
    "/dev/random", "/dev/urandom",
    "/dev/ptmx", "/dev/kvm", "/dev/kqemu",
49
    "/dev/rtc", "/dev/hpet", "/dev/vfio/vfio",
50 51 52 53 54
    NULL,
};
#define DEVICE_PTY_MAJOR 136
#define DEVICE_SND_MAJOR 116

55

56
static int
57 58 59
qemuSetupImagePathCgroup(virDomainObjPtr vm,
                         const char *path,
                         bool readonly)
60
{
61
    qemuDomainObjPrivatePtr priv = vm->privateData;
62
    int perms = VIR_CGROUP_DEVICE_READ;
63
    int ret;
64

65
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
66 67
        return 0;

68
    if (!readonly)
69
        perms |= VIR_CGROUP_DEVICE_WRITE;
70

71
    VIR_DEBUG("Allow path %s, perms: %s",
72
              path, virCgroupGetDevicePermsString(perms));
73

74
    ret = virCgroupAllowDevicePath(priv->cgroup, path, perms, true);
75

76
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
77 78
                             virCgroupGetDevicePermsString(perms),
                             ret == 0);
79 80

    return ret;
81 82 83
}


84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
static int
qemuSetupImageCgroupInternal(virDomainObjPtr vm,
                             virStorageSourcePtr src,
                             bool forceReadonly)
{
    if (!src->path || !virStorageSourceIsLocalStorage(src)) {
        VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s",
                  NULLSTR(src->path), virStorageTypeToString(src->type));
        return 0;
    }

    return qemuSetupImagePathCgroup(vm, src->path, src->readonly || forceReadonly);
}


99
int
100 101
qemuSetupImageCgroup(virDomainObjPtr vm,
                     virStorageSourcePtr src)
102
{
103
    return qemuSetupImageCgroupInternal(vm, src, false);
104 105 106 107 108 109 110
}


int
qemuTeardownImageCgroup(virDomainObjPtr vm,
                        virStorageSourcePtr src)
{
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int perms = VIR_CGROUP_DEVICE_READ |
                VIR_CGROUP_DEVICE_WRITE |
                VIR_CGROUP_DEVICE_MKNOD;
    int ret;

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (!src->path || !virStorageSourceIsLocalStorage(src)) {
        VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s",
                  NULLSTR(src->path), virStorageTypeToString(src->type));
        return 0;
    }

    VIR_DEBUG("Deny path %s", src->path);

    ret = virCgroupDenyDevicePath(priv->cgroup, src->path, perms, true);

    virDomainAuditCgroupPath(vm, priv->cgroup, "deny", src->path,
                             virCgroupGetDevicePermsString(perms), ret == 0);

    return ret;
135 136 137
}


138 139 140
int
qemuSetupDiskCgroup(virDomainObjPtr vm,
                    virDomainDiskDefPtr disk)
141
{
142
    virStorageSourcePtr next;
143
    bool forceReadonly = false;
144

145
    for (next = disk->src; next; next = next->backingStore) {
146
        if (qemuSetupImageCgroupInternal(vm, next, forceReadonly) < 0)
147
            return -1;
148 149 150

        /* setup only the top level image for read-write */
        forceReadonly = true;
151
    }
152 153

    return 0;
154 155 156
}


157 158 159
int
qemuTeardownDiskCgroup(virDomainObjPtr vm,
                       virDomainDiskDefPtr disk)
160
{
161
    virStorageSourcePtr next;
162

163
    for (next = disk->src; next; next = next->backingStore) {
164
        if (qemuTeardownImageCgroup(vm, next) < 0)
165 166
            return -1;
    }
167

168
    return 0;
169 170
}

171

172
static int
173
qemuSetupChrSourceCgroup(virDomainObjPtr vm,
174
                         virDomainChrSourceDefPtr source)
175
{
176
    qemuDomainObjPrivatePtr priv = vm->privateData;
177
    int ret;
178

179
    if (source->type != VIR_DOMAIN_CHR_TYPE_DEV)
180 181
        return 0;

182
    VIR_DEBUG("Process path '%s' for device", source->data.file.path);
183

184
    ret = virCgroupAllowDevicePath(priv->cgroup, source->data.file.path,
185
                                   VIR_CGROUP_DEVICE_RW, false);
186
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
187
                             source->data.file.path, "rw", ret == 0);
188

189
    return ret;
190 191
}

192
static int
193
qemuSetupChardevCgroup(virDomainDefPtr def ATTRIBUTE_UNUSED,
194 195 196
                       virDomainChrDefPtr dev,
                       void *opaque)
{
197 198 199
    virDomainObjPtr vm = opaque;

    return qemuSetupChrSourceCgroup(vm, &dev->source);
200 201 202 203
}


static int
204
qemuSetupTPMCgroup(virDomainObjPtr vm)
205
{
206
    int ret = 0;
207
    virDomainTPMDefPtr dev = vm->def->tpm;
208 209 210

    switch (dev->type) {
    case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
211
        ret = qemuSetupChrSourceCgroup(vm, &dev->data.passthrough.source);
212 213 214 215 216
        break;
    case VIR_DOMAIN_TPM_TYPE_LAST:
        break;
    }

217
    return ret;
218 219
}

220

221 222 223 224 225 226 227 228 229 230 231
static int
qemuSetupInputCgroup(virDomainObjPtr vm,
                     virDomainInputDefPtr dev)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int ret = 0;

    switch (dev->type) {
    case VIR_DOMAIN_INPUT_TYPE_PASSTHROUGH:
        VIR_DEBUG("Process path '%s' for input device", dev->source.evdev);
        ret = virCgroupAllowDevicePath(priv->cgroup, dev->source.evdev,
232
                                       VIR_CGROUP_DEVICE_RW, false);
233 234 235 236 237 238 239 240
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", dev->source.evdev, "rw", ret == 0);
        break;
    }

    return ret;
}


241
static int
242
qemuSetupHostUSBDeviceCgroup(virUSBDevicePtr dev ATTRIBUTE_UNUSED,
243 244
                             const char *path,
                             void *opaque)
245
{
246 247
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
248
    int ret;
249 250

    VIR_DEBUG("Process path '%s' for USB device", path);
251
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
252
                                   VIR_CGROUP_DEVICE_RW, false);
253
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", ret == 0);
254

255
    return ret;
256 257
}

258
static int
259
qemuSetupHostSCSIDeviceCgroup(virSCSIDevicePtr dev ATTRIBUTE_UNUSED,
260 261 262 263 264
                              const char *path,
                              void *opaque)
{
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
265
    int ret;
266 267 268

    VIR_DEBUG("Process path '%s' for SCSI device", path);

269 270 271
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   virSCSIDeviceGetReadonly(dev) ?
                                   VIR_CGROUP_DEVICE_READ :
272
                                   VIR_CGROUP_DEVICE_RW, false);
273 274

    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
275
                             virSCSIDeviceGetReadonly(dev) ? "r" : "rw", ret == 0);
276

277
    return ret;
278
}
279

280
int
281
qemuSetupHostdevCgroup(virDomainObjPtr vm,
282 283 284 285
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
286
    virDomainHostdevSubsysUSBPtr usbsrc = &dev->source.subsys.u.usb;
287
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
288
    virDomainHostdevSubsysSCSIPtr scsisrc = &dev->source.subsys.u.scsi;
289
    virPCIDevicePtr pci = NULL;
290
    virUSBDevicePtr usb = NULL;
291
    virSCSIDevicePtr scsi = NULL;
292 293 294 295 296 297 298 299 300 301 302 303 304 305
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
306
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
307
                int rv;
308

309 310 311 312
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
313 314 315
                if (!pci)
                    goto cleanup;

316
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
317 318 319
                    goto cleanup;

                VIR_DEBUG("Cgroup allow %s for PCI device assignment", path);
320
                rv = virCgroupAllowDevicePath(priv->cgroup, path,
321
                                              VIR_CGROUP_DEVICE_RW, false);
322
                virDomainAuditCgroupPath(vm, priv->cgroup,
323 324
                                         "allow", path, "rw", rv == 0);
                if (rv < 0)
325 326 327
                    goto cleanup;
            }
            break;
328 329 330 331 332 333 334 335

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* NB: hostdev->missing wasn't previously checked in the
             * case of hotplug, only when starting a domain. Now it is
             * always checked, and the cgroup setup skipped if true.
             */
            if (dev->missing)
                break;
336
            if ((usb = virUSBDeviceNew(usbsrc->bus, usbsrc->device,
337 338 339 340
                                       NULL)) == NULL) {
                goto cleanup;
            }

341
            /* oddly, qemuSetupHostUSBDeviceCgroup doesn't ever
342 343
             * reference the usb object we just created
             */
344
            if (virUSBDeviceFileIterate(usb, qemuSetupHostUSBDeviceCgroup,
345 346 347 348
                                        vm) < 0) {
                goto cleanup;
            }
            break;
349

350
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI: {
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
            if (scsisrc->protocol ==
                VIR_DOMAIN_HOSTDEV_SCSI_PROTOCOL_TYPE_ISCSI) {
                virDomainHostdevSubsysSCSIiSCSIPtr iscsisrc = &scsisrc->u.iscsi;
                /* Follow qemuSetupDiskCgroup() and qemuSetImageCgroupInternal()
                 * which does nothing for non local storage
                 */
                VIR_DEBUG("Not updating cgroups for hostdev iSCSI path '%s'",
                          iscsisrc->path);
            } else {
                virDomainHostdevSubsysSCSIHostPtr scsihostsrc =
                    &scsisrc->u.host;
                if ((scsi = virSCSIDeviceNew(NULL,
                                             scsihostsrc->adapter,
                                             scsihostsrc->bus,
                                             scsihostsrc->target,
                                             scsihostsrc->unit,
                                             dev->readonly,
                                             dev->shareable)) == NULL)
                    goto cleanup;
370

371 372 373 374 375
                if (virSCSIDeviceFileIterate(scsi,
                                             qemuSetupHostSCSIDeviceCgroup,
                                             vm) < 0)
                    goto cleanup;
            }
376 377
            break;
        }
378

379 380 381 382 383 384
        default:
            break;
        }
    }

    ret = 0;
385
 cleanup:
386
    virPCIDeviceFree(pci);
387
    virUSBDeviceFree(usb);
388
    virSCSIDeviceFree(scsi);
389 390 391 392 393 394 395 396 397 398
    VIR_FREE(path);
    return ret;
}

int
qemuTeardownHostdevCgroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
399
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
    virPCIDevicePtr pci = NULL;
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
415
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
416
                int rv;
417

418 419 420 421
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
422 423 424
                if (!pci)
                    goto cleanup;

425
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
426 427 428
                    goto cleanup;

                VIR_DEBUG("Cgroup deny %s for PCI device assignment", path);
429
                rv = virCgroupDenyDevicePath(priv->cgroup, path,
430
                                             VIR_CGROUP_DEVICE_RWM, false);
431
                virDomainAuditCgroupPath(vm, priv->cgroup,
432 433
                                         "deny", path, "rwm", rv == 0);
                if (rv < 0)
434 435 436
                    goto cleanup;
            }
            break;
437 438 439
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* nothing to tear down for USB */
            break;
440 441 442 443 444 445
        default:
            break;
        }
    }

    ret = 0;
446
 cleanup:
447 448 449 450 451
    virPCIDeviceFree(pci);
    VIR_FREE(path);
    return ret;
}

452 453 454 455
static int
qemuSetupBlkioCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
456
    size_t i;
457 458 459 460 461 462 463 464 465 466 467 468

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_BLKIO)) {
        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Block I/O tuning is not available on this host"));
            return -1;
        } else {
            return 0;
        }
    }

469 470 471
    if (vm->def->blkio.weight != 0 &&
        virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight) < 0)
        return -1;
472 473 474

    if (vm->def->blkio.ndevices) {
        for (i = 0; i < vm->def->blkio.ndevices; i++) {
475
            virBlkioDevicePtr dev = &vm->def->blkio.devices[i];
476 477
            if (dev->weight &&
                (virCgroupSetBlkioDeviceWeight(priv->cgroup, dev->path,
478 479 480
                                               dev->weight) < 0 ||
                 virCgroupGetBlkioDeviceWeight(priv->cgroup, dev->path,
                                               &dev->weight) < 0))
481 482 483 484
                return -1;

            if (dev->riops &&
                (virCgroupSetBlkioDeviceReadIops(priv->cgroup, dev->path,
485 486 487
                                                 dev->riops) < 0 ||
                 virCgroupGetBlkioDeviceReadIops(priv->cgroup, dev->path,
                                                 &dev->riops) < 0))
488 489 490 491
                return -1;

            if (dev->wiops &&
                (virCgroupSetBlkioDeviceWriteIops(priv->cgroup, dev->path,
492 493 494
                                                  dev->wiops) < 0 ||
                 virCgroupGetBlkioDeviceWriteIops(priv->cgroup, dev->path,
                                                  &dev->wiops) < 0))
495 496 497 498
                return -1;

            if (dev->rbps &&
                (virCgroupSetBlkioDeviceReadBps(priv->cgroup, dev->path,
499 500 501
                                                dev->rbps) < 0 ||
                 virCgroupGetBlkioDeviceReadBps(priv->cgroup, dev->path,
                                                &dev->rbps) < 0))
502 503 504 505
                return -1;

            if (dev->wbps &&
                (virCgroupSetBlkioDeviceWriteBps(priv->cgroup, dev->path,
506 507 508
                                                 dev->wbps) < 0 ||
                 virCgroupGetBlkioDeviceWriteBps(priv->cgroup, dev->path,
                                                 &dev->wbps) < 0))
509 510 511 512 513 514 515
                return -1;
        }
    }

    return 0;
}

516

517 518 519 520 521
static int
qemuSetupMemoryCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

E
Eric Blake 已提交
522
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
523 524 525
        if (virMemoryLimitIsSet(vm->def->mem.hard_limit) ||
            virMemoryLimitIsSet(vm->def->mem.soft_limit) ||
            virMemoryLimitIsSet(vm->def->mem.swap_hard_limit)) {
526 527 528
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Memory cgroup is not available on this host"));
            return -1;
O
Osier Yang 已提交
529 530
        } else {
            return 0;
531 532 533
        }
    }

534 535 536
    if (virMemoryLimitIsSet(vm->def->mem.hard_limit))
        if (virCgroupSetMemoryHardLimit(priv->cgroup, vm->def->mem.hard_limit) < 0)
            return -1;
537

538 539 540
    if (virMemoryLimitIsSet(vm->def->mem.soft_limit))
        if (virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit) < 0)
            return -1;
541

542 543 544
    if (virMemoryLimitIsSet(vm->def->mem.swap_hard_limit))
        if (virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit) < 0)
            return -1;
545 546 547 548 549

    return 0;
}


550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568
static int
qemuSetupFirmwareCgroup(virDomainObjPtr vm)
{
    if (!vm->def->os.loader)
        return 0;

    if (vm->def->os.loader->path &&
        qemuSetupImagePathCgroup(vm, vm->def->os.loader->path,
                                 vm->def->os.loader->readonly == VIR_TRISTATE_BOOL_YES) < 0)
        return -1;

    if (vm->def->os.loader->nvram &&
        qemuSetupImagePathCgroup(vm, vm->def->os.loader->nvram, false) < 0)
        return -1;

    return 0;
}


569 570 571 572 573 574 575
static int
qemuSetupDevicesCgroup(virQEMUDriverPtr driver,
                       virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = NULL;
    const char *const *deviceACL = NULL;
576
    int rv = -1;
577
    int ret = -1;
578
    size_t i;
579 580 581 582

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

583 584 585 586 587
    rv = virCgroupDenyAllDevices(priv->cgroup);
    virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rv == 0);
    if (rv < 0) {
        if (virLastErrorIsSystemErrno(EPERM)) {
            virResetLastError();
588 589 590 591 592 593 594
            VIR_WARN("Group devices ACL is not accessible, disabling whitelisting");
            return 0;
        }

        goto cleanup;
    }

595 596 597
    if (qemuSetupFirmwareCgroup(vm) < 0)
        goto cleanup;

598
    for (i = 0; i < vm->def->ndisks; i++) {
599 600 601 602
        if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0)
            goto cleanup;
    }

603 604
    rv = virCgroupAllowDevice(priv->cgroup, 'c', DEVICE_PTY_MAJOR, -1,
                              VIR_CGROUP_DEVICE_RW);
605
    virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR,
606 607
                              "pty", "rw", rv == 0);
    if (rv < 0)
608 609 610 611 612 613 614 615
        goto cleanup;

    cfg = virQEMUDriverGetConfig(driver);
    deviceACL = cfg->cgroupDeviceACL ?
                (const char *const *)cfg->cgroupDeviceACL :
                defaultDeviceACL;

    if (vm->def->nsounds &&
616
        ((!vm->def->ngraphics && cfg->nogfxAllowHostAudio) ||
617 618
         (vm->def->graphics &&
          ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC &&
619
           cfg->vncAllowHostAudio) ||
620
           (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL))))) {
621 622
        rv = virCgroupAllowDevice(priv->cgroup, 'c', DEVICE_SND_MAJOR, -1,
                                  VIR_CGROUP_DEVICE_RW);
623
        virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR,
624 625
                                  "sound", "rw", rv == 0);
        if (rv < 0)
626 627 628
            goto cleanup;
    }

629
    for (i = 0; deviceACL[i] != NULL; i++) {
630
        if (!virFileExists(deviceACL[i])) {
N
Nehal J Wani 已提交
631
            VIR_DEBUG("Ignoring non-existent device %s", deviceACL[i]);
632 633 634
            continue;
        }

635
        rv = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i],
636
                                      VIR_CGROUP_DEVICE_RW, false);
637 638 639
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rv == 0);
        if (rv < 0 &&
            !virLastErrorIsSystemErrno(ENOENT))
640 641 642 643 644 645 646 647 648
            goto cleanup;
    }

    if (virDomainChrDefForeach(vm->def,
                               true,
                               qemuSetupChardevCgroup,
                               vm) < 0)
        goto cleanup;

649
    if (vm->def->tpm && qemuSetupTPMCgroup(vm) < 0)
650 651 652
        goto cleanup;

    for (i = 0; i < vm->def->nhostdevs; i++) {
653
        if (qemuSetupHostdevCgroup(vm, vm->def->hostdevs[i]) < 0)
654 655 656
            goto cleanup;
    }

657 658 659 660 661
    for (i = 0; i < vm->def->ninputs; i++) {
        if (qemuSetupInputCgroup(vm, vm->def->inputs[i]) < 0)
            goto cleanup;
    }

662 663 664
    for (i = 0; i < vm->def->nrngs; i++) {
        if (vm->def->rngs[i]->backend == VIR_DOMAIN_RNG_BACKEND_RANDOM) {
            VIR_DEBUG("Setting Cgroup ACL for RNG device");
665 666
            rv = virCgroupAllowDevicePath(priv->cgroup,
                                          vm->def->rngs[i]->source.file,
667
                                          VIR_CGROUP_DEVICE_RW, false);
668
            virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
669 670
                                     vm->def->rngs[i]->source.file,
                                     "rw", rv == 0);
671 672 673 674
            if (rv < 0 &&
                !virLastErrorIsSystemErrno(ENOENT))
                goto cleanup;
        }
675 676
    }

677
    ret = 0;
678
 cleanup:
679 680 681 682 683
    virObjectUnref(cfg);
    return ret;
}


684
int
685
qemuSetupCpusetMems(virDomainObjPtr vm)
686
{
687
    virCgroupPtr cgroup_temp = NULL;
688
    qemuDomainObjPrivatePtr priv = vm->privateData;
689
    virDomainNumatuneMemMode mode;
690
    char *mem_mask = NULL;
691 692 693 694 695
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

696 697
    if (virDomainNumatuneGetMode(vm->def->numa, -1, &mode) < 0 ||
        mode != VIR_DOMAIN_NUMATUNE_MEM_STRICT)
698 699
        return 0;

700
    if (virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
701
                                            priv->autoNodeset,
702
                                            &mem_mask, -1) < 0)
703
        goto cleanup;
704

705
    if (mem_mask)
J
John Ferlan 已提交
706 707
        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
                               false, &cgroup_temp) < 0 ||
708
            virCgroupSetCpusetMems(cgroup_temp, mem_mask) < 0)
709
            goto cleanup;
710

711 712 713
    ret = 0;
 cleanup:
    VIR_FREE(mem_mask);
714
    virCgroupFree(&cgroup_temp);
715 716 717 718 719
    return ret;
}


static int
720
qemuSetupCpusetCgroup(virDomainObjPtr vm)
721 722 723 724 725 726
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

727 728 729
    if (virCgroupSetCpusetMemoryMigrate(priv->cgroup, true) < 0)
        return -1;

730
    return 0;
731 732 733
}


734
static int
735 736
qemuSetupCpuCgroup(virQEMUDriverPtr driver,
                   virDomainObjPtr vm)
737 738
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
739 740 741 742
    virObjectEventPtr event = NULL;
    virTypedParameterPtr eventParams = NULL;
    int eventNparams = 0;
    int eventMaxparams = 0;
743 744

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
745
       if (vm->def->cputune.sharesSpecified) {
746 747 748 749 750 751 752 753
           virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                          _("CPU tuning is not available on this host"));
           return -1;
       } else {
           return 0;
       }
    }

754 755 756 757 758 759 760
    if (vm->def->cputune.sharesSpecified) {
        unsigned long long val;
        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
            return -1;

        if (virCgroupGetCpuShares(priv->cgroup, &val) < 0)
            return -1;
761 762 763 764
        if (vm->def->cputune.shares != val) {
            vm->def->cputune.shares = val;
            if (virTypedParamsAddULLong(&eventParams, &eventNparams,
                                        &eventMaxparams,
765
                                        VIR_DOMAIN_TUNABLE_CPU_CPU_SHARES,
766 767 768 769 770 771
                                        val) < 0)
                return -1;

            event = virDomainEventTunableNewFromObj(vm, eventParams, eventNparams);
        }

772
        qemuDomainEventQueue(driver, event);
773
    }
774 775 776 777 778

    return 0;
}


779
static int
780
qemuInitCgroup(virQEMUDriverPtr driver,
781 782 783
               virDomainObjPtr vm,
               size_t nnicindexes,
               int *nicindexes)
784
{
785
    int ret = -1;
786 787 788
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);

789
    if (!virQEMUDriverIsPrivileged(driver))
790 791
        goto done;

792 793 794
    if (!virCgroupAvailable())
        goto done;

795 796
    virCgroupFree(&priv->cgroup);

797
    if (!vm->def->resource) {
798 799
        virDomainResourceDefPtr res;

800
        if (VIR_ALLOC(res) < 0)
801
            goto cleanup;
802

803
        if (VIR_STRDUP(res->partition, "/machine") < 0) {
804 805 806 807 808
            VIR_FREE(res);
            goto cleanup;
        }

        vm->def->resource = res;
809 810
    }

811 812 813 814 815 816
    if (vm->def->resource->partition[0] != '/') {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Resource partition '%s' must start with '/'"),
                       vm->def->resource->partition);
        goto cleanup;
    }
817

818 819 820 821 822 823 824 825 826 827 828 829
    /*
     * We need to do this because of systemd-machined, because
     * CreateMachine requires the name to be a valid hostname.
     */
    priv->machineName = virSystemdMakeMachineName("qemu",
                                                  vm->def->id,
                                                  vm->def->name,
                                                  virQEMUDriverIsPrivileged(driver));
    if (!priv->machineName)
        goto cleanup;

    if (virCgroupNewMachine(priv->machineName,
830 831 832 833 834
                            "qemu",
                            vm->def->uuid,
                            NULL,
                            vm->pid,
                            false,
835
                            nnicindexes, nicindexes,
836 837 838
                            vm->def->resource->partition,
                            cfg->cgroupControllers,
                            &priv->cgroup) < 0) {
839 840
        if (virCgroupNewIgnoreError())
            goto done;
841

842 843
        goto cleanup;
    }
844

845
 done:
846
    ret = 0;
847
 cleanup:
848 849 850
    virObjectUnref(cfg);
    return ret;
}
851

852 853 854
static void
qemuRestoreCgroupState(virDomainObjPtr vm)
{
855
    char *mem_mask = NULL;
856
    char *nodeset = NULL;
857 858
    int empty = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
859
    size_t i = 0;
860
    virBitmapPtr all_nodes;
861
    virCgroupPtr cgroup_temp = NULL;
862 863 864 865 866 867 868 869

    if (!(all_nodes = virNumaGetHostNodeset()))
        goto error;

    if (!(mem_mask = virBitmapFormat(all_nodes)))
        goto error;

    if ((empty = virCgroupHasEmptyTasks(priv->cgroup,
870
                                        VIR_CGROUP_CONTROLLER_CPUSET)) <= 0)
871 872 873 874 875
        goto error;

    if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
        goto error;

876 877 878 879 880 881
    for (i = 0; i < virDomainDefGetVcpusMax(vm->def); i++) {
        virDomainVcpuInfoPtr vcpu = virDomainDefGetVcpu(vm->def, i);

        if (!vcpu->online)
            continue;

J
John Ferlan 已提交
882 883
        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i,
                               false, &cgroup_temp) < 0 ||
884 885 886 887 888
            virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
            virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
            virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
            goto cleanup;

889
        VIR_FREE(nodeset);
890 891 892
        virCgroupFree(&cgroup_temp);
    }

893 894 895
    for (i = 0; i < vm->def->niothreadids; i++) {
        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_IOTHREAD,
                               vm->def->iothreadids[i]->iothread_id,
J
John Ferlan 已提交
896
                               false, &cgroup_temp) < 0 ||
897 898 899 900 901
            virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
            virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
            virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
            goto cleanup;

902
        VIR_FREE(nodeset);
903 904 905
        virCgroupFree(&cgroup_temp);
    }

J
John Ferlan 已提交
906 907
    if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
                           false, &cgroup_temp) < 0 ||
908 909 910 911 912
        virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
        virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
        virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
        goto cleanup;

913 914
 cleanup:
    VIR_FREE(mem_mask);
915
    VIR_FREE(nodeset);
916
    virBitmapFree(all_nodes);
917
    virCgroupFree(&cgroup_temp);
918 919 920 921 922 923 924
    return;

 error:
    virResetLastError();
    VIR_DEBUG("Couldn't restore cgroups to meaningful state");
    goto cleanup;
}
925 926 927 928 929 930 931 932 933

int
qemuConnectCgroup(virQEMUDriverPtr driver,
                  virDomainObjPtr vm)
{
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int ret = -1;

934
    if (!virQEMUDriverIsPrivileged(driver))
935 936 937 938 939 940 941
        goto done;

    if (!virCgroupAvailable())
        goto done;

    virCgroupFree(&priv->cgroup);

942 943
    if (virCgroupNewDetectMachine(vm->def->name,
                                  "qemu",
944 945
                                  vm->def->id,
                                  virQEMUDriverIsPrivileged(driver),
946
                                  vm->pid,
947
                                  cfg->cgroupControllers,
948
                                  &priv->cgroup) < 0)
949
        goto cleanup;
950

951 952 953 954
    priv->machineName = virSystemdGetMachineNameByPID(vm->pid);
    if (!priv->machineName)
        virResetLastError();

955 956
    qemuRestoreCgroupState(vm);

957
 done:
958
    ret = 0;
959
 cleanup:
960
    virObjectUnref(cfg);
961
    return ret;
962 963
}

964 965
int
qemuSetupCgroup(virQEMUDriverPtr driver,
966 967 968
                virDomainObjPtr vm,
                size_t nnicindexes,
                int *nicindexes)
969
{
970
    qemuDomainObjPrivatePtr priv = vm->privateData;
971
    int ret = -1;
972

973 974 975 976 977 978
    if (!vm->pid) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("Cannot setup cgroups until process is started"));
        return -1;
    }

979
    if (qemuInitCgroup(driver, vm, nnicindexes, nicindexes) < 0)
980
        return -1;
981

982
    if (!priv->cgroup)
983
        return 0;
984

985 986
    if (qemuSetupDevicesCgroup(driver, vm) < 0)
        goto cleanup;
987

988 989
    if (qemuSetupBlkioCgroup(vm) < 0)
        goto cleanup;
990

991 992
    if (qemuSetupMemoryCgroup(vm) < 0)
        goto cleanup;
993

994
    if (qemuSetupCpuCgroup(driver, vm) < 0)
995
        goto cleanup;
996

997
    if (qemuSetupCpusetCgroup(vm) < 0)
998
        goto cleanup;
999

1000
    ret = 0;
1001
 cleanup:
1002
    return ret;
1003 1004
}

1005 1006 1007 1008
int
qemuSetupCgroupVcpuBW(virCgroupPtr cgroup,
                      unsigned long long period,
                      long long quota)
1009 1010 1011 1012 1013 1014 1015 1016
{
    unsigned long long old_period;

    if (period == 0 && quota == 0)
        return 0;

    if (period) {
        /* get old period, and we can rollback if set quota failed */
1017
        if (virCgroupGetCpuCfsPeriod(cgroup, &old_period) < 0)
1018 1019
            return -1;

1020
        if (virCgroupSetCpuCfsPeriod(cgroup, period) < 0)
1021 1022 1023
            return -1;
    }

1024 1025 1026
    if (quota &&
        virCgroupSetCpuCfsQuota(cgroup, quota) < 0)
        goto error;
1027 1028 1029

    return 0;

1030
 error:
1031
    if (period) {
1032 1033 1034 1035 1036 1037
        virErrorPtr saved = virSaveLastError();
        ignore_value(virCgroupSetCpuCfsPeriod(cgroup, old_period));
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
1038 1039 1040 1041 1042
    }

    return -1;
}

1043

1044
int
1045 1046
qemuSetupCgroupCpusetCpus(virCgroupPtr cgroup,
                          virBitmapPtr cpumask)
1047
{
1048
    int ret = -1;
1049 1050
    char *new_cpus = NULL;

1051
    if (!(new_cpus = virBitmapFormat(cpumask)))
1052 1053
        goto cleanup;

1054
    if (virCgroupSetCpusetCpus(cgroup, new_cpus) < 0)
1055 1056
        goto cleanup;

1057
    ret = 0;
1058
 cleanup:
1059
    VIR_FREE(new_cpus);
1060
    return ret;
1061 1062
}

1063

1064
int
1065
qemuSetupCgroupForEmulator(virDomainObjPtr vm)
1066
{
1067
    virBitmapPtr cpumask = NULL;
1068
    virCgroupPtr cgroup_emulator = NULL;
1069
    virDomainDefPtr def = vm->def;
1070
    qemuDomainObjPrivatePtr priv = vm->privateData;
1071 1072
    unsigned long long period = vm->def->cputune.emulator_period;
    long long quota = vm->def->cputune.emulator_quota;
1073

1074
    if ((period || quota) &&
1075
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
1076 1077 1078 1079 1080
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

1081 1082 1083 1084 1085 1086 1087 1088 1089
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

J
John Ferlan 已提交
1090 1091
    if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
                           true, &cgroup_emulator) < 0)
1092 1093
        goto cleanup;

1094
    if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0)
1095
        goto cleanup;
1096

1097
    if (def->cputune.emulatorpin)
1098
        cpumask = def->cputune.emulatorpin;
1099 1100
    else if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)
        cpumask = priv->autoCpuset;
1101
    else if (def->cpumask)
1102 1103 1104
        cpumask = def->cpumask;

    if (cpumask) {
1105
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) &&
1106
            qemuSetupCgroupCpusetCpus(cgroup_emulator, cpumask) < 0)
1107
            goto cleanup;
H
Hu Tao 已提交
1108
    }
1109

1110
    if (period || quota) {
1111 1112 1113 1114
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
            qemuSetupCgroupVcpuBW(cgroup_emulator, period,
                                  quota) < 0)
            goto cleanup;
1115 1116
    }

1117 1118 1119
    virCgroupFree(&cgroup_emulator);
    return 0;

1120
 cleanup:
1121 1122 1123 1124 1125
    if (cgroup_emulator) {
        virCgroupRemove(cgroup_emulator);
        virCgroupFree(&cgroup_emulator);
    }

1126
    return -1;
1127
}
1128

1129

1130
int
1131
qemuRemoveCgroup(virDomainObjPtr vm)
1132
{
1133
    qemuDomainObjPrivatePtr priv = vm->privateData;
1134

1135
    if (priv->cgroup == NULL)
1136 1137
        return 0; /* Not supported, so claim success */

1138
    if (virCgroupTerminateMachine(priv->machineName) < 0) {
1139 1140 1141 1142
        if (!virCgroupNewIgnoreError())
            VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name);
    }

1143
    return virCgroupRemove(priv->cgroup);
1144
}