qemu_cgroup.c 33.0 KB
Newer Older
1 2 3
/*
 * qemu_cgroup.c: QEMU cgroup management
 *
4
 * Copyright (C) 2006-2015 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright (C) 2006 Daniel P. Berrange
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 */

#include <config.h>

#include "qemu_cgroup.h"
27
#include "qemu_domain.h"
28
#include "qemu_process.h"
29
#include "vircgroup.h"
30
#include "virlog.h"
31
#include "viralloc.h"
32
#include "virerror.h"
33
#include "domain_audit.h"
34
#include "virscsi.h"
35
#include "virstring.h"
36
#include "virfile.h"
37
#include "virtypedparam.h"
38
#include "virnuma.h"
39
#include "virsystemd.h"
40 41 42

#define VIR_FROM_THIS VIR_FROM_QEMU

43 44
VIR_LOG_INIT("qemu.qemu_cgroup");

45 46 47 48
static const char *const defaultDeviceACL[] = {
    "/dev/null", "/dev/full", "/dev/zero",
    "/dev/random", "/dev/urandom",
    "/dev/ptmx", "/dev/kvm", "/dev/kqemu",
49
    "/dev/rtc", "/dev/hpet", "/dev/vfio/vfio",
50 51 52 53 54
    NULL,
};
#define DEVICE_PTY_MAJOR 136
#define DEVICE_SND_MAJOR 116

55 56 57 58 59
static int
qemuSetImageCgroupInternal(virDomainObjPtr vm,
                           virStorageSourcePtr src,
                           bool deny,
                           bool forceReadonly)
60
{
61
    qemuDomainObjPrivatePtr priv = vm->privateData;
62
    int perms = VIR_CGROUP_DEVICE_READ;
63
    int ret;
64

65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (!src->path || !virStorageSourceIsLocalStorage(src)) {
        VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s",
                  NULLSTR(src->path), virStorageTypeToString(src->type));
        return 0;
    }

    if (deny) {
        perms |= VIR_CGROUP_DEVICE_WRITE | VIR_CGROUP_DEVICE_MKNOD;

        VIR_DEBUG("Deny path %s", src->path);

        ret = virCgroupDenyDevicePath(priv->cgroup, src->path, perms);
    } else {
82
        if (!src->readonly && !forceReadonly)
83 84 85 86 87 88 89 90 91 92 93 94 95
            perms |= VIR_CGROUP_DEVICE_WRITE;

        VIR_DEBUG("Allow path %s, perms: %s",
                  src->path, virCgroupGetDevicePermsString(perms));

        ret = virCgroupAllowDevicePath(priv->cgroup, src->path, perms);
    }

    virDomainAuditCgroupPath(vm, priv->cgroup,
                             deny ? "deny" : "allow",
                             src->path,
                             virCgroupGetDevicePermsString(perms),
                             ret == 0);
96 97 98 99

    /* Get this for root squash NFS */
    if (ret < 0 &&
        virLastErrorIsSystemErrno(EACCES)) {
100
        VIR_DEBUG("Ignoring EACCES for %s", src->path);
101 102
        virResetLastError();
        ret = 0;
103
    }
104

105
    return ret;
106 107 108
}


109 110 111 112 113 114 115 116 117
int
qemuSetImageCgroup(virDomainObjPtr vm,
                   virStorageSourcePtr src,
                   bool deny)
{
    return qemuSetImageCgroupInternal(vm, src, deny, false);
}


118 119 120
int
qemuSetupDiskCgroup(virDomainObjPtr vm,
                    virDomainDiskDefPtr disk)
121
{
122
    virStorageSourcePtr next;
123
    bool forceReadonly = false;
124

125
    for (next = disk->src; next; next = next->backingStore) {
126
        if (qemuSetImageCgroupInternal(vm, next, false, forceReadonly) < 0)
127
            return -1;
128 129 130

        /* setup only the top level image for read-write */
        forceReadonly = true;
131
    }
132 133

    return 0;
134 135 136
}


137 138 139
int
qemuTeardownDiskCgroup(virDomainObjPtr vm,
                       virDomainDiskDefPtr disk)
140
{
141
    virStorageSourcePtr next;
142

143 144 145 146
    for (next = disk->src; next; next = next->backingStore) {
        if (qemuSetImageCgroup(vm, next, true) < 0)
            return -1;
    }
147

148
    return 0;
149 150
}

151

152
static int
153
qemuSetupChrSourceCgroup(virDomainObjPtr vm,
154
                         virDomainChrSourceDefPtr source)
155
{
156
    qemuDomainObjPrivatePtr priv = vm->privateData;
157
    int ret;
158

159
    if (source->type != VIR_DOMAIN_CHR_TYPE_DEV)
160 161
        return 0;

162
    VIR_DEBUG("Process path '%s' for device", source->data.file.path);
163

164
    ret = virCgroupAllowDevicePath(priv->cgroup, source->data.file.path,
165
                                   VIR_CGROUP_DEVICE_RW);
166
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
167
                             source->data.file.path, "rw", ret == 0);
168

169
    return ret;
170 171
}

172
static int
173
qemuSetupChardevCgroup(virDomainDefPtr def ATTRIBUTE_UNUSED,
174 175 176
                       virDomainChrDefPtr dev,
                       void *opaque)
{
177 178 179
    virDomainObjPtr vm = opaque;

    return qemuSetupChrSourceCgroup(vm, &dev->source);
180 181 182 183
}


static int
184
qemuSetupTPMCgroup(virDomainObjPtr vm)
185
{
186
    int ret = 0;
187
    virDomainTPMDefPtr dev = vm->def->tpm;
188 189 190

    switch (dev->type) {
    case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
191
        ret = qemuSetupChrSourceCgroup(vm, &dev->data.passthrough.source);
192 193 194 195 196
        break;
    case VIR_DOMAIN_TPM_TYPE_LAST:
        break;
    }

197
    return ret;
198 199
}

200

201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
static int
qemuSetupInputCgroup(virDomainObjPtr vm,
                     virDomainInputDefPtr dev)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int ret = 0;

    switch (dev->type) {
    case VIR_DOMAIN_INPUT_TYPE_PASSTHROUGH:
        VIR_DEBUG("Process path '%s' for input device", dev->source.evdev);
        ret = virCgroupAllowDevicePath(priv->cgroup, dev->source.evdev,
                                       VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", dev->source.evdev, "rw", ret == 0);
        break;
    }

    return ret;
}


221
static int
222
qemuSetupHostUSBDeviceCgroup(virUSBDevicePtr dev ATTRIBUTE_UNUSED,
223 224
                             const char *path,
                             void *opaque)
225
{
226 227
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
228
    int ret;
229 230

    VIR_DEBUG("Process path '%s' for USB device", path);
231 232 233
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", ret == 0);
234

235
    return ret;
236 237
}

238
static int
239
qemuSetupHostSCSIDeviceCgroup(virSCSIDevicePtr dev ATTRIBUTE_UNUSED,
240 241 242 243 244
                              const char *path,
                              void *opaque)
{
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
245
    int ret;
246 247 248

    VIR_DEBUG("Process path '%s' for SCSI device", path);

249 250 251 252
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   virSCSIDeviceGetReadonly(dev) ?
                                   VIR_CGROUP_DEVICE_READ :
                                   VIR_CGROUP_DEVICE_RW);
253 254

    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
255
                             virSCSIDeviceGetReadonly(dev) ? "r" : "rw", ret == 0);
256

257
    return ret;
258
}
259

260
int
261
qemuSetupHostdevCgroup(virDomainObjPtr vm,
262 263 264 265
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
266
    virDomainHostdevSubsysUSBPtr usbsrc = &dev->source.subsys.u.usb;
267
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
268
    virDomainHostdevSubsysSCSIPtr scsisrc = &dev->source.subsys.u.scsi;
269
    virPCIDevicePtr pci = NULL;
270
    virUSBDevicePtr usb = NULL;
271
    virSCSIDevicePtr scsi = NULL;
272 273 274 275 276 277 278 279 280 281 282 283 284 285
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
286
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
287
                int rv;
288

289 290 291 292
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
293 294 295
                if (!pci)
                    goto cleanup;

296
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
297 298 299
                    goto cleanup;

                VIR_DEBUG("Cgroup allow %s for PCI device assignment", path);
300
                rv = virCgroupAllowDevicePath(priv->cgroup, path,
301 302
                                              VIR_CGROUP_DEVICE_RW);
                virDomainAuditCgroupPath(vm, priv->cgroup,
303 304
                                         "allow", path, "rw", rv == 0);
                if (rv < 0)
305 306 307
                    goto cleanup;
            }
            break;
308 309 310 311 312 313 314 315

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* NB: hostdev->missing wasn't previously checked in the
             * case of hotplug, only when starting a domain. Now it is
             * always checked, and the cgroup setup skipped if true.
             */
            if (dev->missing)
                break;
316
            if ((usb = virUSBDeviceNew(usbsrc->bus, usbsrc->device,
317 318 319 320
                                       NULL)) == NULL) {
                goto cleanup;
            }

321
            /* oddly, qemuSetupHostUSBDeviceCgroup doesn't ever
322 323
             * reference the usb object we just created
             */
324
            if (virUSBDeviceFileIterate(usb, qemuSetupHostUSBDeviceCgroup,
325 326 327 328
                                        vm) < 0) {
                goto cleanup;
            }
            break;
329

330
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI: {
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
            if (scsisrc->protocol ==
                VIR_DOMAIN_HOSTDEV_SCSI_PROTOCOL_TYPE_ISCSI) {
                virDomainHostdevSubsysSCSIiSCSIPtr iscsisrc = &scsisrc->u.iscsi;
                /* Follow qemuSetupDiskCgroup() and qemuSetImageCgroupInternal()
                 * which does nothing for non local storage
                 */
                VIR_DEBUG("Not updating cgroups for hostdev iSCSI path '%s'",
                          iscsisrc->path);
            } else {
                virDomainHostdevSubsysSCSIHostPtr scsihostsrc =
                    &scsisrc->u.host;
                if ((scsi = virSCSIDeviceNew(NULL,
                                             scsihostsrc->adapter,
                                             scsihostsrc->bus,
                                             scsihostsrc->target,
                                             scsihostsrc->unit,
                                             dev->readonly,
                                             dev->shareable)) == NULL)
                    goto cleanup;
350

351 352 353 354 355
                if (virSCSIDeviceFileIterate(scsi,
                                             qemuSetupHostSCSIDeviceCgroup,
                                             vm) < 0)
                    goto cleanup;
            }
356 357
            break;
        }
358

359 360 361 362 363 364
        default:
            break;
        }
    }

    ret = 0;
365
 cleanup:
366
    virPCIDeviceFree(pci);
367
    virUSBDeviceFree(usb);
368
    virSCSIDeviceFree(scsi);
369 370 371 372 373 374 375 376 377 378
    VIR_FREE(path);
    return ret;
}

int
qemuTeardownHostdevCgroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
379
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
    virPCIDevicePtr pci = NULL;
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
395
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
396
                int rv;
397

398 399 400 401
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
402 403 404
                if (!pci)
                    goto cleanup;

405
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
406 407 408
                    goto cleanup;

                VIR_DEBUG("Cgroup deny %s for PCI device assignment", path);
409
                rv = virCgroupDenyDevicePath(priv->cgroup, path,
410 411
                                             VIR_CGROUP_DEVICE_RWM);
                virDomainAuditCgroupPath(vm, priv->cgroup,
412 413
                                         "deny", path, "rwm", rv == 0);
                if (rv < 0)
414 415 416
                    goto cleanup;
            }
            break;
417 418 419
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* nothing to tear down for USB */
            break;
420 421 422 423 424 425
        default:
            break;
        }
    }

    ret = 0;
426
 cleanup:
427 428 429 430 431
    virPCIDeviceFree(pci);
    VIR_FREE(path);
    return ret;
}

432 433 434 435
static int
qemuSetupBlkioCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
436
    size_t i;
437 438 439 440 441 442 443 444 445 446 447 448

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_BLKIO)) {
        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Block I/O tuning is not available on this host"));
            return -1;
        } else {
            return 0;
        }
    }

449 450 451
    if (vm->def->blkio.weight != 0 &&
        virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight) < 0)
        return -1;
452 453 454

    if (vm->def->blkio.ndevices) {
        for (i = 0; i < vm->def->blkio.ndevices; i++) {
455
            virBlkioDevicePtr dev = &vm->def->blkio.devices[i];
456 457
            if (dev->weight &&
                (virCgroupSetBlkioDeviceWeight(priv->cgroup, dev->path,
458 459 460
                                               dev->weight) < 0 ||
                 virCgroupGetBlkioDeviceWeight(priv->cgroup, dev->path,
                                               &dev->weight) < 0))
461 462 463 464
                return -1;

            if (dev->riops &&
                (virCgroupSetBlkioDeviceReadIops(priv->cgroup, dev->path,
465 466 467
                                                 dev->riops) < 0 ||
                 virCgroupGetBlkioDeviceReadIops(priv->cgroup, dev->path,
                                                 &dev->riops) < 0))
468 469 470 471
                return -1;

            if (dev->wiops &&
                (virCgroupSetBlkioDeviceWriteIops(priv->cgroup, dev->path,
472 473 474
                                                  dev->wiops) < 0 ||
                 virCgroupGetBlkioDeviceWriteIops(priv->cgroup, dev->path,
                                                  &dev->wiops) < 0))
475 476 477 478
                return -1;

            if (dev->rbps &&
                (virCgroupSetBlkioDeviceReadBps(priv->cgroup, dev->path,
479 480 481
                                                dev->rbps) < 0 ||
                 virCgroupGetBlkioDeviceReadBps(priv->cgroup, dev->path,
                                                &dev->rbps) < 0))
482 483 484 485
                return -1;

            if (dev->wbps &&
                (virCgroupSetBlkioDeviceWriteBps(priv->cgroup, dev->path,
486 487 488
                                                 dev->wbps) < 0 ||
                 virCgroupGetBlkioDeviceWriteBps(priv->cgroup, dev->path,
                                                 &dev->wbps) < 0))
489 490 491 492 493 494 495
                return -1;
        }
    }

    return 0;
}

496

497 498 499 500 501
static int
qemuSetupMemoryCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

E
Eric Blake 已提交
502
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
503 504 505
        if (virMemoryLimitIsSet(vm->def->mem.hard_limit) ||
            virMemoryLimitIsSet(vm->def->mem.soft_limit) ||
            virMemoryLimitIsSet(vm->def->mem.swap_hard_limit)) {
506 507 508
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Memory cgroup is not available on this host"));
            return -1;
O
Osier Yang 已提交
509 510
        } else {
            return 0;
511 512 513
        }
    }

514 515 516
    if (virMemoryLimitIsSet(vm->def->mem.hard_limit))
        if (virCgroupSetMemoryHardLimit(priv->cgroup, vm->def->mem.hard_limit) < 0)
            return -1;
517

518 519 520
    if (virMemoryLimitIsSet(vm->def->mem.soft_limit))
        if (virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit) < 0)
            return -1;
521

522 523 524
    if (virMemoryLimitIsSet(vm->def->mem.swap_hard_limit))
        if (virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit) < 0)
            return -1;
525 526 527 528 529

    return 0;
}


530 531 532 533 534 535 536
static int
qemuSetupDevicesCgroup(virQEMUDriverPtr driver,
                       virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = NULL;
    const char *const *deviceACL = NULL;
537
    int rv = -1;
538
    int ret = -1;
539
    size_t i;
540 541 542 543

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

544 545 546 547 548
    rv = virCgroupDenyAllDevices(priv->cgroup);
    virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rv == 0);
    if (rv < 0) {
        if (virLastErrorIsSystemErrno(EPERM)) {
            virResetLastError();
549 550 551 552 553 554 555
            VIR_WARN("Group devices ACL is not accessible, disabling whitelisting");
            return 0;
        }

        goto cleanup;
    }

556
    for (i = 0; i < vm->def->ndisks; i++) {
557 558 559 560
        if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0)
            goto cleanup;
    }

561
    rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR,
562 563
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR,
564 565
                              "pty", "rw", rv == 0);
    if (rv < 0)
566 567 568 569 570 571 572 573
        goto cleanup;

    cfg = virQEMUDriverGetConfig(driver);
    deviceACL = cfg->cgroupDeviceACL ?
                (const char *const *)cfg->cgroupDeviceACL :
                defaultDeviceACL;

    if (vm->def->nsounds &&
574
        ((!vm->def->ngraphics && cfg->nogfxAllowHostAudio) ||
575 576
         (vm->def->graphics &&
          ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC &&
577
           cfg->vncAllowHostAudio) ||
578
           (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL))))) {
579
        rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR,
580 581
                                       VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR,
582 583
                                  "sound", "rw", rv == 0);
        if (rv < 0)
584 585 586
            goto cleanup;
    }

587
    for (i = 0; deviceACL[i] != NULL; i++) {
588
        if (!virFileExists(deviceACL[i])) {
N
Nehal J Wani 已提交
589
            VIR_DEBUG("Ignoring non-existent device %s", deviceACL[i]);
590 591 592
            continue;
        }

593
        rv = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i],
594
                                      VIR_CGROUP_DEVICE_RW);
595 596 597
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rv == 0);
        if (rv < 0 &&
            !virLastErrorIsSystemErrno(ENOENT))
598 599 600 601 602 603 604 605 606
            goto cleanup;
    }

    if (virDomainChrDefForeach(vm->def,
                               true,
                               qemuSetupChardevCgroup,
                               vm) < 0)
        goto cleanup;

607
    if (vm->def->tpm && qemuSetupTPMCgroup(vm) < 0)
608 609 610
        goto cleanup;

    for (i = 0; i < vm->def->nhostdevs; i++) {
611
        if (qemuSetupHostdevCgroup(vm, vm->def->hostdevs[i]) < 0)
612 613 614
            goto cleanup;
    }

615 616 617 618 619
    for (i = 0; i < vm->def->ninputs; i++) {
        if (qemuSetupInputCgroup(vm, vm->def->inputs[i]) < 0)
            goto cleanup;
    }

620 621 622
    for (i = 0; i < vm->def->nrngs; i++) {
        if (vm->def->rngs[i]->backend == VIR_DOMAIN_RNG_BACKEND_RANDOM) {
            VIR_DEBUG("Setting Cgroup ACL for RNG device");
623 624
            rv = virCgroupAllowDevicePath(priv->cgroup,
                                          vm->def->rngs[i]->source.file,
625 626
                                          VIR_CGROUP_DEVICE_RW);
            virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
627 628
                                     vm->def->rngs[i]->source.file,
                                     "rw", rv == 0);
629 630 631 632
            if (rv < 0 &&
                !virLastErrorIsSystemErrno(ENOENT))
                goto cleanup;
        }
633 634
    }

635
    ret = 0;
636
 cleanup:
637 638 639 640 641
    virObjectUnref(cfg);
    return ret;
}


642
int
643
qemuSetupCpusetMems(virDomainObjPtr vm)
644
{
645
    virCgroupPtr cgroup_temp = NULL;
646
    qemuDomainObjPrivatePtr priv = vm->privateData;
647
    virDomainNumatuneMemMode mode;
648
    char *mem_mask = NULL;
649 650 651 652 653
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

654 655
    if (virDomainNumatuneGetMode(vm->def->numa, -1, &mode) < 0 ||
        mode != VIR_DOMAIN_NUMATUNE_MEM_STRICT)
656 657
        return 0;

658
    if (virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
659
                                            priv->autoNodeset,
660
                                            &mem_mask, -1) < 0)
661
        goto cleanup;
662

663
    if (mem_mask)
J
John Ferlan 已提交
664 665
        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
                               false, &cgroup_temp) < 0 ||
666
            virCgroupSetCpusetMems(cgroup_temp, mem_mask) < 0)
667
            goto cleanup;
668

669 670 671
    ret = 0;
 cleanup:
    VIR_FREE(mem_mask);
672
    virCgroupFree(&cgroup_temp);
673 674 675 676 677
    return ret;
}


static int
678
qemuSetupCpusetCgroup(virDomainObjPtr vm)
679 680 681 682 683 684
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

685 686 687
    if (virCgroupSetCpusetMemoryMigrate(priv->cgroup, true) < 0)
        return -1;

688
    return 0;
689 690 691
}


692
static int
693 694
qemuSetupCpuCgroup(virQEMUDriverPtr driver,
                   virDomainObjPtr vm)
695 696
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
697 698 699 700
    virObjectEventPtr event = NULL;
    virTypedParameterPtr eventParams = NULL;
    int eventNparams = 0;
    int eventMaxparams = 0;
701 702

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
703
       if (vm->def->cputune.sharesSpecified) {
704 705 706 707 708 709 710 711
           virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                          _("CPU tuning is not available on this host"));
           return -1;
       } else {
           return 0;
       }
    }

712 713 714 715 716 717 718
    if (vm->def->cputune.sharesSpecified) {
        unsigned long long val;
        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
            return -1;

        if (virCgroupGetCpuShares(priv->cgroup, &val) < 0)
            return -1;
719 720 721 722
        if (vm->def->cputune.shares != val) {
            vm->def->cputune.shares = val;
            if (virTypedParamsAddULLong(&eventParams, &eventNparams,
                                        &eventMaxparams,
723
                                        VIR_DOMAIN_TUNABLE_CPU_CPU_SHARES,
724 725 726 727 728 729
                                        val) < 0)
                return -1;

            event = virDomainEventTunableNewFromObj(vm, eventParams, eventNparams);
        }

730
        qemuDomainEventQueue(driver, event);
731
    }
732 733 734 735 736

    return 0;
}


737
static int
738
qemuInitCgroup(virQEMUDriverPtr driver,
739 740 741
               virDomainObjPtr vm,
               size_t nnicindexes,
               int *nicindexes)
742
{
743
    int ret = -1;
744 745 746
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);

747
    if (!virQEMUDriverIsPrivileged(driver))
748 749
        goto done;

750 751 752
    if (!virCgroupAvailable())
        goto done;

753 754
    virCgroupFree(&priv->cgroup);

755
    if (!vm->def->resource) {
756 757
        virDomainResourceDefPtr res;

758
        if (VIR_ALLOC(res) < 0)
759
            goto cleanup;
760

761
        if (VIR_STRDUP(res->partition, "/machine") < 0) {
762 763 764 765 766
            VIR_FREE(res);
            goto cleanup;
        }

        vm->def->resource = res;
767 768
    }

769 770 771 772 773 774
    if (vm->def->resource->partition[0] != '/') {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Resource partition '%s' must start with '/'"),
                       vm->def->resource->partition);
        goto cleanup;
    }
775

776 777 778 779 780 781 782 783 784 785 786 787
    /*
     * We need to do this because of systemd-machined, because
     * CreateMachine requires the name to be a valid hostname.
     */
    priv->machineName = virSystemdMakeMachineName("qemu",
                                                  vm->def->id,
                                                  vm->def->name,
                                                  virQEMUDriverIsPrivileged(driver));
    if (!priv->machineName)
        goto cleanup;

    if (virCgroupNewMachine(priv->machineName,
788 789 790 791 792
                            "qemu",
                            vm->def->uuid,
                            NULL,
                            vm->pid,
                            false,
793
                            nnicindexes, nicindexes,
794 795 796
                            vm->def->resource->partition,
                            cfg->cgroupControllers,
                            &priv->cgroup) < 0) {
797 798
        if (virCgroupNewIgnoreError())
            goto done;
799

800 801
        goto cleanup;
    }
802

803
 done:
804
    ret = 0;
805
 cleanup:
806 807 808
    virObjectUnref(cfg);
    return ret;
}
809

810 811 812
static void
qemuRestoreCgroupState(virDomainObjPtr vm)
{
813
    char *mem_mask = NULL;
814
    char *nodeset = NULL;
815 816
    int empty = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
817
    size_t i = 0;
818
    virBitmapPtr all_nodes;
819
    virCgroupPtr cgroup_temp = NULL;
820 821 822 823 824 825 826 827

    if (!(all_nodes = virNumaGetHostNodeset()))
        goto error;

    if (!(mem_mask = virBitmapFormat(all_nodes)))
        goto error;

    if ((empty = virCgroupHasEmptyTasks(priv->cgroup,
828
                                        VIR_CGROUP_CONTROLLER_CPUSET)) <= 0)
829 830 831 832 833
        goto error;

    if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
        goto error;

834 835 836 837 838 839
    for (i = 0; i < virDomainDefGetVcpusMax(vm->def); i++) {
        virDomainVcpuInfoPtr vcpu = virDomainDefGetVcpu(vm->def, i);

        if (!vcpu->online)
            continue;

J
John Ferlan 已提交
840 841
        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i,
                               false, &cgroup_temp) < 0 ||
842 843 844 845 846
            virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
            virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
            virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
            goto cleanup;

847
        VIR_FREE(nodeset);
848 849 850
        virCgroupFree(&cgroup_temp);
    }

851 852 853
    for (i = 0; i < vm->def->niothreadids; i++) {
        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_IOTHREAD,
                               vm->def->iothreadids[i]->iothread_id,
J
John Ferlan 已提交
854
                               false, &cgroup_temp) < 0 ||
855 856 857 858 859
            virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
            virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
            virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
            goto cleanup;

860
        VIR_FREE(nodeset);
861 862 863
        virCgroupFree(&cgroup_temp);
    }

J
John Ferlan 已提交
864 865
    if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
                           false, &cgroup_temp) < 0 ||
866 867 868 869 870
        virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
        virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
        virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
        goto cleanup;

871 872
 cleanup:
    VIR_FREE(mem_mask);
873
    VIR_FREE(nodeset);
874
    virBitmapFree(all_nodes);
875
    virCgroupFree(&cgroup_temp);
876 877 878 879 880 881 882
    return;

 error:
    virResetLastError();
    VIR_DEBUG("Couldn't restore cgroups to meaningful state");
    goto cleanup;
}
883 884 885 886 887 888 889 890 891

int
qemuConnectCgroup(virQEMUDriverPtr driver,
                  virDomainObjPtr vm)
{
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int ret = -1;

892
    if (!virQEMUDriverIsPrivileged(driver))
893 894 895 896 897 898 899
        goto done;

    if (!virCgroupAvailable())
        goto done;

    virCgroupFree(&priv->cgroup);

900 901
    if (virCgroupNewDetectMachine(vm->def->name,
                                  "qemu",
902 903
                                  vm->def->id,
                                  virQEMUDriverIsPrivileged(driver),
904
                                  vm->pid,
905
                                  cfg->cgroupControllers,
906
                                  &priv->cgroup) < 0)
907
        goto cleanup;
908

909 910 911 912
    priv->machineName = virSystemdGetMachineNameByPID(vm->pid);
    if (!priv->machineName)
        virResetLastError();

913 914
    qemuRestoreCgroupState(vm);

915
 done:
916
    ret = 0;
917
 cleanup:
918
    virObjectUnref(cfg);
919
    return ret;
920 921
}

922 923
int
qemuSetupCgroup(virQEMUDriverPtr driver,
924 925 926
                virDomainObjPtr vm,
                size_t nnicindexes,
                int *nicindexes)
927
{
928
    qemuDomainObjPrivatePtr priv = vm->privateData;
929
    int ret = -1;
930

931 932 933 934 935 936
    if (!vm->pid) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("Cannot setup cgroups until process is started"));
        return -1;
    }

937
    if (qemuInitCgroup(driver, vm, nnicindexes, nicindexes) < 0)
938
        return -1;
939

940
    if (!priv->cgroup)
941
        return 0;
942

943 944
    if (qemuSetupDevicesCgroup(driver, vm) < 0)
        goto cleanup;
945

946 947
    if (qemuSetupBlkioCgroup(vm) < 0)
        goto cleanup;
948

949 950
    if (qemuSetupMemoryCgroup(vm) < 0)
        goto cleanup;
951

952
    if (qemuSetupCpuCgroup(driver, vm) < 0)
953
        goto cleanup;
954

955
    if (qemuSetupCpusetCgroup(vm) < 0)
956
        goto cleanup;
957

958
    ret = 0;
959
 cleanup:
960
    return ret;
961 962
}

963 964 965 966
int
qemuSetupCgroupVcpuBW(virCgroupPtr cgroup,
                      unsigned long long period,
                      long long quota)
967 968 969 970 971 972 973 974
{
    unsigned long long old_period;

    if (period == 0 && quota == 0)
        return 0;

    if (period) {
        /* get old period, and we can rollback if set quota failed */
975
        if (virCgroupGetCpuCfsPeriod(cgroup, &old_period) < 0)
976 977
            return -1;

978
        if (virCgroupSetCpuCfsPeriod(cgroup, period) < 0)
979 980 981
            return -1;
    }

982 983 984
    if (quota &&
        virCgroupSetCpuCfsQuota(cgroup, quota) < 0)
        goto error;
985 986 987

    return 0;

988
 error:
989
    if (period) {
990 991 992 993 994 995
        virErrorPtr saved = virSaveLastError();
        ignore_value(virCgroupSetCpuCfsPeriod(cgroup, old_period));
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
996 997 998 999 1000
    }

    return -1;
}

1001

1002
int
1003 1004
qemuSetupCgroupCpusetCpus(virCgroupPtr cgroup,
                          virBitmapPtr cpumask)
1005
{
1006
    int ret = -1;
1007 1008
    char *new_cpus = NULL;

1009
    if (!(new_cpus = virBitmapFormat(cpumask)))
1010 1011
        goto cleanup;

1012
    if (virCgroupSetCpusetCpus(cgroup, new_cpus) < 0)
1013 1014
        goto cleanup;

1015
    ret = 0;
1016
 cleanup:
1017
    VIR_FREE(new_cpus);
1018
    return ret;
1019 1020
}

1021

1022
int
1023
qemuSetupCgroupForEmulator(virDomainObjPtr vm)
1024
{
1025
    virBitmapPtr cpumask = NULL;
1026
    virCgroupPtr cgroup_emulator = NULL;
1027
    virDomainDefPtr def = vm->def;
1028
    qemuDomainObjPrivatePtr priv = vm->privateData;
1029 1030
    unsigned long long period = vm->def->cputune.emulator_period;
    long long quota = vm->def->cputune.emulator_quota;
1031

1032
    if ((period || quota) &&
1033
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
1034 1035 1036 1037 1038
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

1039 1040 1041 1042 1043 1044 1045 1046 1047
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

J
John Ferlan 已提交
1048 1049
    if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
                           true, &cgroup_emulator) < 0)
1050 1051
        goto cleanup;

1052
    if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0)
1053
        goto cleanup;
1054

1055
    if (def->cputune.emulatorpin)
1056
        cpumask = def->cputune.emulatorpin;
1057 1058
    else if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)
        cpumask = priv->autoCpuset;
1059
    else if (def->cpumask)
1060 1061 1062
        cpumask = def->cpumask;

    if (cpumask) {
1063
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) &&
1064
            qemuSetupCgroupCpusetCpus(cgroup_emulator, cpumask) < 0)
1065
            goto cleanup;
H
Hu Tao 已提交
1066
    }
1067

1068
    if (period || quota) {
1069 1070 1071 1072
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
            qemuSetupCgroupVcpuBW(cgroup_emulator, period,
                                  quota) < 0)
            goto cleanup;
1073 1074
    }

1075 1076 1077
    virCgroupFree(&cgroup_emulator);
    return 0;

1078
 cleanup:
1079 1080 1081 1082 1083
    if (cgroup_emulator) {
        virCgroupRemove(cgroup_emulator);
        virCgroupFree(&cgroup_emulator);
    }

1084
    return -1;
1085
}
1086

1087

1088
int
1089
qemuRemoveCgroup(virDomainObjPtr vm)
1090
{
1091
    qemuDomainObjPrivatePtr priv = vm->privateData;
1092

1093
    if (priv->cgroup == NULL)
1094 1095
        return 0; /* Not supported, so claim success */

1096
    if (virCgroupTerminateMachine(priv->machineName) < 0) {
1097 1098 1099 1100
        if (!virCgroupNewIgnoreError())
            VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name);
    }

1101
    return virCgroupRemove(priv->cgroup);
1102 1103
}

1104 1105
int
qemuAddToCgroup(virDomainObjPtr vm)
1106
{
1107
    qemuDomainObjPrivatePtr priv = vm->privateData;
1108

1109
    if (priv->cgroup == NULL)
1110 1111
        return 0; /* Not supported, so claim success */

1112
    return 0;
1113
}