qemu_cgroup.c 36.2 KB
Newer Older
1 2 3
/*
 * qemu_cgroup.c: QEMU cgroup management
 *
4
 * Copyright (C) 2006-2014 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright (C) 2006 Daniel P. Berrange
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 */

#include <config.h>

#include "qemu_cgroup.h"
27
#include "qemu_domain.h"
28
#include "qemu_process.h"
29
#include "vircgroup.h"
30
#include "virlog.h"
31
#include "viralloc.h"
32
#include "virerror.h"
33
#include "domain_audit.h"
34
#include "virscsi.h"
35
#include "virstring.h"
36
#include "virfile.h"
37
#include "virtypedparam.h"
38 39 40

#define VIR_FROM_THIS VIR_FROM_QEMU

41 42
VIR_LOG_INIT("qemu.qemu_cgroup");

43 44 45 46
static const char *const defaultDeviceACL[] = {
    "/dev/null", "/dev/full", "/dev/zero",
    "/dev/random", "/dev/urandom",
    "/dev/ptmx", "/dev/kvm", "/dev/kqemu",
47
    "/dev/rtc", "/dev/hpet", "/dev/vfio/vfio",
48 49 50 51 52
    NULL,
};
#define DEVICE_PTY_MAJOR 136
#define DEVICE_SND_MAJOR 116

53 54 55 56 57
static int
qemuSetImageCgroupInternal(virDomainObjPtr vm,
                           virStorageSourcePtr src,
                           bool deny,
                           bool forceReadonly)
58
{
59
    qemuDomainObjPrivatePtr priv = vm->privateData;
60
    int perms = VIR_CGROUP_DEVICE_READ;
61
    int ret;
62

63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (!src->path || !virStorageSourceIsLocalStorage(src)) {
        VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s",
                  NULLSTR(src->path), virStorageTypeToString(src->type));
        return 0;
    }

    if (deny) {
        perms |= VIR_CGROUP_DEVICE_WRITE | VIR_CGROUP_DEVICE_MKNOD;

        VIR_DEBUG("Deny path %s", src->path);

        ret = virCgroupDenyDevicePath(priv->cgroup, src->path, perms);
    } else {
80
        if (!src->readonly && !forceReadonly)
81 82 83 84 85 86 87 88 89 90 91 92 93
            perms |= VIR_CGROUP_DEVICE_WRITE;

        VIR_DEBUG("Allow path %s, perms: %s",
                  src->path, virCgroupGetDevicePermsString(perms));

        ret = virCgroupAllowDevicePath(priv->cgroup, src->path, perms);
    }

    virDomainAuditCgroupPath(vm, priv->cgroup,
                             deny ? "deny" : "allow",
                             src->path,
                             virCgroupGetDevicePermsString(perms),
                             ret == 0);
94 95 96 97

    /* Get this for root squash NFS */
    if (ret < 0 &&
        virLastErrorIsSystemErrno(EACCES)) {
98
        VIR_DEBUG("Ignoring EACCES for %s", src->path);
99 100
        virResetLastError();
        ret = 0;
101
    }
102

103
    return ret;
104 105 106
}


107 108 109 110 111 112 113 114 115
int
qemuSetImageCgroup(virDomainObjPtr vm,
                   virStorageSourcePtr src,
                   bool deny)
{
    return qemuSetImageCgroupInternal(vm, src, deny, false);
}


116 117 118
int
qemuSetupDiskCgroup(virDomainObjPtr vm,
                    virDomainDiskDefPtr disk)
119
{
120
    virStorageSourcePtr next;
121
    bool forceReadonly = false;
122

123
    for (next = disk->src; next; next = next->backingStore) {
124
        if (qemuSetImageCgroupInternal(vm, next, false, forceReadonly) < 0)
125
            return -1;
126 127 128

        /* setup only the top level image for read-write */
        forceReadonly = true;
129
    }
130 131

    return 0;
132 133 134
}


135 136 137
int
qemuTeardownDiskCgroup(virDomainObjPtr vm,
                       virDomainDiskDefPtr disk)
138
{
139
    virStorageSourcePtr next;
140

141 142 143 144
    for (next = disk->src; next; next = next->backingStore) {
        if (qemuSetImageCgroup(vm, next, true) < 0)
            return -1;
    }
145

146
    return 0;
147 148
}

149

150
static int
151
qemuSetupChrSourceCgroup(virDomainDefPtr def ATTRIBUTE_UNUSED,
152
                         virDomainChrSourceDefPtr dev,
153
                         void *opaque)
154
{
155 156
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
157
    int ret;
158

159
    if (dev->type != VIR_DOMAIN_CHR_TYPE_DEV)
160 161
        return 0;

162
    VIR_DEBUG("Process path '%s' for device", dev->data.file.path);
163

164 165
    ret = virCgroupAllowDevicePath(priv->cgroup, dev->data.file.path,
                                   VIR_CGROUP_DEVICE_RW);
166
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
167
                             dev->data.file.path, "rw", ret == 0);
168

169
    return ret;
170 171
}

172 173 174 175 176
static int
qemuSetupChardevCgroup(virDomainDefPtr def,
                       virDomainChrDefPtr dev,
                       void *opaque)
{
177
    return qemuSetupChrSourceCgroup(def, &dev->source, opaque);
178 179 180 181 182 183
}


static int
qemuSetupTPMCgroup(virDomainDefPtr def,
                   virDomainTPMDefPtr dev,
184
                   void *opaque)
185
{
186
    int ret = 0;
187 188 189

    switch (dev->type) {
    case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
190 191
        ret = qemuSetupChrSourceCgroup(def, &dev->data.passthrough.source,
                                       opaque);
192 193 194 195 196
        break;
    case VIR_DOMAIN_TPM_TYPE_LAST:
        break;
    }

197
    return ret;
198 199
}

200

201
static int
202
qemuSetupHostUSBDeviceCgroup(virUSBDevicePtr dev ATTRIBUTE_UNUSED,
203 204
                             const char *path,
                             void *opaque)
205
{
206 207
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
208
    int ret;
209 210

    VIR_DEBUG("Process path '%s' for USB device", path);
211 212 213
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", ret == 0);
214

215
    return ret;
216 217
}

218
static int
219
qemuSetupHostSCSIDeviceCgroup(virSCSIDevicePtr dev ATTRIBUTE_UNUSED,
220 221 222 223 224
                              const char *path,
                              void *opaque)
{
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
225
    int ret;
226 227 228

    VIR_DEBUG("Process path '%s' for SCSI device", path);

229 230 231 232
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   virSCSIDeviceGetReadonly(dev) ?
                                   VIR_CGROUP_DEVICE_READ :
                                   VIR_CGROUP_DEVICE_RW);
233 234

    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
235
                             virSCSIDeviceGetReadonly(dev) ? "r" : "rw", ret == 0);
236

237
    return ret;
238
}
239

240 241 242 243 244 245
int
qemuSetupHostdevCGroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
246
    virDomainHostdevSubsysUSBPtr usbsrc = &dev->source.subsys.u.usb;
247
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
248
    virDomainHostdevSubsysSCSIPtr scsisrc = &dev->source.subsys.u.scsi;
249
    virPCIDevicePtr pci = NULL;
250
    virUSBDevicePtr usb = NULL;
251
    virSCSIDevicePtr scsi = NULL;
252 253 254 255 256 257 258 259 260 261 262 263 264 265
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
266
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
267
                int rv;
268

269 270 271 272
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
273 274 275
                if (!pci)
                    goto cleanup;

276
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
277 278 279
                    goto cleanup;

                VIR_DEBUG("Cgroup allow %s for PCI device assignment", path);
280
                rv = virCgroupAllowDevicePath(priv->cgroup, path,
281 282
                                              VIR_CGROUP_DEVICE_RW);
                virDomainAuditCgroupPath(vm, priv->cgroup,
283 284
                                         "allow", path, "rw", rv == 0);
                if (rv < 0)
285 286 287
                    goto cleanup;
            }
            break;
288 289 290 291 292 293 294 295

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* NB: hostdev->missing wasn't previously checked in the
             * case of hotplug, only when starting a domain. Now it is
             * always checked, and the cgroup setup skipped if true.
             */
            if (dev->missing)
                break;
296
            if ((usb = virUSBDeviceNew(usbsrc->bus, usbsrc->device,
297 298 299 300
                                       NULL)) == NULL) {
                goto cleanup;
            }

301
            /* oddly, qemuSetupHostUSBDeviceCgroup doesn't ever
302 303
             * reference the usb object we just created
             */
304
            if (virUSBDeviceFileIterate(usb, qemuSetupHostUSBDeviceCgroup,
305 306 307 308
                                        vm) < 0) {
                goto cleanup;
            }
            break;
309

310
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI: {
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
            if (scsisrc->protocol ==
                VIR_DOMAIN_HOSTDEV_SCSI_PROTOCOL_TYPE_ISCSI) {
                virDomainHostdevSubsysSCSIiSCSIPtr iscsisrc = &scsisrc->u.iscsi;
                /* Follow qemuSetupDiskCgroup() and qemuSetImageCgroupInternal()
                 * which does nothing for non local storage
                 */
                VIR_DEBUG("Not updating cgroups for hostdev iSCSI path '%s'",
                          iscsisrc->path);
            } else {
                virDomainHostdevSubsysSCSIHostPtr scsihostsrc =
                    &scsisrc->u.host;
                if ((scsi = virSCSIDeviceNew(NULL,
                                             scsihostsrc->adapter,
                                             scsihostsrc->bus,
                                             scsihostsrc->target,
                                             scsihostsrc->unit,
                                             dev->readonly,
                                             dev->shareable)) == NULL)
                    goto cleanup;
330

331 332 333 334 335
                if (virSCSIDeviceFileIterate(scsi,
                                             qemuSetupHostSCSIDeviceCgroup,
                                             vm) < 0)
                    goto cleanup;
            }
336 337
            break;
        }
338

339 340 341 342 343 344
        default:
            break;
        }
    }

    ret = 0;
345
 cleanup:
346
    virPCIDeviceFree(pci);
347
    virUSBDeviceFree(usb);
348
    virSCSIDeviceFree(scsi);
349 350 351 352 353 354 355 356 357 358
    VIR_FREE(path);
    return ret;
}

int
qemuTeardownHostdevCgroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
359
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
    virPCIDevicePtr pci = NULL;
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
375
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
376
                int rv;
377

378 379 380 381
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
382 383 384
                if (!pci)
                    goto cleanup;

385
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
386 387 388
                    goto cleanup;

                VIR_DEBUG("Cgroup deny %s for PCI device assignment", path);
389
                rv = virCgroupDenyDevicePath(priv->cgroup, path,
390 391
                                             VIR_CGROUP_DEVICE_RWM);
                virDomainAuditCgroupPath(vm, priv->cgroup,
392 393
                                         "deny", path, "rwm", rv == 0);
                if (rv < 0)
394 395 396
                    goto cleanup;
            }
            break;
397 398 399
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* nothing to tear down for USB */
            break;
400 401 402 403 404 405
        default:
            break;
        }
    }

    ret = 0;
406
 cleanup:
407 408 409 410 411
    virPCIDeviceFree(pci);
    VIR_FREE(path);
    return ret;
}

412 413 414 415
static int
qemuSetupBlkioCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
416
    size_t i;
417 418 419 420 421 422 423 424 425 426 427 428

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_BLKIO)) {
        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Block I/O tuning is not available on this host"));
            return -1;
        } else {
            return 0;
        }
    }

429 430 431
    if (vm->def->blkio.weight != 0 &&
        virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight) < 0)
        return -1;
432 433 434

    if (vm->def->blkio.ndevices) {
        for (i = 0; i < vm->def->blkio.ndevices; i++) {
435
            virBlkioDevicePtr dev = &vm->def->blkio.devices[i];
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
            if (dev->weight &&
                (virCgroupSetBlkioDeviceWeight(priv->cgroup, dev->path,
                                               dev->weight) < 0))
                return -1;

            if (dev->riops &&
                (virCgroupSetBlkioDeviceReadIops(priv->cgroup, dev->path,
                                                 dev->riops) < 0))
                return -1;

            if (dev->wiops &&
                (virCgroupSetBlkioDeviceWriteIops(priv->cgroup, dev->path,
                                                  dev->wiops) < 0))
                return -1;

            if (dev->rbps &&
                (virCgroupSetBlkioDeviceReadBps(priv->cgroup, dev->path,
                                                dev->rbps) < 0))
                return -1;

            if (dev->wbps &&
                (virCgroupSetBlkioDeviceWriteBps(priv->cgroup, dev->path,
                                                 dev->wbps) < 0))
459 460 461 462 463 464 465
                return -1;
        }
    }

    return 0;
}

466

467 468 469 470 471
static int
qemuSetupMemoryCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

E
Eric Blake 已提交
472
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
473 474 475 476 477 478
        if (vm->def->mem.hard_limit != 0 ||
            vm->def->mem.soft_limit != 0 ||
            vm->def->mem.swap_hard_limit != 0) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Memory cgroup is not available on this host"));
            return -1;
O
Osier Yang 已提交
479 480
        } else {
            return 0;
481 482 483
        }
    }

484 485
    if (vm->def->mem.hard_limit != 0 &&
        virCgroupSetMemoryHardLimit(priv->cgroup, vm->def->mem.hard_limit) < 0)
486 487
        return -1;

488 489 490 491 492 493 494
    if (vm->def->mem.soft_limit != 0 &&
        virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit) < 0)
        return -1;

    if (vm->def->mem.swap_hard_limit != 0 &&
        virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit) < 0)
        return -1;
495 496 497 498 499

    return 0;
}


500 501 502 503 504 505 506
static int
qemuSetupDevicesCgroup(virQEMUDriverPtr driver,
                       virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = NULL;
    const char *const *deviceACL = NULL;
507
    int rv = -1;
508
    int ret = -1;
509
    size_t i;
510 511 512 513

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

514 515 516 517 518
    rv = virCgroupDenyAllDevices(priv->cgroup);
    virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rv == 0);
    if (rv < 0) {
        if (virLastErrorIsSystemErrno(EPERM)) {
            virResetLastError();
519 520 521 522 523 524 525
            VIR_WARN("Group devices ACL is not accessible, disabling whitelisting");
            return 0;
        }

        goto cleanup;
    }

526
    for (i = 0; i < vm->def->ndisks; i++) {
527 528 529 530
        if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0)
            goto cleanup;
    }

531
    rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR,
532 533
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR,
534 535
                              "pty", "rw", rv == 0);
    if (rv < 0)
536 537 538 539 540 541 542 543
        goto cleanup;

    cfg = virQEMUDriverGetConfig(driver);
    deviceACL = cfg->cgroupDeviceACL ?
                (const char *const *)cfg->cgroupDeviceACL :
                defaultDeviceACL;

    if (vm->def->nsounds &&
544
        ((!vm->def->ngraphics && cfg->nogfxAllowHostAudio) ||
545 546
         (vm->def->graphics &&
          ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC &&
547
           cfg->vncAllowHostAudio) ||
548
           (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL))))) {
549
        rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR,
550 551
                                       VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR,
552 553
                                  "sound", "rw", rv == 0);
        if (rv < 0)
554 555 556
            goto cleanup;
    }

557
    for (i = 0; deviceACL[i] != NULL; i++) {
558
        if (!virFileExists(deviceACL[i])) {
N
Nehal J Wani 已提交
559
            VIR_DEBUG("Ignoring non-existent device %s", deviceACL[i]);
560 561 562
            continue;
        }

563
        rv = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i],
564
                                      VIR_CGROUP_DEVICE_RW);
565 566 567
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rv == 0);
        if (rv < 0 &&
            !virLastErrorIsSystemErrno(ENOENT))
568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
            goto cleanup;
    }

    if (virDomainChrDefForeach(vm->def,
                               true,
                               qemuSetupChardevCgroup,
                               vm) < 0)
        goto cleanup;

    if (vm->def->tpm &&
        (qemuSetupTPMCgroup(vm->def,
                            vm->def->tpm,
                            vm) < 0))
        goto cleanup;

    for (i = 0; i < vm->def->nhostdevs; i++) {
        if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0)
            goto cleanup;
    }

588 589 590
    for (i = 0; i < vm->def->nrngs; i++) {
        if (vm->def->rngs[i]->backend == VIR_DOMAIN_RNG_BACKEND_RANDOM) {
            VIR_DEBUG("Setting Cgroup ACL for RNG device");
591 592
            rv = virCgroupAllowDevicePath(priv->cgroup,
                                          vm->def->rngs[i]->source.file,
593 594
                                          VIR_CGROUP_DEVICE_RW);
            virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
595 596
                                     vm->def->rngs[i]->source.file,
                                     "rw", rv == 0);
597 598 599 600
            if (rv < 0 &&
                !virLastErrorIsSystemErrno(ENOENT))
                goto cleanup;
        }
601 602
    }

603
    ret = 0;
604
 cleanup:
605 606 607 608 609
    virObjectUnref(cfg);
    return ret;
}


610
int
611
qemuSetupCpusetMems(virDomainObjPtr vm)
612
{
613
    virCgroupPtr cgroup_temp = NULL;
614
    qemuDomainObjPrivatePtr priv = vm->privateData;
615
    char *mem_mask = NULL;
616 617 618 619 620
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

621 622 623 624
    if (virDomainNumatuneGetMode(vm->def->numatune, -1) !=
        VIR_DOMAIN_NUMATUNE_MEM_STRICT)
        return 0;

625
    if (virDomainNumatuneMaybeFormatNodeset(vm->def->numatune,
626
                                            priv->autoNodeset,
627
                                            &mem_mask, -1) < 0)
628
        goto cleanup;
629

630 631 632 633 634
    if (mem_mask)
        if (virCgroupNewEmulator(priv->cgroup, false, &cgroup_temp) < 0 ||
            virCgroupSetCpusetMems(cgroup_temp, mem_mask) < 0 ||
            virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
            goto cleanup;
635

636 637 638
    ret = 0;
 cleanup:
    VIR_FREE(mem_mask);
639
    virCgroupFree(&cgroup_temp);
640 641 642 643 644 645 646 647 648 649 650 651 652 653 654
    return ret;
}


static int
qemuSetupCpusetCgroup(virDomainObjPtr vm,
                      virCapsPtr caps)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    char *cpu_mask = NULL;
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

655 656 657 658 659
    if (vm->def->cpumask ||
        (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) {

        if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
            virBitmapPtr cpumap;
660
            if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, priv->autoNodeset)))
661 662 663 664 665 666 667
                goto cleanup;
            cpu_mask = virBitmapFormat(cpumap);
            virBitmapFree(cpumap);
        } else {
            cpu_mask = virBitmapFormat(vm->def->cpumask);
        }

668
        if (!cpu_mask)
669 670
            goto cleanup;

671
        if (virCgroupSetCpusetCpus(priv->cgroup, cpu_mask) < 0)
672 673 674
            goto cleanup;
    }

675
    ret = 0;
676
 cleanup:
677
    VIR_FREE(cpu_mask);
678 679 680 681
    return ret;
}


682
static int
683 684
qemuSetupCpuCgroup(virQEMUDriverPtr driver,
                   virDomainObjPtr vm)
685 686
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
687 688 689 690
    virObjectEventPtr event = NULL;
    virTypedParameterPtr eventParams = NULL;
    int eventNparams = 0;
    int eventMaxparams = 0;
691 692

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
693
       if (vm->def->cputune.sharesSpecified) {
694 695 696 697 698 699 700 701
           virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                          _("CPU tuning is not available on this host"));
           return -1;
       } else {
           return 0;
       }
    }

702 703 704 705 706 707 708
    if (vm->def->cputune.sharesSpecified) {
        unsigned long long val;
        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
            return -1;

        if (virCgroupGetCpuShares(priv->cgroup, &val) < 0)
            return -1;
709 710 711 712
        if (vm->def->cputune.shares != val) {
            vm->def->cputune.shares = val;
            if (virTypedParamsAddULLong(&eventParams, &eventNparams,
                                        &eventMaxparams,
713
                                        VIR_DOMAIN_TUNABLE_CPU_CPU_SHARES,
714 715 716 717 718 719 720
                                        val) < 0)
                return -1;

            event = virDomainEventTunableNewFromObj(vm, eventParams, eventNparams);
        }

        if (event)
721
            qemuDomainEventQueue(driver, event);
722
    }
723 724 725 726 727

    return 0;
}


728
static int
729
qemuInitCgroup(virQEMUDriverPtr driver,
730
               virDomainObjPtr vm)
731
{
732
    int ret = -1;
733 734 735
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);

736 737 738
    if (!cfg->privileged)
        goto done;

739 740 741
    if (!virCgroupAvailable())
        goto done;

742 743
    virCgroupFree(&priv->cgroup);

744
    if (!vm->def->resource) {
745 746
        virDomainResourceDefPtr res;

747
        if (VIR_ALLOC(res) < 0)
748
            goto cleanup;
749

750
        if (VIR_STRDUP(res->partition, "/machine") < 0) {
751 752 753 754 755
            VIR_FREE(res);
            goto cleanup;
        }

        vm->def->resource = res;
756 757
    }

758 759 760 761 762 763
    if (vm->def->resource->partition[0] != '/') {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Resource partition '%s' must start with '/'"),
                       vm->def->resource->partition);
        goto cleanup;
    }
764 765 766 767 768 769 770 771 772 773 774

    if (virCgroupNewMachine(vm->def->name,
                            "qemu",
                            cfg->privileged,
                            vm->def->uuid,
                            NULL,
                            vm->pid,
                            false,
                            vm->def->resource->partition,
                            cfg->cgroupControllers,
                            &priv->cgroup) < 0) {
775 776
        if (virCgroupNewIgnoreError())
            goto done;
777

778 779
        goto cleanup;
    }
780

781
 done:
782
    ret = 0;
783
 cleanup:
784 785 786
    virObjectUnref(cfg);
    return ret;
}
787

788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804

int
qemuConnectCgroup(virQEMUDriverPtr driver,
                  virDomainObjPtr vm)
{
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int ret = -1;

    if (!cfg->privileged)
        goto done;

    if (!virCgroupAvailable())
        goto done;

    virCgroupFree(&priv->cgroup);

805 806 807
    if (virCgroupNewDetectMachine(vm->def->name,
                                  "qemu",
                                  vm->pid,
808 809 810
                                  vm->def->resource ?
                                  vm->def->resource->partition :
                                  NULL,
811
                                  cfg->cgroupControllers,
812
                                  &priv->cgroup) < 0)
813
        goto cleanup;
814

815
 done:
816
    ret = 0;
817
 cleanup:
818
    virObjectUnref(cfg);
819
    return ret;
820 821
}

822 823
int
qemuSetupCgroup(virQEMUDriverPtr driver,
824
                virDomainObjPtr vm)
825
{
826
    qemuDomainObjPrivatePtr priv = vm->privateData;
827
    virCapsPtr caps = NULL;
828
    int ret = -1;
829

830 831 832 833 834 835
    if (!vm->pid) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("Cannot setup cgroups until process is started"));
        return -1;
    }

836
    if (qemuInitCgroup(driver, vm) < 0)
837
        return -1;
838

839
    if (!priv->cgroup)
840
        return 0;
841

842 843 844
    if (!(caps = virQEMUDriverGetCapabilities(driver, false)))
        goto cleanup;

845 846
    if (qemuSetupDevicesCgroup(driver, vm) < 0)
        goto cleanup;
847

848 849
    if (qemuSetupBlkioCgroup(vm) < 0)
        goto cleanup;
850

851 852
    if (qemuSetupMemoryCgroup(vm) < 0)
        goto cleanup;
853

854
    if (qemuSetupCpuCgroup(driver, vm) < 0)
855
        goto cleanup;
856

857
    if (qemuSetupCpusetCgroup(vm, caps) < 0)
858
        goto cleanup;
859

860
    ret = 0;
861
 cleanup:
862
    virObjectUnref(caps);
863
    return ret;
864 865
}

866 867 868 869
int
qemuSetupCgroupVcpuBW(virCgroupPtr cgroup,
                      unsigned long long period,
                      long long quota)
870 871 872 873 874 875 876 877
{
    unsigned long long old_period;

    if (period == 0 && quota == 0)
        return 0;

    if (period) {
        /* get old period, and we can rollback if set quota failed */
878
        if (virCgroupGetCpuCfsPeriod(cgroup, &old_period) < 0)
879 880
            return -1;

881
        if (virCgroupSetCpuCfsPeriod(cgroup, period) < 0)
882 883 884
            return -1;
    }

885 886 887
    if (quota &&
        virCgroupSetCpuCfsQuota(cgroup, quota) < 0)
        goto error;
888 889 890

    return 0;

891
 error:
892
    if (period) {
893 894 895 896 897 898
        virErrorPtr saved = virSaveLastError();
        ignore_value(virCgroupSetCpuCfsPeriod(cgroup, old_period));
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
899 900 901 902 903
    }

    return -1;
}

904 905 906 907 908
int
qemuSetupCgroupVcpuPin(virCgroupPtr cgroup,
                       virDomainVcpuPinDefPtr *vcpupin,
                       int nvcpupin,
                       int vcpuid)
909
{
910
    size_t i;
911 912

    for (i = 0; i < nvcpupin; i++) {
913
        if (vcpuid == vcpupin[i]->vcpuid)
914
            return qemuSetupCgroupEmulatorPin(cgroup, vcpupin[i]->cpumask);
915 916
    }

917 918 919
    return -1;
}

920 921 922 923 924 925 926 927 928
int
qemuSetupCgroupIOThreadsPin(virCgroupPtr cgroup,
                            virDomainVcpuPinDefPtr *iothreadspin,
                            int niothreadspin,
                            int iothreadid)
{
    size_t i;

    for (i = 0; i < niothreadspin; i++) {
929
        if (iothreadid == iothreadspin[i]->vcpuid)
930 931 932 933 934 935
            return qemuSetupCgroupEmulatorPin(cgroup, iothreadspin[i]->cpumask);
    }

    return -1;
}

936 937 938
int
qemuSetupCgroupEmulatorPin(virCgroupPtr cgroup,
                           virBitmapPtr cpumask)
939
{
940
    int ret = -1;
941 942
    char *new_cpus = NULL;

943
    if (!(new_cpus = virBitmapFormat(cpumask)))
944 945
        goto cleanup;

946
    if (virCgroupSetCpusetCpus(cgroup, new_cpus) < 0)
947 948
        goto cleanup;

949
    ret = 0;
950
 cleanup:
951
    VIR_FREE(new_cpus);
952
    return ret;
953 954
}

955 956
int
qemuSetupCgroupForVcpu(virDomainObjPtr vm)
957 958 959
{
    virCgroupPtr cgroup_vcpu = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
960
    virDomainDefPtr def = vm->def;
961
    size_t i, j;
962 963 964
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

965
    if ((period || quota) &&
966
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
967 968
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
H
Hu Tao 已提交
969 970 971
        return -1;
    }

972 973 974 975 976 977 978 979 980
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

981
    /* We are trying to setup cgroups for CPU pinning, which can also be done
982
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
983
     */
984
    if (priv->cgroup == NULL)
985 986
        return 0;

987
    if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) {
988
        /* If we don't know VCPU<->PID mapping or all vcpu runs in the same
W
Wen Congyang 已提交
989
         * thread, we cannot control each vcpu.
990
         */
991 992
        VIR_WARN("Unable to get vcpus' pids.");
        return 0;
993 994 995
    }

    for (i = 0; i < priv->nvcpupids; i++) {
996
        if (virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu) < 0)
997 998 999
            goto cleanup;

        /* move the thread for vcpu to sub dir */
1000
        if (virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]) < 0)
1001 1002 1003
            goto cleanup;

        if (period || quota) {
H
Hu Tao 已提交
1004 1005
            if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0)
                goto cleanup;
1006 1007
        }

1008
        /* Set vcpupin in cgroup if vcpupin xml is provided */
1009
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
M
Martin Kletzander 已提交
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupVcpuPin will fail. */
            for (j = 0; j < def->cputune.nvcpupin; j++) {
                if (def->cputune.vcpupin[j]->vcpuid != i)
                    continue;

                if (qemuSetupCgroupVcpuPin(cgroup_vcpu,
                                           def->cputune.vcpupin,
                                           def->cputune.nvcpupin,
                                           i) < 0)
                    goto cleanup;

                break;
            }
        }
1025

1026 1027 1028 1029 1030
        virCgroupFree(&cgroup_vcpu);
    }

    return 0;

1031
 cleanup:
1032 1033 1034 1035 1036
    if (cgroup_vcpu) {
        virCgroupRemove(cgroup_vcpu);
        virCgroupFree(&cgroup_vcpu);
    }

1037 1038 1039
    return -1;
}

1040 1041
int
qemuSetupCgroupForEmulator(virQEMUDriverPtr driver,
1042
                           virDomainObjPtr vm)
1043
{
1044
    virBitmapPtr cpumask = NULL;
1045
    virBitmapPtr cpumap = NULL;
1046
    virCgroupPtr cgroup_emulator = NULL;
1047
    virDomainDefPtr def = vm->def;
1048
    qemuDomainObjPrivatePtr priv = vm->privateData;
1049 1050
    unsigned long long period = vm->def->cputune.emulator_period;
    long long quota = vm->def->cputune.emulator_quota;
1051

1052
    if ((period || quota) &&
1053
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
1054 1055 1056 1057 1058
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

1059 1060 1061 1062 1063 1064 1065 1066 1067
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

1068
    if (priv->cgroup == NULL)
1069 1070
        return 0; /* Not supported, so claim success */

1071
    if (virCgroupNewEmulator(priv->cgroup, true, &cgroup_emulator) < 0)
1072 1073
        goto cleanup;

1074
    if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0)
1075
        goto cleanup;
1076

1077
    if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
1078
        if (!(cpumap = qemuPrepareCpumap(driver, priv->autoNodeset)))
1079 1080 1081
            goto cleanup;
        cpumask = cpumap;
    } else if (def->cputune.emulatorpin) {
1082
        cpumask = def->cputune.emulatorpin->cpumask;
1083
    } else if (def->cpumask) {
1084
        cpumask = def->cpumask;
1085
    }
1086 1087

    if (cpumask) {
1088 1089 1090
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) &&
            qemuSetupCgroupEmulatorPin(cgroup_emulator, cpumask) < 0)
            goto cleanup;
H
Hu Tao 已提交
1091
    }
1092

1093
    if (period || quota) {
1094 1095 1096 1097
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
            qemuSetupCgroupVcpuBW(cgroup_emulator, period,
                                  quota) < 0)
            goto cleanup;
1098 1099
    }

1100
    virCgroupFree(&cgroup_emulator);
1101
    virBitmapFree(cpumap);
1102 1103
    return 0;

1104
 cleanup:
1105 1106
    virBitmapFree(cpumap);

1107 1108 1109 1110 1111
    if (cgroup_emulator) {
        virCgroupRemove(cgroup_emulator);
        virCgroupFree(&cgroup_emulator);
    }

1112
    return -1;
1113
}
1114

1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146
int
qemuSetupCgroupForIOThreads(virDomainObjPtr vm)
{
    virCgroupPtr cgroup_iothread = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virDomainDefPtr def = vm->def;
    size_t i, j;
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

    if ((period || quota) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

    /* We are trying to setup cgroups for CPU pinning, which can also be done
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
     */
    if (priv->cgroup == NULL)
        return 0;

J
Ján Tomko 已提交
1147
    if (def->iothreads && priv->niothreadpids == 0) {
1148 1149 1150 1151 1152 1153 1154 1155
        VIR_WARN("Unable to get iothreads' pids.");
        return 0;
    }

    for (i = 0; i < priv->niothreadpids; i++) {
        /* IOThreads are numbered 1..n, although the array is 0..n-1,
         * so we will account for that here
         */
1156 1157
        if (virCgroupNewIOThread(priv->cgroup, i + 1, true,
                                 &cgroup_iothread) < 0)
1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175
            goto cleanup;

        /* move the thread for iothread to sub dir */
        if (virCgroupAddTask(cgroup_iothread, priv->iothreadpids[i]) < 0)
            goto cleanup;

        if (period || quota) {
            if (qemuSetupCgroupVcpuBW(cgroup_iothread, period, quota) < 0)
                goto cleanup;
        }

        /* Set iothreadpin in cgroup if iothreadpin xml is provided */
        if (virCgroupHasController(priv->cgroup,
                                   VIR_CGROUP_CONTROLLER_CPUSET)) {
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupIOThreadsPin will fail. */
            for (j = 0; j < def->cputune.niothreadspin; j++) {
                /* IOThreads are numbered/named 1..n */
1176
                if (def->cputune.iothreadspin[j]->vcpuid != i + 1)
1177 1178 1179 1180 1181
                    continue;

                if (qemuSetupCgroupIOThreadsPin(cgroup_iothread,
                                                def->cputune.iothreadspin,
                                                def->cputune.niothreadspin,
1182
                                                i + 1) < 0)
1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202
                    goto cleanup;

                break;
            }
        }

        virCgroupFree(&cgroup_iothread);
    }

    return 0;

 cleanup:
    if (cgroup_iothread) {
        virCgroupRemove(cgroup_iothread);
        virCgroupFree(&cgroup_iothread);
    }

    return -1;
}

1203
int
1204 1205
qemuRemoveCgroup(virQEMUDriverPtr driver,
                 virDomainObjPtr vm)
1206
{
1207
    qemuDomainObjPrivatePtr priv = vm->privateData;
1208
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
1209

1210
    if (priv->cgroup == NULL)
1211 1212
        return 0; /* Not supported, so claim success */

1213 1214 1215 1216 1217 1218 1219
    if (virCgroupTerminateMachine(vm->def->name,
                                  "qemu",
                                  cfg->privileged) < 0) {
        if (!virCgroupNewIgnoreError())
            VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name);
    }

1220 1221
    virObjectUnref(cfg);

1222
    return virCgroupRemove(priv->cgroup);
1223 1224
}

1225 1226
int
qemuAddToCgroup(virDomainObjPtr vm)
1227
{
1228
    qemuDomainObjPrivatePtr priv = vm->privateData;
1229

1230
    if (priv->cgroup == NULL)
1231 1232
        return 0; /* Not supported, so claim success */

1233
    return 0;
1234
}