qemu_cgroup.c 36.5 KB
Newer Older
1 2 3
/*
 * qemu_cgroup.c: QEMU cgroup management
 *
4
 * Copyright (C) 2006-2014 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright (C) 2006 Daniel P. Berrange
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 */

#include <config.h>

#include "qemu_cgroup.h"
27
#include "qemu_domain.h"
28
#include "qemu_process.h"
29
#include "vircgroup.h"
30
#include "virlog.h"
31
#include "viralloc.h"
32
#include "virerror.h"
33
#include "domain_audit.h"
34
#include "virscsi.h"
35
#include "virstring.h"
36
#include "virfile.h"
37
#include "virtypedparam.h"
38 39 40

#define VIR_FROM_THIS VIR_FROM_QEMU

41 42
VIR_LOG_INIT("qemu.qemu_cgroup");

43 44 45 46
static const char *const defaultDeviceACL[] = {
    "/dev/null", "/dev/full", "/dev/zero",
    "/dev/random", "/dev/urandom",
    "/dev/ptmx", "/dev/kvm", "/dev/kqemu",
47
    "/dev/rtc", "/dev/hpet", "/dev/vfio/vfio",
48 49 50 51 52
    NULL,
};
#define DEVICE_PTY_MAJOR 136
#define DEVICE_SND_MAJOR 116

53 54 55 56 57
static int
qemuSetImageCgroupInternal(virDomainObjPtr vm,
                           virStorageSourcePtr src,
                           bool deny,
                           bool forceReadonly)
58
{
59
    qemuDomainObjPrivatePtr priv = vm->privateData;
60
    int perms = VIR_CGROUP_DEVICE_READ;
61
    int ret;
62

63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (!src->path || !virStorageSourceIsLocalStorage(src)) {
        VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s",
                  NULLSTR(src->path), virStorageTypeToString(src->type));
        return 0;
    }

    if (deny) {
        perms |= VIR_CGROUP_DEVICE_WRITE | VIR_CGROUP_DEVICE_MKNOD;

        VIR_DEBUG("Deny path %s", src->path);

        ret = virCgroupDenyDevicePath(priv->cgroup, src->path, perms);
    } else {
80
        if (!src->readonly && !forceReadonly)
81 82 83 84 85 86 87 88 89 90 91 92 93
            perms |= VIR_CGROUP_DEVICE_WRITE;

        VIR_DEBUG("Allow path %s, perms: %s",
                  src->path, virCgroupGetDevicePermsString(perms));

        ret = virCgroupAllowDevicePath(priv->cgroup, src->path, perms);
    }

    virDomainAuditCgroupPath(vm, priv->cgroup,
                             deny ? "deny" : "allow",
                             src->path,
                             virCgroupGetDevicePermsString(perms),
                             ret == 0);
94 95 96 97

    /* Get this for root squash NFS */
    if (ret < 0 &&
        virLastErrorIsSystemErrno(EACCES)) {
98
        VIR_DEBUG("Ignoring EACCES for %s", src->path);
99 100
        virResetLastError();
        ret = 0;
101
    }
102

103
    return ret;
104 105 106
}


107 108 109 110 111 112 113 114 115
int
qemuSetImageCgroup(virDomainObjPtr vm,
                   virStorageSourcePtr src,
                   bool deny)
{
    return qemuSetImageCgroupInternal(vm, src, deny, false);
}


116 117 118
int
qemuSetupDiskCgroup(virDomainObjPtr vm,
                    virDomainDiskDefPtr disk)
119
{
120
    virStorageSourcePtr next;
121
    bool forceReadonly = false;
122

123
    for (next = disk->src; next; next = next->backingStore) {
124
        if (qemuSetImageCgroupInternal(vm, next, false, forceReadonly) < 0)
125
            return -1;
126 127 128

        /* setup only the top level image for read-write */
        forceReadonly = true;
129
    }
130 131

    return 0;
132 133 134
}


135 136 137
int
qemuTeardownDiskCgroup(virDomainObjPtr vm,
                       virDomainDiskDefPtr disk)
138
{
139
    virStorageSourcePtr next;
140

141 142 143 144
    for (next = disk->src; next; next = next->backingStore) {
        if (qemuSetImageCgroup(vm, next, true) < 0)
            return -1;
    }
145

146
    return 0;
147 148
}

149

150
static int
151
qemuSetupChrSourceCgroup(virDomainDefPtr def ATTRIBUTE_UNUSED,
152
                         virDomainChrSourceDefPtr dev,
153
                         void *opaque)
154
{
155 156
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
157
    int ret;
158

159
    if (dev->type != VIR_DOMAIN_CHR_TYPE_DEV)
160 161
        return 0;

162
    VIR_DEBUG("Process path '%s' for device", dev->data.file.path);
163

164 165
    ret = virCgroupAllowDevicePath(priv->cgroup, dev->data.file.path,
                                   VIR_CGROUP_DEVICE_RW);
166
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
167
                             dev->data.file.path, "rw", ret == 0);
168

169
    return ret;
170 171
}

172 173 174 175 176
static int
qemuSetupChardevCgroup(virDomainDefPtr def,
                       virDomainChrDefPtr dev,
                       void *opaque)
{
177
    return qemuSetupChrSourceCgroup(def, &dev->source, opaque);
178 179 180 181 182 183
}


static int
qemuSetupTPMCgroup(virDomainDefPtr def,
                   virDomainTPMDefPtr dev,
184
                   void *opaque)
185
{
186
    int ret = 0;
187 188 189

    switch (dev->type) {
    case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
190 191
        ret = qemuSetupChrSourceCgroup(def, &dev->data.passthrough.source,
                                       opaque);
192 193 194 195 196
        break;
    case VIR_DOMAIN_TPM_TYPE_LAST:
        break;
    }

197
    return ret;
198 199
}

200

201
static int
202
qemuSetupHostUSBDeviceCgroup(virUSBDevicePtr dev ATTRIBUTE_UNUSED,
203 204
                             const char *path,
                             void *opaque)
205
{
206 207
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
208
    int ret;
209 210

    VIR_DEBUG("Process path '%s' for USB device", path);
211 212 213
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", ret == 0);
214

215
    return ret;
216 217
}

218
static int
219
qemuSetupHostSCSIDeviceCgroup(virSCSIDevicePtr dev ATTRIBUTE_UNUSED,
220 221 222 223 224
                              const char *path,
                              void *opaque)
{
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
225
    int ret;
226 227 228

    VIR_DEBUG("Process path '%s' for SCSI device", path);

229 230 231 232
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   virSCSIDeviceGetReadonly(dev) ?
                                   VIR_CGROUP_DEVICE_READ :
                                   VIR_CGROUP_DEVICE_RW);
233 234

    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
235
                             virSCSIDeviceGetReadonly(dev) ? "r" : "rw", ret == 0);
236

237
    return ret;
238
}
239

240 241 242 243 244 245
int
qemuSetupHostdevCGroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
246
    virDomainHostdevSubsysUSBPtr usbsrc = &dev->source.subsys.u.usb;
247
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
248
    virDomainHostdevSubsysSCSIPtr scsisrc = &dev->source.subsys.u.scsi;
249
    virPCIDevicePtr pci = NULL;
250
    virUSBDevicePtr usb = NULL;
251
    virSCSIDevicePtr scsi = NULL;
252 253 254 255 256 257 258 259 260 261 262 263 264 265
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
266
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
267
                int rv;
268

269 270 271 272
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
273 274 275
                if (!pci)
                    goto cleanup;

276
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
277 278 279
                    goto cleanup;

                VIR_DEBUG("Cgroup allow %s for PCI device assignment", path);
280
                rv = virCgroupAllowDevicePath(priv->cgroup, path,
281 282
                                              VIR_CGROUP_DEVICE_RW);
                virDomainAuditCgroupPath(vm, priv->cgroup,
283 284
                                         "allow", path, "rw", rv == 0);
                if (rv < 0)
285 286 287
                    goto cleanup;
            }
            break;
288 289 290 291 292 293 294 295

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* NB: hostdev->missing wasn't previously checked in the
             * case of hotplug, only when starting a domain. Now it is
             * always checked, and the cgroup setup skipped if true.
             */
            if (dev->missing)
                break;
296
            if ((usb = virUSBDeviceNew(usbsrc->bus, usbsrc->device,
297 298 299 300
                                       NULL)) == NULL) {
                goto cleanup;
            }

301
            /* oddly, qemuSetupHostUSBDeviceCgroup doesn't ever
302 303
             * reference the usb object we just created
             */
304
            if (virUSBDeviceFileIterate(usb, qemuSetupHostUSBDeviceCgroup,
305 306 307 308
                                        vm) < 0) {
                goto cleanup;
            }
            break;
309

310
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI: {
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
            if (scsisrc->protocol ==
                VIR_DOMAIN_HOSTDEV_SCSI_PROTOCOL_TYPE_ISCSI) {
                virDomainHostdevSubsysSCSIiSCSIPtr iscsisrc = &scsisrc->u.iscsi;
                /* Follow qemuSetupDiskCgroup() and qemuSetImageCgroupInternal()
                 * which does nothing for non local storage
                 */
                VIR_DEBUG("Not updating cgroups for hostdev iSCSI path '%s'",
                          iscsisrc->path);
            } else {
                virDomainHostdevSubsysSCSIHostPtr scsihostsrc =
                    &scsisrc->u.host;
                if ((scsi = virSCSIDeviceNew(NULL,
                                             scsihostsrc->adapter,
                                             scsihostsrc->bus,
                                             scsihostsrc->target,
                                             scsihostsrc->unit,
                                             dev->readonly,
                                             dev->shareable)) == NULL)
                    goto cleanup;
330

331 332 333 334 335
                if (virSCSIDeviceFileIterate(scsi,
                                             qemuSetupHostSCSIDeviceCgroup,
                                             vm) < 0)
                    goto cleanup;
            }
336 337
            break;
        }
338

339 340 341 342 343 344
        default:
            break;
        }
    }

    ret = 0;
345
 cleanup:
346
    virPCIDeviceFree(pci);
347
    virUSBDeviceFree(usb);
348
    virSCSIDeviceFree(scsi);
349 350 351 352 353 354 355 356 357 358
    VIR_FREE(path);
    return ret;
}

int
qemuTeardownHostdevCgroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
359
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
    virPCIDevicePtr pci = NULL;
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
375
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
376
                int rv;
377

378 379 380 381
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
382 383 384
                if (!pci)
                    goto cleanup;

385
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
386 387 388
                    goto cleanup;

                VIR_DEBUG("Cgroup deny %s for PCI device assignment", path);
389
                rv = virCgroupDenyDevicePath(priv->cgroup, path,
390 391
                                             VIR_CGROUP_DEVICE_RWM);
                virDomainAuditCgroupPath(vm, priv->cgroup,
392 393
                                         "deny", path, "rwm", rv == 0);
                if (rv < 0)
394 395 396
                    goto cleanup;
            }
            break;
397 398 399
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* nothing to tear down for USB */
            break;
400 401 402 403 404 405
        default:
            break;
        }
    }

    ret = 0;
406
 cleanup:
407 408 409 410 411
    virPCIDeviceFree(pci);
    VIR_FREE(path);
    return ret;
}

412 413 414 415
static int
qemuSetupBlkioCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
416
    size_t i;
417 418 419 420 421 422 423 424 425 426 427 428

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_BLKIO)) {
        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Block I/O tuning is not available on this host"));
            return -1;
        } else {
            return 0;
        }
    }

429 430 431
    if (vm->def->blkio.weight != 0 &&
        virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight) < 0)
        return -1;
432 433 434

    if (vm->def->blkio.ndevices) {
        for (i = 0; i < vm->def->blkio.ndevices; i++) {
435
            virBlkioDevicePtr dev = &vm->def->blkio.devices[i];
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
            if (dev->weight &&
                (virCgroupSetBlkioDeviceWeight(priv->cgroup, dev->path,
                                               dev->weight) < 0))
                return -1;

            if (dev->riops &&
                (virCgroupSetBlkioDeviceReadIops(priv->cgroup, dev->path,
                                                 dev->riops) < 0))
                return -1;

            if (dev->wiops &&
                (virCgroupSetBlkioDeviceWriteIops(priv->cgroup, dev->path,
                                                  dev->wiops) < 0))
                return -1;

            if (dev->rbps &&
                (virCgroupSetBlkioDeviceReadBps(priv->cgroup, dev->path,
                                                dev->rbps) < 0))
                return -1;

            if (dev->wbps &&
                (virCgroupSetBlkioDeviceWriteBps(priv->cgroup, dev->path,
                                                 dev->wbps) < 0))
459 460 461 462 463 464 465
                return -1;
        }
    }

    return 0;
}

466

467 468 469 470 471
static int
qemuSetupMemoryCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

E
Eric Blake 已提交
472
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
473 474 475 476 477 478
        if (vm->def->mem.hard_limit != 0 ||
            vm->def->mem.soft_limit != 0 ||
            vm->def->mem.swap_hard_limit != 0) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Memory cgroup is not available on this host"));
            return -1;
O
Osier Yang 已提交
479 480
        } else {
            return 0;
481 482 483
        }
    }

484 485
    if (vm->def->mem.hard_limit != 0 &&
        virCgroupSetMemoryHardLimit(priv->cgroup, vm->def->mem.hard_limit) < 0)
486 487
        return -1;

488 489 490 491 492 493 494
    if (vm->def->mem.soft_limit != 0 &&
        virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit) < 0)
        return -1;

    if (vm->def->mem.swap_hard_limit != 0 &&
        virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit) < 0)
        return -1;
495 496 497 498 499

    return 0;
}


500 501 502 503 504 505 506
static int
qemuSetupDevicesCgroup(virQEMUDriverPtr driver,
                       virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = NULL;
    const char *const *deviceACL = NULL;
507
    int rv = -1;
508
    int ret = -1;
509
    size_t i;
510 511 512 513

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

514 515 516 517 518
    rv = virCgroupDenyAllDevices(priv->cgroup);
    virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rv == 0);
    if (rv < 0) {
        if (virLastErrorIsSystemErrno(EPERM)) {
            virResetLastError();
519 520 521 522 523 524 525
            VIR_WARN("Group devices ACL is not accessible, disabling whitelisting");
            return 0;
        }

        goto cleanup;
    }

526
    for (i = 0; i < vm->def->ndisks; i++) {
527 528 529 530
        if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0)
            goto cleanup;
    }

531
    rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR,
532 533
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR,
534 535
                              "pty", "rw", rv == 0);
    if (rv < 0)
536 537 538 539 540 541 542 543
        goto cleanup;

    cfg = virQEMUDriverGetConfig(driver);
    deviceACL = cfg->cgroupDeviceACL ?
                (const char *const *)cfg->cgroupDeviceACL :
                defaultDeviceACL;

    if (vm->def->nsounds &&
544
        ((!vm->def->ngraphics && cfg->nogfxAllowHostAudio) ||
545 546
         (vm->def->graphics &&
          ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC &&
547
           cfg->vncAllowHostAudio) ||
548
           (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL))))) {
549
        rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR,
550 551
                                       VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR,
552 553
                                  "sound", "rw", rv == 0);
        if (rv < 0)
554 555 556
            goto cleanup;
    }

557
    for (i = 0; deviceACL[i] != NULL; i++) {
558
        if (!virFileExists(deviceACL[i])) {
N
Nehal J Wani 已提交
559
            VIR_DEBUG("Ignoring non-existent device %s", deviceACL[i]);
560 561 562
            continue;
        }

563
        rv = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i],
564
                                      VIR_CGROUP_DEVICE_RW);
565 566 567
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rv == 0);
        if (rv < 0 &&
            !virLastErrorIsSystemErrno(ENOENT))
568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
            goto cleanup;
    }

    if (virDomainChrDefForeach(vm->def,
                               true,
                               qemuSetupChardevCgroup,
                               vm) < 0)
        goto cleanup;

    if (vm->def->tpm &&
        (qemuSetupTPMCgroup(vm->def,
                            vm->def->tpm,
                            vm) < 0))
        goto cleanup;

    for (i = 0; i < vm->def->nhostdevs; i++) {
        if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0)
            goto cleanup;
    }

588 589 590
    for (i = 0; i < vm->def->nrngs; i++) {
        if (vm->def->rngs[i]->backend == VIR_DOMAIN_RNG_BACKEND_RANDOM) {
            VIR_DEBUG("Setting Cgroup ACL for RNG device");
591 592
            rv = virCgroupAllowDevicePath(priv->cgroup,
                                          vm->def->rngs[i]->source.file,
593 594
                                          VIR_CGROUP_DEVICE_RW);
            virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
595 596
                                     vm->def->rngs[i]->source.file,
                                     "rw", rv == 0);
597 598 599 600
            if (rv < 0 &&
                !virLastErrorIsSystemErrno(ENOENT))
                goto cleanup;
        }
601 602
    }

603
    ret = 0;
604
 cleanup:
605 606 607 608 609
    virObjectUnref(cfg);
    return ret;
}


610
static int
611 612
qemuSetupCpusetMems(virDomainObjPtr vm,
                    virBitmapPtr nodemask)
613
{
614
    virCgroupPtr cgroup_temp = NULL;
615
    qemuDomainObjPrivatePtr priv = vm->privateData;
616
    char *mem_mask = NULL;
617 618 619 620 621
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

622 623 624 625
    if (virDomainNumatuneGetMode(vm->def->numatune, -1) !=
        VIR_DOMAIN_NUMATUNE_MEM_STRICT)
        return 0;

626 627
    if (virDomainNumatuneMaybeFormatNodeset(vm->def->numatune,
                                            nodemask,
628
                                            &mem_mask, -1) < 0)
629
        goto cleanup;
630

631 632 633 634 635
    if (mem_mask)
        if (virCgroupNewEmulator(priv->cgroup, false, &cgroup_temp) < 0 ||
            virCgroupSetCpusetMems(cgroup_temp, mem_mask) < 0 ||
            virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
            goto cleanup;
636

637 638 639
    ret = 0;
 cleanup:
    VIR_FREE(mem_mask);
640
    virCgroupFree(&cgroup_temp);
641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
    return ret;
}


static int
qemuSetupCpusetCgroup(virDomainObjPtr vm,
                      virBitmapPtr nodemask,
                      virCapsPtr caps)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    char *cpu_mask = NULL;
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

657 658 659 660 661 662 663 664 665 666 667 668 669
    if (vm->def->cpumask ||
        (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) {

        if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
            virBitmapPtr cpumap;
            if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask)))
                goto cleanup;
            cpu_mask = virBitmapFormat(cpumap);
            virBitmapFree(cpumap);
        } else {
            cpu_mask = virBitmapFormat(vm->def->cpumask);
        }

670
        if (!cpu_mask)
671 672
            goto cleanup;

673
        if (virCgroupSetCpusetCpus(priv->cgroup, cpu_mask) < 0)
674 675 676
            goto cleanup;
    }

677
    ret = 0;
678
 cleanup:
679
    VIR_FREE(cpu_mask);
680 681 682 683
    return ret;
}


684
static int
685 686
qemuSetupCpuCgroup(virQEMUDriverPtr driver,
                   virDomainObjPtr vm)
687 688
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
689 690 691 692
    virObjectEventPtr event = NULL;
    virTypedParameterPtr eventParams = NULL;
    int eventNparams = 0;
    int eventMaxparams = 0;
693 694

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
695
       if (vm->def->cputune.sharesSpecified) {
696 697 698 699 700 701 702 703
           virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                          _("CPU tuning is not available on this host"));
           return -1;
       } else {
           return 0;
       }
    }

704 705 706 707 708 709 710
    if (vm->def->cputune.sharesSpecified) {
        unsigned long long val;
        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
            return -1;

        if (virCgroupGetCpuShares(priv->cgroup, &val) < 0)
            return -1;
711 712 713 714
        if (vm->def->cputune.shares != val) {
            vm->def->cputune.shares = val;
            if (virTypedParamsAddULLong(&eventParams, &eventNparams,
                                        &eventMaxparams,
715
                                        VIR_DOMAIN_TUNABLE_CPU_CPU_SHARES,
716 717 718 719 720 721 722
                                        val) < 0)
                return -1;

            event = virDomainEventTunableNewFromObj(vm, eventParams, eventNparams);
        }

        if (event)
723
            qemuDomainEventQueue(driver, event);
724
    }
725 726 727 728 729

    return 0;
}


730
static int
731
qemuInitCgroup(virQEMUDriverPtr driver,
732
               virDomainObjPtr vm)
733
{
734
    int ret = -1;
735 736 737
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);

738 739 740
    if (!cfg->privileged)
        goto done;

741 742 743
    if (!virCgroupAvailable())
        goto done;

744 745
    virCgroupFree(&priv->cgroup);

746
    if (!vm->def->resource) {
747 748
        virDomainResourceDefPtr res;

749
        if (VIR_ALLOC(res) < 0)
750
            goto cleanup;
751

752
        if (VIR_STRDUP(res->partition, "/machine") < 0) {
753 754 755 756 757
            VIR_FREE(res);
            goto cleanup;
        }

        vm->def->resource = res;
758 759
    }

760 761 762 763 764 765
    if (vm->def->resource->partition[0] != '/') {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Resource partition '%s' must start with '/'"),
                       vm->def->resource->partition);
        goto cleanup;
    }
766 767 768 769 770 771 772 773 774 775 776

    if (virCgroupNewMachine(vm->def->name,
                            "qemu",
                            cfg->privileged,
                            vm->def->uuid,
                            NULL,
                            vm->pid,
                            false,
                            vm->def->resource->partition,
                            cfg->cgroupControllers,
                            &priv->cgroup) < 0) {
777 778
        if (virCgroupNewIgnoreError())
            goto done;
779

780 781
        goto cleanup;
    }
782

783
 done:
784
    ret = 0;
785
 cleanup:
786 787 788
    virObjectUnref(cfg);
    return ret;
}
789

790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806

int
qemuConnectCgroup(virQEMUDriverPtr driver,
                  virDomainObjPtr vm)
{
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int ret = -1;

    if (!cfg->privileged)
        goto done;

    if (!virCgroupAvailable())
        goto done;

    virCgroupFree(&priv->cgroup);

807 808 809
    if (virCgroupNewDetectMachine(vm->def->name,
                                  "qemu",
                                  vm->pid,
810 811 812
                                  vm->def->resource ?
                                  vm->def->resource->partition :
                                  NULL,
813
                                  cfg->cgroupControllers,
814
                                  &priv->cgroup) < 0)
815
        goto cleanup;
816

817
 done:
818
    ret = 0;
819
 cleanup:
820
    virObjectUnref(cfg);
821
    return ret;
822 823
}

824 825 826 827
int
qemuSetupCgroup(virQEMUDriverPtr driver,
                virDomainObjPtr vm,
                virBitmapPtr nodemask)
828
{
829
    qemuDomainObjPrivatePtr priv = vm->privateData;
830
    virCapsPtr caps = NULL;
831
    int ret = -1;
832

833 834 835 836 837 838
    if (!vm->pid) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("Cannot setup cgroups until process is started"));
        return -1;
    }

839
    if (qemuInitCgroup(driver, vm) < 0)
840
        return -1;
841

842
    if (!priv->cgroup)
843
        return 0;
844

845 846 847
    if (!(caps = virQEMUDriverGetCapabilities(driver, false)))
        goto cleanup;

848 849
    if (qemuSetupDevicesCgroup(driver, vm) < 0)
        goto cleanup;
850

851 852
    if (qemuSetupBlkioCgroup(vm) < 0)
        goto cleanup;
853

854 855
    if (qemuSetupMemoryCgroup(vm) < 0)
        goto cleanup;
856

857
    if (qemuSetupCpuCgroup(driver, vm) < 0)
858
        goto cleanup;
859

860
    if (qemuSetupCpusetCgroup(vm, nodemask, caps) < 0)
861
        goto cleanup;
862

863
    ret = 0;
864
 cleanup:
865
    virObjectUnref(caps);
866
    return ret;
867 868
}

869 870 871 872 873 874 875
int
qemuSetupCgroupPostInit(virDomainObjPtr vm,
                        virBitmapPtr nodemask)
{
    return qemuSetupCpusetMems(vm, nodemask);
}

876 877 878 879
int
qemuSetupCgroupVcpuBW(virCgroupPtr cgroup,
                      unsigned long long period,
                      long long quota)
880 881 882 883 884 885 886 887
{
    unsigned long long old_period;

    if (period == 0 && quota == 0)
        return 0;

    if (period) {
        /* get old period, and we can rollback if set quota failed */
888
        if (virCgroupGetCpuCfsPeriod(cgroup, &old_period) < 0)
889 890
            return -1;

891
        if (virCgroupSetCpuCfsPeriod(cgroup, period) < 0)
892 893 894
            return -1;
    }

895 896 897
    if (quota &&
        virCgroupSetCpuCfsQuota(cgroup, quota) < 0)
        goto error;
898 899 900

    return 0;

901
 error:
902
    if (period) {
903 904 905 906 907 908
        virErrorPtr saved = virSaveLastError();
        ignore_value(virCgroupSetCpuCfsPeriod(cgroup, old_period));
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
909 910 911 912 913
    }

    return -1;
}

914 915 916 917 918
int
qemuSetupCgroupVcpuPin(virCgroupPtr cgroup,
                       virDomainVcpuPinDefPtr *vcpupin,
                       int nvcpupin,
                       int vcpuid)
919
{
920
    size_t i;
921 922

    for (i = 0; i < nvcpupin; i++) {
923
        if (vcpuid == vcpupin[i]->vcpuid)
924
            return qemuSetupCgroupEmulatorPin(cgroup, vcpupin[i]->cpumask);
925 926
    }

927 928 929
    return -1;
}

930 931 932 933 934 935 936 937 938
int
qemuSetupCgroupIOThreadsPin(virCgroupPtr cgroup,
                            virDomainVcpuPinDefPtr *iothreadspin,
                            int niothreadspin,
                            int iothreadid)
{
    size_t i;

    for (i = 0; i < niothreadspin; i++) {
939
        if (iothreadid == iothreadspin[i]->vcpuid)
940 941 942 943 944 945
            return qemuSetupCgroupEmulatorPin(cgroup, iothreadspin[i]->cpumask);
    }

    return -1;
}

946 947 948
int
qemuSetupCgroupEmulatorPin(virCgroupPtr cgroup,
                           virBitmapPtr cpumask)
949
{
950
    int ret = -1;
951 952
    char *new_cpus = NULL;

953
    if (!(new_cpus = virBitmapFormat(cpumask)))
954 955
        goto cleanup;

956
    if (virCgroupSetCpusetCpus(cgroup, new_cpus) < 0)
957 958
        goto cleanup;

959
    ret = 0;
960
 cleanup:
961
    VIR_FREE(new_cpus);
962
    return ret;
963 964
}

965 966
int
qemuSetupCgroupForVcpu(virDomainObjPtr vm)
967 968 969
{
    virCgroupPtr cgroup_vcpu = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
970
    virDomainDefPtr def = vm->def;
971
    size_t i, j;
972 973 974
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

975
    if ((period || quota) &&
976
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
977 978
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
H
Hu Tao 已提交
979 980 981
        return -1;
    }

982 983 984 985 986 987 988 989 990
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

991
    /* We are trying to setup cgroups for CPU pinning, which can also be done
992
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
993
     */
994
    if (priv->cgroup == NULL)
995 996
        return 0;

997
    if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) {
998
        /* If we don't know VCPU<->PID mapping or all vcpu runs in the same
W
Wen Congyang 已提交
999
         * thread, we cannot control each vcpu.
1000
         */
1001 1002
        VIR_WARN("Unable to get vcpus' pids.");
        return 0;
1003 1004 1005
    }

    for (i = 0; i < priv->nvcpupids; i++) {
1006
        if (virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu) < 0)
1007 1008 1009
            goto cleanup;

        /* move the thread for vcpu to sub dir */
1010
        if (virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]) < 0)
1011 1012 1013
            goto cleanup;

        if (period || quota) {
H
Hu Tao 已提交
1014 1015
            if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0)
                goto cleanup;
1016 1017
        }

1018
        /* Set vcpupin in cgroup if vcpupin xml is provided */
1019
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
M
Martin Kletzander 已提交
1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupVcpuPin will fail. */
            for (j = 0; j < def->cputune.nvcpupin; j++) {
                if (def->cputune.vcpupin[j]->vcpuid != i)
                    continue;

                if (qemuSetupCgroupVcpuPin(cgroup_vcpu,
                                           def->cputune.vcpupin,
                                           def->cputune.nvcpupin,
                                           i) < 0)
                    goto cleanup;

                break;
            }
        }
1035

1036 1037 1038 1039 1040
        virCgroupFree(&cgroup_vcpu);
    }

    return 0;

1041
 cleanup:
1042 1043 1044 1045 1046
    if (cgroup_vcpu) {
        virCgroupRemove(cgroup_vcpu);
        virCgroupFree(&cgroup_vcpu);
    }

1047 1048 1049
    return -1;
}

1050 1051 1052 1053
int
qemuSetupCgroupForEmulator(virQEMUDriverPtr driver,
                           virDomainObjPtr vm,
                           virBitmapPtr nodemask)
1054
{
1055
    virBitmapPtr cpumask = NULL;
1056
    virBitmapPtr cpumap = NULL;
1057
    virCgroupPtr cgroup_emulator = NULL;
1058
    virDomainDefPtr def = vm->def;
1059
    qemuDomainObjPrivatePtr priv = vm->privateData;
1060 1061
    unsigned long long period = vm->def->cputune.emulator_period;
    long long quota = vm->def->cputune.emulator_quota;
1062

1063
    if ((period || quota) &&
1064
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
1065 1066 1067 1068 1069
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

1070 1071 1072 1073 1074 1075 1076 1077 1078
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

1079
    if (priv->cgroup == NULL)
1080 1081
        return 0; /* Not supported, so claim success */

1082
    if (virCgroupNewEmulator(priv->cgroup, true, &cgroup_emulator) < 0)
1083 1084
        goto cleanup;

1085
    if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0)
1086
        goto cleanup;
1087

1088 1089 1090 1091 1092
    if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
        if (!(cpumap = qemuPrepareCpumap(driver, nodemask)))
            goto cleanup;
        cpumask = cpumap;
    } else if (def->cputune.emulatorpin) {
1093
        cpumask = def->cputune.emulatorpin->cpumask;
1094
    } else if (def->cpumask) {
1095
        cpumask = def->cpumask;
1096
    }
1097 1098

    if (cpumask) {
1099 1100 1101
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) &&
            qemuSetupCgroupEmulatorPin(cgroup_emulator, cpumask) < 0)
            goto cleanup;
H
Hu Tao 已提交
1102
    }
1103

1104
    if (period || quota) {
1105 1106 1107 1108
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
            qemuSetupCgroupVcpuBW(cgroup_emulator, period,
                                  quota) < 0)
            goto cleanup;
1109 1110
    }

1111
    virCgroupFree(&cgroup_emulator);
1112
    virBitmapFree(cpumap);
1113 1114
    return 0;

1115
 cleanup:
1116 1117
    virBitmapFree(cpumap);

1118 1119 1120 1121 1122
    if (cgroup_emulator) {
        virCgroupRemove(cgroup_emulator);
        virCgroupFree(&cgroup_emulator);
    }

1123
    return -1;
1124
}
1125

1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157
int
qemuSetupCgroupForIOThreads(virDomainObjPtr vm)
{
    virCgroupPtr cgroup_iothread = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virDomainDefPtr def = vm->def;
    size_t i, j;
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

    if ((period || quota) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

    /* We are trying to setup cgroups for CPU pinning, which can also be done
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
     */
    if (priv->cgroup == NULL)
        return 0;

J
Ján Tomko 已提交
1158
    if (def->iothreads && priv->niothreadpids == 0) {
1159 1160 1161 1162 1163 1164 1165 1166
        VIR_WARN("Unable to get iothreads' pids.");
        return 0;
    }

    for (i = 0; i < priv->niothreadpids; i++) {
        /* IOThreads are numbered 1..n, although the array is 0..n-1,
         * so we will account for that here
         */
1167 1168
        if (virCgroupNewIOThread(priv->cgroup, i + 1, true,
                                 &cgroup_iothread) < 0)
1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186
            goto cleanup;

        /* move the thread for iothread to sub dir */
        if (virCgroupAddTask(cgroup_iothread, priv->iothreadpids[i]) < 0)
            goto cleanup;

        if (period || quota) {
            if (qemuSetupCgroupVcpuBW(cgroup_iothread, period, quota) < 0)
                goto cleanup;
        }

        /* Set iothreadpin in cgroup if iothreadpin xml is provided */
        if (virCgroupHasController(priv->cgroup,
                                   VIR_CGROUP_CONTROLLER_CPUSET)) {
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupIOThreadsPin will fail. */
            for (j = 0; j < def->cputune.niothreadspin; j++) {
                /* IOThreads are numbered/named 1..n */
1187
                if (def->cputune.iothreadspin[j]->vcpuid != i + 1)
1188 1189 1190 1191 1192
                    continue;

                if (qemuSetupCgroupIOThreadsPin(cgroup_iothread,
                                                def->cputune.iothreadspin,
                                                def->cputune.niothreadspin,
1193
                                                i + 1) < 0)
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213
                    goto cleanup;

                break;
            }
        }

        virCgroupFree(&cgroup_iothread);
    }

    return 0;

 cleanup:
    if (cgroup_iothread) {
        virCgroupRemove(cgroup_iothread);
        virCgroupFree(&cgroup_iothread);
    }

    return -1;
}

1214
int
1215 1216
qemuRemoveCgroup(virQEMUDriverPtr driver,
                 virDomainObjPtr vm)
1217
{
1218
    qemuDomainObjPrivatePtr priv = vm->privateData;
1219
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
1220

1221
    if (priv->cgroup == NULL)
1222 1223
        return 0; /* Not supported, so claim success */

1224 1225 1226 1227 1228 1229 1230
    if (virCgroupTerminateMachine(vm->def->name,
                                  "qemu",
                                  cfg->privileged) < 0) {
        if (!virCgroupNewIgnoreError())
            VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name);
    }

1231 1232
    virObjectUnref(cfg);

1233
    return virCgroupRemove(priv->cgroup);
1234 1235
}

1236 1237
int
qemuAddToCgroup(virDomainObjPtr vm)
1238
{
1239
    qemuDomainObjPrivatePtr priv = vm->privateData;
1240

1241
    if (priv->cgroup == NULL)
1242 1243
        return 0; /* Not supported, so claim success */

1244
    return 0;
1245
}