qemu_cgroup.c 35.8 KB
Newer Older
1 2 3
/*
 * qemu_cgroup.c: QEMU cgroup management
 *
4
 * Copyright (C) 2006-2014 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright (C) 2006 Daniel P. Berrange
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 */

#include <config.h>

#include "qemu_cgroup.h"
27
#include "qemu_domain.h"
28
#include "qemu_process.h"
29
#include "vircgroup.h"
30
#include "virlog.h"
31
#include "viralloc.h"
32
#include "virerror.h"
33
#include "domain_audit.h"
34
#include "virscsi.h"
35
#include "virstring.h"
36
#include "virfile.h"
37
#include "virtypedparam.h"
38 39 40

#define VIR_FROM_THIS VIR_FROM_QEMU

41 42
VIR_LOG_INIT("qemu.qemu_cgroup");

43 44 45 46
static const char *const defaultDeviceACL[] = {
    "/dev/null", "/dev/full", "/dev/zero",
    "/dev/random", "/dev/urandom",
    "/dev/ptmx", "/dev/kvm", "/dev/kqemu",
47
    "/dev/rtc", "/dev/hpet", "/dev/vfio/vfio",
48 49 50 51 52
    NULL,
};
#define DEVICE_PTY_MAJOR 136
#define DEVICE_SND_MAJOR 116

53 54 55 56 57
static int
qemuSetImageCgroupInternal(virDomainObjPtr vm,
                           virStorageSourcePtr src,
                           bool deny,
                           bool forceReadonly)
58
{
59
    qemuDomainObjPrivatePtr priv = vm->privateData;
60
    int perms = VIR_CGROUP_DEVICE_READ;
61
    int ret;
62

63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (!src->path || !virStorageSourceIsLocalStorage(src)) {
        VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s",
                  NULLSTR(src->path), virStorageTypeToString(src->type));
        return 0;
    }

    if (deny) {
        perms |= VIR_CGROUP_DEVICE_WRITE | VIR_CGROUP_DEVICE_MKNOD;

        VIR_DEBUG("Deny path %s", src->path);

        ret = virCgroupDenyDevicePath(priv->cgroup, src->path, perms);
    } else {
80
        if (!src->readonly && !forceReadonly)
81 82 83 84 85 86 87 88 89 90 91 92 93
            perms |= VIR_CGROUP_DEVICE_WRITE;

        VIR_DEBUG("Allow path %s, perms: %s",
                  src->path, virCgroupGetDevicePermsString(perms));

        ret = virCgroupAllowDevicePath(priv->cgroup, src->path, perms);
    }

    virDomainAuditCgroupPath(vm, priv->cgroup,
                             deny ? "deny" : "allow",
                             src->path,
                             virCgroupGetDevicePermsString(perms),
                             ret == 0);
94 95 96 97

    /* Get this for root squash NFS */
    if (ret < 0 &&
        virLastErrorIsSystemErrno(EACCES)) {
98
        VIR_DEBUG("Ignoring EACCES for %s", src->path);
99 100
        virResetLastError();
        ret = 0;
101
    }
102

103
    return ret;
104 105 106
}


107 108 109 110 111 112 113 114 115
int
qemuSetImageCgroup(virDomainObjPtr vm,
                   virStorageSourcePtr src,
                   bool deny)
{
    return qemuSetImageCgroupInternal(vm, src, deny, false);
}


116 117 118
int
qemuSetupDiskCgroup(virDomainObjPtr vm,
                    virDomainDiskDefPtr disk)
119
{
120
    virStorageSourcePtr next;
121
    bool forceReadonly = false;
122

123
    for (next = disk->src; next; next = next->backingStore) {
124
        if (qemuSetImageCgroupInternal(vm, next, false, forceReadonly) < 0)
125
            return -1;
126 127 128

        /* setup only the top level image for read-write */
        forceReadonly = true;
129
    }
130 131

    return 0;
132 133 134
}


135 136 137
int
qemuTeardownDiskCgroup(virDomainObjPtr vm,
                       virDomainDiskDefPtr disk)
138
{
139
    virStorageSourcePtr next;
140

141 142 143 144
    for (next = disk->src; next; next = next->backingStore) {
        if (qemuSetImageCgroup(vm, next, true) < 0)
            return -1;
    }
145

146
    return 0;
147 148
}

149

150
static int
151
qemuSetupChrSourceCgroup(virDomainDefPtr def ATTRIBUTE_UNUSED,
152
                         virDomainChrSourceDefPtr dev,
153
                         void *opaque)
154
{
155 156
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
157
    int ret;
158

159
    if (dev->type != VIR_DOMAIN_CHR_TYPE_DEV)
160 161
        return 0;

162
    VIR_DEBUG("Process path '%s' for device", dev->data.file.path);
163

164 165
    ret = virCgroupAllowDevicePath(priv->cgroup, dev->data.file.path,
                                   VIR_CGROUP_DEVICE_RW);
166
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
167
                             dev->data.file.path, "rw", ret == 0);
168

169
    return ret;
170 171
}

172 173 174 175 176
static int
qemuSetupChardevCgroup(virDomainDefPtr def,
                       virDomainChrDefPtr dev,
                       void *opaque)
{
177
    return qemuSetupChrSourceCgroup(def, &dev->source, opaque);
178 179 180 181 182 183
}


static int
qemuSetupTPMCgroup(virDomainDefPtr def,
                   virDomainTPMDefPtr dev,
184
                   void *opaque)
185
{
186
    int ret = 0;
187 188 189

    switch (dev->type) {
    case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
190 191
        ret = qemuSetupChrSourceCgroup(def, &dev->data.passthrough.source,
                                       opaque);
192 193 194 195 196
        break;
    case VIR_DOMAIN_TPM_TYPE_LAST:
        break;
    }

197
    return ret;
198 199
}

200

201
static int
202
qemuSetupHostUSBDeviceCgroup(virUSBDevicePtr dev ATTRIBUTE_UNUSED,
203 204
                             const char *path,
                             void *opaque)
205
{
206 207
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
208
    int ret;
209 210

    VIR_DEBUG("Process path '%s' for USB device", path);
211 212 213
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", ret == 0);
214

215
    return ret;
216 217
}

218
static int
219
qemuSetupHostSCSIDeviceCgroup(virSCSIDevicePtr dev ATTRIBUTE_UNUSED,
220 221 222 223 224
                              const char *path,
                              void *opaque)
{
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
225
    int ret;
226 227 228

    VIR_DEBUG("Process path '%s' for SCSI device", path);

229 230 231 232
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   virSCSIDeviceGetReadonly(dev) ?
                                   VIR_CGROUP_DEVICE_READ :
                                   VIR_CGROUP_DEVICE_RW);
233 234

    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
235
                             virSCSIDeviceGetReadonly(dev) ? "r" : "rw", ret == 0);
236

237
    return ret;
238
}
239

240 241 242 243 244 245
int
qemuSetupHostdevCGroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
246
    virDomainHostdevSubsysUSBPtr usbsrc = &dev->source.subsys.u.usb;
247
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
248
    virDomainHostdevSubsysSCSIPtr scsisrc = &dev->source.subsys.u.scsi;
249
    virPCIDevicePtr pci = NULL;
250
    virUSBDevicePtr usb = NULL;
251
    virSCSIDevicePtr scsi = NULL;
252 253 254 255 256 257 258 259 260 261 262 263 264 265
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
266
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
267
                int rv;
268

269 270 271 272
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
273 274 275
                if (!pci)
                    goto cleanup;

276
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
277 278 279
                    goto cleanup;

                VIR_DEBUG("Cgroup allow %s for PCI device assignment", path);
280
                rv = virCgroupAllowDevicePath(priv->cgroup, path,
281 282
                                              VIR_CGROUP_DEVICE_RW);
                virDomainAuditCgroupPath(vm, priv->cgroup,
283 284
                                         "allow", path, "rw", rv == 0);
                if (rv < 0)
285 286 287
                    goto cleanup;
            }
            break;
288 289 290 291 292 293 294 295

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* NB: hostdev->missing wasn't previously checked in the
             * case of hotplug, only when starting a domain. Now it is
             * always checked, and the cgroup setup skipped if true.
             */
            if (dev->missing)
                break;
296
            if ((usb = virUSBDeviceNew(usbsrc->bus, usbsrc->device,
297 298 299 300
                                       NULL)) == NULL) {
                goto cleanup;
            }

301
            /* oddly, qemuSetupHostUSBDeviceCgroup doesn't ever
302 303
             * reference the usb object we just created
             */
304
            if (virUSBDeviceFileIterate(usb, qemuSetupHostUSBDeviceCgroup,
305 306 307 308
                                        vm) < 0) {
                goto cleanup;
            }
            break;
309

310
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI: {
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
            if (scsisrc->protocol ==
                VIR_DOMAIN_HOSTDEV_SCSI_PROTOCOL_TYPE_ISCSI) {
                virDomainHostdevSubsysSCSIiSCSIPtr iscsisrc = &scsisrc->u.iscsi;
                /* Follow qemuSetupDiskCgroup() and qemuSetImageCgroupInternal()
                 * which does nothing for non local storage
                 */
                VIR_DEBUG("Not updating cgroups for hostdev iSCSI path '%s'",
                          iscsisrc->path);
            } else {
                virDomainHostdevSubsysSCSIHostPtr scsihostsrc =
                    &scsisrc->u.host;
                if ((scsi = virSCSIDeviceNew(NULL,
                                             scsihostsrc->adapter,
                                             scsihostsrc->bus,
                                             scsihostsrc->target,
                                             scsihostsrc->unit,
                                             dev->readonly,
                                             dev->shareable)) == NULL)
                    goto cleanup;
330

331 332 333 334 335
                if (virSCSIDeviceFileIterate(scsi,
                                             qemuSetupHostSCSIDeviceCgroup,
                                             vm) < 0)
                    goto cleanup;
            }
336 337
            break;
        }
338

339 340 341 342 343 344
        default:
            break;
        }
    }

    ret = 0;
345
 cleanup:
346
    virPCIDeviceFree(pci);
347
    virUSBDeviceFree(usb);
348
    virSCSIDeviceFree(scsi);
349 350 351 352 353 354 355 356 357 358
    VIR_FREE(path);
    return ret;
}

int
qemuTeardownHostdevCgroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
359
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
    virPCIDevicePtr pci = NULL;
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
375
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
376
                int rv;
377

378 379 380 381
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
382 383 384
                if (!pci)
                    goto cleanup;

385
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
386 387 388
                    goto cleanup;

                VIR_DEBUG("Cgroup deny %s for PCI device assignment", path);
389
                rv = virCgroupDenyDevicePath(priv->cgroup, path,
390 391
                                             VIR_CGROUP_DEVICE_RWM);
                virDomainAuditCgroupPath(vm, priv->cgroup,
392 393
                                         "deny", path, "rwm", rv == 0);
                if (rv < 0)
394 395 396
                    goto cleanup;
            }
            break;
397 398 399
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* nothing to tear down for USB */
            break;
400 401 402 403 404 405
        default:
            break;
        }
    }

    ret = 0;
406
 cleanup:
407 408 409 410 411
    virPCIDeviceFree(pci);
    VIR_FREE(path);
    return ret;
}

412 413 414 415
static int
qemuSetupBlkioCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
416
    size_t i;
417 418 419 420 421 422 423 424 425 426 427 428

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_BLKIO)) {
        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Block I/O tuning is not available on this host"));
            return -1;
        } else {
            return 0;
        }
    }

429 430 431
    if (vm->def->blkio.weight != 0 &&
        virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight) < 0)
        return -1;
432 433 434

    if (vm->def->blkio.ndevices) {
        for (i = 0; i < vm->def->blkio.ndevices; i++) {
435
            virBlkioDevicePtr dev = &vm->def->blkio.devices[i];
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
            if (dev->weight &&
                (virCgroupSetBlkioDeviceWeight(priv->cgroup, dev->path,
                                               dev->weight) < 0))
                return -1;

            if (dev->riops &&
                (virCgroupSetBlkioDeviceReadIops(priv->cgroup, dev->path,
                                                 dev->riops) < 0))
                return -1;

            if (dev->wiops &&
                (virCgroupSetBlkioDeviceWriteIops(priv->cgroup, dev->path,
                                                  dev->wiops) < 0))
                return -1;

            if (dev->rbps &&
                (virCgroupSetBlkioDeviceReadBps(priv->cgroup, dev->path,
                                                dev->rbps) < 0))
                return -1;

            if (dev->wbps &&
                (virCgroupSetBlkioDeviceWriteBps(priv->cgroup, dev->path,
                                                 dev->wbps) < 0))
459 460 461 462 463 464 465
                return -1;
        }
    }

    return 0;
}

466

467 468 469 470 471
static int
qemuSetupMemoryCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

E
Eric Blake 已提交
472
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
473 474 475 476 477 478
        if (vm->def->mem.hard_limit != 0 ||
            vm->def->mem.soft_limit != 0 ||
            vm->def->mem.swap_hard_limit != 0) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Memory cgroup is not available on this host"));
            return -1;
O
Osier Yang 已提交
479 480
        } else {
            return 0;
481 482 483
        }
    }

484 485
    if (vm->def->mem.hard_limit != 0 &&
        virCgroupSetMemoryHardLimit(priv->cgroup, vm->def->mem.hard_limit) < 0)
486 487
        return -1;

488 489 490 491 492 493 494
    if (vm->def->mem.soft_limit != 0 &&
        virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit) < 0)
        return -1;

    if (vm->def->mem.swap_hard_limit != 0 &&
        virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit) < 0)
        return -1;
495 496 497 498 499

    return 0;
}


500 501 502 503 504 505 506
static int
qemuSetupDevicesCgroup(virQEMUDriverPtr driver,
                       virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = NULL;
    const char *const *deviceACL = NULL;
507
    int rv = -1;
508
    int ret = -1;
509
    size_t i;
510 511 512 513

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

514 515 516 517 518
    rv = virCgroupDenyAllDevices(priv->cgroup);
    virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rv == 0);
    if (rv < 0) {
        if (virLastErrorIsSystemErrno(EPERM)) {
            virResetLastError();
519 520 521 522 523 524 525
            VIR_WARN("Group devices ACL is not accessible, disabling whitelisting");
            return 0;
        }

        goto cleanup;
    }

526
    for (i = 0; i < vm->def->ndisks; i++) {
527 528 529 530
        if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0)
            goto cleanup;
    }

531
    rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR,
532 533
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR,
534 535
                              "pty", "rw", rv == 0);
    if (rv < 0)
536 537 538 539 540 541 542 543
        goto cleanup;

    cfg = virQEMUDriverGetConfig(driver);
    deviceACL = cfg->cgroupDeviceACL ?
                (const char *const *)cfg->cgroupDeviceACL :
                defaultDeviceACL;

    if (vm->def->nsounds &&
544
        ((!vm->def->ngraphics && cfg->nogfxAllowHostAudio) ||
545 546
         (vm->def->graphics &&
          ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC &&
547
           cfg->vncAllowHostAudio) ||
548
           (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL))))) {
549
        rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR,
550 551
                                       VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR,
552 553
                                  "sound", "rw", rv == 0);
        if (rv < 0)
554 555 556
            goto cleanup;
    }

557
    for (i = 0; deviceACL[i] != NULL; i++) {
558
        if (!virFileExists(deviceACL[i])) {
N
Nehal J Wani 已提交
559
            VIR_DEBUG("Ignoring non-existent device %s", deviceACL[i]);
560 561 562
            continue;
        }

563
        rv = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i],
564
                                      VIR_CGROUP_DEVICE_RW);
565 566 567
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rv == 0);
        if (rv < 0 &&
            !virLastErrorIsSystemErrno(ENOENT))
568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
            goto cleanup;
    }

    if (virDomainChrDefForeach(vm->def,
                               true,
                               qemuSetupChardevCgroup,
                               vm) < 0)
        goto cleanup;

    if (vm->def->tpm &&
        (qemuSetupTPMCgroup(vm->def,
                            vm->def->tpm,
                            vm) < 0))
        goto cleanup;

    for (i = 0; i < vm->def->nhostdevs; i++) {
        if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0)
            goto cleanup;
    }

588 589 590
    for (i = 0; i < vm->def->nrngs; i++) {
        if (vm->def->rngs[i]->backend == VIR_DOMAIN_RNG_BACKEND_RANDOM) {
            VIR_DEBUG("Setting Cgroup ACL for RNG device");
591 592
            rv = virCgroupAllowDevicePath(priv->cgroup,
                                          vm->def->rngs[i]->source.file,
593 594
                                          VIR_CGROUP_DEVICE_RW);
            virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
595 596
                                     vm->def->rngs[i]->source.file,
                                     "rw", rv == 0);
597 598 599 600
            if (rv < 0 &&
                !virLastErrorIsSystemErrno(ENOENT))
                goto cleanup;
        }
601 602
    }

603
    ret = 0;
604
 cleanup:
605 606 607 608 609
    virObjectUnref(cfg);
    return ret;
}


610
static int
611 612
qemuSetupCpusetMems(virDomainObjPtr vm,
                    virBitmapPtr nodemask)
613 614
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
615
    char *mem_mask = NULL;
616 617 618 619 620
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

621 622
    if (virDomainNumatuneMaybeFormatNodeset(vm->def->numatune,
                                            nodemask,
623
                                            &mem_mask, -1) < 0)
624
        goto cleanup;
625

626 627 628
    if (mem_mask &&
        virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
        goto cleanup;
629

630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
    ret = 0;
 cleanup:
    VIR_FREE(mem_mask);
    return ret;
}


static int
qemuSetupCpusetCgroup(virDomainObjPtr vm,
                      virBitmapPtr nodemask,
                      virCapsPtr caps)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    char *cpu_mask = NULL;
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

649 650 651 652 653 654 655 656 657 658 659 660 661
    if (vm->def->cpumask ||
        (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) {

        if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
            virBitmapPtr cpumap;
            if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask)))
                goto cleanup;
            cpu_mask = virBitmapFormat(cpumap);
            virBitmapFree(cpumap);
        } else {
            cpu_mask = virBitmapFormat(vm->def->cpumask);
        }

662
        if (!cpu_mask)
663 664
            goto cleanup;

665
        if (virCgroupSetCpusetCpus(priv->cgroup, cpu_mask) < 0)
666 667 668
            goto cleanup;
    }

669
    ret = 0;
670
 cleanup:
671
    VIR_FREE(cpu_mask);
672 673 674 675
    return ret;
}


676 677 678 679
static int
qemuSetupCpuCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
680 681 682 683
    virObjectEventPtr event = NULL;
    virTypedParameterPtr eventParams = NULL;
    int eventNparams = 0;
    int eventMaxparams = 0;
684 685

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
686
       if (vm->def->cputune.sharesSpecified) {
687 688 689 690 691 692 693 694
           virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                          _("CPU tuning is not available on this host"));
           return -1;
       } else {
           return 0;
       }
    }

695 696 697 698 699 700 701
    if (vm->def->cputune.sharesSpecified) {
        unsigned long long val;
        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
            return -1;

        if (virCgroupGetCpuShares(priv->cgroup, &val) < 0)
            return -1;
702 703 704 705
        if (vm->def->cputune.shares != val) {
            vm->def->cputune.shares = val;
            if (virTypedParamsAddULLong(&eventParams, &eventNparams,
                                        &eventMaxparams,
706
                                        VIR_DOMAIN_TUNABLE_CPU_CPU_SHARES,
707 708 709 710 711 712 713 714
                                        val) < 0)
                return -1;

            event = virDomainEventTunableNewFromObj(vm, eventParams, eventNparams);
        }

        if (event)
            qemuDomainEventQueue(vm->privateData, event);
715
    }
716 717 718 719 720

    return 0;
}


721
static int
722
qemuInitCgroup(virQEMUDriverPtr driver,
723
               virDomainObjPtr vm)
724
{
725
    int ret = -1;
726 727 728
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);

729 730 731
    if (!cfg->privileged)
        goto done;

732 733 734
    if (!virCgroupAvailable())
        goto done;

735 736
    virCgroupFree(&priv->cgroup);

737
    if (!vm->def->resource) {
738 739
        virDomainResourceDefPtr res;

740
        if (VIR_ALLOC(res) < 0)
741
            goto cleanup;
742

743
        if (VIR_STRDUP(res->partition, "/machine") < 0) {
744 745 746 747 748
            VIR_FREE(res);
            goto cleanup;
        }

        vm->def->resource = res;
749 750
    }

751 752 753 754 755 756
    if (vm->def->resource->partition[0] != '/') {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Resource partition '%s' must start with '/'"),
                       vm->def->resource->partition);
        goto cleanup;
    }
757 758 759 760 761 762 763 764 765 766 767

    if (virCgroupNewMachine(vm->def->name,
                            "qemu",
                            cfg->privileged,
                            vm->def->uuid,
                            NULL,
                            vm->pid,
                            false,
                            vm->def->resource->partition,
                            cfg->cgroupControllers,
                            &priv->cgroup) < 0) {
768 769
        if (virCgroupNewIgnoreError())
            goto done;
770

771 772
        goto cleanup;
    }
773

774
 done:
775
    ret = 0;
776
 cleanup:
777 778 779
    virObjectUnref(cfg);
    return ret;
}
780

781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797

int
qemuConnectCgroup(virQEMUDriverPtr driver,
                  virDomainObjPtr vm)
{
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int ret = -1;

    if (!cfg->privileged)
        goto done;

    if (!virCgroupAvailable())
        goto done;

    virCgroupFree(&priv->cgroup);

798 799 800
    if (virCgroupNewDetectMachine(vm->def->name,
                                  "qemu",
                                  vm->pid,
801 802 803
                                  vm->def->resource ?
                                  vm->def->resource->partition :
                                  NULL,
804
                                  cfg->cgroupControllers,
805
                                  &priv->cgroup) < 0)
806
        goto cleanup;
807

808
 done:
809
    ret = 0;
810
 cleanup:
811
    virObjectUnref(cfg);
812
    return ret;
813 814
}

815 816 817 818
int
qemuSetupCgroup(virQEMUDriverPtr driver,
                virDomainObjPtr vm,
                virBitmapPtr nodemask)
819
{
820
    qemuDomainObjPrivatePtr priv = vm->privateData;
821
    virCapsPtr caps = NULL;
822
    int ret = -1;
823

824 825 826 827 828 829
    if (!vm->pid) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("Cannot setup cgroups until process is started"));
        return -1;
    }

830
    if (qemuInitCgroup(driver, vm) < 0)
831
        return -1;
832

833
    if (!priv->cgroup)
834
        return 0;
835

836 837 838
    if (!(caps = virQEMUDriverGetCapabilities(driver, false)))
        goto cleanup;

839 840
    if (qemuSetupDevicesCgroup(driver, vm) < 0)
        goto cleanup;
841

842 843
    if (qemuSetupBlkioCgroup(vm) < 0)
        goto cleanup;
844

845 846
    if (qemuSetupMemoryCgroup(vm) < 0)
        goto cleanup;
847

848 849
    if (qemuSetupCpuCgroup(vm) < 0)
        goto cleanup;
850

851
    if (qemuSetupCpusetCgroup(vm, nodemask, caps) < 0)
852
        goto cleanup;
853

854
    ret = 0;
855
 cleanup:
856
    virObjectUnref(caps);
857
    return ret;
858 859
}

860 861 862 863 864 865 866
int
qemuSetupCgroupPostInit(virDomainObjPtr vm,
                        virBitmapPtr nodemask)
{
    return qemuSetupCpusetMems(vm, nodemask);
}

867 868 869 870
int
qemuSetupCgroupVcpuBW(virCgroupPtr cgroup,
                      unsigned long long period,
                      long long quota)
871 872 873 874 875 876 877 878
{
    unsigned long long old_period;

    if (period == 0 && quota == 0)
        return 0;

    if (period) {
        /* get old period, and we can rollback if set quota failed */
879
        if (virCgroupGetCpuCfsPeriod(cgroup, &old_period) < 0)
880 881
            return -1;

882
        if (virCgroupSetCpuCfsPeriod(cgroup, period) < 0)
883 884 885
            return -1;
    }

886 887 888
    if (quota &&
        virCgroupSetCpuCfsQuota(cgroup, quota) < 0)
        goto error;
889 890 891

    return 0;

892
 error:
893
    if (period) {
894 895 896 897 898 899
        virErrorPtr saved = virSaveLastError();
        ignore_value(virCgroupSetCpuCfsPeriod(cgroup, old_period));
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
900 901 902 903 904
    }

    return -1;
}

905 906 907 908 909
int
qemuSetupCgroupVcpuPin(virCgroupPtr cgroup,
                       virDomainVcpuPinDefPtr *vcpupin,
                       int nvcpupin,
                       int vcpuid)
910
{
911
    size_t i;
912 913 914

    for (i = 0; i < nvcpupin; i++) {
        if (vcpuid == vcpupin[i]->vcpuid) {
915
            return qemuSetupCgroupEmulatorPin(cgroup, vcpupin[i]->cpumask);
916 917 918
        }
    }

919 920 921
    return -1;
}

922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938
int
qemuSetupCgroupIOThreadsPin(virCgroupPtr cgroup,
                            virDomainVcpuPinDefPtr *iothreadspin,
                            int niothreadspin,
                            int iothreadid)
{
    size_t i;

    for (i = 0; i < niothreadspin; i++) {
        if (iothreadid == iothreadspin[i]->vcpuid) {
            return qemuSetupCgroupEmulatorPin(cgroup, iothreadspin[i]->cpumask);
        }
    }

    return -1;
}

939 940 941
int
qemuSetupCgroupEmulatorPin(virCgroupPtr cgroup,
                           virBitmapPtr cpumask)
942
{
943
    int ret = -1;
944 945
    char *new_cpus = NULL;

946
    if (!(new_cpus = virBitmapFormat(cpumask)))
947 948
        goto cleanup;

949
    if (virCgroupSetCpusetCpus(cgroup, new_cpus) < 0)
950 951
        goto cleanup;

952
    ret = 0;
953
 cleanup:
954
    VIR_FREE(new_cpus);
955
    return ret;
956 957
}

958 959
int
qemuSetupCgroupForVcpu(virDomainObjPtr vm)
960 961 962
{
    virCgroupPtr cgroup_vcpu = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
963
    virDomainDefPtr def = vm->def;
964
    size_t i, j;
965 966 967
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

968
    if ((period || quota) &&
969
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
970 971
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
H
Hu Tao 已提交
972 973 974
        return -1;
    }

975 976 977 978 979 980 981 982 983
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

984
    /* We are trying to setup cgroups for CPU pinning, which can also be done
985
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
986
     */
987
    if (priv->cgroup == NULL)
988 989
        return 0;

990
    if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) {
991
        /* If we don't know VCPU<->PID mapping or all vcpu runs in the same
W
Wen Congyang 已提交
992
         * thread, we cannot control each vcpu.
993
         */
994 995
        VIR_WARN("Unable to get vcpus' pids.");
        return 0;
996 997 998
    }

    for (i = 0; i < priv->nvcpupids; i++) {
999
        if (virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu) < 0)
1000 1001 1002
            goto cleanup;

        /* move the thread for vcpu to sub dir */
1003
        if (virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]) < 0)
1004 1005 1006
            goto cleanup;

        if (period || quota) {
H
Hu Tao 已提交
1007 1008
            if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0)
                goto cleanup;
1009 1010
        }

1011
        /* Set vcpupin in cgroup if vcpupin xml is provided */
1012
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
M
Martin Kletzander 已提交
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupVcpuPin will fail. */
            for (j = 0; j < def->cputune.nvcpupin; j++) {
                if (def->cputune.vcpupin[j]->vcpuid != i)
                    continue;

                if (qemuSetupCgroupVcpuPin(cgroup_vcpu,
                                           def->cputune.vcpupin,
                                           def->cputune.nvcpupin,
                                           i) < 0)
                    goto cleanup;

                break;
            }
        }
1028

1029 1030 1031 1032 1033
        virCgroupFree(&cgroup_vcpu);
    }

    return 0;

1034
 cleanup:
1035 1036 1037 1038 1039
    if (cgroup_vcpu) {
        virCgroupRemove(cgroup_vcpu);
        virCgroupFree(&cgroup_vcpu);
    }

1040 1041 1042
    return -1;
}

1043 1044 1045 1046
int
qemuSetupCgroupForEmulator(virQEMUDriverPtr driver,
                           virDomainObjPtr vm,
                           virBitmapPtr nodemask)
1047
{
1048
    virBitmapPtr cpumask = NULL;
1049
    virBitmapPtr cpumap = NULL;
1050
    virCgroupPtr cgroup_emulator = NULL;
1051
    virDomainDefPtr def = vm->def;
1052
    qemuDomainObjPrivatePtr priv = vm->privateData;
1053 1054
    unsigned long long period = vm->def->cputune.emulator_period;
    long long quota = vm->def->cputune.emulator_quota;
1055

1056
    if ((period || quota) &&
1057
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
1058 1059 1060 1061 1062
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

1063 1064 1065 1066 1067 1068 1069 1070 1071
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

1072
    if (priv->cgroup == NULL)
1073 1074
        return 0; /* Not supported, so claim success */

1075
    if (virCgroupNewEmulator(priv->cgroup, true, &cgroup_emulator) < 0)
1076 1077
        goto cleanup;

1078
    if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0)
1079
        goto cleanup;
1080

1081 1082 1083 1084 1085
    if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
        if (!(cpumap = qemuPrepareCpumap(driver, nodemask)))
            goto cleanup;
        cpumask = cpumap;
    } else if (def->cputune.emulatorpin) {
1086
        cpumask = def->cputune.emulatorpin->cpumask;
1087
    } else if (def->cpumask) {
1088
        cpumask = def->cpumask;
1089
    }
1090 1091

    if (cpumask) {
1092 1093 1094
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) &&
            qemuSetupCgroupEmulatorPin(cgroup_emulator, cpumask) < 0)
            goto cleanup;
H
Hu Tao 已提交
1095
    }
1096

1097
    if (period || quota) {
1098 1099 1100 1101
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
            qemuSetupCgroupVcpuBW(cgroup_emulator, period,
                                  quota) < 0)
            goto cleanup;
1102 1103
    }

1104
    virCgroupFree(&cgroup_emulator);
1105
    virBitmapFree(cpumap);
1106 1107
    return 0;

1108
 cleanup:
1109 1110
    virBitmapFree(cpumap);

1111 1112 1113 1114 1115
    if (cgroup_emulator) {
        virCgroupRemove(cgroup_emulator);
        virCgroupFree(&cgroup_emulator);
    }

1116
    return -1;
1117
}
1118

1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
int
qemuSetupCgroupForIOThreads(virDomainObjPtr vm)
{
    virCgroupPtr cgroup_iothread = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virDomainDefPtr def = vm->def;
    size_t i, j;
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

    if ((period || quota) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

    /* We are trying to setup cgroups for CPU pinning, which can also be done
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
     */
    if (priv->cgroup == NULL)
        return 0;

J
Ján Tomko 已提交
1151
    if (def->iothreads && priv->niothreadpids == 0) {
1152 1153 1154 1155 1156 1157 1158 1159
        VIR_WARN("Unable to get iothreads' pids.");
        return 0;
    }

    for (i = 0; i < priv->niothreadpids; i++) {
        /* IOThreads are numbered 1..n, although the array is 0..n-1,
         * so we will account for that here
         */
1160 1161
        if (virCgroupNewIOThread(priv->cgroup, i + 1, true,
                                 &cgroup_iothread) < 0)
1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
            goto cleanup;

        /* move the thread for iothread to sub dir */
        if (virCgroupAddTask(cgroup_iothread, priv->iothreadpids[i]) < 0)
            goto cleanup;

        if (period || quota) {
            if (qemuSetupCgroupVcpuBW(cgroup_iothread, period, quota) < 0)
                goto cleanup;
        }

        /* Set iothreadpin in cgroup if iothreadpin xml is provided */
        if (virCgroupHasController(priv->cgroup,
                                   VIR_CGROUP_CONTROLLER_CPUSET)) {
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupIOThreadsPin will fail. */
            for (j = 0; j < def->cputune.niothreadspin; j++) {
                /* IOThreads are numbered/named 1..n */
1180
                if (def->cputune.iothreadspin[j]->vcpuid != i + 1)
1181 1182 1183 1184 1185
                    continue;

                if (qemuSetupCgroupIOThreadsPin(cgroup_iothread,
                                                def->cputune.iothreadspin,
                                                def->cputune.niothreadspin,
1186
                                                i + 1) < 0)
1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206
                    goto cleanup;

                break;
            }
        }

        virCgroupFree(&cgroup_iothread);
    }

    return 0;

 cleanup:
    if (cgroup_iothread) {
        virCgroupRemove(cgroup_iothread);
        virCgroupFree(&cgroup_iothread);
    }

    return -1;
}

1207 1208
int
qemuRemoveCgroup(virDomainObjPtr vm)
1209
{
1210
    qemuDomainObjPrivatePtr priv = vm->privateData;
1211

1212
    if (priv->cgroup == NULL)
1213 1214
        return 0; /* Not supported, so claim success */

1215
    return virCgroupRemove(priv->cgroup);
1216 1217
}

1218 1219
int
qemuAddToCgroup(virDomainObjPtr vm)
1220
{
1221
    qemuDomainObjPrivatePtr priv = vm->privateData;
1222

1223
    if (priv->cgroup == NULL)
1224 1225
        return 0; /* Not supported, so claim success */

1226
    return 0;
1227
}