qemu_cgroup.c 36.2 KB
Newer Older
1 2 3
/*
 * qemu_cgroup.c: QEMU cgroup management
 *
4
 * Copyright (C) 2006-2014 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright (C) 2006 Daniel P. Berrange
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 */

#include <config.h>

#include "qemu_cgroup.h"
27
#include "qemu_domain.h"
28
#include "qemu_process.h"
29
#include "vircgroup.h"
30
#include "virlog.h"
31
#include "viralloc.h"
32
#include "virerror.h"
33
#include "domain_audit.h"
34
#include "virscsi.h"
35
#include "virstring.h"
36
#include "virfile.h"
37
#include "virtypedparam.h"
38 39 40

#define VIR_FROM_THIS VIR_FROM_QEMU

41 42
VIR_LOG_INIT("qemu.qemu_cgroup");

43 44 45 46
static const char *const defaultDeviceACL[] = {
    "/dev/null", "/dev/full", "/dev/zero",
    "/dev/random", "/dev/urandom",
    "/dev/ptmx", "/dev/kvm", "/dev/kqemu",
47
    "/dev/rtc", "/dev/hpet", "/dev/vfio/vfio",
48 49 50 51 52
    NULL,
};
#define DEVICE_PTY_MAJOR 136
#define DEVICE_SND_MAJOR 116

53 54 55 56 57
static int
qemuSetImageCgroupInternal(virDomainObjPtr vm,
                           virStorageSourcePtr src,
                           bool deny,
                           bool forceReadonly)
58
{
59
    qemuDomainObjPrivatePtr priv = vm->privateData;
60
    int perms = VIR_CGROUP_DEVICE_READ;
61
    int ret;
62

63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (!src->path || !virStorageSourceIsLocalStorage(src)) {
        VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s",
                  NULLSTR(src->path), virStorageTypeToString(src->type));
        return 0;
    }

    if (deny) {
        perms |= VIR_CGROUP_DEVICE_WRITE | VIR_CGROUP_DEVICE_MKNOD;

        VIR_DEBUG("Deny path %s", src->path);

        ret = virCgroupDenyDevicePath(priv->cgroup, src->path, perms);
    } else {
80
        if (!src->readonly && !forceReadonly)
81 82 83 84 85 86 87 88 89 90 91 92 93
            perms |= VIR_CGROUP_DEVICE_WRITE;

        VIR_DEBUG("Allow path %s, perms: %s",
                  src->path, virCgroupGetDevicePermsString(perms));

        ret = virCgroupAllowDevicePath(priv->cgroup, src->path, perms);
    }

    virDomainAuditCgroupPath(vm, priv->cgroup,
                             deny ? "deny" : "allow",
                             src->path,
                             virCgroupGetDevicePermsString(perms),
                             ret == 0);
94 95 96 97

    /* Get this for root squash NFS */
    if (ret < 0 &&
        virLastErrorIsSystemErrno(EACCES)) {
98
        VIR_DEBUG("Ignoring EACCES for %s", src->path);
99 100
        virResetLastError();
        ret = 0;
101
    }
102

103
    return ret;
104 105 106
}


107 108 109 110 111 112 113 114 115
int
qemuSetImageCgroup(virDomainObjPtr vm,
                   virStorageSourcePtr src,
                   bool deny)
{
    return qemuSetImageCgroupInternal(vm, src, deny, false);
}


116 117 118
int
qemuSetupDiskCgroup(virDomainObjPtr vm,
                    virDomainDiskDefPtr disk)
119
{
120
    virStorageSourcePtr next;
121
    bool forceReadonly = false;
122

123
    for (next = disk->src; next; next = next->backingStore) {
124
        if (qemuSetImageCgroupInternal(vm, next, false, forceReadonly) < 0)
125
            return -1;
126 127 128

        /* setup only the top level image for read-write */
        forceReadonly = true;
129
    }
130 131

    return 0;
132 133 134
}


135 136 137
int
qemuTeardownDiskCgroup(virDomainObjPtr vm,
                       virDomainDiskDefPtr disk)
138
{
139
    virStorageSourcePtr next;
140

141 142 143 144
    for (next = disk->src; next; next = next->backingStore) {
        if (qemuSetImageCgroup(vm, next, true) < 0)
            return -1;
    }
145

146
    return 0;
147 148
}

149

150
static int
151
qemuSetupChrSourceCgroup(virDomainDefPtr def ATTRIBUTE_UNUSED,
152
                         virDomainChrSourceDefPtr dev,
153
                         void *opaque)
154
{
155 156
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
157
    int ret;
158

159
    if (dev->type != VIR_DOMAIN_CHR_TYPE_DEV)
160 161
        return 0;

162
    VIR_DEBUG("Process path '%s' for device", dev->data.file.path);
163

164 165
    ret = virCgroupAllowDevicePath(priv->cgroup, dev->data.file.path,
                                   VIR_CGROUP_DEVICE_RW);
166
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
167
                             dev->data.file.path, "rw", ret == 0);
168

169
    return ret;
170 171
}

172 173 174 175 176
static int
qemuSetupChardevCgroup(virDomainDefPtr def,
                       virDomainChrDefPtr dev,
                       void *opaque)
{
177
    return qemuSetupChrSourceCgroup(def, &dev->source, opaque);
178 179 180 181 182 183
}


static int
qemuSetupTPMCgroup(virDomainDefPtr def,
                   virDomainTPMDefPtr dev,
184
                   void *opaque)
185
{
186
    int ret = 0;
187 188 189

    switch (dev->type) {
    case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
190 191
        ret = qemuSetupChrSourceCgroup(def, &dev->data.passthrough.source,
                                       opaque);
192 193 194 195 196
        break;
    case VIR_DOMAIN_TPM_TYPE_LAST:
        break;
    }

197
    return ret;
198 199
}

200

201
static int
202
qemuSetupHostUSBDeviceCgroup(virUSBDevicePtr dev ATTRIBUTE_UNUSED,
203 204
                             const char *path,
                             void *opaque)
205
{
206 207
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
208
    int ret;
209 210

    VIR_DEBUG("Process path '%s' for USB device", path);
211 212 213
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", ret == 0);
214

215
    return ret;
216 217
}

218
static int
219
qemuSetupHostSCSIDeviceCgroup(virSCSIDevicePtr dev ATTRIBUTE_UNUSED,
220 221 222 223 224
                              const char *path,
                              void *opaque)
{
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
225
    int ret;
226 227 228

    VIR_DEBUG("Process path '%s' for SCSI device", path);

229 230 231 232
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   virSCSIDeviceGetReadonly(dev) ?
                                   VIR_CGROUP_DEVICE_READ :
                                   VIR_CGROUP_DEVICE_RW);
233 234

    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
235
                             virSCSIDeviceGetReadonly(dev) ? "r" : "rw", ret == 0);
236

237
    return ret;
238
}
239

240 241 242 243 244 245
int
qemuSetupHostdevCGroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
246
    virDomainHostdevSubsysUSBPtr usbsrc = &dev->source.subsys.u.usb;
247
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
248
    virDomainHostdevSubsysSCSIPtr scsisrc = &dev->source.subsys.u.scsi;
249
    virPCIDevicePtr pci = NULL;
250
    virUSBDevicePtr usb = NULL;
251
    virSCSIDevicePtr scsi = NULL;
252 253 254 255 256 257 258 259 260 261 262 263 264 265
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
266
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
267
                int rv;
268

269 270 271 272
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
273 274 275
                if (!pci)
                    goto cleanup;

276
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
277 278 279
                    goto cleanup;

                VIR_DEBUG("Cgroup allow %s for PCI device assignment", path);
280
                rv = virCgroupAllowDevicePath(priv->cgroup, path,
281 282
                                              VIR_CGROUP_DEVICE_RW);
                virDomainAuditCgroupPath(vm, priv->cgroup,
283 284
                                         "allow", path, "rw", rv == 0);
                if (rv < 0)
285 286 287
                    goto cleanup;
            }
            break;
288 289 290 291 292 293 294 295

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* NB: hostdev->missing wasn't previously checked in the
             * case of hotplug, only when starting a domain. Now it is
             * always checked, and the cgroup setup skipped if true.
             */
            if (dev->missing)
                break;
296
            if ((usb = virUSBDeviceNew(usbsrc->bus, usbsrc->device,
297 298 299 300
                                       NULL)) == NULL) {
                goto cleanup;
            }

301
            /* oddly, qemuSetupHostUSBDeviceCgroup doesn't ever
302 303
             * reference the usb object we just created
             */
304
            if (virUSBDeviceFileIterate(usb, qemuSetupHostUSBDeviceCgroup,
305 306 307 308
                                        vm) < 0) {
                goto cleanup;
            }
            break;
309

310
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI: {
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
            if (scsisrc->protocol ==
                VIR_DOMAIN_HOSTDEV_SCSI_PROTOCOL_TYPE_ISCSI) {
                virDomainHostdevSubsysSCSIiSCSIPtr iscsisrc = &scsisrc->u.iscsi;
                /* Follow qemuSetupDiskCgroup() and qemuSetImageCgroupInternal()
                 * which does nothing for non local storage
                 */
                VIR_DEBUG("Not updating cgroups for hostdev iSCSI path '%s'",
                          iscsisrc->path);
            } else {
                virDomainHostdevSubsysSCSIHostPtr scsihostsrc =
                    &scsisrc->u.host;
                if ((scsi = virSCSIDeviceNew(NULL,
                                             scsihostsrc->adapter,
                                             scsihostsrc->bus,
                                             scsihostsrc->target,
                                             scsihostsrc->unit,
                                             dev->readonly,
                                             dev->shareable)) == NULL)
                    goto cleanup;
330

331 332 333 334 335
                if (virSCSIDeviceFileIterate(scsi,
                                             qemuSetupHostSCSIDeviceCgroup,
                                             vm) < 0)
                    goto cleanup;
            }
336 337
            break;
        }
338

339 340 341 342 343 344
        default:
            break;
        }
    }

    ret = 0;
345
 cleanup:
346
    virPCIDeviceFree(pci);
347
    virUSBDeviceFree(usb);
348
    virSCSIDeviceFree(scsi);
349 350 351 352 353 354 355 356 357 358
    VIR_FREE(path);
    return ret;
}

int
qemuTeardownHostdevCgroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
359
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
    virPCIDevicePtr pci = NULL;
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
375
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
376
                int rv;
377

378 379 380 381
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
382 383 384
                if (!pci)
                    goto cleanup;

385
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
386 387 388
                    goto cleanup;

                VIR_DEBUG("Cgroup deny %s for PCI device assignment", path);
389
                rv = virCgroupDenyDevicePath(priv->cgroup, path,
390 391
                                             VIR_CGROUP_DEVICE_RWM);
                virDomainAuditCgroupPath(vm, priv->cgroup,
392 393
                                         "deny", path, "rwm", rv == 0);
                if (rv < 0)
394 395 396
                    goto cleanup;
            }
            break;
397 398 399
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* nothing to tear down for USB */
            break;
400 401 402 403 404 405
        default:
            break;
        }
    }

    ret = 0;
406
 cleanup:
407 408 409 410 411
    virPCIDeviceFree(pci);
    VIR_FREE(path);
    return ret;
}

412 413 414 415
static int
qemuSetupBlkioCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
416
    size_t i;
417 418 419 420 421 422 423 424 425 426 427 428

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_BLKIO)) {
        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Block I/O tuning is not available on this host"));
            return -1;
        } else {
            return 0;
        }
    }

429 430 431
    if (vm->def->blkio.weight != 0 &&
        virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight) < 0)
        return -1;
432 433 434

    if (vm->def->blkio.ndevices) {
        for (i = 0; i < vm->def->blkio.ndevices; i++) {
435
            virBlkioDevicePtr dev = &vm->def->blkio.devices[i];
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
            if (dev->weight &&
                (virCgroupSetBlkioDeviceWeight(priv->cgroup, dev->path,
                                               dev->weight) < 0))
                return -1;

            if (dev->riops &&
                (virCgroupSetBlkioDeviceReadIops(priv->cgroup, dev->path,
                                                 dev->riops) < 0))
                return -1;

            if (dev->wiops &&
                (virCgroupSetBlkioDeviceWriteIops(priv->cgroup, dev->path,
                                                  dev->wiops) < 0))
                return -1;

            if (dev->rbps &&
                (virCgroupSetBlkioDeviceReadBps(priv->cgroup, dev->path,
                                                dev->rbps) < 0))
                return -1;

            if (dev->wbps &&
                (virCgroupSetBlkioDeviceWriteBps(priv->cgroup, dev->path,
                                                 dev->wbps) < 0))
459 460 461 462 463 464 465
                return -1;
        }
    }

    return 0;
}

466

467 468 469 470 471
static int
qemuSetupMemoryCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

E
Eric Blake 已提交
472
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
473 474 475 476 477 478
        if (vm->def->mem.hard_limit != 0 ||
            vm->def->mem.soft_limit != 0 ||
            vm->def->mem.swap_hard_limit != 0) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Memory cgroup is not available on this host"));
            return -1;
O
Osier Yang 已提交
479 480
        } else {
            return 0;
481 482 483
        }
    }

484 485
    if (vm->def->mem.hard_limit != 0 &&
        virCgroupSetMemoryHardLimit(priv->cgroup, vm->def->mem.hard_limit) < 0)
486 487
        return -1;

488 489 490 491 492 493 494
    if (vm->def->mem.soft_limit != 0 &&
        virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit) < 0)
        return -1;

    if (vm->def->mem.swap_hard_limit != 0 &&
        virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit) < 0)
        return -1;
495 496 497 498 499

    return 0;
}


500 501 502 503 504 505 506
static int
qemuSetupDevicesCgroup(virQEMUDriverPtr driver,
                       virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = NULL;
    const char *const *deviceACL = NULL;
507
    int rv = -1;
508
    int ret = -1;
509
    size_t i;
510 511 512 513

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

514 515 516 517 518
    rv = virCgroupDenyAllDevices(priv->cgroup);
    virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rv == 0);
    if (rv < 0) {
        if (virLastErrorIsSystemErrno(EPERM)) {
            virResetLastError();
519 520 521 522 523 524 525
            VIR_WARN("Group devices ACL is not accessible, disabling whitelisting");
            return 0;
        }

        goto cleanup;
    }

526
    for (i = 0; i < vm->def->ndisks; i++) {
527 528 529 530
        if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0)
            goto cleanup;
    }

531
    rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR,
532 533
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR,
534 535
                              "pty", "rw", rv == 0);
    if (rv < 0)
536 537 538 539 540 541 542 543
        goto cleanup;

    cfg = virQEMUDriverGetConfig(driver);
    deviceACL = cfg->cgroupDeviceACL ?
                (const char *const *)cfg->cgroupDeviceACL :
                defaultDeviceACL;

    if (vm->def->nsounds &&
544
        ((!vm->def->ngraphics && cfg->nogfxAllowHostAudio) ||
545 546
         (vm->def->graphics &&
          ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC &&
547
           cfg->vncAllowHostAudio) ||
548
           (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL))))) {
549
        rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR,
550 551
                                       VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR,
552 553
                                  "sound", "rw", rv == 0);
        if (rv < 0)
554 555 556
            goto cleanup;
    }

557
    for (i = 0; deviceACL[i] != NULL; i++) {
558
        if (!virFileExists(deviceACL[i])) {
N
Nehal J Wani 已提交
559
            VIR_DEBUG("Ignoring non-existent device %s", deviceACL[i]);
560 561 562
            continue;
        }

563
        rv = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i],
564
                                      VIR_CGROUP_DEVICE_RW);
565 566 567
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rv == 0);
        if (rv < 0 &&
            !virLastErrorIsSystemErrno(ENOENT))
568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
            goto cleanup;
    }

    if (virDomainChrDefForeach(vm->def,
                               true,
                               qemuSetupChardevCgroup,
                               vm) < 0)
        goto cleanup;

    if (vm->def->tpm &&
        (qemuSetupTPMCgroup(vm->def,
                            vm->def->tpm,
                            vm) < 0))
        goto cleanup;

    for (i = 0; i < vm->def->nhostdevs; i++) {
        if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0)
            goto cleanup;
    }

588 589 590
    for (i = 0; i < vm->def->nrngs; i++) {
        if (vm->def->rngs[i]->backend == VIR_DOMAIN_RNG_BACKEND_RANDOM) {
            VIR_DEBUG("Setting Cgroup ACL for RNG device");
591 592
            rv = virCgroupAllowDevicePath(priv->cgroup,
                                          vm->def->rngs[i]->source.file,
593 594
                                          VIR_CGROUP_DEVICE_RW);
            virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
595 596
                                     vm->def->rngs[i]->source.file,
                                     "rw", rv == 0);
597 598 599 600
            if (rv < 0 &&
                !virLastErrorIsSystemErrno(ENOENT))
                goto cleanup;
        }
601 602
    }

603
    ret = 0;
604
 cleanup:
605 606 607 608 609
    virObjectUnref(cfg);
    return ret;
}


610
static int
611 612
qemuSetupCpusetMems(virDomainObjPtr vm,
                    virBitmapPtr nodemask)
613 614
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
615
    char *mem_mask = NULL;
616 617 618 619 620
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

621 622
    if (virDomainNumatuneMaybeFormatNodeset(vm->def->numatune,
                                            nodemask,
623
                                            &mem_mask, -1) < 0)
624
        goto cleanup;
625

626 627 628
    if (mem_mask &&
        virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
        goto cleanup;
629

630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
    ret = 0;
 cleanup:
    VIR_FREE(mem_mask);
    return ret;
}


static int
qemuSetupCpusetCgroup(virDomainObjPtr vm,
                      virBitmapPtr nodemask,
                      virCapsPtr caps)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    char *cpu_mask = NULL;
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

649 650 651 652 653 654 655 656 657 658 659 660 661
    if (vm->def->cpumask ||
        (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) {

        if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
            virBitmapPtr cpumap;
            if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask)))
                goto cleanup;
            cpu_mask = virBitmapFormat(cpumap);
            virBitmapFree(cpumap);
        } else {
            cpu_mask = virBitmapFormat(vm->def->cpumask);
        }

662
        if (!cpu_mask)
663 664
            goto cleanup;

665
        if (virCgroupSetCpusetCpus(priv->cgroup, cpu_mask) < 0)
666 667 668
            goto cleanup;
    }

669
    ret = 0;
670
 cleanup:
671
    VIR_FREE(cpu_mask);
672 673 674 675
    return ret;
}


676
static int
677 678
qemuSetupCpuCgroup(virQEMUDriverPtr driver,
                   virDomainObjPtr vm)
679 680
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
681 682 683 684
    virObjectEventPtr event = NULL;
    virTypedParameterPtr eventParams = NULL;
    int eventNparams = 0;
    int eventMaxparams = 0;
685 686

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
687
       if (vm->def->cputune.sharesSpecified) {
688 689 690 691 692 693 694 695
           virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                          _("CPU tuning is not available on this host"));
           return -1;
       } else {
           return 0;
       }
    }

696 697 698 699 700 701 702
    if (vm->def->cputune.sharesSpecified) {
        unsigned long long val;
        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
            return -1;

        if (virCgroupGetCpuShares(priv->cgroup, &val) < 0)
            return -1;
703 704 705 706
        if (vm->def->cputune.shares != val) {
            vm->def->cputune.shares = val;
            if (virTypedParamsAddULLong(&eventParams, &eventNparams,
                                        &eventMaxparams,
707
                                        VIR_DOMAIN_TUNABLE_CPU_CPU_SHARES,
708 709 710 711 712 713 714
                                        val) < 0)
                return -1;

            event = virDomainEventTunableNewFromObj(vm, eventParams, eventNparams);
        }

        if (event)
715
            qemuDomainEventQueue(driver, event);
716
    }
717 718 719 720 721

    return 0;
}


722
static int
723
qemuInitCgroup(virQEMUDriverPtr driver,
724
               virDomainObjPtr vm)
725
{
726
    int ret = -1;
727 728 729
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);

730 731 732
    if (!cfg->privileged)
        goto done;

733 734 735
    if (!virCgroupAvailable())
        goto done;

736 737
    virCgroupFree(&priv->cgroup);

738
    if (!vm->def->resource) {
739 740
        virDomainResourceDefPtr res;

741
        if (VIR_ALLOC(res) < 0)
742
            goto cleanup;
743

744
        if (VIR_STRDUP(res->partition, "/machine") < 0) {
745 746 747 748 749
            VIR_FREE(res);
            goto cleanup;
        }

        vm->def->resource = res;
750 751
    }

752 753 754 755 756 757
    if (vm->def->resource->partition[0] != '/') {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Resource partition '%s' must start with '/'"),
                       vm->def->resource->partition);
        goto cleanup;
    }
758 759 760 761 762 763 764 765 766 767 768

    if (virCgroupNewMachine(vm->def->name,
                            "qemu",
                            cfg->privileged,
                            vm->def->uuid,
                            NULL,
                            vm->pid,
                            false,
                            vm->def->resource->partition,
                            cfg->cgroupControllers,
                            &priv->cgroup) < 0) {
769 770
        if (virCgroupNewIgnoreError())
            goto done;
771

772 773
        goto cleanup;
    }
774

775
 done:
776
    ret = 0;
777
 cleanup:
778 779 780
    virObjectUnref(cfg);
    return ret;
}
781

782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798

int
qemuConnectCgroup(virQEMUDriverPtr driver,
                  virDomainObjPtr vm)
{
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int ret = -1;

    if (!cfg->privileged)
        goto done;

    if (!virCgroupAvailable())
        goto done;

    virCgroupFree(&priv->cgroup);

799 800 801
    if (virCgroupNewDetectMachine(vm->def->name,
                                  "qemu",
                                  vm->pid,
802 803 804
                                  vm->def->resource ?
                                  vm->def->resource->partition :
                                  NULL,
805
                                  cfg->cgroupControllers,
806
                                  &priv->cgroup) < 0)
807
        goto cleanup;
808

809
 done:
810
    ret = 0;
811
 cleanup:
812
    virObjectUnref(cfg);
813
    return ret;
814 815
}

816 817 818 819
int
qemuSetupCgroup(virQEMUDriverPtr driver,
                virDomainObjPtr vm,
                virBitmapPtr nodemask)
820
{
821
    qemuDomainObjPrivatePtr priv = vm->privateData;
822
    virCapsPtr caps = NULL;
823
    int ret = -1;
824

825 826 827 828 829 830
    if (!vm->pid) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("Cannot setup cgroups until process is started"));
        return -1;
    }

831
    if (qemuInitCgroup(driver, vm) < 0)
832
        return -1;
833

834
    if (!priv->cgroup)
835
        return 0;
836

837 838 839
    if (!(caps = virQEMUDriverGetCapabilities(driver, false)))
        goto cleanup;

840 841
    if (qemuSetupDevicesCgroup(driver, vm) < 0)
        goto cleanup;
842

843 844
    if (qemuSetupBlkioCgroup(vm) < 0)
        goto cleanup;
845

846 847
    if (qemuSetupMemoryCgroup(vm) < 0)
        goto cleanup;
848

849
    if (qemuSetupCpuCgroup(driver, vm) < 0)
850
        goto cleanup;
851

852
    if (qemuSetupCpusetCgroup(vm, nodemask, caps) < 0)
853
        goto cleanup;
854

855
    ret = 0;
856
 cleanup:
857
    virObjectUnref(caps);
858
    return ret;
859 860
}

861 862 863 864 865 866 867
int
qemuSetupCgroupPostInit(virDomainObjPtr vm,
                        virBitmapPtr nodemask)
{
    return qemuSetupCpusetMems(vm, nodemask);
}

868 869 870 871
int
qemuSetupCgroupVcpuBW(virCgroupPtr cgroup,
                      unsigned long long period,
                      long long quota)
872 873 874 875 876 877 878 879
{
    unsigned long long old_period;

    if (period == 0 && quota == 0)
        return 0;

    if (period) {
        /* get old period, and we can rollback if set quota failed */
880
        if (virCgroupGetCpuCfsPeriod(cgroup, &old_period) < 0)
881 882
            return -1;

883
        if (virCgroupSetCpuCfsPeriod(cgroup, period) < 0)
884 885 886
            return -1;
    }

887 888 889
    if (quota &&
        virCgroupSetCpuCfsQuota(cgroup, quota) < 0)
        goto error;
890 891 892

    return 0;

893
 error:
894
    if (period) {
895 896 897 898 899 900
        virErrorPtr saved = virSaveLastError();
        ignore_value(virCgroupSetCpuCfsPeriod(cgroup, old_period));
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
901 902 903 904 905
    }

    return -1;
}

906 907 908 909 910
int
qemuSetupCgroupVcpuPin(virCgroupPtr cgroup,
                       virDomainVcpuPinDefPtr *vcpupin,
                       int nvcpupin,
                       int vcpuid)
911
{
912
    size_t i;
913 914 915

    for (i = 0; i < nvcpupin; i++) {
        if (vcpuid == vcpupin[i]->vcpuid) {
916
            return qemuSetupCgroupEmulatorPin(cgroup, vcpupin[i]->cpumask);
917 918 919
        }
    }

920 921 922
    return -1;
}

923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939
int
qemuSetupCgroupIOThreadsPin(virCgroupPtr cgroup,
                            virDomainVcpuPinDefPtr *iothreadspin,
                            int niothreadspin,
                            int iothreadid)
{
    size_t i;

    for (i = 0; i < niothreadspin; i++) {
        if (iothreadid == iothreadspin[i]->vcpuid) {
            return qemuSetupCgroupEmulatorPin(cgroup, iothreadspin[i]->cpumask);
        }
    }

    return -1;
}

940 941 942
int
qemuSetupCgroupEmulatorPin(virCgroupPtr cgroup,
                           virBitmapPtr cpumask)
943
{
944
    int ret = -1;
945 946
    char *new_cpus = NULL;

947
    if (!(new_cpus = virBitmapFormat(cpumask)))
948 949
        goto cleanup;

950
    if (virCgroupSetCpusetCpus(cgroup, new_cpus) < 0)
951 952
        goto cleanup;

953
    ret = 0;
954
 cleanup:
955
    VIR_FREE(new_cpus);
956
    return ret;
957 958
}

959 960
int
qemuSetupCgroupForVcpu(virDomainObjPtr vm)
961 962 963
{
    virCgroupPtr cgroup_vcpu = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
964
    virDomainDefPtr def = vm->def;
965
    size_t i, j;
966 967 968
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

969
    if ((period || quota) &&
970
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
971 972
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
H
Hu Tao 已提交
973 974 975
        return -1;
    }

976 977 978 979 980 981 982 983 984
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

985
    /* We are trying to setup cgroups for CPU pinning, which can also be done
986
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
987
     */
988
    if (priv->cgroup == NULL)
989 990
        return 0;

991
    if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) {
992
        /* If we don't know VCPU<->PID mapping or all vcpu runs in the same
W
Wen Congyang 已提交
993
         * thread, we cannot control each vcpu.
994
         */
995 996
        VIR_WARN("Unable to get vcpus' pids.");
        return 0;
997 998 999
    }

    for (i = 0; i < priv->nvcpupids; i++) {
1000
        if (virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu) < 0)
1001 1002 1003
            goto cleanup;

        /* move the thread for vcpu to sub dir */
1004
        if (virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]) < 0)
1005 1006 1007
            goto cleanup;

        if (period || quota) {
H
Hu Tao 已提交
1008 1009
            if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0)
                goto cleanup;
1010 1011
        }

1012
        /* Set vcpupin in cgroup if vcpupin xml is provided */
1013
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
M
Martin Kletzander 已提交
1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupVcpuPin will fail. */
            for (j = 0; j < def->cputune.nvcpupin; j++) {
                if (def->cputune.vcpupin[j]->vcpuid != i)
                    continue;

                if (qemuSetupCgroupVcpuPin(cgroup_vcpu,
                                           def->cputune.vcpupin,
                                           def->cputune.nvcpupin,
                                           i) < 0)
                    goto cleanup;

                break;
            }
        }
1029

1030 1031 1032 1033 1034
        virCgroupFree(&cgroup_vcpu);
    }

    return 0;

1035
 cleanup:
1036 1037 1038 1039 1040
    if (cgroup_vcpu) {
        virCgroupRemove(cgroup_vcpu);
        virCgroupFree(&cgroup_vcpu);
    }

1041 1042 1043
    return -1;
}

1044 1045 1046 1047
int
qemuSetupCgroupForEmulator(virQEMUDriverPtr driver,
                           virDomainObjPtr vm,
                           virBitmapPtr nodemask)
1048
{
1049
    virBitmapPtr cpumask = NULL;
1050
    virBitmapPtr cpumap = NULL;
1051
    virCgroupPtr cgroup_emulator = NULL;
1052
    virDomainDefPtr def = vm->def;
1053
    qemuDomainObjPrivatePtr priv = vm->privateData;
1054 1055
    unsigned long long period = vm->def->cputune.emulator_period;
    long long quota = vm->def->cputune.emulator_quota;
1056

1057
    if ((period || quota) &&
1058
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
1059 1060 1061 1062 1063
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

1064 1065 1066 1067 1068 1069 1070 1071 1072
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

1073
    if (priv->cgroup == NULL)
1074 1075
        return 0; /* Not supported, so claim success */

1076
    if (virCgroupNewEmulator(priv->cgroup, true, &cgroup_emulator) < 0)
1077 1078
        goto cleanup;

1079
    if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0)
1080
        goto cleanup;
1081

1082 1083 1084 1085 1086
    if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
        if (!(cpumap = qemuPrepareCpumap(driver, nodemask)))
            goto cleanup;
        cpumask = cpumap;
    } else if (def->cputune.emulatorpin) {
1087
        cpumask = def->cputune.emulatorpin->cpumask;
1088
    } else if (def->cpumask) {
1089
        cpumask = def->cpumask;
1090
    }
1091 1092

    if (cpumask) {
1093 1094 1095
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) &&
            qemuSetupCgroupEmulatorPin(cgroup_emulator, cpumask) < 0)
            goto cleanup;
H
Hu Tao 已提交
1096
    }
1097

1098
    if (period || quota) {
1099 1100 1101 1102
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
            qemuSetupCgroupVcpuBW(cgroup_emulator, period,
                                  quota) < 0)
            goto cleanup;
1103 1104
    }

1105
    virCgroupFree(&cgroup_emulator);
1106
    virBitmapFree(cpumap);
1107 1108
    return 0;

1109
 cleanup:
1110 1111
    virBitmapFree(cpumap);

1112 1113 1114 1115 1116
    if (cgroup_emulator) {
        virCgroupRemove(cgroup_emulator);
        virCgroupFree(&cgroup_emulator);
    }

1117
    return -1;
1118
}
1119

1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151
int
qemuSetupCgroupForIOThreads(virDomainObjPtr vm)
{
    virCgroupPtr cgroup_iothread = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virDomainDefPtr def = vm->def;
    size_t i, j;
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

    if ((period || quota) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

    /* We are trying to setup cgroups for CPU pinning, which can also be done
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
     */
    if (priv->cgroup == NULL)
        return 0;

J
Ján Tomko 已提交
1152
    if (def->iothreads && priv->niothreadpids == 0) {
1153 1154 1155 1156 1157 1158 1159 1160
        VIR_WARN("Unable to get iothreads' pids.");
        return 0;
    }

    for (i = 0; i < priv->niothreadpids; i++) {
        /* IOThreads are numbered 1..n, although the array is 0..n-1,
         * so we will account for that here
         */
1161 1162
        if (virCgroupNewIOThread(priv->cgroup, i + 1, true,
                                 &cgroup_iothread) < 0)
1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180
            goto cleanup;

        /* move the thread for iothread to sub dir */
        if (virCgroupAddTask(cgroup_iothread, priv->iothreadpids[i]) < 0)
            goto cleanup;

        if (period || quota) {
            if (qemuSetupCgroupVcpuBW(cgroup_iothread, period, quota) < 0)
                goto cleanup;
        }

        /* Set iothreadpin in cgroup if iothreadpin xml is provided */
        if (virCgroupHasController(priv->cgroup,
                                   VIR_CGROUP_CONTROLLER_CPUSET)) {
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupIOThreadsPin will fail. */
            for (j = 0; j < def->cputune.niothreadspin; j++) {
                /* IOThreads are numbered/named 1..n */
1181
                if (def->cputune.iothreadspin[j]->vcpuid != i + 1)
1182 1183 1184 1185 1186
                    continue;

                if (qemuSetupCgroupIOThreadsPin(cgroup_iothread,
                                                def->cputune.iothreadspin,
                                                def->cputune.niothreadspin,
1187
                                                i + 1) < 0)
1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207
                    goto cleanup;

                break;
            }
        }

        virCgroupFree(&cgroup_iothread);
    }

    return 0;

 cleanup:
    if (cgroup_iothread) {
        virCgroupRemove(cgroup_iothread);
        virCgroupFree(&cgroup_iothread);
    }

    return -1;
}

1208
int
1209 1210
qemuRemoveCgroup(virQEMUDriverPtr driver,
                 virDomainObjPtr vm)
1211
{
1212
    qemuDomainObjPrivatePtr priv = vm->privateData;
1213
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
1214

1215
    if (priv->cgroup == NULL)
1216 1217
        return 0; /* Not supported, so claim success */

1218 1219 1220 1221 1222 1223 1224
    if (virCgroupTerminateMachine(vm->def->name,
                                  "qemu",
                                  cfg->privileged) < 0) {
        if (!virCgroupNewIgnoreError())
            VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name);
    }

1225 1226
    virObjectUnref(cfg);

1227
    return virCgroupRemove(priv->cgroup);
1228 1229
}

1230 1231
int
qemuAddToCgroup(virDomainObjPtr vm)
1232
{
1233
    qemuDomainObjPrivatePtr priv = vm->privateData;
1234

1235
    if (priv->cgroup == NULL)
1236 1237
        return 0; /* Not supported, so claim success */

1238
    return 0;
1239
}