qemu_cgroup.c 36.4 KB
Newer Older
1 2 3
/*
 * qemu_cgroup.c: QEMU cgroup management
 *
4
 * Copyright (C) 2006-2014 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright (C) 2006 Daniel P. Berrange
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 */

#include <config.h>

#include "qemu_cgroup.h"
27
#include "qemu_domain.h"
28
#include "qemu_process.h"
29
#include "vircgroup.h"
30
#include "virlog.h"
31
#include "viralloc.h"
32
#include "virerror.h"
33
#include "domain_audit.h"
34
#include "virscsi.h"
35
#include "virstring.h"
36
#include "virfile.h"
37
#include "virtypedparam.h"
38 39 40

#define VIR_FROM_THIS VIR_FROM_QEMU

41 42
VIR_LOG_INIT("qemu.qemu_cgroup");

43 44 45 46
static const char *const defaultDeviceACL[] = {
    "/dev/null", "/dev/full", "/dev/zero",
    "/dev/random", "/dev/urandom",
    "/dev/ptmx", "/dev/kvm", "/dev/kqemu",
47
    "/dev/rtc", "/dev/hpet", "/dev/vfio/vfio",
48 49 50 51 52
    NULL,
};
#define DEVICE_PTY_MAJOR 136
#define DEVICE_SND_MAJOR 116

53 54 55 56 57
static int
qemuSetImageCgroupInternal(virDomainObjPtr vm,
                           virStorageSourcePtr src,
                           bool deny,
                           bool forceReadonly)
58
{
59
    qemuDomainObjPrivatePtr priv = vm->privateData;
60
    int perms = VIR_CGROUP_DEVICE_READ;
61
    int ret;
62

63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (!src->path || !virStorageSourceIsLocalStorage(src)) {
        VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s",
                  NULLSTR(src->path), virStorageTypeToString(src->type));
        return 0;
    }

    if (deny) {
        perms |= VIR_CGROUP_DEVICE_WRITE | VIR_CGROUP_DEVICE_MKNOD;

        VIR_DEBUG("Deny path %s", src->path);

        ret = virCgroupDenyDevicePath(priv->cgroup, src->path, perms);
    } else {
80
        if (!src->readonly && !forceReadonly)
81 82 83 84 85 86 87 88 89 90 91 92 93
            perms |= VIR_CGROUP_DEVICE_WRITE;

        VIR_DEBUG("Allow path %s, perms: %s",
                  src->path, virCgroupGetDevicePermsString(perms));

        ret = virCgroupAllowDevicePath(priv->cgroup, src->path, perms);
    }

    virDomainAuditCgroupPath(vm, priv->cgroup,
                             deny ? "deny" : "allow",
                             src->path,
                             virCgroupGetDevicePermsString(perms),
                             ret == 0);
94 95 96 97

    /* Get this for root squash NFS */
    if (ret < 0 &&
        virLastErrorIsSystemErrno(EACCES)) {
98
        VIR_DEBUG("Ignoring EACCES for %s", src->path);
99 100
        virResetLastError();
        ret = 0;
101
    }
102

103
    return ret;
104 105 106
}


107 108 109 110 111 112 113 114 115
int
qemuSetImageCgroup(virDomainObjPtr vm,
                   virStorageSourcePtr src,
                   bool deny)
{
    return qemuSetImageCgroupInternal(vm, src, deny, false);
}


116 117 118
int
qemuSetupDiskCgroup(virDomainObjPtr vm,
                    virDomainDiskDefPtr disk)
119
{
120
    virStorageSourcePtr next;
121
    bool forceReadonly = false;
122

123
    for (next = disk->src; next; next = next->backingStore) {
124
        if (qemuSetImageCgroupInternal(vm, next, false, forceReadonly) < 0)
125
            return -1;
126 127 128

        /* setup only the top level image for read-write */
        forceReadonly = true;
129
    }
130 131

    return 0;
132 133 134
}


135 136 137
int
qemuTeardownDiskCgroup(virDomainObjPtr vm,
                       virDomainDiskDefPtr disk)
138
{
139
    virStorageSourcePtr next;
140

141 142 143 144
    for (next = disk->src; next; next = next->backingStore) {
        if (qemuSetImageCgroup(vm, next, true) < 0)
            return -1;
    }
145

146
    return 0;
147 148
}

149

150
static int
151
qemuSetupChrSourceCgroup(virDomainDefPtr def ATTRIBUTE_UNUSED,
152
                         virDomainChrSourceDefPtr dev,
153
                         void *opaque)
154
{
155 156
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
157
    int ret;
158

159
    if (dev->type != VIR_DOMAIN_CHR_TYPE_DEV)
160 161
        return 0;

162
    VIR_DEBUG("Process path '%s' for device", dev->data.file.path);
163

164 165
    ret = virCgroupAllowDevicePath(priv->cgroup, dev->data.file.path,
                                   VIR_CGROUP_DEVICE_RW);
166
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
167
                             dev->data.file.path, "rw", ret == 0);
168

169
    return ret;
170 171
}

172 173 174 175 176
static int
qemuSetupChardevCgroup(virDomainDefPtr def,
                       virDomainChrDefPtr dev,
                       void *opaque)
{
177
    return qemuSetupChrSourceCgroup(def, &dev->source, opaque);
178 179 180 181 182 183
}


static int
qemuSetupTPMCgroup(virDomainDefPtr def,
                   virDomainTPMDefPtr dev,
184
                   void *opaque)
185
{
186
    int ret = 0;
187 188 189

    switch (dev->type) {
    case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
190 191
        ret = qemuSetupChrSourceCgroup(def, &dev->data.passthrough.source,
                                       opaque);
192 193 194 195 196
        break;
    case VIR_DOMAIN_TPM_TYPE_LAST:
        break;
    }

197
    return ret;
198 199
}

200

201
static int
202
qemuSetupHostUSBDeviceCgroup(virUSBDevicePtr dev ATTRIBUTE_UNUSED,
203 204
                             const char *path,
                             void *opaque)
205
{
206 207
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
208
    int ret;
209 210

    VIR_DEBUG("Process path '%s' for USB device", path);
211 212 213
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", ret == 0);
214

215
    return ret;
216 217
}

218
static int
219
qemuSetupHostSCSIDeviceCgroup(virSCSIDevicePtr dev ATTRIBUTE_UNUSED,
220 221 222 223 224
                              const char *path,
                              void *opaque)
{
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
225
    int ret;
226 227 228

    VIR_DEBUG("Process path '%s' for SCSI device", path);

229 230 231 232
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   virSCSIDeviceGetReadonly(dev) ?
                                   VIR_CGROUP_DEVICE_READ :
                                   VIR_CGROUP_DEVICE_RW);
233 234

    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
235
                             virSCSIDeviceGetReadonly(dev) ? "r" : "rw", ret == 0);
236

237
    return ret;
238
}
239

240 241 242 243 244 245
int
qemuSetupHostdevCGroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
246
    virDomainHostdevSubsysUSBPtr usbsrc = &dev->source.subsys.u.usb;
247
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
248
    virDomainHostdevSubsysSCSIPtr scsisrc = &dev->source.subsys.u.scsi;
249
    virPCIDevicePtr pci = NULL;
250
    virUSBDevicePtr usb = NULL;
251
    virSCSIDevicePtr scsi = NULL;
252 253 254 255 256 257 258 259 260 261 262 263 264 265
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
266
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
267
                int rv;
268

269 270 271 272
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
273 274 275
                if (!pci)
                    goto cleanup;

276
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
277 278 279
                    goto cleanup;

                VIR_DEBUG("Cgroup allow %s for PCI device assignment", path);
280
                rv = virCgroupAllowDevicePath(priv->cgroup, path,
281 282
                                              VIR_CGROUP_DEVICE_RW);
                virDomainAuditCgroupPath(vm, priv->cgroup,
283 284
                                         "allow", path, "rw", rv == 0);
                if (rv < 0)
285 286 287
                    goto cleanup;
            }
            break;
288 289 290 291 292 293 294 295

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* NB: hostdev->missing wasn't previously checked in the
             * case of hotplug, only when starting a domain. Now it is
             * always checked, and the cgroup setup skipped if true.
             */
            if (dev->missing)
                break;
296
            if ((usb = virUSBDeviceNew(usbsrc->bus, usbsrc->device,
297 298 299 300
                                       NULL)) == NULL) {
                goto cleanup;
            }

301
            /* oddly, qemuSetupHostUSBDeviceCgroup doesn't ever
302 303
             * reference the usb object we just created
             */
304
            if (virUSBDeviceFileIterate(usb, qemuSetupHostUSBDeviceCgroup,
305 306 307 308
                                        vm) < 0) {
                goto cleanup;
            }
            break;
309

310
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI: {
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
            if (scsisrc->protocol ==
                VIR_DOMAIN_HOSTDEV_SCSI_PROTOCOL_TYPE_ISCSI) {
                virDomainHostdevSubsysSCSIiSCSIPtr iscsisrc = &scsisrc->u.iscsi;
                /* Follow qemuSetupDiskCgroup() and qemuSetImageCgroupInternal()
                 * which does nothing for non local storage
                 */
                VIR_DEBUG("Not updating cgroups for hostdev iSCSI path '%s'",
                          iscsisrc->path);
            } else {
                virDomainHostdevSubsysSCSIHostPtr scsihostsrc =
                    &scsisrc->u.host;
                if ((scsi = virSCSIDeviceNew(NULL,
                                             scsihostsrc->adapter,
                                             scsihostsrc->bus,
                                             scsihostsrc->target,
                                             scsihostsrc->unit,
                                             dev->readonly,
                                             dev->shareable)) == NULL)
                    goto cleanup;
330

331 332 333 334 335
                if (virSCSIDeviceFileIterate(scsi,
                                             qemuSetupHostSCSIDeviceCgroup,
                                             vm) < 0)
                    goto cleanup;
            }
336 337
            break;
        }
338

339 340 341 342 343 344
        default:
            break;
        }
    }

    ret = 0;
345
 cleanup:
346
    virPCIDeviceFree(pci);
347
    virUSBDeviceFree(usb);
348
    virSCSIDeviceFree(scsi);
349 350 351 352 353 354 355 356 357 358
    VIR_FREE(path);
    return ret;
}

int
qemuTeardownHostdevCgroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
359
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
    virPCIDevicePtr pci = NULL;
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
375
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
376
                int rv;
377

378 379 380 381
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
382 383 384
                if (!pci)
                    goto cleanup;

385
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
386 387 388
                    goto cleanup;

                VIR_DEBUG("Cgroup deny %s for PCI device assignment", path);
389
                rv = virCgroupDenyDevicePath(priv->cgroup, path,
390 391
                                             VIR_CGROUP_DEVICE_RWM);
                virDomainAuditCgroupPath(vm, priv->cgroup,
392 393
                                         "deny", path, "rwm", rv == 0);
                if (rv < 0)
394 395 396
                    goto cleanup;
            }
            break;
397 398 399
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* nothing to tear down for USB */
            break;
400 401 402 403 404 405
        default:
            break;
        }
    }

    ret = 0;
406
 cleanup:
407 408 409 410 411
    virPCIDeviceFree(pci);
    VIR_FREE(path);
    return ret;
}

412 413 414 415
static int
qemuSetupBlkioCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
416
    size_t i;
417 418 419 420 421 422 423 424 425 426 427 428

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_BLKIO)) {
        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Block I/O tuning is not available on this host"));
            return -1;
        } else {
            return 0;
        }
    }

429 430 431
    if (vm->def->blkio.weight != 0 &&
        virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight) < 0)
        return -1;
432 433 434

    if (vm->def->blkio.ndevices) {
        for (i = 0; i < vm->def->blkio.ndevices; i++) {
435
            virBlkioDevicePtr dev = &vm->def->blkio.devices[i];
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
            if (dev->weight &&
                (virCgroupSetBlkioDeviceWeight(priv->cgroup, dev->path,
                                               dev->weight) < 0))
                return -1;

            if (dev->riops &&
                (virCgroupSetBlkioDeviceReadIops(priv->cgroup, dev->path,
                                                 dev->riops) < 0))
                return -1;

            if (dev->wiops &&
                (virCgroupSetBlkioDeviceWriteIops(priv->cgroup, dev->path,
                                                  dev->wiops) < 0))
                return -1;

            if (dev->rbps &&
                (virCgroupSetBlkioDeviceReadBps(priv->cgroup, dev->path,
                                                dev->rbps) < 0))
                return -1;

            if (dev->wbps &&
                (virCgroupSetBlkioDeviceWriteBps(priv->cgroup, dev->path,
                                                 dev->wbps) < 0))
459 460 461 462 463 464 465
                return -1;
        }
    }

    return 0;
}

466

467 468 469 470 471
static int
qemuSetupMemoryCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

E
Eric Blake 已提交
472
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
473 474 475 476 477 478
        if (vm->def->mem.hard_limit != 0 ||
            vm->def->mem.soft_limit != 0 ||
            vm->def->mem.swap_hard_limit != 0) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Memory cgroup is not available on this host"));
            return -1;
O
Osier Yang 已提交
479 480
        } else {
            return 0;
481 482 483
        }
    }

484 485
    if (vm->def->mem.hard_limit != 0 &&
        virCgroupSetMemoryHardLimit(priv->cgroup, vm->def->mem.hard_limit) < 0)
486 487
        return -1;

488 489 490 491 492 493 494
    if (vm->def->mem.soft_limit != 0 &&
        virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit) < 0)
        return -1;

    if (vm->def->mem.swap_hard_limit != 0 &&
        virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit) < 0)
        return -1;
495 496 497 498 499

    return 0;
}


500 501 502 503 504 505 506
static int
qemuSetupDevicesCgroup(virQEMUDriverPtr driver,
                       virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = NULL;
    const char *const *deviceACL = NULL;
507
    int rv = -1;
508
    int ret = -1;
509
    size_t i;
510 511 512 513

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

514 515 516 517 518
    rv = virCgroupDenyAllDevices(priv->cgroup);
    virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rv == 0);
    if (rv < 0) {
        if (virLastErrorIsSystemErrno(EPERM)) {
            virResetLastError();
519 520 521 522 523 524 525
            VIR_WARN("Group devices ACL is not accessible, disabling whitelisting");
            return 0;
        }

        goto cleanup;
    }

526
    for (i = 0; i < vm->def->ndisks; i++) {
527 528 529 530
        if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0)
            goto cleanup;
    }

531
    rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR,
532 533
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR,
534 535
                              "pty", "rw", rv == 0);
    if (rv < 0)
536 537 538 539 540 541 542 543
        goto cleanup;

    cfg = virQEMUDriverGetConfig(driver);
    deviceACL = cfg->cgroupDeviceACL ?
                (const char *const *)cfg->cgroupDeviceACL :
                defaultDeviceACL;

    if (vm->def->nsounds &&
544
        ((!vm->def->ngraphics && cfg->nogfxAllowHostAudio) ||
545 546
         (vm->def->graphics &&
          ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC &&
547
           cfg->vncAllowHostAudio) ||
548
           (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL))))) {
549
        rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR,
550 551
                                       VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR,
552 553
                                  "sound", "rw", rv == 0);
        if (rv < 0)
554 555 556
            goto cleanup;
    }

557
    for (i = 0; deviceACL[i] != NULL; i++) {
558
        if (!virFileExists(deviceACL[i])) {
N
Nehal J Wani 已提交
559
            VIR_DEBUG("Ignoring non-existent device %s", deviceACL[i]);
560 561 562
            continue;
        }

563
        rv = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i],
564
                                      VIR_CGROUP_DEVICE_RW);
565 566 567
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rv == 0);
        if (rv < 0 &&
            !virLastErrorIsSystemErrno(ENOENT))
568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
            goto cleanup;
    }

    if (virDomainChrDefForeach(vm->def,
                               true,
                               qemuSetupChardevCgroup,
                               vm) < 0)
        goto cleanup;

    if (vm->def->tpm &&
        (qemuSetupTPMCgroup(vm->def,
                            vm->def->tpm,
                            vm) < 0))
        goto cleanup;

    for (i = 0; i < vm->def->nhostdevs; i++) {
        if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0)
            goto cleanup;
    }

588 589 590
    for (i = 0; i < vm->def->nrngs; i++) {
        if (vm->def->rngs[i]->backend == VIR_DOMAIN_RNG_BACKEND_RANDOM) {
            VIR_DEBUG("Setting Cgroup ACL for RNG device");
591 592
            rv = virCgroupAllowDevicePath(priv->cgroup,
                                          vm->def->rngs[i]->source.file,
593 594
                                          VIR_CGROUP_DEVICE_RW);
            virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
595 596
                                     vm->def->rngs[i]->source.file,
                                     "rw", rv == 0);
597 598 599 600
            if (rv < 0 &&
                !virLastErrorIsSystemErrno(ENOENT))
                goto cleanup;
        }
601 602
    }

603
    ret = 0;
604
 cleanup:
605 606 607 608 609
    virObjectUnref(cfg);
    return ret;
}


610
int
611 612
qemuSetupCpusetMems(virDomainObjPtr vm,
                    virBitmapPtr nodemask)
613
{
614
    virCgroupPtr cgroup_temp = NULL;
615
    qemuDomainObjPrivatePtr priv = vm->privateData;
616
    char *mem_mask = NULL;
617 618 619 620 621
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

622 623 624 625
    if (virDomainNumatuneGetMode(vm->def->numatune, -1) !=
        VIR_DOMAIN_NUMATUNE_MEM_STRICT)
        return 0;

626 627
    if (virDomainNumatuneMaybeFormatNodeset(vm->def->numatune,
                                            nodemask,
628
                                            &mem_mask, -1) < 0)
629
        goto cleanup;
630

631 632 633 634 635
    if (mem_mask)
        if (virCgroupNewEmulator(priv->cgroup, false, &cgroup_temp) < 0 ||
            virCgroupSetCpusetMems(cgroup_temp, mem_mask) < 0 ||
            virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
            goto cleanup;
636

637 638 639
    ret = 0;
 cleanup:
    VIR_FREE(mem_mask);
640
    virCgroupFree(&cgroup_temp);
641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
    return ret;
}


static int
qemuSetupCpusetCgroup(virDomainObjPtr vm,
                      virBitmapPtr nodemask,
                      virCapsPtr caps)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    char *cpu_mask = NULL;
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

657 658 659 660 661 662 663 664 665 666 667 668 669
    if (vm->def->cpumask ||
        (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) {

        if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
            virBitmapPtr cpumap;
            if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask)))
                goto cleanup;
            cpu_mask = virBitmapFormat(cpumap);
            virBitmapFree(cpumap);
        } else {
            cpu_mask = virBitmapFormat(vm->def->cpumask);
        }

670
        if (!cpu_mask)
671 672
            goto cleanup;

673
        if (virCgroupSetCpusetCpus(priv->cgroup, cpu_mask) < 0)
674 675 676
            goto cleanup;
    }

677
    ret = 0;
678
 cleanup:
679
    VIR_FREE(cpu_mask);
680 681 682 683
    return ret;
}


684
static int
685 686
qemuSetupCpuCgroup(virQEMUDriverPtr driver,
                   virDomainObjPtr vm)
687 688
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
689 690 691 692
    virObjectEventPtr event = NULL;
    virTypedParameterPtr eventParams = NULL;
    int eventNparams = 0;
    int eventMaxparams = 0;
693 694

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
695
       if (vm->def->cputune.sharesSpecified) {
696 697 698 699 700 701 702 703
           virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                          _("CPU tuning is not available on this host"));
           return -1;
       } else {
           return 0;
       }
    }

704 705 706 707 708 709 710
    if (vm->def->cputune.sharesSpecified) {
        unsigned long long val;
        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
            return -1;

        if (virCgroupGetCpuShares(priv->cgroup, &val) < 0)
            return -1;
711 712 713 714
        if (vm->def->cputune.shares != val) {
            vm->def->cputune.shares = val;
            if (virTypedParamsAddULLong(&eventParams, &eventNparams,
                                        &eventMaxparams,
715
                                        VIR_DOMAIN_TUNABLE_CPU_CPU_SHARES,
716 717 718 719 720 721 722
                                        val) < 0)
                return -1;

            event = virDomainEventTunableNewFromObj(vm, eventParams, eventNparams);
        }

        if (event)
723
            qemuDomainEventQueue(driver, event);
724
    }
725 726 727 728 729

    return 0;
}


730
static int
731
qemuInitCgroup(virQEMUDriverPtr driver,
732
               virDomainObjPtr vm)
733
{
734
    int ret = -1;
735 736 737
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);

738 739 740
    if (!cfg->privileged)
        goto done;

741 742 743
    if (!virCgroupAvailable())
        goto done;

744 745
    virCgroupFree(&priv->cgroup);

746
    if (!vm->def->resource) {
747 748
        virDomainResourceDefPtr res;

749
        if (VIR_ALLOC(res) < 0)
750
            goto cleanup;
751

752
        if (VIR_STRDUP(res->partition, "/machine") < 0) {
753 754 755 756 757
            VIR_FREE(res);
            goto cleanup;
        }

        vm->def->resource = res;
758 759
    }

760 761 762 763 764 765
    if (vm->def->resource->partition[0] != '/') {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Resource partition '%s' must start with '/'"),
                       vm->def->resource->partition);
        goto cleanup;
    }
766 767 768 769 770 771 772 773 774 775 776

    if (virCgroupNewMachine(vm->def->name,
                            "qemu",
                            cfg->privileged,
                            vm->def->uuid,
                            NULL,
                            vm->pid,
                            false,
                            vm->def->resource->partition,
                            cfg->cgroupControllers,
                            &priv->cgroup) < 0) {
777 778
        if (virCgroupNewIgnoreError())
            goto done;
779

780 781
        goto cleanup;
    }
782

783
 done:
784
    ret = 0;
785
 cleanup:
786 787 788
    virObjectUnref(cfg);
    return ret;
}
789

790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806

int
qemuConnectCgroup(virQEMUDriverPtr driver,
                  virDomainObjPtr vm)
{
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int ret = -1;

    if (!cfg->privileged)
        goto done;

    if (!virCgroupAvailable())
        goto done;

    virCgroupFree(&priv->cgroup);

807 808 809
    if (virCgroupNewDetectMachine(vm->def->name,
                                  "qemu",
                                  vm->pid,
810 811 812
                                  vm->def->resource ?
                                  vm->def->resource->partition :
                                  NULL,
813
                                  cfg->cgroupControllers,
814
                                  &priv->cgroup) < 0)
815
        goto cleanup;
816

817
 done:
818
    ret = 0;
819
 cleanup:
820
    virObjectUnref(cfg);
821
    return ret;
822 823
}

824 825 826 827
int
qemuSetupCgroup(virQEMUDriverPtr driver,
                virDomainObjPtr vm,
                virBitmapPtr nodemask)
828
{
829
    qemuDomainObjPrivatePtr priv = vm->privateData;
830
    virCapsPtr caps = NULL;
831
    int ret = -1;
832

833 834 835 836 837 838
    if (!vm->pid) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("Cannot setup cgroups until process is started"));
        return -1;
    }

839
    if (qemuInitCgroup(driver, vm) < 0)
840
        return -1;
841

842
    if (!priv->cgroup)
843
        return 0;
844

845 846 847
    if (!(caps = virQEMUDriverGetCapabilities(driver, false)))
        goto cleanup;

848 849
    if (qemuSetupDevicesCgroup(driver, vm) < 0)
        goto cleanup;
850

851 852
    if (qemuSetupBlkioCgroup(vm) < 0)
        goto cleanup;
853

854 855
    if (qemuSetupMemoryCgroup(vm) < 0)
        goto cleanup;
856

857
    if (qemuSetupCpuCgroup(driver, vm) < 0)
858
        goto cleanup;
859

860
    if (qemuSetupCpusetCgroup(vm, nodemask, caps) < 0)
861
        goto cleanup;
862

863
    ret = 0;
864
 cleanup:
865
    virObjectUnref(caps);
866
    return ret;
867 868
}

869 870 871 872
int
qemuSetupCgroupVcpuBW(virCgroupPtr cgroup,
                      unsigned long long period,
                      long long quota)
873 874 875 876 877 878 879 880
{
    unsigned long long old_period;

    if (period == 0 && quota == 0)
        return 0;

    if (period) {
        /* get old period, and we can rollback if set quota failed */
881
        if (virCgroupGetCpuCfsPeriod(cgroup, &old_period) < 0)
882 883
            return -1;

884
        if (virCgroupSetCpuCfsPeriod(cgroup, period) < 0)
885 886 887
            return -1;
    }

888 889 890
    if (quota &&
        virCgroupSetCpuCfsQuota(cgroup, quota) < 0)
        goto error;
891 892 893

    return 0;

894
 error:
895
    if (period) {
896 897 898 899 900 901
        virErrorPtr saved = virSaveLastError();
        ignore_value(virCgroupSetCpuCfsPeriod(cgroup, old_period));
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
902 903 904 905 906
    }

    return -1;
}

907 908 909 910 911
int
qemuSetupCgroupVcpuPin(virCgroupPtr cgroup,
                       virDomainVcpuPinDefPtr *vcpupin,
                       int nvcpupin,
                       int vcpuid)
912
{
913
    size_t i;
914 915

    for (i = 0; i < nvcpupin; i++) {
916
        if (vcpuid == vcpupin[i]->vcpuid)
917
            return qemuSetupCgroupEmulatorPin(cgroup, vcpupin[i]->cpumask);
918 919
    }

920 921 922
    return -1;
}

923 924 925 926 927 928 929 930 931
int
qemuSetupCgroupIOThreadsPin(virCgroupPtr cgroup,
                            virDomainVcpuPinDefPtr *iothreadspin,
                            int niothreadspin,
                            int iothreadid)
{
    size_t i;

    for (i = 0; i < niothreadspin; i++) {
932
        if (iothreadid == iothreadspin[i]->vcpuid)
933 934 935 936 937 938
            return qemuSetupCgroupEmulatorPin(cgroup, iothreadspin[i]->cpumask);
    }

    return -1;
}

939 940 941
int
qemuSetupCgroupEmulatorPin(virCgroupPtr cgroup,
                           virBitmapPtr cpumask)
942
{
943
    int ret = -1;
944 945
    char *new_cpus = NULL;

946
    if (!(new_cpus = virBitmapFormat(cpumask)))
947 948
        goto cleanup;

949
    if (virCgroupSetCpusetCpus(cgroup, new_cpus) < 0)
950 951
        goto cleanup;

952
    ret = 0;
953
 cleanup:
954
    VIR_FREE(new_cpus);
955
    return ret;
956 957
}

958 959
int
qemuSetupCgroupForVcpu(virDomainObjPtr vm)
960 961 962
{
    virCgroupPtr cgroup_vcpu = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
963
    virDomainDefPtr def = vm->def;
964
    size_t i, j;
965 966 967
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

968
    if ((period || quota) &&
969
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
970 971
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
H
Hu Tao 已提交
972 973 974
        return -1;
    }

975 976 977 978 979 980 981 982 983
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

984
    /* We are trying to setup cgroups for CPU pinning, which can also be done
985
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
986
     */
987
    if (priv->cgroup == NULL)
988 989
        return 0;

990
    if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) {
991
        /* If we don't know VCPU<->PID mapping or all vcpu runs in the same
W
Wen Congyang 已提交
992
         * thread, we cannot control each vcpu.
993
         */
994 995
        VIR_WARN("Unable to get vcpus' pids.");
        return 0;
996 997 998
    }

    for (i = 0; i < priv->nvcpupids; i++) {
999
        if (virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu) < 0)
1000 1001 1002
            goto cleanup;

        /* move the thread for vcpu to sub dir */
1003
        if (virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]) < 0)
1004 1005 1006
            goto cleanup;

        if (period || quota) {
H
Hu Tao 已提交
1007 1008
            if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0)
                goto cleanup;
1009 1010
        }

1011
        /* Set vcpupin in cgroup if vcpupin xml is provided */
1012
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
M
Martin Kletzander 已提交
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupVcpuPin will fail. */
            for (j = 0; j < def->cputune.nvcpupin; j++) {
                if (def->cputune.vcpupin[j]->vcpuid != i)
                    continue;

                if (qemuSetupCgroupVcpuPin(cgroup_vcpu,
                                           def->cputune.vcpupin,
                                           def->cputune.nvcpupin,
                                           i) < 0)
                    goto cleanup;

                break;
            }
        }
1028

1029 1030 1031 1032 1033
        virCgroupFree(&cgroup_vcpu);
    }

    return 0;

1034
 cleanup:
1035 1036 1037 1038 1039
    if (cgroup_vcpu) {
        virCgroupRemove(cgroup_vcpu);
        virCgroupFree(&cgroup_vcpu);
    }

1040 1041 1042
    return -1;
}

1043 1044 1045 1046
int
qemuSetupCgroupForEmulator(virQEMUDriverPtr driver,
                           virDomainObjPtr vm,
                           virBitmapPtr nodemask)
1047
{
1048
    virBitmapPtr cpumask = NULL;
1049
    virBitmapPtr cpumap = NULL;
1050
    virCgroupPtr cgroup_emulator = NULL;
1051
    virDomainDefPtr def = vm->def;
1052
    qemuDomainObjPrivatePtr priv = vm->privateData;
1053 1054
    unsigned long long period = vm->def->cputune.emulator_period;
    long long quota = vm->def->cputune.emulator_quota;
1055

1056
    if ((period || quota) &&
1057
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
1058 1059 1060 1061 1062
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

1063 1064 1065 1066 1067 1068 1069 1070 1071
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

1072
    if (priv->cgroup == NULL)
1073 1074
        return 0; /* Not supported, so claim success */

1075
    if (virCgroupNewEmulator(priv->cgroup, true, &cgroup_emulator) < 0)
1076 1077
        goto cleanup;

1078
    if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0)
1079
        goto cleanup;
1080

1081 1082 1083 1084 1085
    if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
        if (!(cpumap = qemuPrepareCpumap(driver, nodemask)))
            goto cleanup;
        cpumask = cpumap;
    } else if (def->cputune.emulatorpin) {
1086
        cpumask = def->cputune.emulatorpin->cpumask;
1087
    } else if (def->cpumask) {
1088
        cpumask = def->cpumask;
1089
    }
1090 1091

    if (cpumask) {
1092 1093 1094
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) &&
            qemuSetupCgroupEmulatorPin(cgroup_emulator, cpumask) < 0)
            goto cleanup;
H
Hu Tao 已提交
1095
    }
1096

1097
    if (period || quota) {
1098 1099 1100 1101
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
            qemuSetupCgroupVcpuBW(cgroup_emulator, period,
                                  quota) < 0)
            goto cleanup;
1102 1103
    }

1104
    virCgroupFree(&cgroup_emulator);
1105
    virBitmapFree(cpumap);
1106 1107
    return 0;

1108
 cleanup:
1109 1110
    virBitmapFree(cpumap);

1111 1112 1113 1114 1115
    if (cgroup_emulator) {
        virCgroupRemove(cgroup_emulator);
        virCgroupFree(&cgroup_emulator);
    }

1116
    return -1;
1117
}
1118

1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
int
qemuSetupCgroupForIOThreads(virDomainObjPtr vm)
{
    virCgroupPtr cgroup_iothread = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virDomainDefPtr def = vm->def;
    size_t i, j;
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

    if ((period || quota) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

    /* We are trying to setup cgroups for CPU pinning, which can also be done
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
     */
    if (priv->cgroup == NULL)
        return 0;

J
Ján Tomko 已提交
1151
    if (def->iothreads && priv->niothreadpids == 0) {
1152 1153 1154 1155 1156 1157 1158 1159
        VIR_WARN("Unable to get iothreads' pids.");
        return 0;
    }

    for (i = 0; i < priv->niothreadpids; i++) {
        /* IOThreads are numbered 1..n, although the array is 0..n-1,
         * so we will account for that here
         */
1160 1161
        if (virCgroupNewIOThread(priv->cgroup, i + 1, true,
                                 &cgroup_iothread) < 0)
1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
            goto cleanup;

        /* move the thread for iothread to sub dir */
        if (virCgroupAddTask(cgroup_iothread, priv->iothreadpids[i]) < 0)
            goto cleanup;

        if (period || quota) {
            if (qemuSetupCgroupVcpuBW(cgroup_iothread, period, quota) < 0)
                goto cleanup;
        }

        /* Set iothreadpin in cgroup if iothreadpin xml is provided */
        if (virCgroupHasController(priv->cgroup,
                                   VIR_CGROUP_CONTROLLER_CPUSET)) {
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupIOThreadsPin will fail. */
            for (j = 0; j < def->cputune.niothreadspin; j++) {
                /* IOThreads are numbered/named 1..n */
1180
                if (def->cputune.iothreadspin[j]->vcpuid != i + 1)
1181 1182 1183 1184 1185
                    continue;

                if (qemuSetupCgroupIOThreadsPin(cgroup_iothread,
                                                def->cputune.iothreadspin,
                                                def->cputune.niothreadspin,
1186
                                                i + 1) < 0)
1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206
                    goto cleanup;

                break;
            }
        }

        virCgroupFree(&cgroup_iothread);
    }

    return 0;

 cleanup:
    if (cgroup_iothread) {
        virCgroupRemove(cgroup_iothread);
        virCgroupFree(&cgroup_iothread);
    }

    return -1;
}

1207
int
1208 1209
qemuRemoveCgroup(virQEMUDriverPtr driver,
                 virDomainObjPtr vm)
1210
{
1211
    qemuDomainObjPrivatePtr priv = vm->privateData;
1212
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
1213

1214
    if (priv->cgroup == NULL)
1215 1216
        return 0; /* Not supported, so claim success */

1217 1218 1219 1220 1221 1222 1223
    if (virCgroupTerminateMachine(vm->def->name,
                                  "qemu",
                                  cfg->privileged) < 0) {
        if (!virCgroupNewIgnoreError())
            VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name);
    }

1224 1225
    virObjectUnref(cfg);

1226
    return virCgroupRemove(priv->cgroup);
1227 1228
}

1229 1230
int
qemuAddToCgroup(virDomainObjPtr vm)
1231
{
1232
    qemuDomainObjPrivatePtr priv = vm->privateData;
1233

1234
    if (priv->cgroup == NULL)
1235 1236
        return 0; /* Not supported, so claim success */

1237
    return 0;
1238
}