qemu_cgroup.c 31.8 KB
Newer Older
1 2 3
/*
 * qemu_cgroup.c: QEMU cgroup management
 *
4
 * Copyright (C) 2006-2014 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright (C) 2006 Daniel P. Berrange
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 */

#include <config.h>

#include "qemu_cgroup.h"
27
#include "qemu_domain.h"
28
#include "qemu_process.h"
29
#include "vircgroup.h"
30
#include "virlog.h"
31
#include "viralloc.h"
32
#include "virerror.h"
33
#include "domain_audit.h"
34
#include "virscsi.h"
35
#include "virstring.h"
36
#include "virfile.h"
37 38 39

#define VIR_FROM_THIS VIR_FROM_QEMU

40 41
VIR_LOG_INIT("qemu.qemu_cgroup");

42 43 44 45
static const char *const defaultDeviceACL[] = {
    "/dev/null", "/dev/full", "/dev/zero",
    "/dev/random", "/dev/urandom",
    "/dev/ptmx", "/dev/kvm", "/dev/kqemu",
46
    "/dev/rtc", "/dev/hpet", "/dev/vfio/vfio",
47 48 49 50 51
    NULL,
};
#define DEVICE_PTY_MAJOR 136
#define DEVICE_SND_MAJOR 116

52 53 54 55 56
static int
qemuSetImageCgroupInternal(virDomainObjPtr vm,
                           virStorageSourcePtr src,
                           bool deny,
                           bool forceReadonly)
57
{
58
    qemuDomainObjPrivatePtr priv = vm->privateData;
59
    int perms = VIR_CGROUP_DEVICE_READ;
60
    int ret;
61

62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (!src->path || !virStorageSourceIsLocalStorage(src)) {
        VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s",
                  NULLSTR(src->path), virStorageTypeToString(src->type));
        return 0;
    }

    if (deny) {
        perms |= VIR_CGROUP_DEVICE_WRITE | VIR_CGROUP_DEVICE_MKNOD;

        VIR_DEBUG("Deny path %s", src->path);

        ret = virCgroupDenyDevicePath(priv->cgroup, src->path, perms);
    } else {
79
        if (!src->readonly && !forceReadonly)
80 81 82 83 84 85 86 87 88 89 90 91 92
            perms |= VIR_CGROUP_DEVICE_WRITE;

        VIR_DEBUG("Allow path %s, perms: %s",
                  src->path, virCgroupGetDevicePermsString(perms));

        ret = virCgroupAllowDevicePath(priv->cgroup, src->path, perms);
    }

    virDomainAuditCgroupPath(vm, priv->cgroup,
                             deny ? "deny" : "allow",
                             src->path,
                             virCgroupGetDevicePermsString(perms),
                             ret == 0);
93 94 95 96

    /* Get this for root squash NFS */
    if (ret < 0 &&
        virLastErrorIsSystemErrno(EACCES)) {
97
        VIR_DEBUG("Ignoring EACCES for %s", src->path);
98 99
        virResetLastError();
        ret = 0;
100
    }
101

102
    return ret;
103 104 105
}


106 107 108 109 110 111 112 113 114
int
qemuSetImageCgroup(virDomainObjPtr vm,
                   virStorageSourcePtr src,
                   bool deny)
{
    return qemuSetImageCgroupInternal(vm, src, deny, false);
}


115 116 117
int
qemuSetupDiskCgroup(virDomainObjPtr vm,
                    virDomainDiskDefPtr disk)
118
{
119
    virStorageSourcePtr next;
120
    bool forceReadonly = false;
121

122
    for (next = disk->src; next; next = next->backingStore) {
123
        if (qemuSetImageCgroupInternal(vm, next, false, forceReadonly) < 0)
124
            return -1;
125 126 127

        /* setup only the top level image for read-write */
        forceReadonly = true;
128
    }
129 130

    return 0;
131 132 133
}


134 135 136
int
qemuTeardownDiskCgroup(virDomainObjPtr vm,
                       virDomainDiskDefPtr disk)
137
{
138
    virStorageSourcePtr next;
139

140 141 142 143
    for (next = disk->src; next; next = next->backingStore) {
        if (qemuSetImageCgroup(vm, next, true) < 0)
            return -1;
    }
144

145
    return 0;
146 147
}

148

149
static int
150
qemuSetupChrSourceCgroup(virDomainDefPtr def ATTRIBUTE_UNUSED,
151
                         virDomainChrSourceDefPtr dev,
152
                         void *opaque)
153
{
154 155
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
156
    int ret;
157

158
    if (dev->type != VIR_DOMAIN_CHR_TYPE_DEV)
159 160
        return 0;

161
    VIR_DEBUG("Process path '%s' for device", dev->data.file.path);
162

163 164
    ret = virCgroupAllowDevicePath(priv->cgroup, dev->data.file.path,
                                   VIR_CGROUP_DEVICE_RW);
165
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
166
                             dev->data.file.path, "rw", ret == 0);
167

168
    return ret;
169 170
}

171 172 173 174 175
static int
qemuSetupChardevCgroup(virDomainDefPtr def,
                       virDomainChrDefPtr dev,
                       void *opaque)
{
176
    return qemuSetupChrSourceCgroup(def, &dev->source, opaque);
177 178 179 180 181 182
}


static int
qemuSetupTPMCgroup(virDomainDefPtr def,
                   virDomainTPMDefPtr dev,
183
                   void *opaque)
184
{
185
    int ret = 0;
186 187 188

    switch (dev->type) {
    case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
189 190
        ret = qemuSetupChrSourceCgroup(def, &dev->data.passthrough.source,
                                       opaque);
191 192 193 194 195
        break;
    case VIR_DOMAIN_TPM_TYPE_LAST:
        break;
    }

196
    return ret;
197 198
}

199

200
static int
201
qemuSetupHostUSBDeviceCgroup(virUSBDevicePtr dev ATTRIBUTE_UNUSED,
202 203
                             const char *path,
                             void *opaque)
204
{
205 206
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
207
    int ret;
208 209

    VIR_DEBUG("Process path '%s' for USB device", path);
210 211 212
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", ret == 0);
213

214
    return ret;
215 216
}

217
static int
218
qemuSetupHostSCSIDeviceCgroup(virSCSIDevicePtr dev ATTRIBUTE_UNUSED,
219 220 221 222 223
                              const char *path,
                              void *opaque)
{
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
224
    int ret;
225 226 227

    VIR_DEBUG("Process path '%s' for SCSI device", path);

228 229 230 231
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   virSCSIDeviceGetReadonly(dev) ?
                                   VIR_CGROUP_DEVICE_READ :
                                   VIR_CGROUP_DEVICE_RW);
232 233

    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
234
                             virSCSIDeviceGetReadonly(dev) ? "r" : "rw", ret == 0);
235

236
    return ret;
237
}
238

239 240 241 242 243 244
int
qemuSetupHostdevCGroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
245
    virDomainHostdevSubsysUSBPtr usbsrc = &dev->source.subsys.u.usb;
246
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
247
    virDomainHostdevSubsysSCSIPtr scsisrc = &dev->source.subsys.u.scsi;
248
    virPCIDevicePtr pci = NULL;
249
    virUSBDevicePtr usb = NULL;
250
    virSCSIDevicePtr scsi = NULL;
251 252 253 254 255 256 257 258 259 260 261 262 263 264
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
265
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
266
                int rv;
267

268 269 270 271
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
272 273 274
                if (!pci)
                    goto cleanup;

275
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
276 277 278
                    goto cleanup;

                VIR_DEBUG("Cgroup allow %s for PCI device assignment", path);
279
                rv = virCgroupAllowDevicePath(priv->cgroup, path,
280 281
                                              VIR_CGROUP_DEVICE_RW);
                virDomainAuditCgroupPath(vm, priv->cgroup,
282 283
                                         "allow", path, "rw", rv == 0);
                if (rv < 0)
284 285 286
                    goto cleanup;
            }
            break;
287 288 289 290 291 292 293 294

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* NB: hostdev->missing wasn't previously checked in the
             * case of hotplug, only when starting a domain. Now it is
             * always checked, and the cgroup setup skipped if true.
             */
            if (dev->missing)
                break;
295
            if ((usb = virUSBDeviceNew(usbsrc->bus, usbsrc->device,
296 297 298 299
                                       NULL)) == NULL) {
                goto cleanup;
            }

300
            /* oddly, qemuSetupHostUSBDeviceCgroup doesn't ever
301 302
             * reference the usb object we just created
             */
303
            if (virUSBDeviceFileIterate(usb, qemuSetupHostUSBDeviceCgroup,
304 305 306 307
                                        vm) < 0) {
                goto cleanup;
            }
            break;
308

309
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI: {
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
            if (scsisrc->protocol ==
                VIR_DOMAIN_HOSTDEV_SCSI_PROTOCOL_TYPE_ISCSI) {
                virDomainHostdevSubsysSCSIiSCSIPtr iscsisrc = &scsisrc->u.iscsi;
                /* Follow qemuSetupDiskCgroup() and qemuSetImageCgroupInternal()
                 * which does nothing for non local storage
                 */
                VIR_DEBUG("Not updating cgroups for hostdev iSCSI path '%s'",
                          iscsisrc->path);
            } else {
                virDomainHostdevSubsysSCSIHostPtr scsihostsrc =
                    &scsisrc->u.host;
                if ((scsi = virSCSIDeviceNew(NULL,
                                             scsihostsrc->adapter,
                                             scsihostsrc->bus,
                                             scsihostsrc->target,
                                             scsihostsrc->unit,
                                             dev->readonly,
                                             dev->shareable)) == NULL)
                    goto cleanup;
329

330 331 332 333 334
                if (virSCSIDeviceFileIterate(scsi,
                                             qemuSetupHostSCSIDeviceCgroup,
                                             vm) < 0)
                    goto cleanup;
            }
335 336
            break;
        }
337

338 339 340 341 342 343
        default:
            break;
        }
    }

    ret = 0;
344
 cleanup:
345
    virPCIDeviceFree(pci);
346
    virUSBDeviceFree(usb);
347
    virSCSIDeviceFree(scsi);
348 349 350 351 352 353 354 355 356 357
    VIR_FREE(path);
    return ret;
}

int
qemuTeardownHostdevCgroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
358
    virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci;
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
    virPCIDevicePtr pci = NULL;
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
374
            if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
375
                int rv;
376

377 378 379 380
                pci = virPCIDeviceNew(pcisrc->addr.domain,
                                      pcisrc->addr.bus,
                                      pcisrc->addr.slot,
                                      pcisrc->addr.function);
381 382 383
                if (!pci)
                    goto cleanup;

384
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
385 386 387
                    goto cleanup;

                VIR_DEBUG("Cgroup deny %s for PCI device assignment", path);
388
                rv = virCgroupDenyDevicePath(priv->cgroup, path,
389 390
                                             VIR_CGROUP_DEVICE_RWM);
                virDomainAuditCgroupPath(vm, priv->cgroup,
391 392
                                         "deny", path, "rwm", rv == 0);
                if (rv < 0)
393 394 395
                    goto cleanup;
            }
            break;
396 397 398
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* nothing to tear down for USB */
            break;
399 400 401 402 403 404
        default:
            break;
        }
    }

    ret = 0;
405
 cleanup:
406 407 408 409 410
    virPCIDeviceFree(pci);
    VIR_FREE(path);
    return ret;
}

411 412 413 414
static int
qemuSetupBlkioCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
415
    size_t i;
416 417 418 419 420 421 422 423 424 425 426 427

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_BLKIO)) {
        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Block I/O tuning is not available on this host"));
            return -1;
        } else {
            return 0;
        }
    }

428 429 430
    if (vm->def->blkio.weight != 0 &&
        virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight) < 0)
        return -1;
431 432 433

    if (vm->def->blkio.ndevices) {
        for (i = 0; i < vm->def->blkio.ndevices; i++) {
434
            virBlkioDevicePtr dev = &vm->def->blkio.devices[i];
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457
            if (dev->weight &&
                (virCgroupSetBlkioDeviceWeight(priv->cgroup, dev->path,
                                               dev->weight) < 0))
                return -1;

            if (dev->riops &&
                (virCgroupSetBlkioDeviceReadIops(priv->cgroup, dev->path,
                                                 dev->riops) < 0))
                return -1;

            if (dev->wiops &&
                (virCgroupSetBlkioDeviceWriteIops(priv->cgroup, dev->path,
                                                  dev->wiops) < 0))
                return -1;

            if (dev->rbps &&
                (virCgroupSetBlkioDeviceReadBps(priv->cgroup, dev->path,
                                                dev->rbps) < 0))
                return -1;

            if (dev->wbps &&
                (virCgroupSetBlkioDeviceWriteBps(priv->cgroup, dev->path,
                                                 dev->wbps) < 0))
458 459 460 461 462 463 464
                return -1;
        }
    }

    return 0;
}

465

466 467 468 469 470
static int
qemuSetupMemoryCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

E
Eric Blake 已提交
471
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
472 473 474 475 476 477
        if (vm->def->mem.hard_limit != 0 ||
            vm->def->mem.soft_limit != 0 ||
            vm->def->mem.swap_hard_limit != 0) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Memory cgroup is not available on this host"));
            return -1;
O
Osier Yang 已提交
478 479
        } else {
            return 0;
480 481 482
        }
    }

483 484
    if (vm->def->mem.hard_limit != 0 &&
        virCgroupSetMemoryHardLimit(priv->cgroup, vm->def->mem.hard_limit) < 0)
485 486
        return -1;

487 488 489 490 491 492 493
    if (vm->def->mem.soft_limit != 0 &&
        virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit) < 0)
        return -1;

    if (vm->def->mem.swap_hard_limit != 0 &&
        virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit) < 0)
        return -1;
494 495 496 497 498

    return 0;
}


499 500 501 502 503 504 505
static int
qemuSetupDevicesCgroup(virQEMUDriverPtr driver,
                       virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = NULL;
    const char *const *deviceACL = NULL;
506
    int rv = -1;
507
    int ret = -1;
508
    size_t i;
509 510 511 512

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

513 514 515 516 517
    rv = virCgroupDenyAllDevices(priv->cgroup);
    virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rv == 0);
    if (rv < 0) {
        if (virLastErrorIsSystemErrno(EPERM)) {
            virResetLastError();
518 519 520 521 522 523 524
            VIR_WARN("Group devices ACL is not accessible, disabling whitelisting");
            return 0;
        }

        goto cleanup;
    }

525
    for (i = 0; i < vm->def->ndisks; i++) {
526 527 528 529
        if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0)
            goto cleanup;
    }

530
    rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR,
531 532
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR,
533 534
                              "pty", "rw", rv == 0);
    if (rv < 0)
535 536 537 538 539 540 541 542
        goto cleanup;

    cfg = virQEMUDriverGetConfig(driver);
    deviceACL = cfg->cgroupDeviceACL ?
                (const char *const *)cfg->cgroupDeviceACL :
                defaultDeviceACL;

    if (vm->def->nsounds &&
543
        ((!vm->def->ngraphics && cfg->nogfxAllowHostAudio) ||
544 545
         (vm->def->graphics &&
          ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC &&
546
           cfg->vncAllowHostAudio) ||
547
           (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL))))) {
548
        rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR,
549 550
                                       VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR,
551 552
                                  "sound", "rw", rv == 0);
        if (rv < 0)
553 554 555
            goto cleanup;
    }

556
    for (i = 0; deviceACL[i] != NULL; i++) {
557
        if (!virFileExists(deviceACL[i])) {
N
Nehal J Wani 已提交
558
            VIR_DEBUG("Ignoring non-existent device %s", deviceACL[i]);
559 560 561
            continue;
        }

562
        rv = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i],
563
                                      VIR_CGROUP_DEVICE_RW);
564 565 566
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rv == 0);
        if (rv < 0 &&
            !virLastErrorIsSystemErrno(ENOENT))
567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
            goto cleanup;
    }

    if (virDomainChrDefForeach(vm->def,
                               true,
                               qemuSetupChardevCgroup,
                               vm) < 0)
        goto cleanup;

    if (vm->def->tpm &&
        (qemuSetupTPMCgroup(vm->def,
                            vm->def->tpm,
                            vm) < 0))
        goto cleanup;

    for (i = 0; i < vm->def->nhostdevs; i++) {
        if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0)
            goto cleanup;
    }

587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603
    for (i = 0; i < vm->def->nrngs; i++) {
        if (vm->def->rngs[i]->backend == VIR_DOMAIN_RNG_BACKEND_RANDOM) {
            VIR_DEBUG("Setting Cgroup ACL for RNG device");
            const char *rngpath = vm->def->rngs[i]->source.file;

            /* fix path when using the default */
            if (!rngpath)
                rngpath = "/dev/random";

            rv = virCgroupAllowDevicePath(priv->cgroup, rngpath,
                                          VIR_CGROUP_DEVICE_RW);
            virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
                                     rngpath, "rw", rv == 0);
            if (rv < 0 &&
                !virLastErrorIsSystemErrno(ENOENT))
                goto cleanup;
        }
604 605
    }

606
    ret = 0;
607
 cleanup:
608 609 610 611 612
    virObjectUnref(cfg);
    return ret;
}


613
static int
614 615
qemuSetupCpusetMems(virDomainObjPtr vm,
                    virBitmapPtr nodemask)
616 617
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
618
    char *mem_mask = NULL;
619 620 621 622 623
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

624 625
    if (virDomainNumatuneMaybeFormatNodeset(vm->def->numatune,
                                            nodemask,
626
                                            &mem_mask, -1) < 0)
627
        goto cleanup;
628

629 630 631
    if (mem_mask &&
        virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
        goto cleanup;
632

633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651
    ret = 0;
 cleanup:
    VIR_FREE(mem_mask);
    return ret;
}


static int
qemuSetupCpusetCgroup(virDomainObjPtr vm,
                      virBitmapPtr nodemask,
                      virCapsPtr caps)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    char *cpu_mask = NULL;
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

652 653 654 655 656 657 658 659 660 661 662 663 664
    if (vm->def->cpumask ||
        (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) {

        if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
            virBitmapPtr cpumap;
            if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask)))
                goto cleanup;
            cpu_mask = virBitmapFormat(cpumap);
            virBitmapFree(cpumap);
        } else {
            cpu_mask = virBitmapFormat(vm->def->cpumask);
        }

665
        if (!cpu_mask)
666 667
            goto cleanup;

668
        if (virCgroupSetCpusetCpus(priv->cgroup, cpu_mask) < 0)
669 670 671
            goto cleanup;
    }

672
    ret = 0;
673
 cleanup:
674
    VIR_FREE(cpu_mask);
675 676 677 678
    return ret;
}


679 680 681 682 683 684
static int
qemuSetupCpuCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
685
       if (vm->def->cputune.sharesSpecified) {
686 687 688 689 690 691 692 693
           virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                          _("CPU tuning is not available on this host"));
           return -1;
       } else {
           return 0;
       }
    }

694 695 696 697 698 699 700 701 702
    if (vm->def->cputune.sharesSpecified) {
        unsigned long long val;
        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
            return -1;

        if (virCgroupGetCpuShares(priv->cgroup, &val) < 0)
            return -1;
        vm->def->cputune.shares = val;
    }
703 704 705 706 707

    return 0;
}


708
static int
709
qemuInitCgroup(virQEMUDriverPtr driver,
710
               virDomainObjPtr vm)
711
{
712
    int ret = -1;
713 714 715
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);

716 717 718
    if (!cfg->privileged)
        goto done;

719 720 721
    if (!virCgroupAvailable())
        goto done;

722 723
    virCgroupFree(&priv->cgroup);

724
    if (!vm->def->resource) {
725 726
        virDomainResourceDefPtr res;

727
        if (VIR_ALLOC(res) < 0)
728
            goto cleanup;
729

730
        if (VIR_STRDUP(res->partition, "/machine") < 0) {
731 732 733 734 735
            VIR_FREE(res);
            goto cleanup;
        }

        vm->def->resource = res;
736 737
    }

738 739 740 741 742 743
    if (vm->def->resource->partition[0] != '/') {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Resource partition '%s' must start with '/'"),
                       vm->def->resource->partition);
        goto cleanup;
    }
744 745 746 747 748 749 750 751 752 753 754

    if (virCgroupNewMachine(vm->def->name,
                            "qemu",
                            cfg->privileged,
                            vm->def->uuid,
                            NULL,
                            vm->pid,
                            false,
                            vm->def->resource->partition,
                            cfg->cgroupControllers,
                            &priv->cgroup) < 0) {
755 756
        if (virCgroupNewIgnoreError())
            goto done;
757

758 759
        goto cleanup;
    }
760

761
 done:
762
    ret = 0;
763
 cleanup:
764 765 766
    virObjectUnref(cfg);
    return ret;
}
767

768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784

int
qemuConnectCgroup(virQEMUDriverPtr driver,
                  virDomainObjPtr vm)
{
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
    qemuDomainObjPrivatePtr priv = vm->privateData;
    int ret = -1;

    if (!cfg->privileged)
        goto done;

    if (!virCgroupAvailable())
        goto done;

    virCgroupFree(&priv->cgroup);

785 786 787
    if (virCgroupNewDetectMachine(vm->def->name,
                                  "qemu",
                                  vm->pid,
788 789 790
                                  vm->def->resource ?
                                  vm->def->resource->partition :
                                  NULL,
791
                                  cfg->cgroupControllers,
792
                                  &priv->cgroup) < 0)
793
        goto cleanup;
794

795
 done:
796
    ret = 0;
797
 cleanup:
798
    virObjectUnref(cfg);
799
    return ret;
800 801
}

802 803 804 805
int
qemuSetupCgroup(virQEMUDriverPtr driver,
                virDomainObjPtr vm,
                virBitmapPtr nodemask)
806
{
807
    qemuDomainObjPrivatePtr priv = vm->privateData;
808
    virCapsPtr caps = NULL;
809
    int ret = -1;
810

811 812 813 814 815 816
    if (!vm->pid) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("Cannot setup cgroups until process is started"));
        return -1;
    }

817
    if (qemuInitCgroup(driver, vm) < 0)
818
        return -1;
819

820
    if (!priv->cgroup)
821
        return 0;
822

823 824 825
    if (!(caps = virQEMUDriverGetCapabilities(driver, false)))
        goto cleanup;

826 827
    if (qemuSetupDevicesCgroup(driver, vm) < 0)
        goto cleanup;
828

829 830
    if (qemuSetupBlkioCgroup(vm) < 0)
        goto cleanup;
831

832 833
    if (qemuSetupMemoryCgroup(vm) < 0)
        goto cleanup;
834

835 836
    if (qemuSetupCpuCgroup(vm) < 0)
        goto cleanup;
837

838
    if (qemuSetupCpusetCgroup(vm, nodemask, caps) < 0)
839
        goto cleanup;
840

841
    ret = 0;
842
 cleanup:
843
    virObjectUnref(caps);
844
    return ret;
845 846
}

847 848 849 850 851 852 853
int
qemuSetupCgroupPostInit(virDomainObjPtr vm,
                        virBitmapPtr nodemask)
{
    return qemuSetupCpusetMems(vm, nodemask);
}

854 855 856 857
int
qemuSetupCgroupVcpuBW(virCgroupPtr cgroup,
                      unsigned long long period,
                      long long quota)
858 859 860 861 862 863 864 865
{
    unsigned long long old_period;

    if (period == 0 && quota == 0)
        return 0;

    if (period) {
        /* get old period, and we can rollback if set quota failed */
866
        if (virCgroupGetCpuCfsPeriod(cgroup, &old_period) < 0)
867 868
            return -1;

869
        if (virCgroupSetCpuCfsPeriod(cgroup, period) < 0)
870 871 872
            return -1;
    }

873 874 875
    if (quota &&
        virCgroupSetCpuCfsQuota(cgroup, quota) < 0)
        goto error;
876 877 878

    return 0;

879
 error:
880
    if (period) {
881 882 883 884 885 886
        virErrorPtr saved = virSaveLastError();
        ignore_value(virCgroupSetCpuCfsPeriod(cgroup, old_period));
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
887 888 889 890 891
    }

    return -1;
}

892 893 894 895 896
int
qemuSetupCgroupVcpuPin(virCgroupPtr cgroup,
                       virDomainVcpuPinDefPtr *vcpupin,
                       int nvcpupin,
                       int vcpuid)
897
{
898
    size_t i;
899 900 901

    for (i = 0; i < nvcpupin; i++) {
        if (vcpuid == vcpupin[i]->vcpuid) {
902
            return qemuSetupCgroupEmulatorPin(cgroup, vcpupin[i]->cpumask);
903 904 905
        }
    }

906 907 908
    return -1;
}

909 910 911
int
qemuSetupCgroupEmulatorPin(virCgroupPtr cgroup,
                           virBitmapPtr cpumask)
912
{
913
    int ret = -1;
914 915
    char *new_cpus = NULL;

916
    if (!(new_cpus = virBitmapFormat(cpumask)))
917 918
        goto cleanup;

919
    if (virCgroupSetCpusetCpus(cgroup, new_cpus) < 0)
920 921
        goto cleanup;

922
    ret = 0;
923
 cleanup:
924
    VIR_FREE(new_cpus);
925
    return ret;
926 927
}

928 929
int
qemuSetupCgroupForVcpu(virDomainObjPtr vm)
930 931 932
{
    virCgroupPtr cgroup_vcpu = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
933
    virDomainDefPtr def = vm->def;
934
    size_t i, j;
935 936 937
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

938
    if ((period || quota) &&
939
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
940 941
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
H
Hu Tao 已提交
942 943 944
        return -1;
    }

945 946 947 948 949 950 951 952 953
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

954
    /* We are trying to setup cgroups for CPU pinning, which can also be done
955
     * with virProcessSetAffinity, thus the lack of cgroups is not fatal here.
956
     */
957
    if (priv->cgroup == NULL)
958 959
        return 0;

960
    if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) {
961
        /* If we don't know VCPU<->PID mapping or all vcpu runs in the same
W
Wen Congyang 已提交
962
         * thread, we cannot control each vcpu.
963
         */
964 965
        VIR_WARN("Unable to get vcpus' pids.");
        return 0;
966 967 968
    }

    for (i = 0; i < priv->nvcpupids; i++) {
969
        if (virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu) < 0)
970 971 972
            goto cleanup;

        /* move the thread for vcpu to sub dir */
973
        if (virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]) < 0)
974 975 976
            goto cleanup;

        if (period || quota) {
H
Hu Tao 已提交
977 978
            if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0)
                goto cleanup;
979 980
        }

981
        /* Set vcpupin in cgroup if vcpupin xml is provided */
982
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
M
Martin Kletzander 已提交
983 984 985 986 987 988 989 990 991 992 993 994 995 996 997
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupVcpuPin will fail. */
            for (j = 0; j < def->cputune.nvcpupin; j++) {
                if (def->cputune.vcpupin[j]->vcpuid != i)
                    continue;

                if (qemuSetupCgroupVcpuPin(cgroup_vcpu,
                                           def->cputune.vcpupin,
                                           def->cputune.nvcpupin,
                                           i) < 0)
                    goto cleanup;

                break;
            }
        }
998

999 1000 1001 1002 1003
        virCgroupFree(&cgroup_vcpu);
    }

    return 0;

1004
 cleanup:
1005 1006 1007 1008 1009
    if (cgroup_vcpu) {
        virCgroupRemove(cgroup_vcpu);
        virCgroupFree(&cgroup_vcpu);
    }

1010 1011 1012
    return -1;
}

1013 1014 1015 1016
int
qemuSetupCgroupForEmulator(virQEMUDriverPtr driver,
                           virDomainObjPtr vm,
                           virBitmapPtr nodemask)
1017
{
1018
    virBitmapPtr cpumask = NULL;
1019
    virBitmapPtr cpumap = NULL;
1020
    virCgroupPtr cgroup_emulator = NULL;
1021
    virDomainDefPtr def = vm->def;
1022
    qemuDomainObjPrivatePtr priv = vm->privateData;
1023 1024
    unsigned long long period = vm->def->cputune.emulator_period;
    long long quota = vm->def->cputune.emulator_quota;
1025

1026
    if ((period || quota) &&
1027
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
1028 1029 1030 1031 1032
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

1033 1034 1035 1036 1037 1038 1039 1040 1041
    /*
     * If CPU cgroup controller is not initialized here, then we need
     * neither period nor quota settings.  And if CPUSET controller is
     * not initialized either, then there's nothing to do anyway.
     */
    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

1042
    if (priv->cgroup == NULL)
1043 1044
        return 0; /* Not supported, so claim success */

1045
    if (virCgroupNewEmulator(priv->cgroup, true, &cgroup_emulator) < 0)
1046 1047
        goto cleanup;

1048
    if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0)
1049
        goto cleanup;
1050

1051 1052 1053 1054 1055
    if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
        if (!(cpumap = qemuPrepareCpumap(driver, nodemask)))
            goto cleanup;
        cpumask = cpumap;
    } else if (def->cputune.emulatorpin) {
1056
        cpumask = def->cputune.emulatorpin->cpumask;
1057
    } else if (def->cpumask) {
1058
        cpumask = def->cpumask;
1059
    }
1060 1061

    if (cpumask) {
1062 1063 1064
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) &&
            qemuSetupCgroupEmulatorPin(cgroup_emulator, cpumask) < 0)
            goto cleanup;
H
Hu Tao 已提交
1065
    }
1066

1067
    if (period || quota) {
1068 1069 1070 1071
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
            qemuSetupCgroupVcpuBW(cgroup_emulator, period,
                                  quota) < 0)
            goto cleanup;
1072 1073
    }

1074
    virCgroupFree(&cgroup_emulator);
1075
    virBitmapFree(cpumap);
1076 1077
    return 0;

1078
 cleanup:
1079 1080
    virBitmapFree(cpumap);

1081 1082 1083 1084 1085
    if (cgroup_emulator) {
        virCgroupRemove(cgroup_emulator);
        virCgroupFree(&cgroup_emulator);
    }

1086
    return -1;
1087
}
1088

1089 1090
int
qemuRemoveCgroup(virDomainObjPtr vm)
1091
{
1092
    qemuDomainObjPrivatePtr priv = vm->privateData;
1093

1094
    if (priv->cgroup == NULL)
1095 1096
        return 0; /* Not supported, so claim success */

1097
    return virCgroupRemove(priv->cgroup);
1098 1099
}

1100 1101
int
qemuAddToCgroup(virDomainObjPtr vm)
1102
{
1103
    qemuDomainObjPrivatePtr priv = vm->privateData;
1104

1105
    if (priv->cgroup == NULL)
1106 1107
        return 0; /* Not supported, so claim success */

1108
    return 0;
1109
}