qemu_cgroup.c 28.8 KB
Newer Older
1 2 3
/*
 * qemu_cgroup.c: QEMU cgroup management
 *
E
Eric Blake 已提交
4
 * Copyright (C) 2006-2013 Red Hat, Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright (C) 2006 Daniel P. Berrange
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25 26
 *
 * Author: Daniel P. Berrange <berrange@redhat.com>
 */

#include <config.h>

#include "qemu_cgroup.h"
27
#include "qemu_domain.h"
28
#include "qemu_process.h"
29
#include "vircgroup.h"
30
#include "virlog.h"
31
#include "viralloc.h"
32
#include "virerror.h"
33
#include "domain_audit.h"
34
#include "virscsi.h"
35
#include "virstring.h"
36 37 38 39 40 41 42

#define VIR_FROM_THIS VIR_FROM_QEMU

static const char *const defaultDeviceACL[] = {
    "/dev/null", "/dev/full", "/dev/zero",
    "/dev/random", "/dev/urandom",
    "/dev/ptmx", "/dev/kvm", "/dev/kqemu",
43
    "/dev/rtc", "/dev/hpet", "/dev/vfio/vfio",
44 45 46 47 48
    NULL,
};
#define DEVICE_PTY_MAJOR 136
#define DEVICE_SND_MAJOR 116

49
static int
50
qemuSetupDiskPathAllow(virDomainDiskDefPtr disk,
51 52 53
                       const char *path,
                       size_t depth ATTRIBUTE_UNUSED,
                       void *opaque)
54
{
55 56
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
57
    int ret;
58 59

    VIR_DEBUG("Process path %s for disk", path);
60 61 62
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   (disk->readonly ? VIR_CGROUP_DEVICE_READ
                                    : VIR_CGROUP_DEVICE_RW));
63
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
64 65 66 67 68 69 70 71
                             disk->readonly ? "r" : "rw", ret == 0);

    /* Get this for root squash NFS */
    if (ret < 0 &&
        virLastErrorIsSystemErrno(EACCES)) {
        VIR_DEBUG("Ignoring EACCES for %s", path);
        virResetLastError();
        ret = 0;
72
    }
73
    return ret;
74 75 76
}


77 78 79
int
qemuSetupDiskCgroup(virDomainObjPtr vm,
                    virDomainDiskDefPtr disk)
80
{
81 82 83 84 85 86
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

87
    return virDomainDiskDefForeachPath(disk, true, qemuSetupDiskPathAllow, vm);
88 89 90
}


91 92 93 94 95
static int
qemuTeardownDiskPathDeny(virDomainDiskDefPtr disk ATTRIBUTE_UNUSED,
                         const char *path,
                         size_t depth ATTRIBUTE_UNUSED,
                         void *opaque)
96
{
97 98
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
99
    int ret;
100 101

    VIR_DEBUG("Process path %s for disk", path);
102 103 104 105 106 107 108 109 110 111
    ret = virCgroupDenyDevicePath(priv->cgroup, path,
                                  VIR_CGROUP_DEVICE_RWM);
    virDomainAuditCgroupPath(vm, priv->cgroup, "deny", path, "rwm", ret == 0);

    /* Get this for root squash NFS */
    if (ret < 0 &&
        virLastErrorIsSystemErrno(EACCES)) {
        VIR_DEBUG("Ignoring EACCES for %s", path);
        virResetLastError();
        ret = 0;
112
    }
113
    return ret;
114 115 116
}


117 118 119
int
qemuTeardownDiskCgroup(virDomainObjPtr vm,
                       virDomainDiskDefPtr disk)
120
{
121 122 123 124 125 126
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

127 128 129
    return virDomainDiskDefForeachPath(disk,
                                       true,
                                       qemuTeardownDiskPathDeny,
130
                                       vm);
131 132
}

133
static int
134
qemuSetupChrSourceCgroup(virDomainDefPtr def ATTRIBUTE_UNUSED,
135
                         virDomainChrSourceDefPtr dev,
136
                         void *opaque)
137
{
138 139
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
140
    int ret;
141

142
    if (dev->type != VIR_DOMAIN_CHR_TYPE_DEV)
143 144
        return 0;

145
    VIR_DEBUG("Process path '%s' for device", dev->data.file.path);
146

147 148
    ret = virCgroupAllowDevicePath(priv->cgroup, dev->data.file.path,
                                   VIR_CGROUP_DEVICE_RW);
149
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow",
150
                             dev->data.file.path, "rw", ret == 0);
151

152
    return ret;
153 154
}

155 156 157 158 159
static int
qemuSetupChardevCgroup(virDomainDefPtr def,
                       virDomainChrDefPtr dev,
                       void *opaque)
{
160
    return qemuSetupChrSourceCgroup(def, &dev->source, opaque);
161 162 163 164 165 166
}


static int
qemuSetupTPMCgroup(virDomainDefPtr def,
                   virDomainTPMDefPtr dev,
167
                   void *opaque)
168
{
169
    int ret = 0;
170 171 172

    switch (dev->type) {
    case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
173 174
        ret = qemuSetupChrSourceCgroup(def, &dev->data.passthrough.source,
                                       opaque);
175 176 177 178 179
        break;
    case VIR_DOMAIN_TPM_TYPE_LAST:
        break;
    }

180
    return ret;
181 182
}

183

184 185 186 187
static int
qemuSetupHostUsbDeviceCgroup(virUSBDevicePtr dev ATTRIBUTE_UNUSED,
                             const char *path,
                             void *opaque)
188
{
189 190
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
191
    int ret;
192 193

    VIR_DEBUG("Process path '%s' for USB device", path);
194 195 196
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", ret == 0);
197

198
    return ret;
199 200
}

201 202 203 204 205 206 207
static int
qemuSetupHostScsiDeviceCgroup(virSCSIDevicePtr dev ATTRIBUTE_UNUSED,
                              const char *path,
                              void *opaque)
{
    virDomainObjPtr vm = opaque;
    qemuDomainObjPrivatePtr priv = vm->privateData;
208
    int ret;
209 210 211

    VIR_DEBUG("Process path '%s' for SCSI device", path);

212 213 214 215
    ret = virCgroupAllowDevicePath(priv->cgroup, path,
                                   virSCSIDeviceGetReadonly(dev) ?
                                   VIR_CGROUP_DEVICE_READ :
                                   VIR_CGROUP_DEVICE_RW);
216 217

    virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path,
218
                             virSCSIDeviceGetReadonly(dev) ? "r" : "rw", ret == 0);
219

220
    return ret;
221
}
222

223 224 225 226 227 228 229
int
qemuSetupHostdevCGroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virPCIDevicePtr pci = NULL;
230
    virUSBDevicePtr usb = NULL;
231
    virSCSIDevicePtr scsi = NULL;
232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
            if (dev->source.subsys.u.pci.backend
247
                == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
248
                int rv;
249 250 251 252 253 254 255 256

                pci = virPCIDeviceNew(dev->source.subsys.u.pci.addr.domain,
                                      dev->source.subsys.u.pci.addr.bus,
                                      dev->source.subsys.u.pci.addr.slot,
                                      dev->source.subsys.u.pci.addr.function);
                if (!pci)
                    goto cleanup;

257
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
258 259 260
                    goto cleanup;

                VIR_DEBUG("Cgroup allow %s for PCI device assignment", path);
261
                rv = virCgroupAllowDevicePath(priv->cgroup, path,
262 263
                                              VIR_CGROUP_DEVICE_RW);
                virDomainAuditCgroupPath(vm, priv->cgroup,
264 265
                                         "allow", path, "rw", rv == 0);
                if (rv < 0)
266 267 268
                    goto cleanup;
            }
            break;
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* NB: hostdev->missing wasn't previously checked in the
             * case of hotplug, only when starting a domain. Now it is
             * always checked, and the cgroup setup skipped if true.
             */
            if (dev->missing)
                break;
            if ((usb = virUSBDeviceNew(dev->source.subsys.u.usb.bus,
                                       dev->source.subsys.u.usb.device,
                                       NULL)) == NULL) {
                goto cleanup;
            }

            /* oddly, qemuSetupHostUsbDeviceCgroup doesn't ever
             * reference the usb object we just created
             */
            if (virUSBDeviceFileIterate(usb, qemuSetupHostUsbDeviceCgroup,
                                        vm) < 0) {
                goto cleanup;
            }
            break;
291 292 293 294 295 296 297 298 299 300 301 302 303 304

        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI:
            if ((scsi = virSCSIDeviceNew(dev->source.subsys.u.scsi.adapter,
                                         dev->source.subsys.u.scsi.bus,
                                         dev->source.subsys.u.scsi.target,
                                         dev->source.subsys.u.scsi.unit,
                                         dev->readonly)) == NULL)
                goto cleanup;

            if (virSCSIDeviceFileIterate(scsi,
                                         qemuSetupHostScsiDeviceCgroup,
                                         vm) < 0)
                goto cleanup;

305 306 307 308 309 310 311 312
        default:
            break;
        }
    }

    ret = 0;
cleanup:
    virPCIDeviceFree(pci);
313
    virUSBDeviceFree(usb);
314
    virSCSIDeviceFree(scsi);
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
    VIR_FREE(path);
    return ret;
}

int
qemuTeardownHostdevCgroup(virDomainObjPtr vm,
                       virDomainHostdevDefPtr dev)
{
    int ret = -1;
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virPCIDevicePtr pci = NULL;
    char *path = NULL;

    /* currently this only does something for PCI devices using vfio
     * for device assignment, but it is called for *all* hostdev
     * devices.
     */

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

    if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) {

        switch (dev->source.subsys.type) {
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
            if (dev->source.subsys.u.pci.backend
341
                == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
342
                int rv;
343 344 345 346 347 348 349 350

                pci = virPCIDeviceNew(dev->source.subsys.u.pci.addr.domain,
                                      dev->source.subsys.u.pci.addr.bus,
                                      dev->source.subsys.u.pci.addr.slot,
                                      dev->source.subsys.u.pci.addr.function);
                if (!pci)
                    goto cleanup;

351
                if (!(path = virPCIDeviceGetIOMMUGroupDev(pci)))
352 353 354
                    goto cleanup;

                VIR_DEBUG("Cgroup deny %s for PCI device assignment", path);
355
                rv = virCgroupDenyDevicePath(priv->cgroup, path,
356 357
                                             VIR_CGROUP_DEVICE_RWM);
                virDomainAuditCgroupPath(vm, priv->cgroup,
358 359
                                         "deny", path, "rwm", rv == 0);
                if (rv < 0)
360 361 362
                    goto cleanup;
            }
            break;
363 364 365
        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
            /* nothing to tear down for USB */
            break;
366 367 368 369 370 371 372 373 374 375 376 377
        default:
            break;
        }
    }

    ret = 0;
cleanup:
    virPCIDeviceFree(pci);
    VIR_FREE(path);
    return ret;
}

378 379 380 381
static int
qemuSetupBlkioCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
382
    size_t i;
383 384 385 386 387 388 389 390 391 392 393 394

    if (!virCgroupHasController(priv->cgroup,
                                VIR_CGROUP_CONTROLLER_BLKIO)) {
        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Block I/O tuning is not available on this host"));
            return -1;
        } else {
            return 0;
        }
    }

395 396 397
    if (vm->def->blkio.weight != 0 &&
        virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight) < 0)
        return -1;
398 399 400 401 402 403

    if (vm->def->blkio.ndevices) {
        for (i = 0; i < vm->def->blkio.ndevices; i++) {
            virBlkioDeviceWeightPtr dw = &vm->def->blkio.devices[i];
            if (!dw->weight)
                continue;
404 405
            if (virCgroupSetBlkioDeviceWeight(priv->cgroup, dw->path,
                                              dw->weight) < 0)
406 407 408 409 410 411 412
                return -1;
        }
    }

    return 0;
}

413

414 415 416 417 418 419 420 421 422 423 424 425
static int
qemuSetupMemoryCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup,VIR_CGROUP_CONTROLLER_MEMORY)) {
        if (vm->def->mem.hard_limit != 0 ||
            vm->def->mem.soft_limit != 0 ||
            vm->def->mem.swap_hard_limit != 0) {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                           _("Memory cgroup is not available on this host"));
            return -1;
O
Osier Yang 已提交
426 427
        } else {
            return 0;
428 429 430
        }
    }

431 432
    if (virCgroupSetMemoryHardLimit(priv->cgroup,
                                    qemuDomainMemoryLimit(vm->def)) < 0)
433 434
        return -1;

435 436 437 438 439 440 441
    if (vm->def->mem.soft_limit != 0 &&
        virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit) < 0)
        return -1;

    if (vm->def->mem.swap_hard_limit != 0 &&
        virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit) < 0)
        return -1;
442 443 444 445 446

    return 0;
}


447 448 449 450 451 452 453
static int
qemuSetupDevicesCgroup(virQEMUDriverPtr driver,
                       virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
    virQEMUDriverConfigPtr cfg = NULL;
    const char *const *deviceACL = NULL;
454
    int rv = -1;
455
    int ret = -1;
456
    size_t i;
457 458 459 460

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
        return 0;

461 462 463 464 465
    rv = virCgroupDenyAllDevices(priv->cgroup);
    virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rv == 0);
    if (rv < 0) {
        if (virLastErrorIsSystemErrno(EPERM)) {
            virResetLastError();
466 467 468 469 470 471 472
            VIR_WARN("Group devices ACL is not accessible, disabling whitelisting");
            return 0;
        }

        goto cleanup;
    }

473
    for (i = 0; i < vm->def->ndisks; i++) {
474 475 476 477
        if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0)
            goto cleanup;
    }

478
    rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR,
479 480
                                   VIR_CGROUP_DEVICE_RW);
    virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR,
481 482
                              "pty", "rw", rv == 0);
    if (rv < 0)
483 484 485 486 487 488 489 490 491 492 493 494
        goto cleanup;

    cfg = virQEMUDriverGetConfig(driver);
    deviceACL = cfg->cgroupDeviceACL ?
                (const char *const *)cfg->cgroupDeviceACL :
                defaultDeviceACL;

    if (vm->def->nsounds &&
        (!vm->def->ngraphics ||
         ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC &&
           cfg->vncAllowHostAudio) ||
           (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL)))) {
495
        rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR,
496 497
                                       VIR_CGROUP_DEVICE_RW);
        virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR,
498 499
                                  "sound", "rw", rv == 0);
        if (rv < 0)
500 501 502
            goto cleanup;
    }

503
    for (i = 0; deviceACL[i] != NULL; i++) {
504 505 506 507 508 509
        if (access(deviceACL[i], F_OK) < 0) {
            VIR_DEBUG("Ignoring non-existant device %s",
                      deviceACL[i]);
            continue;
        }

510
        rv = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i],
511
                                      VIR_CGROUP_DEVICE_RW);
512 513 514
        virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rv == 0);
        if (rv < 0 &&
            !virLastErrorIsSystemErrno(ENOENT))
515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541
            goto cleanup;
    }

    if (virDomainChrDefForeach(vm->def,
                               true,
                               qemuSetupChardevCgroup,
                               vm) < 0)
        goto cleanup;

    if (vm->def->tpm &&
        (qemuSetupTPMCgroup(vm->def,
                            vm->def->tpm,
                            vm) < 0))
        goto cleanup;

    for (i = 0; i < vm->def->nhostdevs; i++) {
        if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0)
            goto cleanup;
    }

    ret = 0;
cleanup:
    virObjectUnref(cfg);
    return ret;
}


542 543
static int
qemuSetupCpusetCgroup(virDomainObjPtr vm,
544 545
                      virBitmapPtr nodemask,
                      virCapsPtr caps)
546 547
{
    qemuDomainObjPrivatePtr priv = vm->privateData;
548 549
    char *mem_mask = NULL;
    char *cpu_mask = NULL;
550 551 552 553 554 555 556 557 558 559 560 561
    int ret = -1;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
        return 0;

    if ((vm->def->numatune.memory.nodemask ||
         (vm->def->numatune.memory.placement_mode ==
          VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) &&
        vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {

        if (vm->def->numatune.memory.placement_mode ==
            VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)
562
            mem_mask = virBitmapFormat(nodemask);
563
        else
564
            mem_mask = virBitmapFormat(vm->def->numatune.memory.nodemask);
565

566
        if (!mem_mask) {
567 568 569 570 571
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("failed to convert memory nodemask"));
            goto cleanup;
        }

572
        if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
573 574 575
            goto cleanup;
    }

576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
    if (vm->def->cpumask ||
        (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) {

        if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
            virBitmapPtr cpumap;
            if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask)))
                goto cleanup;
            cpu_mask = virBitmapFormat(cpumap);
            virBitmapFree(cpumap);
        } else {
            cpu_mask = virBitmapFormat(vm->def->cpumask);
        }

        if (!cpu_mask) {
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("failed to convert cpu mask"));
            goto cleanup;
        }

595
        if (virCgroupSetCpusetCpus(priv->cgroup, cpu_mask) < 0)
596 597 598
            goto cleanup;
    }

599 600
    ret = 0;
cleanup:
601 602
    VIR_FREE(mem_mask);
    VIR_FREE(cpu_mask);
603 604 605 606
    return ret;
}


607 608 609 610 611 612 613 614 615 616 617 618 619 620 621
static int
qemuSetupCpuCgroup(virDomainObjPtr vm)
{
    qemuDomainObjPrivatePtr priv = vm->privateData;

    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
       if (vm->def->cputune.shares) {
           virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                          _("CPU tuning is not available on this host"));
           return -1;
       } else {
           return 0;
       }
    }

622 623 624
    if (vm->def->cputune.shares &&
        virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
        return -1;
625 626 627 628 629

    return 0;
}


630 631 632 633
int
qemuInitCgroup(virQEMUDriverPtr driver,
               virDomainObjPtr vm,
               bool startup)
634
{
635
    int ret = -1;
636
    qemuDomainObjPrivatePtr priv = vm->privateData;
637
    virCgroupPtr parent = NULL;
638 639
    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);

640 641 642
    if (!cfg->privileged)
        goto done;

643 644 645
    if (!virCgroupAvailable())
        goto done;

646 647
    virCgroupFree(&priv->cgroup);

648 649 650
    if (!vm->def->resource && startup) {
        virDomainResourceDefPtr res;

651
        if (VIR_ALLOC(res) < 0)
652
            goto cleanup;
653

654
        if (VIR_STRDUP(res->partition, "/machine") < 0) {
655 656 657 658 659
            VIR_FREE(res);
            goto cleanup;
        }

        vm->def->resource = res;
660 661
    }

662 663 664 665 666 667 668 669 670 671
    if (vm->def->resource &&
        vm->def->resource->partition) {
        if (vm->def->resource->partition[0] != '/') {
            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                           _("Resource partition '%s' must start with '/'"),
                           vm->def->resource->partition);
            goto cleanup;
        }
        /* We only auto-create the default partition. In other
         * cases we expec the sysadmin/app to have done so */
672 673 674 675 676
        if (virCgroupNewPartition(vm->def->resource->partition,
                                  STREQ(vm->def->resource->partition, "/machine"),
                                  cfg->cgroupControllers,
                                  &parent) < 0) {
            if (virCgroupNewIgnoreError())
677 678 679 680 681
                goto done;

            goto cleanup;
        }

682 683 684 685 686
        if (virCgroupNewDomainPartition(parent,
                                        "qemu",
                                        vm->def->name,
                                        true,
                                        &priv->cgroup) < 0)
687 688
            goto cleanup;
    } else {
689 690 691 692 693
        if (virCgroupNewDriver("qemu",
                               true,
                               cfg->cgroupControllers,
                               &parent) < 0) {
            if (virCgroupNewIgnoreError())
694 695 696 697 698
                goto done;

            goto cleanup;
        }

699 700 701 702
        if (virCgroupNewDomainDriver(parent,
                                     vm->def->name,
                                     true,
                                     &priv->cgroup) < 0)
703
            goto cleanup;
704 705 706
    }

done:
707
    ret = 0;
708
cleanup:
709
    virCgroupFree(&parent);
710
    virObjectUnref(cfg);
711
    return ret;
712 713 714
}


715 716 717 718
int
qemuSetupCgroup(virQEMUDriverPtr driver,
                virDomainObjPtr vm,
                virBitmapPtr nodemask)
719
{
720
    qemuDomainObjPrivatePtr priv = vm->privateData;
721
    virCapsPtr caps = NULL;
722
    int ret = -1;
723

724
    if (qemuInitCgroup(driver, vm, true) < 0)
725
        return -1;
726

727
    if (!priv->cgroup)
728
        return 0;
729

730 731 732
    if (!(caps = virQEMUDriverGetCapabilities(driver, false)))
        goto cleanup;

733 734
    if (qemuSetupDevicesCgroup(driver, vm) < 0)
        goto cleanup;
735

736 737
    if (qemuSetupBlkioCgroup(vm) < 0)
        goto cleanup;
738

739 740
    if (qemuSetupMemoryCgroup(vm) < 0)
        goto cleanup;
741

742 743
    if (qemuSetupCpuCgroup(vm) < 0)
        goto cleanup;
744

745
    if (qemuSetupCpusetCgroup(vm, nodemask, caps) < 0)
746
        goto cleanup;
747

748
    ret = 0;
749
cleanup:
750
    virObjectUnref(caps);
751
    return ret;
752 753
}

754 755 756 757
int
qemuSetupCgroupVcpuBW(virCgroupPtr cgroup,
                      unsigned long long period,
                      long long quota)
758 759 760 761 762 763 764 765
{
    unsigned long long old_period;

    if (period == 0 && quota == 0)
        return 0;

    if (period) {
        /* get old period, and we can rollback if set quota failed */
766
        if (virCgroupGetCpuCfsPeriod(cgroup, &old_period) < 0)
767 768
            return -1;

769
        if (virCgroupSetCpuCfsPeriod(cgroup, period) < 0)
770 771 772
            return -1;
    }

773 774 775
    if (quota &&
        virCgroupSetCpuCfsQuota(cgroup, quota) < 0)
        goto error;
776 777 778

    return 0;

779
error:
780
    if (period) {
781 782 783 784 785 786
        virErrorPtr saved = virSaveLastError();
        ignore_value(virCgroupSetCpuCfsPeriod(cgroup, old_period));
        if (saved) {
            virSetError(saved);
            virFreeError(saved);
        }
787 788 789 790 791
    }

    return -1;
}

792 793 794 795 796
int
qemuSetupCgroupVcpuPin(virCgroupPtr cgroup,
                       virDomainVcpuPinDefPtr *vcpupin,
                       int nvcpupin,
                       int vcpuid)
797
{
798
    size_t i;
799 800 801

    for (i = 0; i < nvcpupin; i++) {
        if (vcpuid == vcpupin[i]->vcpuid) {
802
            return qemuSetupCgroupEmulatorPin(cgroup, vcpupin[i]->cpumask);
803 804 805
        }
    }

806 807 808
    return -1;
}

809 810 811
int
qemuSetupCgroupEmulatorPin(virCgroupPtr cgroup,
                           virBitmapPtr cpumask)
812
{
813
    int ret = -1;
814 815
    char *new_cpus = NULL;

816
    new_cpus = virBitmapFormat(cpumask);
817 818 819 820 821 822
    if (!new_cpus) {
        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                       _("failed to convert cpu mask"));
        goto cleanup;
    }

823
    if (virCgroupSetCpusetCpus(cgroup, new_cpus) < 0)
824 825
        goto cleanup;

826
    ret = 0;
827 828
cleanup:
    VIR_FREE(new_cpus);
829
    return ret;
830 831
}

832 833
int
qemuSetupCgroupForVcpu(virDomainObjPtr vm)
834 835 836
{
    virCgroupPtr cgroup_vcpu = NULL;
    qemuDomainObjPrivatePtr priv = vm->privateData;
837
    virDomainDefPtr def = vm->def;
838
    size_t i, j;
839 840 841
    unsigned long long period = vm->def->cputune.period;
    long long quota = vm->def->cputune.quota;

842
    if ((period || quota) &&
843
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
844 845
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
H
Hu Tao 已提交
846 847 848
        return -1;
    }

849 850 851 852
    /* We are trying to setup cgroups for CPU pinning, which can also be done
     * with virProcessInfoSetAffinity, thus the lack of cgroups is not fatal
     * here.
     */
853
    if (priv->cgroup == NULL)
854 855
        return 0;

856
    if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) {
857
        /* If we don't know VCPU<->PID mapping or all vcpu runs in the same
W
Wen Congyang 已提交
858
         * thread, we cannot control each vcpu.
859
         */
860 861
        VIR_WARN("Unable to get vcpus' pids.");
        return 0;
862 863 864
    }

    for (i = 0; i < priv->nvcpupids; i++) {
865
        if (virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu) < 0)
866 867 868
            goto cleanup;

        /* move the thread for vcpu to sub dir */
869
        if (virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]) < 0)
870 871 872
            goto cleanup;

        if (period || quota) {
H
Hu Tao 已提交
873 874
            if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0)
                goto cleanup;
875 876
        }

877
        /* Set vcpupin in cgroup if vcpupin xml is provided */
878
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
M
Martin Kletzander 已提交
879 880 881 882 883 884 885 886 887 888 889 890 891 892 893
            /* find the right CPU to pin, otherwise
             * qemuSetupCgroupVcpuPin will fail. */
            for (j = 0; j < def->cputune.nvcpupin; j++) {
                if (def->cputune.vcpupin[j]->vcpuid != i)
                    continue;

                if (qemuSetupCgroupVcpuPin(cgroup_vcpu,
                                           def->cputune.vcpupin,
                                           def->cputune.nvcpupin,
                                           i) < 0)
                    goto cleanup;

                break;
            }
        }
894

895 896 897 898 899 900
        virCgroupFree(&cgroup_vcpu);
    }

    return 0;

cleanup:
901 902 903 904 905
    if (cgroup_vcpu) {
        virCgroupRemove(cgroup_vcpu);
        virCgroupFree(&cgroup_vcpu);
    }

906 907 908
    return -1;
}

909 910 911 912
int
qemuSetupCgroupForEmulator(virQEMUDriverPtr driver,
                           virDomainObjPtr vm,
                           virBitmapPtr nodemask)
913
{
914
    virBitmapPtr cpumask = NULL;
915
    virBitmapPtr cpumap = NULL;
916
    virCgroupPtr cgroup_emulator = NULL;
917
    virDomainDefPtr def = vm->def;
918
    qemuDomainObjPrivatePtr priv = vm->privateData;
919 920
    unsigned long long period = vm->def->cputune.emulator_period;
    long long quota = vm->def->cputune.emulator_quota;
921

922
    if ((period || quota) &&
923
        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
924 925 926 927 928
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
                       _("cgroup cpu is required for scheduler tuning"));
        return -1;
    }

929
    if (priv->cgroup == NULL)
930 931
        return 0; /* Not supported, so claim success */

932
    if (virCgroupNewEmulator(priv->cgroup, true, &cgroup_emulator) < 0)
933 934
        goto cleanup;

935
    if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0)
936
        goto cleanup;
937

938 939 940 941 942
    if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
        if (!(cpumap = qemuPrepareCpumap(driver, nodemask)))
            goto cleanup;
        cpumask = cpumap;
    } else if (def->cputune.emulatorpin) {
943
        cpumask = def->cputune.emulatorpin->cpumask;
944
    } else if (def->cpumask) {
945
        cpumask = def->cpumask;
946
    }
947 948

    if (cpumask) {
949 950 951
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) &&
            qemuSetupCgroupEmulatorPin(cgroup_emulator, cpumask) < 0)
            goto cleanup;
H
Hu Tao 已提交
952
    }
953

954
    if (period || quota) {
955 956 957 958
        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
            qemuSetupCgroupVcpuBW(cgroup_emulator, period,
                                  quota) < 0)
            goto cleanup;
959 960
    }

961
    virCgroupFree(&cgroup_emulator);
962
    virBitmapFree(cpumap);
963 964 965
    return 0;

cleanup:
966 967
    virBitmapFree(cpumap);

968 969 970 971 972
    if (cgroup_emulator) {
        virCgroupRemove(cgroup_emulator);
        virCgroupFree(&cgroup_emulator);
    }

973
    return -1;
974
}
975

976 977
int
qemuRemoveCgroup(virDomainObjPtr vm)
978
{
979
    qemuDomainObjPrivatePtr priv = vm->privateData;
980

981
    if (priv->cgroup == NULL)
982 983
        return 0; /* Not supported, so claim success */

984
    return virCgroupRemove(priv->cgroup);
985 986
}

987 988
int
qemuAddToCgroup(virDomainObjPtr vm)
989
{
990
    qemuDomainObjPrivatePtr priv = vm->privateData;
991

992
    if (priv->cgroup == NULL)
993 994
        return 0; /* Not supported, so claim success */

995
    if (virCgroupAddTask(priv->cgroup, getpid()) < 0)
996
        return -1;
997

998
    return 0;
999
}