lxc_cgroup.c 13.5 KB
Newer Older
1
/*
2
 * Copyright (C) 2010-2014 Red Hat, Inc.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
 * Copyright IBM Corp. 2008
 *
 * lxc_cgroup.c: LXC cgroup helpers
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library.  If not, see
O
Osier Yang 已提交
19
 * <http://www.gnu.org/licenses/>.
20 21 22 23 24 25
 */

#include <config.h>

#include "lxc_cgroup.h"
#include "lxc_container.h"
26
#include "domain_cgroup.h"
27
#include "virfile.h"
28
#include "virerror.h"
29
#include "virlog.h"
30
#include "virstring.h"
31
#include "virsystemd.h"
J
Ján Tomko 已提交
32
#include "virutil.h"
33 34 35

#define VIR_FROM_THIS VIR_FROM_LXC

36 37
VIR_LOG_INIT("lxc.lxc_cgroup");

38 39 40
static int virLXCCgroupSetupCpuTune(virDomainDefPtr def,
                                    virCgroupPtr cgroup)
{
41 42
    if (def->cputune.sharesSpecified) {
        unsigned long long val;
43
        if (virCgroupSetupCpuShares(cgroup, def->cputune.shares, &val) < 0)
44
            return -1;
45 46
        def->cputune.shares = val;
    }
47

48 49
    return virCgroupSetupCpuPeriodQuota(cgroup, def->cputune.period,
                                        def->cputune.quota);
50 51 52
}


53 54 55 56
static int virLXCCgroupSetupCpusetTune(virDomainDefPtr def,
                                       virCgroupPtr cgroup,
                                       virBitmapPtr nodemask)
{
57
    g_autofree char *mask = NULL;
58
    virDomainNumatuneMemMode mode;
59 60

    if (def->placement_mode != VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO &&
61 62 63
        def->cpumask &&
        virCgroupSetupCpusetCpus(cgroup, def->cpumask) < 0) {
        return -1;
64 65
    }

66 67
    if (virDomainNumatuneGetMode(def->numa, -1, &mode) < 0 ||
        mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
68
        return 0;
69
    }
70

71
    if (virDomainNumatuneMaybeFormatNodeset(def->numa, nodemask,
72
                                            &mask, -1) < 0)
73
        return -1;
74

75
    if (mask && virCgroupSetCpusetMems(cgroup, mask) < 0)
76
        return -1;
77

78
    return 0;
79 80 81
}


82 83 84
static int virLXCCgroupSetupBlkioTune(virDomainDefPtr def,
                                      virCgroupPtr cgroup)
{
85
    return virDomainCgroupSetupBlkio(cgroup, def->blkio);
86 87 88 89 90 91
}


static int virLXCCgroupSetupMemTune(virDomainDefPtr def,
                                    virCgroupPtr cgroup)
{
92
    if (virCgroupSetMemory(cgroup, virDomainDefGetMemoryInitial(def)) < 0)
93
        return -1;
94

95
    return virDomainCgroupSetupMemtune(cgroup, def->mem);
96 97 98
}


99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
static int virLXCCgroupGetMemSwapUsage(virCgroupPtr cgroup,
                                       virLXCMeminfoPtr meminfo)
{
    return virCgroupGetMemSwapUsage(cgroup, &meminfo->swapusage);
}


static int virLXCCgroupGetMemSwapTotal(virCgroupPtr cgroup,
                                       virLXCMeminfoPtr meminfo)
{
    return virCgroupGetMemSwapHardLimit(cgroup, &meminfo->swaptotal);
}


static int virLXCCgroupGetMemUsage(virCgroupPtr cgroup,
                                   virLXCMeminfoPtr meminfo)
{
    int ret;
    unsigned long memUsage;

    ret = virCgroupGetMemoryUsage(cgroup, &memUsage);
120
    meminfo->memusage = (unsigned long long)memUsage;
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135

    return ret;
}


static int virLXCCgroupGetMemTotal(virCgroupPtr cgroup,
                                   virLXCMeminfoPtr meminfo)
{
    return virCgroupGetMemoryHardLimit(cgroup, &meminfo->memtotal);
}


static int virLXCCgroupGetMemStat(virCgroupPtr cgroup,
                                  virLXCMeminfoPtr meminfo)
{
P
Pavel Hrdina 已提交
136 137 138 139 140 141 142
    return virCgroupGetMemoryStat(cgroup,
                                  &meminfo->cached,
                                  &meminfo->inactive_anon,
                                  &meminfo->active_anon,
                                  &meminfo->inactive_file,
                                  &meminfo->active_file,
                                  &meminfo->unevictable);
143 144 145 146 147
}


int virLXCCgroupGetMeminfo(virLXCMeminfoPtr meminfo)
{
148
    int ret = -1;
149 150
    virCgroupPtr cgroup;

151 152
    if (virCgroupNewSelf(&cgroup) < 0)
        return -1;
153

154
    if (virLXCCgroupGetMemStat(cgroup, meminfo) < 0)
155 156
        goto cleanup;

157
    if (virLXCCgroupGetMemTotal(cgroup, meminfo) < 0)
158 159
        goto cleanup;

160
    if (virLXCCgroupGetMemUsage(cgroup, meminfo) < 0)
161 162
        goto cleanup;

163 164 165 166 167
    if (virLXCCgroupGetMemSwapTotal(cgroup, meminfo) < 0)
        goto cleanup;

    if (virLXCCgroupGetMemSwapUsage(cgroup, meminfo) < 0)
        goto cleanup;
168 169

    ret = 0;
170
 cleanup:
171
    virCgroupFree(&cgroup);
172 173 174 175 176
    return ret;
}



177 178 179 180 181 182 183 184 185 186
typedef struct _virLXCCgroupDevicePolicy virLXCCgroupDevicePolicy;
typedef virLXCCgroupDevicePolicy *virLXCCgroupDevicePolicyPtr;

struct _virLXCCgroupDevicePolicy {
    char type;
    int major;
    int minor;
};


187
int
J
Ján Tomko 已提交
188
virLXCSetupHostUSBDeviceCgroup(virUSBDevicePtr dev G_GNUC_UNUSED,
189 190 191 192 193 194
                               const char *path,
                               void *opaque)
{
    virCgroupPtr cgroup = opaque;

    VIR_DEBUG("Process path '%s' for USB device", path);
195
    if (virCgroupAllowDevicePath(cgroup, path,
196
                                 VIR_CGROUP_DEVICE_RWM, false) < 0)
197 198 199 200 201 202 203
        return -1;

    return 0;
}


int
J
Ján Tomko 已提交
204
virLXCTeardownHostUSBDeviceCgroup(virUSBDevicePtr dev G_GNUC_UNUSED,
205 206 207 208 209 210
                                  const char *path,
                                  void *opaque)
{
    virCgroupPtr cgroup = opaque;

    VIR_DEBUG("Process path '%s' for USB device", path);
211
    if (virCgroupDenyDevicePath(cgroup, path,
212
                                VIR_CGROUP_DEVICE_RWM, false) < 0)
213 214 215 216 217
        return -1;

    return 0;
}

218 219 220 221

static int virLXCCgroupSetupDeviceACL(virDomainDefPtr def,
                                      virCgroupPtr cgroup)
{
222
    int capMknod = def->caps_features[VIR_DOMAIN_PROCES_CAPS_FEATURE_MKNOD];
223 224 225 226 227 228 229 230 231
    size_t i;
    static virLXCCgroupDevicePolicy devices[] = {
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM},
        {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM},
        {'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_TTY},
        {'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX},
G
Gao feng 已提交
232
        {'c', LXC_DEV_MAJ_FUSE, LXC_DEV_MIN_FUSE},
233 234
        {0,   0, 0}};

235
    if (virCgroupDenyAllDevices(cgroup) < 0)
236
        return -1;
237

238
    /* white list mknod if CAP_MKNOD has to be kept */
J
Ján Tomko 已提交
239
    if (capMknod == VIR_TRISTATE_SWITCH_ON) {
240 241
        if (virCgroupAllowAllDevices(cgroup,
                                    VIR_CGROUP_DEVICE_MKNOD) < 0)
242
            return -1;
243 244
    }

245 246
    for (i = 0; devices[i].type != 0; i++) {
        virLXCCgroupDevicePolicyPtr dev = &devices[i];
247 248 249 250 251
        if (virCgroupAllowDevice(cgroup,
                                 dev->type,
                                 dev->major,
                                 dev->minor,
                                 VIR_CGROUP_DEVICE_RWM) < 0)
252
            return -1;
253 254
    }

255
    VIR_DEBUG("Allowing any disk block devs");
256
    for (i = 0; i < def->ndisks; i++) {
257 258
        if (virStorageSourceIsEmpty(def->disks[i]->src) ||
            !virStorageSourceIsBlockLocal(def->disks[i]->src))
259 260
            continue;

261
        if (virCgroupAllowDevicePath(cgroup,
262
                                     virDomainDiskGetSource(def->disks[i]),
263
                                     (def->disks[i]->src->readonly ?
264 265
                                      VIR_CGROUP_DEVICE_READ :
                                      VIR_CGROUP_DEVICE_RW) |
266
                                     VIR_CGROUP_DEVICE_MKNOD, false) < 0)
267
            return -1;
268 269
    }

270
    VIR_DEBUG("Allowing any filesystem block devs");
271
    for (i = 0; i < def->nfss; i++) {
272 273 274
        if (def->fss[i]->type != VIR_DOMAIN_FS_TYPE_BLOCK)
            continue;

275
        if (virCgroupAllowDevicePath(cgroup,
276
                                     def->fss[i]->src->path,
277 278
                                     def->fss[i]->readonly ?
                                     VIR_CGROUP_DEVICE_READ :
279
                                     VIR_CGROUP_DEVICE_RW, false) < 0)
280
            return -1;
281 282
    }

283
    VIR_DEBUG("Allowing any hostdev block devs");
284 285
    for (i = 0; i < def->nhostdevs; i++) {
        virDomainHostdevDefPtr hostdev = def->hostdevs[i];
286
        virDomainHostdevSubsysUSBPtr usbsrc = &hostdev->source.subsys.u.usb;
287
        virUSBDevicePtr usb;
288

289 290 291 292 293 294 295
        switch (hostdev->mode) {
        case VIR_DOMAIN_HOSTDEV_MODE_SUBSYS:
            if (hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB)
                continue;
            if (hostdev->missing)
                continue;

296
            if ((usb = virUSBDeviceNew(usbsrc->bus, usbsrc->device,
297
                                       NULL)) == NULL)
298
                return -1;
299

300
            if (virUSBDeviceFileIterate(usb, virLXCSetupHostUSBDeviceCgroup,
301 302
                                        cgroup) < 0) {
                virUSBDeviceFree(usb);
303
                return -1;
304
            }
305
            virUSBDeviceFree(usb);
306 307 308 309 310 311 312
            break;
        case VIR_DOMAIN_HOSTDEV_MODE_CAPABILITIES:
            switch (hostdev->source.caps.type) {
            case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_STORAGE:
                if (virCgroupAllowDevicePath(cgroup,
                                             hostdev->source.caps.u.storage.block,
                                             VIR_CGROUP_DEVICE_RW |
313
                                             VIR_CGROUP_DEVICE_MKNOD, false) < 0)
314
                    return -1;
315
                break;
316 317 318 319
            case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_MISC:
                if (virCgroupAllowDevicePath(cgroup,
                                             hostdev->source.caps.u.misc.chardev,
                                             VIR_CGROUP_DEVICE_RW |
320
                                             VIR_CGROUP_DEVICE_MKNOD, false) < 0)
321
                    return -1;
322
                break;
323 324 325 326 327 328
            default:
                break;
            }
        default:
            break;
        }
329 330
    }

331 332
    if (virCgroupAllowDevice(cgroup, 'c', LXC_DEV_MAJ_PTY, -1,
                             VIR_CGROUP_DEVICE_RWM) < 0)
333
        return -1;
334

335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357
    VIR_DEBUG("Allowing timers char devices");

    /* Sync'ed with Host clock */
    for (i = 0; i < def->clock.ntimers; i++) {
        virDomainTimerDefPtr timer = def->clock.timers[i];
        const char *dev = NULL;

        /* Check if "present" is set to "no" otherwise enable it. */
        if (!timer->present)
            continue;

        switch ((virDomainTimerNameType)timer->name) {
        case VIR_DOMAIN_TIMER_NAME_PLATFORM:
        case VIR_DOMAIN_TIMER_NAME_TSC:
        case VIR_DOMAIN_TIMER_NAME_KVMCLOCK:
        case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK:
        case VIR_DOMAIN_TIMER_NAME_PIT:
        case VIR_DOMAIN_TIMER_NAME_ARMVTIMER:
        case VIR_DOMAIN_TIMER_NAME_LAST:
            break;
        case VIR_DOMAIN_TIMER_NAME_RTC:
            dev = "/dev/rtc0";
            break;
358 359 360
        case VIR_DOMAIN_TIMER_NAME_HPET:
            dev = "/dev/hpet";
            break;
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
        }

        if (!dev)
            continue;

        if (!virFileExists(dev)) {
            VIR_DEBUG("Ignoring non-existent device %s", dev);
            continue;
        }

        if (virCgroupAllowDevicePath(cgroup, dev,
                                     VIR_CGROUP_DEVICE_READ,
                                     false) < 0)
            return -1;
    }

377 378
    VIR_DEBUG("Device whitelist complete");

379
    return 0;
380 381 382
}


383
virCgroupPtr virLXCCgroupCreate(virDomainDefPtr def,
384 385 386
                                pid_t initpid,
                                size_t nnicindexes,
                                int *nicindexes)
387
{
388
    virCgroupPtr cgroup = NULL;
389
    g_autofree char *machineName = virLXCDomainGetMachineName(def, 0);
390 391

    if (!machineName)
392
        return NULL;
393

394 395 396 397
    if (def->resource->partition[0] != '/') {
        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                       _("Resource partition '%s' must start with '/'"),
                       def->resource->partition);
398
        return NULL;
399
    }
400

401
    if (virCgroupNewMachine(machineName,
402 403 404
                            "lxc",
                            def->uuid,
                            NULL,
405
                            initpid,
406
                            true,
407
                            nnicindexes, nicindexes,
408 409
                            def->resource->partition,
                            -1,
410
                            0,
411
                            &cgroup) < 0)
412
        return NULL;
413

414 415 416 417 418 419
    /* setup control group permissions for user namespace */
    if (def->idmap.uidmap) {
        if (virCgroupSetOwner(cgroup,
                              def->idmap.uidmap[0].target,
                              def->idmap.gidmap[0].target,
                              (1 << VIR_CGROUP_CONTROLLER_SYSTEMD)) < 0) {
420
            virCgroupFree(&cgroup);
421
            return NULL;
422 423 424
        }
    }

425 426 427 428 429
    return cgroup;
}


int virLXCCgroupSetup(virDomainDefPtr def,
430 431
                      virCgroupPtr cgroup,
                      virBitmapPtr nodemask)
432
{
433
    if (virLXCCgroupSetupCpuTune(def, cgroup) < 0)
434
        return -1;
435

436
    if (virLXCCgroupSetupCpusetTune(def, cgroup, nodemask) < 0)
437
        return -1;
438

439
    if (virLXCCgroupSetupBlkioTune(def, cgroup) < 0)
440
        return -1;
441 442

    if (virLXCCgroupSetupMemTune(def, cgroup) < 0)
443
        return -1;
444 445

    if (virLXCCgroupSetupDeviceACL(def, cgroup) < 0)
446
        return -1;
447

448
    return 0;
449
}