lxc_container.c 39.7 KB
Newer Older
1
/*
2
 * Copyright (C) 2008-2012 Red Hat, Inc.
3
 * Copyright (C) 2008 IBM Corp.
4 5 6 7 8
 *
 * lxc_container.c: file description
 *
 * Authors:
 *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
9
 *  Daniel P. Berrange <berrange@redhat.com>
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#include <config.h>

#include <fcntl.h>
#include <limits.h>
#include <stdlib.h>
31
#include <stdio.h>
32 33
#include <sys/ioctl.h>
#include <sys/mount.h>
34
#include <sys/wait.h>
35
#include <sys/stat.h>
36
#include <unistd.h>
37 38 39 40 41 42 43
#include <mntent.h>

/* Yes, we want linux private one, for _syscall2() macro */
#include <linux/unistd.h>

/* For MS_MOVE */
#include <linux/fs.h>
44

D
Daniel P. Berrange 已提交
45
#if HAVE_CAPNG
46
# include <cap-ng.h>
D
Daniel P. Berrange 已提交
47
#endif
48

49 50 51 52
#if HAVE_LIBBLKID
# include <blkid/blkid.h>
#endif

53
#include "virterror_internal.h"
54
#include "logging.h"
55 56
#include "lxc_container.h"
#include "util.h"
57
#include "memory.h"
58
#include "virnetdevveth.h"
59
#include "uuid.h"
E
Eric Blake 已提交
60
#include "virfile.h"
61
#include "command.h"
62
#include "virnetdev.h"
63

64 65
#define VIR_FROM_THIS VIR_FROM_LXC

66 67 68 69 70 71
/*
 * GLibc headers are behind the kernel, so we define these
 * constants if they're not present already.
 */

#ifndef CLONE_NEWPID
72
# define CLONE_NEWPID  0x20000000
73 74
#endif
#ifndef CLONE_NEWUTS
75
# define CLONE_NEWUTS  0x04000000
76 77
#endif
#ifndef CLONE_NEWUSER
78
# define CLONE_NEWUSER 0x10000000
79 80
#endif
#ifndef CLONE_NEWIPC
81
# define CLONE_NEWIPC  0x08000000
82 83
#endif
#ifndef CLONE_NEWNET
84
# define CLONE_NEWNET  0x40000000 /* New network namespace */
85 86 87 88 89 90 91 92
#endif

/* messages between parent and container */
typedef char lxc_message_t;
#define LXC_CONTINUE_MSG 'c'

typedef struct __lxc_child_argv lxc_child_argv_t;
struct __lxc_child_argv {
93
    virDomainDefPtr config;
94 95
    unsigned int nveths;
    char **veths;
96
    int monitor;
97 98
    char **ttyPaths;
    size_t nttyPaths;
99
    int handshakefd;
100 101 102
};


103
/**
104
 * lxcContainerBuildInitCmd:
105
 * @vmDef: pointer to vm definition structure
106
 *
107
 * Build a virCommandPtr for launching the container 'init' process
108
 *
109
 * Returns a virCommandPtr
110
 */
111
static virCommandPtr lxcContainerBuildInitCmd(virDomainDefPtr vmDef)
112
{
113
    char uuidstr[VIR_UUID_STRING_BUFLEN];
114
    virCommandPtr cmd;
115 116 117

    virUUIDFormat(vmDef->uuid, uuidstr);

118 119 120 121
    cmd = virCommandNew(vmDef->os.init);

    virCommandAddEnvString(cmd, "PATH=/bin:/sbin");
    virCommandAddEnvString(cmd, "TERM=linux");
122
    virCommandAddEnvString(cmd, "container=lxc-libvirt");
123 124
    virCommandAddEnvPair(cmd, "LIBVIRT_LXC_UUID", uuidstr);
    virCommandAddEnvPair(cmd, "LIBVIRT_LXC_NAME", vmDef->name);
125 126
    if (vmDef->os.cmdline)
        virCommandAddEnvPair(cmd, "LIBVIRT_LXC_CMDLINE", vmDef->os.cmdline);
127

128
    return cmd;
129 130 131
}

/**
132
 * lxcContainerSetStdio:
133 134
 * @control: control FD from parent
 * @ttyfd: FD of tty to set as the container console
135 136 137 138 139 140
 *
 * Sets the given tty as the primary conosole for the container as well as
 * stdout, stdin and stderr.
 *
 * Returns 0 on success or -1 in case of error
 */
141
static int lxcContainerSetStdio(int control, int ttyfd, int handshakefd)
142 143
{
    int rc = -1;
144
    int open_max, i;
145 146

    if (setsid() < 0) {
147
        virReportSystemError(errno, "%s",
148
                             _("setsid failed"));
149
        goto cleanup;
150 151 152
    }

    if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) {
153
        virReportSystemError(errno, "%s",
154
                             _("ioctl(TIOCSTTY) failed"));
155 156 157
        goto cleanup;
    }

158 159 160 161
    /* Just in case someone forget to set FD_CLOEXEC, explicitly
     * close all FDs before executing the container */
    open_max = sysconf (_SC_OPEN_MAX);
    for (i = 0; i < open_max; i++)
162
        if (i != ttyfd && i != control && i != handshakefd) {
163 164 165
            int tmpfd = i;
            VIR_FORCE_CLOSE(tmpfd);
        }
166 167

    if (dup2(ttyfd, 0) < 0) {
168
        virReportSystemError(errno, "%s",
169
                             _("dup2(stdin) failed"));
170 171 172 173
        goto cleanup;
    }

    if (dup2(ttyfd, 1) < 0) {
174
        virReportSystemError(errno, "%s",
175
                             _("dup2(stdout) failed"));
176 177 178 179
        goto cleanup;
    }

    if (dup2(ttyfd, 2) < 0) {
180
        virReportSystemError(errno, "%s",
181
                             _("dup2(stderr) failed"));
182 183 184 185 186 187
        goto cleanup;
    }

    rc = 0;

cleanup:
188
    VIR_DEBUG("rc=%d", rc);
189 190 191 192
    return rc;
}

/**
193
 * lxcContainerSendContinue:
194
 * @control: control FD to child
195
 *
196 197
 * Sends the continue message via the socket pair stored in the vm
 * structure.
198 199 200
 *
 * Returns 0 on success or -1 in case of error
 */
201
int lxcContainerSendContinue(int control)
202 203
{
    int rc = -1;
204 205
    lxc_message_t msg = LXC_CONTINUE_MSG;
    int writeCount = 0;
206

207 208 209
    writeCount = safewrite(control, &msg, sizeof(msg));
    if (writeCount != sizeof(msg)) {
        goto error_out;
210 211
    }

212 213 214
    rc = 0;
error_out:
    return rc;
215 216
}

217
/**
218
 * lxcContainerWaitForContinue:
219
 * @control: Control FD from parent
220 221 222 223 224 225 226
 *
 * This function will wait for the container continue message from the
 * parent process.  It will send this message on the socket pair stored in
 * the vm structure once it has completed the post clone container setup.
 *
 * Returns 0 on success or -1 in case of error
 */
227
int lxcContainerWaitForContinue(int control)
228 229 230 231
{
    lxc_message_t msg;
    int readLen;

232
    readLen = saferead(control, &msg, sizeof(msg));
233 234 235 236 237 238 239
    if (readLen != sizeof(msg)) {
        if (readLen >= 0)
            errno = EIO;
        return -1;
    }
    if (msg != LXC_CONTINUE_MSG) {
        errno = EINVAL;
240
        return -1;
241 242
    }

243
    return 0;
244 245
}

246

247
/**
248
 * lxcContainerRenameAndEnableInterfaces:
249 250
 * @nveths: number of interfaces
 * @veths: interface names
251
 *
252 253 254
 * This function will rename the interfaces to ethN
 * with id ascending order from zero and enable the
 * renamed interfaces for this container.
255 256 257
 *
 * Returns 0 on success or nonzero in case of error
 */
258 259
static int lxcContainerRenameAndEnableInterfaces(unsigned int nveths,
                                                 char **veths)
260 261
{
    int rc = 0;
262
    unsigned int i;
263
    char *newname = NULL;
264

265
    for (i = 0 ; i < nveths ; i++) {
266 267 268
        if (virAsprintf(&newname, "eth%d", i) < 0) {
            virReportOOMError();
            rc = -1;
269
            goto error_out;
270
        }
271

272
        VIR_DEBUG("Renaming %s to %s", veths[i], newname);
273
        rc = virNetDevSetName(veths[i], newname);
274
        if (rc < 0)
275 276
            goto error_out;

277
        VIR_DEBUG("Enabling %s", newname);
278
        rc = virNetDevSetOnline(newname, true);
279
        if (rc < 0)
280
            goto error_out;
281

282
        VIR_FREE(newname);
283 284 285
    }

    /* enable lo device only if there were other net devices */
286
    if (veths)
287
        rc = virNetDevSetOnline("lo", true);
288 289

error_out:
290
    VIR_FREE(newname);
291 292 293
    return rc;
}

294

295
/*_syscall2(int, pivot_root, char *, newroot, const char *, oldroot)*/
296 297 298 299 300 301 302
extern int pivot_root(const char * new_root,const char * put_old);

static int lxcContainerChildMountSort(const void *a, const void *b)
{
  const char **sa = (const char**)a;
  const char **sb = (const char**)b;

E
Eric Blake 已提交
303
  /* Deliberately reversed args - we need to unmount deepest
304 305 306 307
     children first */
  return strcmp(*sb, *sa);
}

308
#ifndef MS_REC
309
# define MS_REC          16384
310 311 312
#endif

#ifndef MNT_DETACH
313
# define MNT_DETACH      0x00000002
314 315 316
#endif

#ifndef MS_PRIVATE
317
# define MS_PRIVATE              (1<<18)
318 319 320
#endif

#ifndef MS_SLAVE
321
# define MS_SLAVE                (1<<19)
322 323
#endif

324 325
static int lxcContainerPivotRoot(virDomainFSDefPtr root)
{
326
    int ret;
327
    char *oldroot = NULL, *newroot = NULL;
328

M
Mark McLoughlin 已提交
329 330
    ret = -1;

331 332
    /* root->parent must be private, so make / private. */
    if (mount("", "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0) {
333
        virReportSystemError(errno, "%s",
334
                             _("Failed to make root private"));
335
        goto err;
336 337
    }

338
    if (virAsprintf(&oldroot, "%s/.oldroot", root->src) < 0) {
339
        virReportOOMError();
340
        goto err;
341 342
    }

343 344
    if (virFileMakePath(oldroot) < 0) {
        virReportSystemError(errno,
345
                             _("Failed to create %s"),
346
                             oldroot);
347 348 349 350 351
        goto err;
    }

    /* Create a tmpfs root since old and new roots must be
     * on separate filesystems */
352
    if (mount("tmprootfs", oldroot, "tmpfs", 0, NULL) < 0) {
353
        virReportSystemError(errno,
354
                             _("Failed to mount empty tmpfs at %s"),
355 356 357
                             oldroot);
        goto err;
    }
M
Mark McLoughlin 已提交
358

359 360
    /* Create a directory called 'new' in tmpfs */
    if (virAsprintf(&newroot, "%s/new", oldroot) < 0) {
361
        virReportOOMError();
362 363 364
        goto err;
    }

365 366
    if (virFileMakePath(newroot) < 0) {
        virReportSystemError(errno,
367
                             _("Failed to create %s"),
368 369 370 371 372 373
                             newroot);
        goto err;
    }

    /* ... and mount our root onto it */
    if (mount(root->src, newroot, NULL, MS_BIND|MS_REC, NULL) < 0) {
374
        virReportSystemError(errno,
375
                             _("Failed to bind new root %s into tmpfs"),
376 377 378 379
                             root->src);
        goto err;
    }

380 381 382 383 384 385 386 387 388
    if (root->readonly) {
        if (mount(root->src, newroot, NULL, MS_BIND|MS_REC|MS_RDONLY|MS_REMOUNT, NULL) < 0) {
            virReportSystemError(errno,
                                 _("Failed to make new root %s readonly"),
                                 root->src);
            goto err;
        }
    }

389 390
    /* Now we chroot into the tmpfs, then pivot into the
     * root->src bind-mounted onto '/new' */
391
    if (chdir(newroot) < 0) {
392
        virReportSystemError(errno,
393
                             _("Failed to chroot into %s"), newroot);
394
        goto err;
395 396 397 398
    }

    /* The old root directory will live at /.oldroot after
     * this and will soon be unmounted completely */
399
    if (pivot_root(".", ".oldroot") < 0) {
400
        virReportSystemError(errno, "%s",
401
                             _("Failed to pivot root"));
402
        goto err;
403 404 405
    }

    /* CWD is undefined after pivot_root, so go to / */
406 407 408
    if (chdir("/") < 0)
        goto err;

M
Mark McLoughlin 已提交
409 410 411
    ret = 0;

err:
412 413 414
    VIR_FREE(oldroot);
    VIR_FREE(newroot);

M
Mark McLoughlin 已提交
415
    return ret;
416 417
}

418

419
static int lxcContainerMountBasicFS(const char *srcprefix, bool pivotRoot)
420 421
{
    const struct {
422
        bool onlyPivotRoot;
423
        bool needPrefix;
424 425 426
        const char *src;
        const char *dst;
        const char *type;
427 428
        const char *opts;
        int mflags;
429
    } mnts[] = {
430 431 432
        /* When we want to make a bind mount readonly, for unknown reasons,
         * it is currently neccessary to bind it once, and then remount the
         * bind with the readonly flag. If this is not done, then the original
433
         * mount point in the main OS becomes readonly too which is not what
434 435
         * we want. Hence some things have two entries here.
         */
436 437 438 439 440 441 442 443
        { true, false, "devfs", "/dev", "tmpfs", "mode=755", MS_NOSUID },
        { false, false, "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV },
        { false, false, "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND },
        { false, false, "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY },
        { false, true, "/sys", "/sys", NULL, NULL, MS_BIND },
        { false, true, "/sys", "/sys", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY },
        { false, true, "/selinux", "/selinux", NULL, NULL, MS_BIND },
        { false, true, "/selinux", "/selinux", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY },
444
    };
445
    int i, rc = -1;
446

447 448
    VIR_DEBUG("Mounting basic filesystems %s pivotRoot=%d", NULLSTR(srcprefix), pivotRoot);

449
    for (i = 0 ; i < ARRAY_CARDINALITY(mnts) ; i++) {
450 451
        char *src = NULL;
        const char *srcpath = NULL;
452 453 454 455 456 457

        VIR_DEBUG("Consider %s onlyPivotRoot=%d",
                  mnts[i].src, mnts[i].onlyPivotRoot);
        if (mnts[i].onlyPivotRoot && !pivotRoot)
            continue;

458
        if (virFileMakePath(mnts[i].dst) < 0) {
459
            virReportSystemError(errno,
460
                                 _("Failed to mkdir %s"),
461
                                 mnts[i].src);
462
            goto cleanup;
463
        }
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479

        if (mnts[i].needPrefix && srcprefix) {
            if (virAsprintf(&src, "%s%s", srcprefix, mnts[i].src) < 0) {
                virReportOOMError();
                goto cleanup;
            }
            srcpath = src;
        } else {
            srcpath = mnts[i].src;
        }

        /* Skip if mount doesn't exist in source */
        if ((srcpath[0] == '/') &&
            (access(srcpath, R_OK) < 0))
            continue;

480 481
        VIR_DEBUG("Mount %s on %s type=%s flags=%x, opts=%s",
                  srcpath, mnts[i].dst, mnts[i].type, mnts[i].mflags, mnts[i].opts);
482 483
        if (mount(srcpath, mnts[i].dst, mnts[i].type, mnts[i].mflags, mnts[i].opts) < 0) {
            VIR_FREE(src);
484
            virReportSystemError(errno,
485 486
                                 _("Failed to mount %s on %s type %s"),
                                 mnts[i].src, mnts[i].dst, NULLSTR(mnts[i].type));
487
            goto cleanup;
488
        }
489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
        VIR_FREE(src);
    }

    rc = 0;

cleanup:
    VIR_DEBUG("rc=%d", rc);
    return rc;
}


static int lxcContainerMountDevFS(virDomainFSDefPtr root)
{
    char *devpts = NULL;
    int rc = -1;

    if (virAsprintf(&devpts, "/.oldroot%s/dev/pts", root->src) < 0) {
        virReportOOMError();
        goto cleanup;
508
    }
509

510 511
    if (virFileMakePath("/dev/pts") < 0) {
        virReportSystemError(errno, "%s",
512
                             _("Cannot create /dev/pts"));
513
        goto cleanup;
514
    }
515 516 517

    VIR_DEBUG("Trying to move %s to %s", devpts, "/dev/pts");
    if ((rc = mount(devpts, "/dev/pts", NULL, MS_MOVE, NULL)) < 0) {
518
        virReportSystemError(errno, "%s",
519
                             _("Failed to mount /dev/pts in container"));
520
        goto cleanup;
521
    }
522 523 524 525

    rc = 0;

 cleanup:
526 527
    VIR_FREE(devpts);

528
    return rc;
529 530
}

531
static int lxcContainerPopulateDevices(char **ttyPaths, size_t nttyPaths)
532
{
533
    size_t i;
534 535 536 537 538 539 540 541 542 543 544 545
    const struct {
        int maj;
        int min;
        mode_t mode;
        const char *path;
    } devs[] = {
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" },
        { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" },
    };
546 547 548 549

    /* Populate /dev/ with a few important bits */
    for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) {
        dev_t dev = makedev(devs[i].maj, devs[i].min);
550
        if (mknod(devs[i].path, S_IFCHR, dev) < 0 ||
551
            chmod(devs[i].path, devs[i].mode)) {
552
            virReportSystemError(errno,
553
                                 _("Failed to make device %s"),
554
                                 devs[i].path);
555 556 557 558
            return -1;
        }
    }

559
    if (access("/dev/pts/ptmx", W_OK) == 0) {
560 561 562 563
        /* We have private devpts capability, so bind that */
        if (virFileTouch("/dev/ptmx", 0666) < 0)
            return -1;

S
Serge Hallyn 已提交
564
        if (mount("/dev/pts/ptmx", "/dev/ptmx", "ptmx", MS_BIND, NULL) < 0) {
565
            virReportSystemError(errno, "%s",
566 567 568 569 570 571 572 573 574 575
                                 _("Failed to bind /dev/pts/ptmx on to /dev/ptmx"));
            return -1;
        }
    } else {
        /* Legacy devpts, so we need to just use shared one */
        dev_t dev = makedev(LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX);
        if (mknod("/dev/ptmx", S_IFCHR, dev) < 0 ||
            chmod("/dev/ptmx", 0666)) {
            virReportSystemError(errno, "%s",
                                 _("Failed to make device /dev/ptmx"));
576 577 578 579
            return -1;
        }
    }

580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600
    for (i = 0 ; i < nttyPaths ; i++) {
        char *tty;
        if (virAsprintf(&tty, "/dev/tty%zu", i+1) < 0) {
            virReportOOMError();
            return -1;
        }
        if (symlink(ttyPaths[i], tty) < 0) {
            VIR_FREE(tty);
            virReportSystemError(errno,
                                 _("Failed to symlink %s to %s"),
                                 ttyPaths[i], tty);
            return -1;
        }
        VIR_FREE(tty);
        if (i == 0 &&
            symlink(ttyPaths[i], "/dev/console") < 0) {
            virReportSystemError(errno,
                                 _("Failed to symlink %s to /dev/console"),
                                 ttyPaths[i]);
            return -1;
        }
601
    }
602 603 604 605
    return 0;
}


606 607
static int lxcContainerMountFSBind(virDomainFSDefPtr fs,
                                   const char *srcprefix)
608
{
609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630
    char *src = NULL;
    int ret = -1;

    if (virAsprintf(&src, "%s%s", srcprefix, fs->src) < 0) {
        virReportOOMError();
        goto cleanup;
    }

    if (virFileMakePath(fs->dst) < 0) {
        virReportSystemError(errno,
                             _("Failed to create %s"),
                             fs->dst);
        goto cleanup;
    }

    if (mount(src, fs->dst, NULL, MS_BIND, NULL) < 0) {
        virReportSystemError(errno,
                             _("Failed to bind mount directory %s to %s"),
                             src, fs->dst);
        goto cleanup;
    }

631 632 633 634 635 636
    if (fs->readonly) {
        VIR_DEBUG("Binding %s readonly", fs->dst);
        if (mount(fs->dst, fs->dst, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
            virReportSystemError(errno,
                                 _("Failed to make directory %s readonly"),
                                 fs->dst);
637 638 639 640 641 642 643 644 645 646 647
        }
    }

    ret = 0;

cleanup:
    VIR_FREE(src);
    return ret;
}


648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
#ifdef HAVE_LIBBLKID
static int
lxcContainerMountDetectFilesystem(const char *src, char **type)
{
    int fd;
    int ret = -1;
    int rc;
    const char *data = NULL;
    blkid_probe blkid = NULL;

    *type = NULL;

    if ((fd = open(src, O_RDONLY)) < 0) {
        virReportSystemError(errno,
                             _("Unable to open filesystem %s"), src);
        return -1;
    }

    if (!(blkid = blkid_new_probe())) {
        virReportSystemError(errno, "%s",
                             _("Unable to create blkid library handle"));
        goto cleanup;
    }
    if (blkid_probe_set_device(blkid, fd, 0, 0) < 0) {
        virReportSystemError(EINVAL,
                             _("Unable to associate device %s with blkid library"),
                             src);
        goto cleanup;
    }

    blkid_probe_enable_superblocks(blkid, 1);

    blkid_probe_set_superblocks_flags(blkid, BLKID_SUBLKS_TYPE);

    rc = blkid_do_safeprobe(blkid);
    if (rc != 0) {
        if (rc == 1) /* Nothing found, return success with *type == NULL */
            goto done;

        if (rc == -2) {
            virReportSystemError(EINVAL,
                                 _("Too many filesystems detected for %s"),
                                 src);
        } else {
            virReportSystemError(errno,
                                 _("Unable to detect filesystem for %s"),
                                 src);
        }
        goto cleanup;
    }

    if (blkid_probe_lookup_value(blkid, "TYPE", &data, NULL) < 0) {
        virReportSystemError(ENOENT,
                             _("Unable to find filesystem type for %s"),
                             src);
        goto cleanup;
    }

    if (!(*type = strdup(data))) {
        virReportOOMError();
        goto cleanup;
    }

done:
    ret = 0;
cleanup:
    VIR_FORCE_CLOSE(fd);
    if (blkid)
        blkid_free_probe(blkid);
    return ret;
}
#else /* ! HAVE_LIBBLKID */
static int
lxcContainerMountDetectFilesystem(const char *src ATTRIBUTE_UNUSED,
                                  char **type)
{
    /* No libblkid, so just return success with no detected type */
    *type = NULL;
    return 0;
}
#endif /* ! HAVE_LIBBLKID */
729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785

/*
 * This functions attempts to do automatic detection of filesystem
 * type following the same rules as the util-linux 'mount' binary.
 *
 * The main difference is that we don't (currently) try to use
 * libblkid to detect the format first. We go straight to using
 * /etc/filesystems, and then /proc/filesystems
 */
static int lxcContainerMountFSBlockAuto(virDomainFSDefPtr fs,
                                        int fsflags,
                                        const char *src,
                                        const char *srcprefix)
{
    FILE *fp = NULL;
    int ret = -1;
    bool tryProc = false;
    bool gotStar = false;
    char *fslist = NULL;
    char *line = NULL;
    const char *type;

    VIR_DEBUG("src=%s srcprefix=%s dst=%s", src, srcprefix, fs->dst);

    /* First time around we use /etc/filesystems */
retry:
    if (virAsprintf(&fslist, "%s%s",
                    srcprefix, tryProc ? "/proc/filesystems" : "/etc/filesystems") < 0) {
        virReportOOMError();
        goto cleanup;
    }

    VIR_DEBUG("Open fslist %s", fslist);
    if (!(fp = fopen(fslist, "r"))) {
        /* If /etc/filesystems does not exist, then we need to retry
         * with /proc/filesystems next
         */
        if (errno == ENOENT &&
            !tryProc) {
            tryProc = true;
            VIR_FREE(fslist);
            goto retry;
        }

        virReportSystemError(errno,
                             _("Unable to read %s"),
                             fslist);
        goto cleanup;
    }

    while (!feof(fp)) {
        size_t n;
        VIR_FREE(line);
        if (getline(&line, &n, fp) <= 0) {
            if (feof(fp))
                break;

786 787 788
            goto cleanup;
        }

789 790 791 792 793 794 795 796 797 798 799 800 801
        if (strstr(line, "nodev"))
            continue;

        type = strchr(line, '\n');
        if (type)
            line[type-line] = '\0';

        type = line;
        virSkipSpaces(&type);

        /*
         * /etc/filesystems is only allowed to contain '*' on the last line
         */
802
        if (gotStar && !tryProc) {
803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
            lxcError(VIR_ERR_INTERNAL_ERROR,
                     _("%s has unexpected '*' before last line"),
                     fslist);
            goto cleanup;
        }

        /* An '*' on the last line in /etc/filesystems
         * means try /proc/filesystems next. We don't
         * jump immediately though, since we need to see
         * if any more lines follow
         */
        if (!tryProc &&
            STREQ(type, "*"))
            gotStar = true;

        VIR_DEBUG("Trying mount %s with %s", src, type);
        if (mount(src, fs->dst, type, fsflags, NULL) < 0) {
            /* These errnos indicate a bogus filesystem type for
             * the image we have, so skip to the next type
             */
            if (errno == EINVAL || errno == ENODEV)
                continue;

            virReportSystemError(errno,
827
                                 _("Failed to mount device %s to %s"),
828 829 830 831 832 833
                                 src, fs->dst);
            goto cleanup;
        }

        ret = 0;
        break;
834 835
    }

836 837 838 839 840 841 842 843 844 845 846 847
    /* We've got to the end of /etc/filesystems and saw
     * a '*', so we must try /proc/filesystems next
     */
    if (ret != 0 &&
        !tryProc &&
        gotStar) {
        tryProc = true;
        VIR_FREE(fslist);
        VIR_FORCE_FCLOSE(fp);
        goto retry;
    }

848 849 850 851 852 853
    if (ret != 0) {
        virReportSystemError(ENODEV,
                             _("Failed to mount device %s to %s, unable to detect filesystem"),
                             src, fs->dst);
    }

854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872
    VIR_DEBUG("Done mounting filesystem ret=%d tryProc=%d", ret, tryProc);

cleanup:
    VIR_FREE(line);
    VIR_FORCE_FCLOSE(fp);
    return ret;
}


/*
 * Mount a block device 'src' on fs->dst, automatically
 * probing for filesystem type
 */
static int lxcContainerMountFSBlockHelper(virDomainFSDefPtr fs,
                                          const char *src,
                                          const char *srcprefix)
{
    int fsflags = 0;
    int ret = -1;
873 874
    char *format = NULL;

875 876 877 878 879 880 881 882 883 884
    if (fs->readonly)
        fsflags |= MS_RDONLY;

    if (virFileMakePath(fs->dst) < 0) {
        virReportSystemError(errno,
                             _("Failed to create %s"),
                             fs->dst);
        goto cleanup;
    }

885 886 887 888 889 890 891 892 893 894 895 896 897 898 899
    if (lxcContainerMountDetectFilesystem(src, &format) < 0)
        goto cleanup;

    if (format) {
        VIR_DEBUG("Mount %s with detected format %s", src, format);
        if (mount(src, fs->dst, format, fsflags, NULL) < 0) {
            virReportSystemError(errno,
                                 _("Failed to mount device %s to %s as %s"),
                                 src, fs->dst, format);
            goto cleanup;
        }
        ret = 0;
    } else {
        ret = lxcContainerMountFSBlockAuto(fs, fsflags, src, srcprefix);
    }
900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917

cleanup:
    return ret;
}


static int lxcContainerMountFSBlock(virDomainFSDefPtr fs,
                                    const char *srcprefix)
{
    char *src = NULL;
    int ret = -1;

    if (virAsprintf(&src, "%s%s", srcprefix, fs->src) < 0) {
        virReportOOMError();
        goto cleanup;
    }

    ret = lxcContainerMountFSBlockHelper(fs, src, srcprefix);
918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934

    VIR_DEBUG("Done mounting filesystem ret=%d", ret);

cleanup:
    VIR_FREE(src);
    return ret;
}


static int lxcContainerMountFS(virDomainFSDefPtr fs,
                               const char *srcprefix)
{
    switch (fs->type) {
    case VIR_DOMAIN_FS_TYPE_MOUNT:
        if (lxcContainerMountFSBind(fs, srcprefix) < 0)
            return -1;
        break;
935 936 937 938
    case VIR_DOMAIN_FS_TYPE_BLOCK:
        if (lxcContainerMountFSBlock(fs, srcprefix) < 0)
            return -1;
        break;
939 940 941 942 943
    case VIR_DOMAIN_FS_TYPE_FILE:
        lxcError(VIR_ERR_INTERNAL_ERROR,
                 _("Unexpected filesystem type %s"),
                 virDomainFSTypeToString(fs->type));
        break;
944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959
    default:
        lxcError(VIR_ERR_CONFIG_UNSUPPORTED,
                 _("Cannot mount filesystem type %s"),
                 virDomainFSTypeToString(fs->type));
        break;
    }
    return 0;
}


static int lxcContainerMountAllFS(virDomainDefPtr vmDef,
                                  const char *dstprefix,
                                  bool skipRoot)
{
    size_t i;
    VIR_DEBUG("Mounting %s %d", dstprefix, skipRoot);
960 961

    /* Pull in rest of container's mounts */
962
    for (i = 0 ; i < vmDef->nfss ; i++) {
963 964
        if (skipRoot &&
            STREQ(vmDef->fss[i]->dst, "/"))
965 966
            continue;

967
        if (lxcContainerMountFS(vmDef->fss[i], dstprefix) < 0)
968 969 970
            return -1;
    }

971
    VIR_DEBUG("Mounted all filesystems");
972 973 974 975 976 977
    return 0;
}


static int lxcContainerUnmountOldFS(void)
{
978
    struct mntent mntent;
979 980 981 982
    char **mounts = NULL;
    int nmounts = 0;
    FILE *procmnt;
    int i;
983
    char mntbuf[1024];
984 985 986
    int saveErrno;
    const char *failedUmount = NULL;
    int ret = -1;
987 988

    if (!(procmnt = setmntent("/proc/mounts", "r"))) {
989
        virReportSystemError(errno, "%s",
990
                             _("Failed to read /proc/mounts"));
991 992
        return -1;
    }
993
    while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) {
994
        VIR_DEBUG("Got %s", mntent.mnt_dir);
995
        if (!STRPREFIX(mntent.mnt_dir, "/.oldroot"))
996 997 998
            continue;

        if (VIR_REALLOC_N(mounts, nmounts+1) < 0) {
999
            virReportOOMError();
1000
            goto cleanup;
1001
        }
1002
        if (!(mounts[nmounts++] = strdup(mntent.mnt_dir))) {
1003
            virReportOOMError();
1004
            goto cleanup;
1005
        }
1006
        VIR_DEBUG("Grabbed %s", mntent.mnt_dir);
1007 1008
    }

1009 1010 1011
    if (mounts)
        qsort(mounts, nmounts, sizeof(mounts[0]),
              lxcContainerChildMountSort);
1012 1013

    for (i = 0 ; i < nmounts ; i++) {
1014
        VIR_DEBUG("Umount %s", mounts[i]);
1015
        if (umount(mounts[i]) < 0) {
1016 1017 1018 1019 1020 1021 1022
            char ebuf[1024];
            failedUmount = mounts[i];
            saveErrno = errno;
            VIR_WARN("Failed to unmount '%s', trying to detach root '%s': %s",
                     failedUmount, mounts[nmounts-1],
                     virStrerror(errno, ebuf, sizeof(ebuf)));
            break;
1023 1024
        }
    }
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048

    if (failedUmount) {
        /* This detaches the old root filesystem */
        if (umount2(mounts[nmounts-1], MNT_DETACH) < 0) {
            virReportSystemError(saveErrno,
                                 _("Failed to unmount '%s' and could not detach old root '%s'"),
                                 failedUmount, mounts[nmounts-1]);
            goto cleanup;
        }
        /* This unmounts the tmpfs on which the old root filesystem was hosted */
        if (umount(mounts[nmounts-1]) < 0) {
            virReportSystemError(saveErrno,
                                 _("Failed to unmount '%s' and could not unmount old root '%s'"),
                                 failedUmount, mounts[nmounts-1]);
            goto cleanup;
        }
    }

    ret = 0;

cleanup:
    for (i = 0 ; i < nmounts ; i++)
        VIR_FREE(mounts[i]);
    endmntent(procmnt);
1049 1050
    VIR_FREE(mounts);

1051
    return ret;
1052 1053 1054 1055 1056 1057 1058 1059
}


/* Got a FS mapped to /, we're going the pivot_root
 * approach to do a better-chroot-than-chroot
 * this is based on this thread http://lkml.org/lkml/2008/3/5/29
 */
static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef,
1060 1061 1062
                                      virDomainFSDefPtr root,
                                      char **ttyPaths,
                                      size_t nttyPaths)
1063
{
1064
    /* Gives us a private root, leaving all parent OS mounts on /.oldroot */
1065 1066 1067
    if (lxcContainerPivotRoot(root) < 0)
        return -1;

1068
    /* Mounts the core /proc, /sys, etc filesystems */
1069
    if (lxcContainerMountBasicFS("/.oldroot", true) < 0)
1070 1071 1072 1073
        return -1;

    /* Mounts /dev and /dev/pts */
    if (lxcContainerMountDevFS(root) < 0)
1074 1075
        return -1;

1076
    /* Populates device nodes in /dev/ */
1077
    if (lxcContainerPopulateDevices(ttyPaths, nttyPaths) < 0)
1078 1079
        return -1;

1080
    /* Sets up any non-root mounts from guest config */
1081
    if (lxcContainerMountAllFS(vmDef, "/.oldroot", true) < 0)
1082 1083
        return -1;

1084
    /* Gets rid of all remaining mounts from host OS, including /.oldroot itself */
1085 1086 1087 1088 1089 1090
    if (lxcContainerUnmountOldFS() < 0)
        return -1;

    return 0;
}

1091

1092 1093 1094 1095
/* Nothing mapped to /, we're using the main root,
   but with extra stuff mapped in */
static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef)
{
1096 1097 1098 1099 1100 1101
    VIR_DEBUG("def=%p", vmDef);
    /*
     * This makes sure that any new filesystems in the
     * host OS propagate to the container, but any
     * changes in the container are private
     */
1102
    if (mount("", "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1103
        virReportSystemError(errno, "%s",
1104
                             _("Failed to make / slave"));
1105 1106
        return -1;
    }
1107

1108
    VIR_DEBUG("Mounting config FS");
1109 1110
    if (lxcContainerMountAllFS(vmDef, "", false) < 0)
        return -1;
1111

1112
    /* Mounts the core /proc, /sys, etc filesystems */
1113
    if (lxcContainerMountBasicFS(NULL, false) < 0)
1114 1115
        return -1;

1116
    VIR_DEBUG("Mounting completed");
1117 1118 1119
    return 0;
}

1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139

static int lxcContainerResolveSymlinks(virDomainDefPtr vmDef)
{
    char *newroot;
    size_t i;

    for (i = 0 ; i < vmDef->nfss ; i++) {
        virDomainFSDefPtr fs = vmDef->fss[i];
        if (virFileResolveAllLinks(fs->src, &newroot) < 0)
            return -1;

        VIR_DEBUG("Resolved '%s' to %s", fs->src, newroot);

        VIR_FREE(fs->src);
        fs->src = newroot;
    }

    return 0;
}

1140
static int lxcContainerSetupMounts(virDomainDefPtr vmDef,
1141 1142 1143
                                   virDomainFSDefPtr root,
                                   char **ttyPaths,
                                   size_t nttyPaths)
1144
{
1145 1146 1147
    if (lxcContainerResolveSymlinks(vmDef) < 0)
        return -1;

1148
    if (root)
1149
        return lxcContainerSetupPivotRoot(vmDef, root, ttyPaths, nttyPaths);
1150 1151 1152 1153
    else
        return lxcContainerSetupExtraMounts(vmDef);
}

D
Daniel P. Berrange 已提交
1154 1155 1156 1157 1158 1159 1160

/*
 * This is running as the 'init' process insid the container.
 * It removes some capabilities that could be dangerous to
 * host system, since they are not currently "containerized"
 */
static int lxcContainerDropCapabilities(void)
1161
{
D
Daniel P. Berrange 已提交
1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175
#if HAVE_CAPNG
    int ret;

    capng_get_caps_process();

    if ((ret = capng_updatev(CAPNG_DROP,
                             CAPNG_EFFECTIVE | CAPNG_PERMITTED |
                             CAPNG_INHERITABLE | CAPNG_BOUNDING_SET,
                             CAP_SYS_BOOT, /* No use of reboot */
                             CAP_SYS_MODULE, /* No kernel module loading */
                             CAP_SYS_TIME, /* No changing the clock */
                             CAP_AUDIT_CONTROL, /* No messing with auditing status */
                             CAP_MAC_ADMIN, /* No messing with LSM config */
                             -1 /* sentinal */)) < 0) {
1176
        lxcError(VIR_ERR_INTERNAL_ERROR,
1177
                 _("Failed to remove capabilities: %d"), ret);
D
Daniel P. Berrange 已提交
1178 1179
        return -1;
    }
1180

D
Daniel P. Berrange 已提交
1181
    if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
1182
        lxcError(VIR_ERR_INTERNAL_ERROR,
1183
                 _("Failed to apply capabilities: %d"), ret);
D
Daniel P. Berrange 已提交
1184
        return -1;
1185
    }
D
Daniel P. Berrange 已提交
1186

1187 1188 1189 1190 1191
    /* We do not need to call capng_lock() in this case. The bounding
     * set restriction will prevent them reacquiring sys_boot/module/time,
     * etc which is all that matters for the container. Once inside the
     * container it is fine for SECURE_NOROOT / SECURE_NO_SETUID_FIXUP to
     * be unmasked  - they can never escape the bounding set. */
D
Daniel P. Berrange 已提交
1192 1193

#else
1194
    VIR_WARN("libcap-ng support not compiled in, unable to clear capabilities");
D
Daniel Veillard 已提交
1195
#endif
1196 1197 1198 1199
    return 0;
}


1200
/**
1201 1202
 * lxcContainerChild:
 * @data: pointer to container arguments
1203 1204 1205 1206 1207 1208 1209 1210 1211
 *
 * This function is run in the process clone()'d in lxcStartContainer.
 * Perform a number of container setup tasks:
 *     Setup container file system
 *     mount container /proca
 * Then exec's the container init
 *
 * Returns 0 on success or -1 in case of error
 */
1212
static int lxcContainerChild( void *data )
1213
{
1214
    lxc_child_argv_t *argv = data;
1215
    virDomainDefPtr vmDef = argv->config;
1216
    int ttyfd = -1;
1217
    int ret = -1;
1218
    char *ttyPath = NULL;
1219
    virDomainFSDefPtr root;
1220
    virCommandPtr cmd = NULL;
1221 1222

    if (NULL == vmDef) {
1223
        lxcError(VIR_ERR_INTERNAL_ERROR,
J
Jim Meyering 已提交
1224
                 "%s", _("lxcChild() passed invalid vm definition"));
1225
        goto cleanup;
1226 1227
    }

1228 1229 1230
    cmd = lxcContainerBuildInitCmd(vmDef);
    virCommandWriteArgLog(cmd, 1);

1231
    root = virDomainGetRootFilesystem(vmDef);
1232

1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
    if (argv->nttyPaths) {
        if (root) {
            if (virAsprintf(&ttyPath, "%s%s", root->src, argv->ttyPaths[0]) < 0) {
                virReportOOMError();
                goto cleanup;
            }
        } else {
            if (!(ttyPath = strdup(argv->ttyPaths[0]))) {
                virReportOOMError();
                goto cleanup;
            }
1244 1245
        }
    } else {
1246
        if (!(ttyPath = strdup("/dev/null"))) {
1247
            virReportOOMError();
1248
            goto cleanup;
1249 1250
        }
    }
1251

1252
    VIR_DEBUG("Container TTY path: %s", ttyPath);
1253 1254

    ttyfd = open(ttyPath, O_RDWR|O_NOCTTY);
1255
    if (ttyfd < 0) {
1256
        virReportSystemError(errno,
1257
                             _("Failed to open tty %s"),
1258
                             ttyPath);
1259
        goto cleanup;
1260
    }
1261

1262
    if (lxcContainerSetupMounts(vmDef, root, argv->ttyPaths, argv->nttyPaths) < 0)
1263
        goto cleanup;
1264

1265 1266 1267 1268 1269 1270 1271
    if (!virFileExists(vmDef->os.init)) {
        virReportSystemError(errno,
                    _("cannot find init path '%s' relative to container root"),
                    vmDef->os.init);
        goto cleanup;
    }

1272
    /* Wait for interface devices to show up */
1273 1274 1275
    if (lxcContainerWaitForContinue(argv->monitor) < 0) {
        virReportSystemError(errno, "%s",
                             _("Failed to read the container continue message"));
1276
        goto cleanup;
1277 1278
    }
    VIR_DEBUG("Received container continue message");
1279

1280 1281
    /* rename and enable interfaces */
    if (lxcContainerRenameAndEnableInterfaces(argv->nveths,
1282
                                              argv->veths) < 0) {
1283
        goto cleanup;
1284
    }
1285

1286
    /* drop a set of root capabilities */
D
Daniel P. Berrange 已提交
1287
    if (lxcContainerDropCapabilities() < 0)
1288
        goto cleanup;
1289

1290 1291 1292 1293 1294 1295 1296
    if (lxcContainerSendContinue(argv->handshakefd) < 0) {
        virReportSystemError(errno, "%s",
                            _("failed to send continue signal to controller"));
        goto cleanup;
    }

    if (lxcContainerSetStdio(argv->monitor, ttyfd, argv->handshakefd) < 0) {
1297 1298
        goto cleanup;
    }
1299

1300
    ret = 0;
1301
cleanup:
1302 1303
    VIR_FREE(ttyPath);
    VIR_FORCE_CLOSE(ttyfd);
1304
    VIR_FORCE_CLOSE(argv->monitor);
1305
    VIR_FORCE_CLOSE(argv->handshakefd);
1306 1307

    if (ret == 0) {
E
Eric Blake 已提交
1308
        /* this function will only return if an error occurred */
1309 1310 1311
        ret = virCommandExec(cmd);
    }

1312 1313
    virCommandFree(cmd);
    return ret;
1314
}
1315

1316 1317
static int userns_supported(void)
{
1318 1319 1320 1321 1322 1323
#if 1
    /*
     * put off using userns until uid mapping is implemented
     */
    return 0;
#else
1324
    return lxcContainerAvailable(LXC_CONTAINER_FEATURE_USER) == 0;
1325
#endif
1326 1327
}

1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348
const char *lxcContainerGetAlt32bitArch(const char *arch)
{
    /* Any Linux 64bit arch which has a 32bit
     * personality available should be listed here */
    if (STREQ(arch, "x86_64"))
        return "i686";
    if (STREQ(arch, "s390x"))
        return "s390";
    if (STREQ(arch, "ppc64"))
        return "ppc";
    if (STREQ(arch, "parisc64"))
        return "parisc";
    if (STREQ(arch, "sparc64"))
        return "sparc";
    if (STREQ(arch, "mips64"))
        return "mips";

    return NULL;
}


1349 1350
/**
 * lxcContainerStart:
1351 1352 1353 1354 1355
 * @def: pointer to virtual machine structure
 * @nveths: number of interfaces
 * @veths: interface names
 * @control: control FD to the container
 * @ttyPath: path of tty to set as the container console
1356 1357 1358 1359 1360
 *
 * Starts a container process by calling clone() with the namespace flags
 *
 * Returns PID of container on success or -1 in case of error
 */
1361
int lxcContainerStart(virDomainDefPtr def,
1362 1363
                      unsigned int nveths,
                      char **veths,
1364
                      int control,
1365
                      int handshakefd,
1366 1367
                      char **ttyPaths,
                      size_t nttyPaths)
1368 1369
{
    pid_t pid;
E
Eric Blake 已提交
1370
    int cflags;
1371 1372
    int stacksize = getpagesize() * 4;
    char *stack, *stacktop;
1373 1374
    lxc_child_argv_t args = { def, nveths, veths, control,
                              ttyPaths, nttyPaths, handshakefd};
1375 1376 1377

    /* allocate a stack for the container */
    if (VIR_ALLOC_N(stack, stacksize) < 0) {
1378
        virReportOOMError();
1379 1380 1381 1382
        return -1;
    }
    stacktop = stack + stacksize;

E
Eric Blake 已提交
1383
    cflags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWIPC|SIGCHLD;
1384

1385
    if (userns_supported()) {
1386
        VIR_DEBUG("Enable user namespaces");
E
Eric Blake 已提交
1387
        cflags |= CLONE_NEWUSER;
1388
    }
1389

1390
    if (def->nets != NULL) {
1391
        VIR_DEBUG("Enable network namespaces");
E
Eric Blake 已提交
1392
        cflags |= CLONE_NEWNET;
1393
    }
1394

E
Eric Blake 已提交
1395
    pid = clone(lxcContainerChild, stacktop, cflags, &args);
1396
    VIR_FREE(stack);
1397
    VIR_DEBUG("clone() completed, new container PID is %d", pid);
1398 1399

    if (pid < 0) {
1400
        virReportSystemError(errno, "%s",
1401
                             _("Failed to run clone container"));
1402 1403 1404 1405 1406 1407
        return -1;
    }

    return pid;
}

1408 1409
ATTRIBUTE_NORETURN static int
lxcContainerDummyChild(void *argv ATTRIBUTE_UNUSED)
1410 1411 1412 1413 1414 1415
{
    _exit(0);
}

int lxcContainerAvailable(int features)
{
1416
    int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|
1417 1418 1419 1420 1421
        CLONE_NEWIPC|SIGCHLD;
    int cpid;
    char *childStack;
    char *stack;

1422 1423 1424
    if (features & LXC_CONTAINER_FEATURE_USER)
        flags |= CLONE_NEWUSER;

1425 1426 1427 1428
    if (features & LXC_CONTAINER_FEATURE_NET)
        flags |= CLONE_NEWNET;

    if (VIR_ALLOC_N(stack, getpagesize() * 4) < 0) {
1429
        VIR_DEBUG("Unable to allocate stack");
1430 1431 1432 1433 1434 1435 1436 1437
        return -1;
    }

    childStack = stack + (getpagesize() * 4);

    cpid = clone(lxcContainerDummyChild, childStack, flags, NULL);
    VIR_FREE(stack);
    if (cpid < 0) {
1438
        char ebuf[1024] ATTRIBUTE_UNUSED;
1439
        VIR_DEBUG("clone call returned %s, container support is not enabled",
1440
              virStrerror(errno, ebuf, sizeof ebuf));
1441
        return -1;
E
Eric Blake 已提交
1442 1443
    } else if (virPidWait(cpid, NULL) < 0) {
        return -1;
1444 1445
    }

1446
    VIR_DEBUG("Mounted all filesystems");
1447
    return 0;
1448
}